|
From: Eric A. <de...@us...> - 2004-03-16 23:28:06
|
Update of /cvsroot/sprawler/sprawler/lib/Sprawler In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv7756/lib/Sprawler Modified Files: Client.pm Log Message: Added more functions to client.pm (indexes more types of data) Index: Client.pm =================================================================== RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler/Client.pm,v retrieving revision 1.35 retrieving revision 1.36 diff -C2 -d -r1.35 -r1.36 *** Client.pm 10 Mar 2004 05:42:27 -0000 1.35 --- Client.pm 16 Mar 2004 23:18:31 -0000 1.36 *************** *** 337,340 **** --- 337,367 ---- @{$self->{TITLEWORDS}}=@titlewords; + @header=undef; + @{$self->{HEADERWORDS}}=undef; + $parser->set_tag('h1'); #header words + @headerwords=$self->clean($parser->get_words($parser -> extract_text ("$doctext"))); + @{$self->{HEADERWORDS}}=@headerwords; + $parser->set_tag('h2'); + @headerwords=$self->clean($parser->get_words($parser -> extract_text ("$doctext"))); + push(@{$self->{HEADERWORDS}},@headerwords); + + @marqueewords=undef; + @{$self->{MARQUEEWORDS}}=undef; + $parser->set_tag('marquee'); #marquee words + @marqueewords=$self->clean($parser->get_words($parser -> extract_text ("$doctext"))); + @{$self->{MARQUEEWORDS}}=@marqueewords; + + @prewords=undef; + @{$self->{PREWORDS}}=undef; + $parser->set_tag('pre'); #"pre"formatted words + @prewords=$self->clean($parser->get_words($parser -> extract_text ("$doctext"))); + @{$self->{PREWORDS}}=@prewords; + + @liwords=undef; + @{$self->{LIWORDS}}=undef; + $parser->set_tag('li'); #"li"ne words + @liwords=$self->clean($parser->get_words($parser -> extract_text ("$doctext"))); + @{$self->{LIWORDS}}=@liwords; + @boldwords=undef; @{$self->{BOLDWORDS}}=undef; *************** *** 364,368 **** ! my @indextypes = ("TITLEWORDS", "BOLDWORDS", "ITALICWORDS","URLS","EMAILS"); ${$self->{URL_DB}}{URL} = $document; foreach my $indextype (@indextypes) { --- 391,395 ---- ! my @indextypes = ("TITLEWORDS", "BOLDWORDS", "ITALICWORDS", "URLS", "EMAILS", "PREWORDS", "LIWORDS", "HEADERWORDS", "MARQUEEWORDS"); ${$self->{URL_DB}}{URL} = $document; foreach my $indextype (@indextypes) { |