|
From: Eric A. <de...@us...> - 2004-03-25 05:17:29
|
Update of /cvsroot/sprawler/sprawler/lib/Sprawler In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv30378/lib/Sprawler Modified Files: Client.pm Log Message: - added title logging, fixed some items that were missed. Index: Client.pm =================================================================== RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler/Client.pm,v retrieving revision 1.37 retrieving revision 1.38 diff -C2 -d -r1.37 -r1.38 *** Client.pm 25 Mar 2004 04:40:36 -0000 1.37 --- Client.pm 25 Mar 2004 05:06:45 -0000 1.38 *************** *** 74,82 **** $self->{CONTENT_TYPE}=undef; $self->{TITLEWORDS}=[]; $self->{BOLDWORDS}=[]; $self->{ITALICWORDS}=[]; $self->{HTMLWORDS}=[]; $self->{SHORTWORDS}=[]; ! $self->{STOPWORDS}={}; --- 74,89 ---- $self->{CONTENT_TYPE}=undef; $self->{TITLEWORDS}=[]; + $self->{TITLE}=undef; $self->{BOLDWORDS}=[]; $self->{ITALICWORDS}=[]; $self->{HTMLWORDS}=[]; $self->{SHORTWORDS}=[]; ! $self->{URLS}=[]; ! $self->{EMAILS}=[]; ! $self->{PREWORDS}=[]; ! $self->{LIWORDS}=[]; ! $self->{HEADERWORDS}=[]; ! $self->{MARQUEEWORDS}=[]; ! $self->{STOPWORDS}={}; *************** *** 335,343 **** } $parser->set_tag('title'); ! my $title = $parser -> extract_text ("$doctext"); @titlewords=undef; @{$self->{TITLEWORDS}}=undef; ! @titlewords=$self->clean($parser->get_words($parser -> extract_text ("$doctext"))); @{$self->{TITLEWORDS}}=@titlewords; --- 342,351 ---- } $parser->set_tag('title'); ! my $title = $parser->extract_text("$doctext"); @titlewords=undef; @{$self->{TITLEWORDS}}=undef; ! $self->{TITLE}=$title; ! @titlewords=$self->clean($parser->get_words($self->{TITLE})); @{$self->{TITLEWORDS}}=@titlewords; *************** *** 398,401 **** --- 406,410 ---- my @indextypes = ("TITLEWORDS", "BOLDWORDS", "ITALICWORDS", "URLS", "EMAILS", "PREWORDS", "LIWORDS", "HEADERWORDS", "MARQUEEWORDS"); ${$self->{URL_DB}}{URL} = $document; + ${$self->{URL_DB}}{TITLE} = $self->{TITLE}; foreach my $indextype (@indextypes) { ${$self->{URL_DB}}{$indextype} = undef; |