From: Bill M. <whm...@us...> - 2003-08-30 05:03:00
|
Update of /cvsroot/swishe/swish-e/prog-bin In directory sc8-pr-cvs1:/tmp/cvs-serv11756/prog-bin Modified Files: SwishSpiderConfig.pl spider.pl.in Log Message: I thought I checked this in already. This are updates to swishspider and spider.pl to use the OO interface of SWISH::Filter. Index: SwishSpiderConfig.pl =================================================================== RCS file: /cvsroot/swishe/swish-e/prog-bin/SwishSpiderConfig.pl,v retrieving revision 1.12 retrieving revision 1.13 diff -u -r1.12 -r1.13 --- SwishSpiderConfig.pl 13 Apr 2003 19:08:03 -0000 1.12 +++ SwishSpiderConfig.pl 30 Aug 2003 05:02:55 -0000 1.13 @@ -233,56 +233,24 @@ # If not filtered return false and doc will be ignored (not indexed) - return unless $filter->filter( + my $doc $filter->content( document => $content_ref, name => $response->base, content_type => $content_type, ); + return unless $doc; + # return unless $doc->was_filtered # could do this since checking for text/* above + return if $doc->is_binary; # nicer to use **char... - $$content_ref = ${$filter->fetch_doc}; + $$content_ref = ${$doc->fetch_doc}; # let's see if we can set the parser. - $server->{parser_type} = $filter->swish_parser_type || ''; + $server->{parser_type} = $doc->swish_parser_type || ''; return 1; } - - - -# Here's othre ways to filter. - -# This converts PDF files into HTML. The second parameter of -# pdf2html tells which pfd info filed to set as <title> - -use pdf2html; # included example pdf converter module -sub pdf { - my ( $uri, $server, $response, $content_ref ) = @_; - - return 1 unless $response->content_type eq 'application/pdf'; - - # for logging counts - $server->{counts}{'PDF transformed'}++; - - $$content_ref = ${pdf2html( $content_ref, 'title' )}; - $$content_ref =~ tr/ / /s; - return 1; -} - -use doc2txt; # included example pdf converter module -sub doc { - my ( $uri, $server, $response, $content_ref ) = @_; - - return 1 unless $response->content_type eq 'application/msword'; - - # for logging counts - $server->{counts}{'DOC transformed'}++; - - $$content_ref = ${doc2txt( $content_ref )}; - $$content_ref =~ tr/ / /s; - return 1; -} # Must return true... Index: spider.pl.in =================================================================== RCS file: /cvsroot/swishe/swish-e/prog-bin/spider.pl.in,v retrieving revision 1.6 retrieving revision 1.7 diff -u -r1.6 -r1.7 --- spider.pl.in 28 Jul 2003 23:44:50 -0000 1.6 +++ spider.pl.in 30 Aug 2003 05:02:55 -0000 1.7 @@ -641,6 +641,7 @@ } + # Check for meta refresh # requires that $ua->parse_head() is enabled (the default) @@ -658,7 +659,7 @@ } - return unless $content; # $$$ any reason to index empty files? +# return unless $content; # $$$ any reason to index empty files? # make sure content is unique - probably better to chunk into an MD5 object above @@ -1046,21 +1047,26 @@ my ( $uri, $server, $response, $content_ref ) = @_; my $content_type = $response->content_type; - # Ignore text/* content type -- no need to filter return 1 if !$content_type || $content_type =~ m!^text/!; - return unless $filter->filter( + my $doc = $filter->convert( document => $content_ref, name => $response->base, content_type => $content_type, ); + return 1 unless $doc; # so just proceed as if not using filter + + if ( $doc->is_binary ) { # ignore "binary" files (not text/* mime type) + die "Skipping " . $response->base . " due to content type: " . $doc->content_type ." may be binary\n"; + } + # nicer to use **char... - $$content_ref = ${$filter->fetch_doc}; + $$content_ref = ${$doc->fetch_doc}; # let's see if we can set the parser. - $server->{parser_type} = $filter->swish_parser_type || ''; + $server->{parser_type} = $doc->swish_parser_type || ''; return 1; } @@ -1086,7 +1092,7 @@ email => 'sw...@us...valid', link_tags => [qw/ a frame /], keep_alive => 1, - test_url => sub { $_[0]->path !~ /\.(?:gif|jpeg|png)$/i }, + test_url => sub { $_[0]->path !~ /\.(?:gif|jpeg|png)$/i }, test_response => $response_sub, filter_content => $filter_sub, validate_links => $validate, |