CVS: swish-e/prog-bin SwishSpiderConfig.pl,1.12,1.13 spider.pl.in,1.6,1.7

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 454-5900

Update of /cvsroot/swishe/swish-e/prog-bin
In directory sc8-pr-cvs1:/tmp/cvs-serv11756/prog-bin

Modified Files:
	SwishSpiderConfig.pl spider.pl.in 
Log Message:
I thought I checked this in already.

This are updates to swishspider and spider.pl to use the OO interface
of SWISH::Filter.



Index: SwishSpiderConfig.pl
===================================================================
RCS file: /cvsroot/swishe/swish-e/prog-bin/SwishSpiderConfig.pl,v
retrieving revision 1.12
retrieving revision 1.13
diff -u -r1.12 -r1.13

--- SwishSpiderConfig.pl	13 Apr 2003 19:08:03 -0000	1.12
+++ SwishSpiderConfig.pl	30 Aug 2003 05:02:55 -0000	1.13
@@ -233,56 +233,24 @@
 
     # If not filtered return false and doc will be ignored (not indexed)
     
-    return unless $filter->filter(
+    my $doc $filter->content(
         document => $content_ref,
         name     => $response->base,
         content_type => $content_type,
     );
+    return unless $doc;
+    # return unless $doc->was_filtered # could do this since checking for text/* above
+    return if $doc->is_binary;
 
     # nicer to use **char...
-    $$content_ref = ${$filter->fetch_doc};
+    $$content_ref = ${$doc->fetch_doc};
 
     # let's see if we can set the parser.
-    $server->{parser_type} = $filter->swish_parser_type || '';
+    $server->{parser_type} = $doc->swish_parser_type || '';
 
     return 1;
 }
 
-
-    
-
-# Here's othre ways to filter.
-
-# This converts PDF files into HTML.  The second parameter of
-# pdf2html tells which pfd info filed to set as <title>
-
-use pdf2html;  # included example pdf converter module
-sub pdf {
-   my ( $uri, $server, $response, $content_ref ) = @_;
-
-   return 1 unless $response->content_type eq 'application/pdf';
-
-   # for logging counts
-   $server->{counts}{'PDF transformed'}++;
-
-   $$content_ref = ${pdf2html( $content_ref, 'title' )};
-   $$content_ref =~ tr/ / /s;
-   return 1;
-}
-
-use doc2txt;  # included example pdf converter module
-sub doc {
-   my ( $uri, $server, $response, $content_ref ) = @_;
-
-   return 1 unless $response->content_type eq 'application/msword';
-
-   # for logging counts
-   $server->{counts}{'DOC transformed'}++;
-
-   $$content_ref = ${doc2txt( $content_ref )};
-   $$content_ref =~ tr/ / /s;
-   return 1;
-}
 
 # Must return true...
 

Index: spider.pl.in
===================================================================
RCS file: /cvsroot/swishe/swish-e/prog-bin/spider.pl.in,v
retrieving revision 1.6
retrieving revision 1.7
diff -u -r1.6 -r1.7
--- spider.pl.in	28 Jul 2003 23:44:50 -0000	1.6
+++ spider.pl.in	30 Aug 2003 05:02:55 -0000	1.7
@@ -641,6 +641,7 @@
     }
 
 
+
     # Check for meta refresh
     # requires that $ua->parse_head() is enabled (the default)
 
@@ -658,7 +659,7 @@
     }
 
 
-    return unless $content;  # $$$ any reason to index empty files?
+#    return unless $content;  # $$$ any reason to index empty files?
     
 
     # make sure content is unique - probably better to chunk into an MD5 object above
@@ -1046,21 +1047,26 @@
             my ( $uri, $server, $response, $content_ref ) = @_;
 
             my $content_type = $response->content_type;
-
             # Ignore text/* content type -- no need to filter
             return 1 if !$content_type || $content_type =~ m!^text/!;
 
-            return unless $filter->filter(
+            my $doc = $filter->convert(
                 document     => $content_ref,
                 name         => $response->base,
                 content_type => $content_type,
             );
 
+            return 1 unless $doc; # so just proceed as if not using filter
+
+            if ( $doc->is_binary ) {  # ignore "binary" files (not text/* mime type)
+                die "Skipping " . $response->base . " due to content type: " . $doc->content_type ." may be binary\n";
+            }
+
             # nicer to use **char...
-            $$content_ref = ${$filter->fetch_doc};
+            $$content_ref = ${$doc->fetch_doc};
 
             # let's see if we can set the parser.
-            $server->{parser_type} = $filter->swish_parser_type || '';
+            $server->{parser_type} = $doc->swish_parser_type || '';
 
             return 1;            
         }
@@ -1086,7 +1092,7 @@
             email           => 'sw...@us...valid',
             link_tags       => [qw/ a frame /],
             keep_alive      => 1,
-            test_url        => sub { $_[0]->path !~ /\.(?:gif|jpeg|png)$/i },
+            test_url        => sub {  $_[0]->path !~ /\.(?:gif|jpeg|png)$/i },
             test_response   => $response_sub,
             filter_content  => $filter_sub,
             validate_links  => $validate,