From: Bill M. <whm...@us...> - 2003-08-29 13:47:37
|
Update of /cvsroot/swishe/swish-e/filters In directory sc8-pr-cvs1:/tmp/cvs-serv12796 Modified Files: README swish-filter-test.in Log Message: suppressed warning in XLtoHTML, and updated swish-filter-test to work with OO interface of SWISH::Filtery Argh -- A reason not to build in the source directory -- I edited swish-filter-test instead of swish-filter-test.in Index: README =================================================================== RCS file: /cvsroot/swishe/swish-e/filters/README,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- README 13 May 2003 06:11:33 -0000 1.4 +++ README 29 Aug 2003 13:47:02 -0000 1.5 @@ -40,14 +40,37 @@ Documentation for SWISH::Filter can also be found in the html directory and at http://swish-e.org. -When first testing you may also wish to set - - export FILTER_DEBUG=1 - Swish-e has another filter system. The FileFilter directive that can be used to filter documents through an external program while indexing. That system requires a separate filter setup for each type of document. See the SWISH-CONFIG page for information on that type of filtering. + + +Testing SWISH::Filter +--------------------- + +The program swish-filter-test in installed by default (in the same location as +the swish-e binary). This program can be used to test SWISH::Filter. For example, +run the command: + + $ swish-filter-test foo.pdf foo.txt + + Document foo.pdf was filtered. + Document: foo.pdf + Content-Type: text/html (initial was application/pdf) + Parser type: HTML* + + Document foo.txt was not filtered. + Document: foo.txt + Content-Type: text/plain (initial was text/plain) + Parser type: TXT* + +Run the command + + $ swish-filter-test -man + +for documentation. + Current filters distributed with Swish-e: ----------------------------------------- Index: swish-filter-test.in =================================================================== RCS file: /cvsroot/swishe/swish-e/filters/swish-filter-test.in,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- swish-filter-test.in 24 Aug 2003 00:52:42 -0000 1.1 +++ swish-filter-test.in 29 Aug 2003 13:47:03 -0000 1.2 @@ -1,4 +1,4 @@ -#!@@perlbinary@@ -w +#!/@@perlbinary@@ -w use strict; # This is set to where Swish-e's "make install" installed the helper modules. @@ -6,7 +6,7 @@ ################################################################################### # -# swish.cgi $Revision$ Copyright (C) 2001 Bill Moseley swi...@ha... +# Copyright (C) 2001 Bill Moseley swi...@ha... # Program to test the SWISH::Filter module # # This program is free software; you can redistribute it and/or @@ -34,7 +34,7 @@ use constant DEBUG => 1; use constant INFO => 2; -my ( $verbose, $show_content, @file, @url, $help, $man, $quiet, $headers); +my ( $verbose, $show_content, @file, @url, $help, $man, $quiet, $headers, $path, $depreciated); my $skip_binary = 1; @@ -51,11 +51,18 @@ 'man' => \$man, 'headers' => \$headers, 'skip_binary!' => \$skip_binary, + 'path' => \$path, + 'depreciated'=> \$depreciated, ) || pod2usage(2); pod2usage( -verbose => 1 ) if $help; pod2usage( -verbose => 2 ) if $man; +if ( $path ) { + print "/home/moseley/swish_test/lib/swish-e/perl\n"; + exit; +} + pod2usage( -verbose => 0, @@ -75,7 +82,7 @@ my $return = 0; for my $doc ( @ARGV ) { - eval { process_doc( $doc ) }; + eval { $depreciated ? process_doc_old( $doc ) : process_doc( $doc ) }; $return = 1 if $@; warn "** $0:\n $@\n" if $@; # always warn on die } @@ -91,6 +98,69 @@ my %config = !$@ && $uri->scheme ? fetch_url( $file ) : fetch_file( $file); + my $doc = $filter->convert( + %config, + name => $file, + ); + + die "Failed to process document [$file]\n" unless $doc; + + my $content_type = $doc->content_type || "unknown"; + my $parser_type = $doc->swish_parser_type || ''; + + my $binary = $doc->is_binary; + + my $msg = $doc->was_filtered ? '' : 'not'; + +my $name = $doc->name; + + + msg(DEBUG, <<EOF ); + +Document $file was $msg filtered. + Document: $file ($name) + Content-Type: $content_type + Parser type: $parser_type +EOF + + if ( my $filters_used = $doc->filters_used ) { + for my $filter ( @$filters_used ) { + msg( DEBUG, " >Filter used: $filter->{name} ( $filter->{start_content_type} -> $filter->{end_content_type} )" ); + } + } + + + if ( !$binary ) { + my @doc = split /\n/, substr( ${$doc->fetch_doc}, 0, $max_chars ); + $lines = @doc-1 if $lines >= @doc; + msg(INFO, join "\n", '-- Output Content Sample --', @doc[0..$lines],'','-- end --','' ); + } + + + die "Skipping binary [$file]\n" if $binary && $skip_binary; + + if ($headers ) { + my $len = length ${$doc->fetch_doc}; + + print "Path-Name: $file\nContent-Length: $len\n"; + print "Document-Type: $parser_type\n" if $parser_type; + print "\n"; + } + + + print ${$doc->fetch_doc} if $show_content; +} + + +sub process_doc_old { + my ($file) = @_; + + + my $uri; + eval { $uri = URI->new( $file ) }; + my %config = !$@ && $uri->scheme ? fetch_url( $file ) : fetch_file( $file); + + my $was_filtered = $filter->filter( %config, name => $file, @@ -196,6 +266,7 @@ -(no)skip_binary skip output of binary files (default) -lines <num> Number of lines of content to display to stderr if verbose -headers output with headers for swish-e -S prog method + -path output @INC path to SWISH::Filter module -help brief help message -man full documentation @@ -261,7 +332,15 @@ swish-filter-test -headers -content http://localhost/ test.pdf | swish-e -S prog -i stdin -v1 - +=item B<-path> + +Prints the installed location of the SWISH::Filter parent directory for use in PERL5LIB, +Allows using SWISH::Filter in other programs, or with the Swish-e -S http method with +swishspider. + +For example: + + PERL5LIB=`swish-filter-test -path` swish-e -S http -i http://localhost =item B<-help> |