From: <ik...@us...> - 2009-08-26 21:27:23
|
Revision: 35 http://webfetch.svn.sourceforge.net/webfetch/?rev=35&view=rev Author: ikluft Date: 2009-08-26 21:27:09 +0000 (Wed, 26 Aug 2009) Log Message: ----------- expand get and save functions, clean up Modified Paths: -------------- branches/v0.11/lib/WebFetch.pm Modified: branches/v0.11/lib/WebFetch.pm =================================================================== --- branches/v0.11/lib/WebFetch.pm 2009-08-25 22:19:47 UTC (rev 34) +++ branches/v0.11/lib/WebFetch.pm 2009-08-26 21:27:09 UTC (rev 35) @@ -113,6 +113,7 @@ use Getopt::Long; use LWP::UserAgent; use HTTP::Request; +use Date::Calc; use Data::Dumper; # define exceptions/errors @@ -143,7 +144,7 @@ 'WebFetch::Exception::NoSave' => { isa => 'WebFetch::Exception', alias => 'throw_no_save', - description => "unable to save data because of no data or nowhere to save it", + description => "unable to save: no data or nowhere to save it", }, 'WebFetch::Exception::NoInputHandler' => { @@ -183,6 +184,7 @@ ); +# initialize class variables our $VERSION = '0.12'; our %default_modules = ( "input" => { @@ -221,8 +223,8 @@ The @capabilities array is any number of strings as needed to list the capabilities which the module performs for the WebFetch API. The currently-recognized capabilities are "cmdline", "input" and "output". -However, the function will save all the capability names that the module -provides. +"config" and "storage" are reserved for future use. The function will save +all the capability names that the module provides. =cut @@ -449,6 +451,7 @@ "source_format:s", "dest=s", "dest_format:s", + "fetch_urls", "quiet", "debug", @mod_options ) }; @@ -459,7 +462,8 @@ ."[--group group] [--mode mode] " ."[--source file] [--source_format fmt-string] " ."[--dest file] [--dest_format fmt-string] " - ."[--quiet] ".join( " ", @mod_usage )); + ."[--fetch_urls] [--quiet] " + .join( " ", @mod_usage )); } # set debugging mode @@ -470,7 +474,7 @@ # if either source/input or dest/output formats were not provided, - # check if only one handler is defined - if so that's the one to use + # check if only one handler is registered - if so that's the default if ( !exists $options{source_format}) { if ( my $fmt = singular_handler( "input" )) { $options{source_format} = $fmt; @@ -550,7 +554,7 @@ # go fetch the data # this function must be provided by a derived module - if (( ! defined $self->{no_fetch}) or ! $self->{no_fetch}) { + if (( ! exists $self->{no_fetch}) or ! $self->{no_fetch}) { $self->fetch(); } @@ -657,34 +661,6 @@ my $options_ref = shift; my $obj; - #my ( $obj, $dir, $group, $mode, - # $dest, $dest_format, - # $quiet, $source, $source_format ); - #my $result = GetOptions ( - # "dir=s" => \$dir, - # "group:s" => \$group, - # "mode:s" => \$mode, - # "dest:s" => \$dest, - # "dest_format:s" => \$dest_format, - # "source:s" => \$source, - # "source_format:s" => \$source_format, - # "quiet" => \$quiet, - # "debug" => \$debug, - # ( eval "defined \@".$run_pkg."::Options" ) - # ? eval "\@".$run_pkg."::Options" - # : ()); - #if ( ! $result ) { - # print STDERR "usage: $0 --dir dirpath " - # ."[--group group] [--mode mode] " - # ."[--source file] [--source_format fmt-string] " - # ."[--dest file] [--dest_format fmt-string] " - # ."[--quiet]\n"; - # if ( eval "defined \$".$run_pkg."::Usage" ) { - # print STDERR " " - # .( eval "\$".$run_pkg."::Usage" )."\n"; - # } - # exit 1; - #} $debug and print STDERR "debug: entered run for $run_pkg\n"; # make sure we have the run package loaded @@ -711,12 +687,15 @@ # then data processing is external to the fetch routine # (This externalizes the data for other software to capture it.) $debug and print STDERR "run before output\n"; - my $dest_format = $options_ref->{dest_format}; - if (( defined $obj->{data}) and ( defined $obj->{actions})) { - if ( defined $obj->{dest}) { - ( defined $obj->{actions}) or $obj->{actions} = {}; - ( defined $obj->{actions}{$dest_format}) - or $obj->{actions}{$dest_format} = []; + my $dest_format = $obj->{dest_format}; + if ( !exists $obj->{actions}) { + $obj->{actions} = {}; + } + if (( exists $obj->{data})) { + if ( exists $obj->{dest}) { + if ( !exists $obj->{actions}{$dest_format}) { + $obj->{actions}{$dest_format} = []; + } push @{$obj->{actions}{$dest_format}}, [ $obj->{dest} ]; } @@ -728,18 +707,21 @@ $debug and print STDERR "run before save\n"; my $result = $obj->save(); - if ( ! $result ) { - my $savable; - foreach $savable ( @{$obj->{savable}}) { - (ref $savable eq "HASH") or next; - if ( defined $savable->{error}) { - throw_save_error( "error saving in " - .$obj->{dir} - ."file: ".$savable->{file} - ."error: " .$savable->{error} ); - } - } - } + + # Old WebFetch pre-0.9 API code, should not be needed any more + #if ( ! $result ) { + # my $savable; + # foreach $savable ( @{$obj->{savable}}) { + # (ref $savable eq "HASH") or next; + # if ( exists $savable->{error}) { + # throw_save_error( "error saving in " + # .$obj->{dir} + # ."file: ".$savable->{file} + # ."error: " .$savable->{error} ); + # } + # } + #} + return $result ? 0 : 1; } @@ -779,7 +761,7 @@ a one-liner banner or title text (plain text, no HTML tags) -=item source +=item url URL or file path (as appropriate) to the news source @@ -964,7 +946,7 @@ # we *really* need the data and actions to be set! # otherwise assume we're in WebFetch 0.09 compatibility mode and # $self->fetch() better have created its own savables already - if (( !defined $self->{data}) or ( !defined $self->{actions})) { + if (( !exists $self->{data}) or ( !exists $self->{actions})) { return } @@ -1155,16 +1137,19 @@ # utility function to get the contents of a URL sub get { - my ( $self ) = @_; + my ( $self, $source ) = @_; + if ( ! defined $source ) { + $source = $self->{source}; + } if ( $self->{debug}) { - print STDERR "debug: get(".$self->{source}.")\n"; + print STDERR "debug: get(".$source.")\n"; } # send request, capture response my $ua = LWP::UserAgent->new; $ua->agent("WebFetch/$VERSION ".$ua->agent); - my $request = HTTP::Request->new(GET => $self->{source}); + my $request = HTTP::Request->new(GET => $source); my $response = $ua->request($request); # abort on failure @@ -1232,17 +1217,47 @@ { my ( $self, $filename, $content ) = @_; - if ( !defined $self->{savable}) { + if ( !exists $self->{savable}) { $self->{savable} = []; } push ( @{$self->{savable}}, { 'file' => $filename, 'content' => $content, - (( defined $self->{group}) ? ('group' => $self->{group}) : ()), - (( defined $self->{mode}) ? ('mode' => $self->{mode}) : ()) + (( exists $self->{group}) ? ('group' => $self->{group}) : ()), + (( exists $self->{mode}) ? ('mode' => $self->{mode}) : ()) }); } +=item $obj->direct_fetch_savable( $filename, $source ) + +I<This should be used only in format handler functions. +See do_actions() for details.> + +This adds a task for the save function to fetch a URL and save it +verbatim in a file. This can be used to download links contained +in a news feed. + +=cut + +sub direct_fetch_savable +{ + my ( $self, $url ) = @_; + + if ( !exists $self->{savable}) { + $self->{savable} = []; + } + my $filename = $url; + $filename =~ s=[;?].*==; + $filename =~ s=^.*/==; + push ( @{$self->{savable}}, { + 'url' => $url, + 'file' => $filename, + 'index' => 1, + (( exists $self->{group}) ? ('group' => $self->{group}) : ()), + (( exists $self->{mode}) ? ('mode' => $self->{mode}) : ()) + }); +} + =item $obj->save This WebFetch utility function goes through all the entries in the @@ -1284,17 +1299,29 @@ } # check if we have attributes needed to proceed - if ( !defined $self->{"dir"}) { + if ( !exists $self->{"dir"}) { die "WebFetch: directory path missing - " ."required for save\n"; } - if ( !defined $self->{savable}) { + if ( !exists $self->{savable}) { die "WebFetch: nothing to save\n"; } if ( ref($self->{savable}) ne "ARRAY" ) { die "WebFetch: cannot save - savable is not an array\n"; } + # if fetch_urls is defined, turn link fields in the data to savables + if (( exists $self->{fetch_urls}) and $self->{fetch_urls}) { + my $url_fnum = $self->wk2fnum( "url" ); + my $entry; + foreach $entry ( @{$self->{data}{records}}) { + if ( defined $entry->[$url_fnum]) { + $self->direct_fetch_savable( + $entry->[$url_fnum]); + } + } + } + # loop through "savable" (grouped content and filename destination) my $savable; foreach $savable ( @{$self->{savable}}) { @@ -1304,12 +1331,14 @@ } # verify contents of savable record - if ( !defined $savable->{file}) { + if ( !exists $savable->{file}) { $savable->{error} = "missing file name - skipped"; next; } - if ( !defined $savable->{content}) { - $savable->{error} = "missing content text - skipped"; + if (( !exists $savable->{content}) + and ( !exists $savable->{url})) + { + $savable->{error} = "missing content or URL - skipped"; next; } @@ -1327,6 +1356,45 @@ } } + # if a URL was provided and index flag is set, use index file + my %id_index; + my ( $timestamp, $filename ); + my $was_in_index = 0; + if (( exists $savable->{url}) and ( exists $savable->{index})) + { + require DB_File; + tie %id_index, 'DB_File', + $self->{dir}."/id_index.db", + &DB_File::O_CREAT|&DB_File::O_RDWR, 0640; + if ( exists $id_index{$savable->{url}}) { + ( $timestamp, $filename ) = + split /#/, $id_index{$savable->{url}}; + $was_in_index = 1; + } else { + $timestamp = time; + $id_index{$savable->{url}} = + $timestamp."#".$savable->{file}; + } + untie %id_index ; + } + + # For now, we consider it done if the file was in the index. + # Future options would be to check if URL was modified. + if ( $was_in_index ) { + next; + } + + # if a URL was provided and no content, get content from URL + if (( ! exists $savable->{content}) + and ( exists $savable->{url})) + { + $savable->{content} = + eval { ${$self->get($savable->{url})} }; + if ( $@ ) { + next; + } + } + # write content to the "new content" file if ( ! open ( new_content, ">$new_content" )) { $savable->{error} = "cannot open " @@ -1366,7 +1434,7 @@ } # chgrp the "new content" before final installation - if ( defined $savable->{group}) { + if ( exists $savable->{group}) { my $gid = $savable->{group}; if ( $gid !~ /^[0-9]+$/o ) { $gid = (getgrnam($gid))[2]; @@ -1387,7 +1455,7 @@ } # chmod the "new content" before final installation - if ( defined $savable->{mode}) { + if ( exists $savable->{mode}) { if ( ! chmod oct($savable->{mode}), $new_content ) { $savable->{error} = "cannot chmod " .$new_content." to " @@ -1410,7 +1478,7 @@ # loop through savable to report any errors my $err_count = 0; foreach $savable ( @{$self->{savable}}) { - if ( defined $savable->{error}) { + if ( exists $savable->{error}) { print STDERR "WebFetch: failed to save " .$savable->{file}.": " .$savable->{error}."\n"; @@ -1435,7 +1503,7 @@ my ( $self ) = @_; # check if fname2fnum is already initialized - if (( defined $self->{fname2fnum}) + if (( exists $self->{fname2fnum}) and ref $self->{fname2fnum} eq "HASH" ) { # already done - success @@ -1443,8 +1511,8 @@ } # check if prerequisite data exists - if (( ! defined $self->{data} ) - or ( ! defined $self->{data}{fields})) + if (( ! exists $self->{data} ) + or ( ! exists $self->{data}{fields})) { # missing prerequisites - failed return 0; @@ -1470,7 +1538,7 @@ $self->init_fname2fnum() or return 0; # check if wk2fnum is already initialized - if (( defined $self->{wk2fnum}) + if (( exists $self->{wk2fnum}) and ref $self->{wk2fnum} eq "HASH" ) { # already done - success @@ -1478,7 +1546,7 @@ } # check for prerequisite data - if ( ! defined $self->{data}{wk_names}) { + if ( ! exists $self->{data}{wk_names}) { return 0; } @@ -1486,7 +1554,7 @@ $self->{wk2fnum} = {}; foreach $wk_key ( keys %{$self->{data}{wk_names}}) { # perform consistency cross-check between wk_names and fields - if ( !defined $self->{fname2fnum}{$self->{data}{wk_names}{$wk_key}}) + if ( !exists $self->{fname2fnum}{$self->{data}{wk_names}{$wk_key}}) { # wk_names has a bad field name - carp about it! carp "warning: wk_names contains $wk_key"."->" @@ -1511,22 +1579,22 @@ $self->init_fname2fnum() or return undef; # check for prerequisite data - if (( ! defined $self->{data}{wk_names}) - or ( ! defined $self->{data}{wk_names}{$wk})) + if (( ! exists $self->{data}{wk_names}) + or ( ! exists $self->{data}{wk_names}{$wk})) { return undef; } # double check that the field exists before pronouncing it OK # (perform consistency cross-check between wk_names and fields) - if ( defined $self->{fname2fnum}{$self->{data}{wk_names}{$wk}}) { + if ( exists $self->{fname2fnum}{$self->{data}{wk_names}{$wk}}) { return $self->{data}{wk_names}{$wk}; } # otherwise, wk_names has a bad field name. # But init_wk2fnum() may have already carped about it # so check whether we need to carp about it or not. - if ( ! defined $self->{wk2fnum}) { + if ( ! exists $self->{wk2fnum}) { carp "warning: wk_names contains $wk"."->" .$self->{data}{wk_names}{$wk} ." but " @@ -1542,7 +1610,8 @@ my ( $self, $fname ) = @_; $self->init_fname2fnum() or return undef; - return $self->{fname2fnum}{$fname}; + return ( exists $self->{fname2fnum}{$fname}) + ? $self->{fname2fnum}{$fname} : undef; } # convert well-known name to field number @@ -1551,14 +1620,15 @@ my ( $self, $wk ) = @_; $self->init_wk2fnum() or return undef; - return $self->{wk2fnum}{$wk}; + return ( exists $self->{wk2fnum}{$wk}) + ? $self->{wk2fnum}{$wk} : undef; } 1; __END__ # remainder of POD docs follow -=head2 WRITING NEW WebFetch-DERIVED MODULES +=head2 WRITING WebFetch-DERIVED MODULES The easiest way to make a new WebFetch-derived module is to start from the module closest to your fetch operation and modify it. @@ -1609,17 +1679,21 @@ Please consider contributing any useful changes back to the WebFetch project at C<ma...@we...>. -=head1 AUTHOR +=head1 ACKNOWLEDGEMENTS WebFetch was written by Ian Kluft Send patches, bug reports, suggestions and questions to C<ma...@we...>. +=head1 LICENSE + WebFetch is Open Source software distributed via the Comprehensive Perl Archive Network (CPAN), a worldwide network of Perl web mirror sites. WebFetch may be copied under the same terms and licensing as Perl itelf. +=head1 SEE ALSO + =for html A current copy of the source code and documentation may be found at <a href="http://www.webfetch.org/">http://www.webfetch.org/</a> @@ -1632,27 +1706,26 @@ A current copy of the source code and documentation may be found at http://www.webfetch.org/ -=head1 SEE ALSO - TODO: fill in these lists =for html <a href="http://www.perl.org/">perl</a>(1), <a href="WebFetch::Input::PerlStruct.html">WebFetch::Input::PerlStruct</a>, <a href="WebFetch::Input::SiteNews.html">WebFetch::Input::SiteNews</a>, +<a href="WebFetch::Input::Atom.html">WebFetch::Input::Atom</a>, <a href="WebFetch::Input::RSS.html">WebFetch::Input::RSS</a>, <a href="WebFetch::Input::Dump.html">WebFetch::Input::Dump</a>, -<a href="WebFetch::Output::RSS.html">WebFetch::Output::RSS</a>, +<a href="WebFetch::Output::TT.html">WebFetch::Output::TT</a>, <a href="WebFetch::Output::Dump.html">WebFetch::Output::Dump</a>, =for text perl(1), WebFetch::Input::PerlStruct, WebFetch::Input::SiteNews, -WebFetch::Input::RSS, WebFetch::Input::Dump, -WebFetch::Output::RSS, WebFetch::Output::Dump +WebFetch::Input::Atom, WebFetch::Input::RSS, WebFetch::Input::Dump, +WebFetch::Output::TT, WebFetch::Output::Dump =for man perl(1), WebFetch::Input::PerlStruct, WebFetch::Input::SiteNews, -WebFetch::Input::RSS, WebFetch::Input::Dump, -WebFetch::Output::RSS, WebFetch::Output::Dump +WebFetch::Input::Atom, WebFetch::Input::RSS, WebFetch::Input::Dump, +WebFetch::Output::TT, WebFetch::Output::Dump =cut This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |