|
From: <ik...@us...> - 2009-08-26 21:27:23
|
Revision: 35
http://webfetch.svn.sourceforge.net/webfetch/?rev=35&view=rev
Author: ikluft
Date: 2009-08-26 21:27:09 +0000 (Wed, 26 Aug 2009)
Log Message:
-----------
expand get and save functions, clean up
Modified Paths:
--------------
branches/v0.11/lib/WebFetch.pm
Modified: branches/v0.11/lib/WebFetch.pm
===================================================================
--- branches/v0.11/lib/WebFetch.pm 2009-08-25 22:19:47 UTC (rev 34)
+++ branches/v0.11/lib/WebFetch.pm 2009-08-26 21:27:09 UTC (rev 35)
@@ -113,6 +113,7 @@
use Getopt::Long;
use LWP::UserAgent;
use HTTP::Request;
+use Date::Calc;
use Data::Dumper;
# define exceptions/errors
@@ -143,7 +144,7 @@
'WebFetch::Exception::NoSave' => {
isa => 'WebFetch::Exception',
alias => 'throw_no_save',
- description => "unable to save data because of no data or nowhere to save it",
+ description => "unable to save: no data or nowhere to save it",
},
'WebFetch::Exception::NoInputHandler' => {
@@ -183,6 +184,7 @@
);
+# initialize class variables
our $VERSION = '0.12';
our %default_modules = (
"input" => {
@@ -221,8 +223,8 @@
The @capabilities array is any number of strings as needed to list the
capabilities which the module performs for the WebFetch API.
The currently-recognized capabilities are "cmdline", "input" and "output".
-However, the function will save all the capability names that the module
-provides.
+"config" and "storage" are reserved for future use. The function will save
+all the capability names that the module provides.
=cut
@@ -449,6 +451,7 @@
"source_format:s",
"dest=s",
"dest_format:s",
+ "fetch_urls",
"quiet",
"debug",
@mod_options ) };
@@ -459,7 +462,8 @@
."[--group group] [--mode mode] "
."[--source file] [--source_format fmt-string] "
."[--dest file] [--dest_format fmt-string] "
- ."[--quiet] ".join( " ", @mod_usage ));
+ ."[--fetch_urls] [--quiet] "
+ .join( " ", @mod_usage ));
}
# set debugging mode
@@ -470,7 +474,7 @@
# if either source/input or dest/output formats were not provided,
- # check if only one handler is defined - if so that's the one to use
+ # check if only one handler is registered - if so that's the default
if ( !exists $options{source_format}) {
if ( my $fmt = singular_handler( "input" )) {
$options{source_format} = $fmt;
@@ -550,7 +554,7 @@
# go fetch the data
# this function must be provided by a derived module
- if (( ! defined $self->{no_fetch}) or ! $self->{no_fetch}) {
+ if (( ! exists $self->{no_fetch}) or ! $self->{no_fetch}) {
$self->fetch();
}
@@ -657,34 +661,6 @@
my $options_ref = shift;
my $obj;
- #my ( $obj, $dir, $group, $mode,
- # $dest, $dest_format,
- # $quiet, $source, $source_format );
- #my $result = GetOptions (
- # "dir=s" => \$dir,
- # "group:s" => \$group,
- # "mode:s" => \$mode,
- # "dest:s" => \$dest,
- # "dest_format:s" => \$dest_format,
- # "source:s" => \$source,
- # "source_format:s" => \$source_format,
- # "quiet" => \$quiet,
- # "debug" => \$debug,
- # ( eval "defined \@".$run_pkg."::Options" )
- # ? eval "\@".$run_pkg."::Options"
- # : ());
- #if ( ! $result ) {
- # print STDERR "usage: $0 --dir dirpath "
- # ."[--group group] [--mode mode] "
- # ."[--source file] [--source_format fmt-string] "
- # ."[--dest file] [--dest_format fmt-string] "
- # ."[--quiet]\n";
- # if ( eval "defined \$".$run_pkg."::Usage" ) {
- # print STDERR " "
- # .( eval "\$".$run_pkg."::Usage" )."\n";
- # }
- # exit 1;
- #}
$debug and print STDERR "debug: entered run for $run_pkg\n";
# make sure we have the run package loaded
@@ -711,12 +687,15 @@
# then data processing is external to the fetch routine
# (This externalizes the data for other software to capture it.)
$debug and print STDERR "run before output\n";
- my $dest_format = $options_ref->{dest_format};
- if (( defined $obj->{data}) and ( defined $obj->{actions})) {
- if ( defined $obj->{dest}) {
- ( defined $obj->{actions}) or $obj->{actions} = {};
- ( defined $obj->{actions}{$dest_format})
- or $obj->{actions}{$dest_format} = [];
+ my $dest_format = $obj->{dest_format};
+ if ( !exists $obj->{actions}) {
+ $obj->{actions} = {};
+ }
+ if (( exists $obj->{data})) {
+ if ( exists $obj->{dest}) {
+ if ( !exists $obj->{actions}{$dest_format}) {
+ $obj->{actions}{$dest_format} = [];
+ }
push @{$obj->{actions}{$dest_format}}, [ $obj->{dest} ];
}
@@ -728,18 +707,21 @@
$debug and print STDERR "run before save\n";
my $result = $obj->save();
- if ( ! $result ) {
- my $savable;
- foreach $savable ( @{$obj->{savable}}) {
- (ref $savable eq "HASH") or next;
- if ( defined $savable->{error}) {
- throw_save_error( "error saving in "
- .$obj->{dir}
- ."file: ".$savable->{file}
- ."error: " .$savable->{error} );
- }
- }
- }
+
+ # Old WebFetch pre-0.9 API code, should not be needed any more
+ #if ( ! $result ) {
+ # my $savable;
+ # foreach $savable ( @{$obj->{savable}}) {
+ # (ref $savable eq "HASH") or next;
+ # if ( exists $savable->{error}) {
+ # throw_save_error( "error saving in "
+ # .$obj->{dir}
+ # ."file: ".$savable->{file}
+ # ."error: " .$savable->{error} );
+ # }
+ # }
+ #}
+
return $result ? 0 : 1;
}
@@ -779,7 +761,7 @@
a one-liner banner or title text
(plain text, no HTML tags)
-=item source
+=item url
URL or file path (as appropriate) to the news source
@@ -964,7 +946,7 @@
# we *really* need the data and actions to be set!
# otherwise assume we're in WebFetch 0.09 compatibility mode and
# $self->fetch() better have created its own savables already
- if (( !defined $self->{data}) or ( !defined $self->{actions})) {
+ if (( !exists $self->{data}) or ( !exists $self->{actions})) {
return
}
@@ -1155,16 +1137,19 @@
# utility function to get the contents of a URL
sub get
{
- my ( $self ) = @_;
+ my ( $self, $source ) = @_;
+ if ( ! defined $source ) {
+ $source = $self->{source};
+ }
if ( $self->{debug}) {
- print STDERR "debug: get(".$self->{source}.")\n";
+ print STDERR "debug: get(".$source.")\n";
}
# send request, capture response
my $ua = LWP::UserAgent->new;
$ua->agent("WebFetch/$VERSION ".$ua->agent);
- my $request = HTTP::Request->new(GET => $self->{source});
+ my $request = HTTP::Request->new(GET => $source);
my $response = $ua->request($request);
# abort on failure
@@ -1232,17 +1217,47 @@
{
my ( $self, $filename, $content ) = @_;
- if ( !defined $self->{savable}) {
+ if ( !exists $self->{savable}) {
$self->{savable} = [];
}
push ( @{$self->{savable}}, {
'file' => $filename,
'content' => $content,
- (( defined $self->{group}) ? ('group' => $self->{group}) : ()),
- (( defined $self->{mode}) ? ('mode' => $self->{mode}) : ())
+ (( exists $self->{group}) ? ('group' => $self->{group}) : ()),
+ (( exists $self->{mode}) ? ('mode' => $self->{mode}) : ())
});
}
+=item $obj->direct_fetch_savable( $filename, $source )
+
+I<This should be used only in format handler functions.
+See do_actions() for details.>
+
+This adds a task for the save function to fetch a URL and save it
+verbatim in a file. This can be used to download links contained
+in a news feed.
+
+=cut
+
+sub direct_fetch_savable
+{
+ my ( $self, $url ) = @_;
+
+ if ( !exists $self->{savable}) {
+ $self->{savable} = [];
+ }
+ my $filename = $url;
+ $filename =~ s=[;?].*==;
+ $filename =~ s=^.*/==;
+ push ( @{$self->{savable}}, {
+ 'url' => $url,
+ 'file' => $filename,
+ 'index' => 1,
+ (( exists $self->{group}) ? ('group' => $self->{group}) : ()),
+ (( exists $self->{mode}) ? ('mode' => $self->{mode}) : ())
+ });
+}
+
=item $obj->save
This WebFetch utility function goes through all the entries in the
@@ -1284,17 +1299,29 @@
}
# check if we have attributes needed to proceed
- if ( !defined $self->{"dir"}) {
+ if ( !exists $self->{"dir"}) {
die "WebFetch: directory path missing - "
."required for save\n";
}
- if ( !defined $self->{savable}) {
+ if ( !exists $self->{savable}) {
die "WebFetch: nothing to save\n";
}
if ( ref($self->{savable}) ne "ARRAY" ) {
die "WebFetch: cannot save - savable is not an array\n";
}
+ # if fetch_urls is defined, turn link fields in the data to savables
+ if (( exists $self->{fetch_urls}) and $self->{fetch_urls}) {
+ my $url_fnum = $self->wk2fnum( "url" );
+ my $entry;
+ foreach $entry ( @{$self->{data}{records}}) {
+ if ( defined $entry->[$url_fnum]) {
+ $self->direct_fetch_savable(
+ $entry->[$url_fnum]);
+ }
+ }
+ }
+
# loop through "savable" (grouped content and filename destination)
my $savable;
foreach $savable ( @{$self->{savable}}) {
@@ -1304,12 +1331,14 @@
}
# verify contents of savable record
- if ( !defined $savable->{file}) {
+ if ( !exists $savable->{file}) {
$savable->{error} = "missing file name - skipped";
next;
}
- if ( !defined $savable->{content}) {
- $savable->{error} = "missing content text - skipped";
+ if (( !exists $savable->{content})
+ and ( !exists $savable->{url}))
+ {
+ $savable->{error} = "missing content or URL - skipped";
next;
}
@@ -1327,6 +1356,45 @@
}
}
+ # if a URL was provided and index flag is set, use index file
+ my %id_index;
+ my ( $timestamp, $filename );
+ my $was_in_index = 0;
+ if (( exists $savable->{url}) and ( exists $savable->{index}))
+ {
+ require DB_File;
+ tie %id_index, 'DB_File',
+ $self->{dir}."/id_index.db",
+ &DB_File::O_CREAT|&DB_File::O_RDWR, 0640;
+ if ( exists $id_index{$savable->{url}}) {
+ ( $timestamp, $filename ) =
+ split /#/, $id_index{$savable->{url}};
+ $was_in_index = 1;
+ } else {
+ $timestamp = time;
+ $id_index{$savable->{url}} =
+ $timestamp."#".$savable->{file};
+ }
+ untie %id_index ;
+ }
+
+ # For now, we consider it done if the file was in the index.
+ # Future options would be to check if URL was modified.
+ if ( $was_in_index ) {
+ next;
+ }
+
+ # if a URL was provided and no content, get content from URL
+ if (( ! exists $savable->{content})
+ and ( exists $savable->{url}))
+ {
+ $savable->{content} =
+ eval { ${$self->get($savable->{url})} };
+ if ( $@ ) {
+ next;
+ }
+ }
+
# write content to the "new content" file
if ( ! open ( new_content, ">$new_content" )) {
$savable->{error} = "cannot open "
@@ -1366,7 +1434,7 @@
}
# chgrp the "new content" before final installation
- if ( defined $savable->{group}) {
+ if ( exists $savable->{group}) {
my $gid = $savable->{group};
if ( $gid !~ /^[0-9]+$/o ) {
$gid = (getgrnam($gid))[2];
@@ -1387,7 +1455,7 @@
}
# chmod the "new content" before final installation
- if ( defined $savable->{mode}) {
+ if ( exists $savable->{mode}) {
if ( ! chmod oct($savable->{mode}), $new_content ) {
$savable->{error} = "cannot chmod "
.$new_content." to "
@@ -1410,7 +1478,7 @@
# loop through savable to report any errors
my $err_count = 0;
foreach $savable ( @{$self->{savable}}) {
- if ( defined $savable->{error}) {
+ if ( exists $savable->{error}) {
print STDERR "WebFetch: failed to save "
.$savable->{file}.": "
.$savable->{error}."\n";
@@ -1435,7 +1503,7 @@
my ( $self ) = @_;
# check if fname2fnum is already initialized
- if (( defined $self->{fname2fnum})
+ if (( exists $self->{fname2fnum})
and ref $self->{fname2fnum} eq "HASH" )
{
# already done - success
@@ -1443,8 +1511,8 @@
}
# check if prerequisite data exists
- if (( ! defined $self->{data} )
- or ( ! defined $self->{data}{fields}))
+ if (( ! exists $self->{data} )
+ or ( ! exists $self->{data}{fields}))
{
# missing prerequisites - failed
return 0;
@@ -1470,7 +1538,7 @@
$self->init_fname2fnum() or return 0;
# check if wk2fnum is already initialized
- if (( defined $self->{wk2fnum})
+ if (( exists $self->{wk2fnum})
and ref $self->{wk2fnum} eq "HASH" )
{
# already done - success
@@ -1478,7 +1546,7 @@
}
# check for prerequisite data
- if ( ! defined $self->{data}{wk_names}) {
+ if ( ! exists $self->{data}{wk_names}) {
return 0;
}
@@ -1486,7 +1554,7 @@
$self->{wk2fnum} = {};
foreach $wk_key ( keys %{$self->{data}{wk_names}}) {
# perform consistency cross-check between wk_names and fields
- if ( !defined $self->{fname2fnum}{$self->{data}{wk_names}{$wk_key}})
+ if ( !exists $self->{fname2fnum}{$self->{data}{wk_names}{$wk_key}})
{
# wk_names has a bad field name - carp about it!
carp "warning: wk_names contains $wk_key"."->"
@@ -1511,22 +1579,22 @@
$self->init_fname2fnum() or return undef;
# check for prerequisite data
- if (( ! defined $self->{data}{wk_names})
- or ( ! defined $self->{data}{wk_names}{$wk}))
+ if (( ! exists $self->{data}{wk_names})
+ or ( ! exists $self->{data}{wk_names}{$wk}))
{
return undef;
}
# double check that the field exists before pronouncing it OK
# (perform consistency cross-check between wk_names and fields)
- if ( defined $self->{fname2fnum}{$self->{data}{wk_names}{$wk}}) {
+ if ( exists $self->{fname2fnum}{$self->{data}{wk_names}{$wk}}) {
return $self->{data}{wk_names}{$wk};
}
# otherwise, wk_names has a bad field name.
# But init_wk2fnum() may have already carped about it
# so check whether we need to carp about it or not.
- if ( ! defined $self->{wk2fnum}) {
+ if ( ! exists $self->{wk2fnum}) {
carp "warning: wk_names contains $wk"."->"
.$self->{data}{wk_names}{$wk}
." but "
@@ -1542,7 +1610,8 @@
my ( $self, $fname ) = @_;
$self->init_fname2fnum() or return undef;
- return $self->{fname2fnum}{$fname};
+ return ( exists $self->{fname2fnum}{$fname})
+ ? $self->{fname2fnum}{$fname} : undef;
}
# convert well-known name to field number
@@ -1551,14 +1620,15 @@
my ( $self, $wk ) = @_;
$self->init_wk2fnum() or return undef;
- return $self->{wk2fnum}{$wk};
+ return ( exists $self->{wk2fnum}{$wk})
+ ? $self->{wk2fnum}{$wk} : undef;
}
1;
__END__
# remainder of POD docs follow
-=head2 WRITING NEW WebFetch-DERIVED MODULES
+=head2 WRITING WebFetch-DERIVED MODULES
The easiest way to make a new WebFetch-derived module is to start
from the module closest to your fetch operation and modify it.
@@ -1609,17 +1679,21 @@
Please consider contributing any useful changes back to the WebFetch
project at C<ma...@we...>.
-=head1 AUTHOR
+=head1 ACKNOWLEDGEMENTS
WebFetch was written by Ian Kluft
Send patches, bug reports, suggestions and questions to
C<ma...@we...>.
+=head1 LICENSE
+
WebFetch is Open Source software distributed via the
Comprehensive Perl Archive Network (CPAN),
a worldwide network of Perl web mirror sites.
WebFetch may be copied under the same terms and licensing as Perl itelf.
+=head1 SEE ALSO
+
=for html
A current copy of the source code and documentation may be found at
<a href="http://www.webfetch.org/">http://www.webfetch.org/</a>
@@ -1632,27 +1706,26 @@
A current copy of the source code and documentation may be found at
http://www.webfetch.org/
-=head1 SEE ALSO
-
TODO: fill in these lists
=for html
<a href="http://www.perl.org/">perl</a>(1),
<a href="WebFetch::Input::PerlStruct.html">WebFetch::Input::PerlStruct</a>,
<a href="WebFetch::Input::SiteNews.html">WebFetch::Input::SiteNews</a>,
+<a href="WebFetch::Input::Atom.html">WebFetch::Input::Atom</a>,
<a href="WebFetch::Input::RSS.html">WebFetch::Input::RSS</a>,
<a href="WebFetch::Input::Dump.html">WebFetch::Input::Dump</a>,
-<a href="WebFetch::Output::RSS.html">WebFetch::Output::RSS</a>,
+<a href="WebFetch::Output::TT.html">WebFetch::Output::TT</a>,
<a href="WebFetch::Output::Dump.html">WebFetch::Output::Dump</a>,
=for text
perl(1), WebFetch::Input::PerlStruct, WebFetch::Input::SiteNews,
-WebFetch::Input::RSS, WebFetch::Input::Dump,
-WebFetch::Output::RSS, WebFetch::Output::Dump
+WebFetch::Input::Atom, WebFetch::Input::RSS, WebFetch::Input::Dump,
+WebFetch::Output::TT, WebFetch::Output::Dump
=for man
perl(1), WebFetch::Input::PerlStruct, WebFetch::Input::SiteNews,
-WebFetch::Input::RSS, WebFetch::Input::Dump,
-WebFetch::Output::RSS, WebFetch::Output::Dump
+WebFetch::Input::Atom, WebFetch::Input::RSS, WebFetch::Input::Dump,
+WebFetch::Output::TT, WebFetch::Output::Dump
=cut
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|