|
From: <ik...@us...> - 2009-09-05 09:35:19
|
Revision: 45
http://webfetch.svn.sourceforge.net/webfetch/?rev=45&view=rev
Author: ikluft
Date: 2009-09-05 09:35:07 +0000 (Sat, 05 Sep 2009)
Log Message:
-----------
expand docs, expand error reporting, streamline debugging, add AUTOLOAD
Modified Paths:
--------------
branches/v0.13/lib/WebFetch.pm
Modified: branches/v0.13/lib/WebFetch.pm
===================================================================
--- branches/v0.13/lib/WebFetch.pm 2009-08-29 00:38:38 UTC (rev 44)
+++ branches/v0.13/lib/WebFetch.pm 2009-09-05 09:35:07 UTC (rev 45)
@@ -19,15 +19,28 @@
=head1 DESCRIPTION
-The WebFetch module is a general framework for downloading and saving
-information from the web, and for display on the web.
-It requires another module to inherit it and fill in the specifics of
-what and how to download.
-WebFetch provides a generalized interface for saving to a file
+The WebFetch module is a framework for downloading and saving
+information from the web, and for saving or re-displaying it.
+It provides a generalized interface for saving to a file
while keeping the previous version as a backup.
-This is expected to be used for periodically-updated information
-which is run as a cron job.
+This is mainly intended for use in a cron-job to acquire
+periodically-updated information.
+WebFetch allows the user to specify a source and destination, and
+the input and output formats. It is possible to write new Perl modules
+to the WebFetch API in order to add more input and output formats.
+
+The currently-provided input formats are Atom, RSS, WebFetch "SiteNews" files
+and raw Perl data structures.
+
+The currently-provided output formats are RSS, WebFetch "SiteNews" files,
+the Perl Template Toolkit, and export into a TWiki site.
+
+Some modules which were specific to pre-RSS/Atom web syndication formats
+have been deprecated. Those modules can be found in the CPAN archive
+in WebFetch 0.10. Those modules are no longer compatible with changes
+in the current WebFetch API.
+
=head1 INSTALLATION
After unpacking and the module sources from the tar file, run
@@ -63,41 +76,19 @@
=head2 SETTING UP CRONTAB ENTRIES
-First of all, if you don't have crontab access or don't know what they are,
-contact your site's system administrator(s). Only local help will do any
-good on local-configuration issues. No one on the Internet can help.
-(If you are the administrator for your system, see the crontab(1) and
-crontab(5) manpages and nearly any book on Unix system administration.)
+If needed, see the manual pages for crontab(1), crontab(5) and any
+web sites or books on Unix system administration.
-Since the WebFetch command lines are usually very long, you may prefer
-to make one or more scripts as front-ends so your crontab entries aren't
-so huge.
+Since WebFetch command lines are usually very long, the user may prefer
+to make one or more scripts as front-ends so crontab entries aren't so big.
-Do not run the crontab entries too often - be a good net.citizen and
-do your updates no more often than necessary.
-Popular sites need their users to refrain from making automated
-requests too often because they add up on an enormous scale
-on the Internet.
-Some sites such as Freshmeat prefer no shorter than hourly intervals.
-Slashdot prefers no shorter than half-hourly intervals.
-When in doubt, ask the site maintainers what they prefer.
+Try not to run crontab entries too often - be aware if the site you're
+accessing has any resource constraints, and how often their information
+gets updated. If they request users not to access a feed more often
+than a certain interval, respect it. (It isn't hard to find violators
+in server logs.) If in doubt, try every 30 minutes until more information
+becomes available.
-(Then again, there are a very few sites like Yahoo and CNN who don't
-mind getting the extra hits if you're going to create links to them.
-Even so, more often than every 20 minutes would still be excessive
-to the biggest web sites.)
-
-=head2 SETTING UP SERVER-SIDE INCLUDES
-
-See the manual for your web server to make sure you have server-side include
-(SSI) enabled for the files that need it.
-(It's wasteful to enable it for all your files so be careful.)
-
-When using Apache HTTPD,
-a line like this will include a WebFetch-generated file:
-
-<!--#include file="fetch/slashdot.html"-->
-
=head1 WebFetch FUNCTIONS
The following function definitions assume B<C<$obj>> is a blessed
@@ -147,10 +138,10 @@
description => "unable to save: no data or nowhere to save it",
},
- 'WebFetch::Exception::NoInputHandler' => {
+ 'WebFetch::Exception::NoHandler' => {
isa => 'WebFetch::Exception',
- alias => 'throw_no_input_handler',
- description => "no input handler was found",
+ alias => 'throw_no_handler',
+ description => "no handler was found",
},
'WebFetch::Exception::MustOverride' => {
@@ -182,10 +173,16 @@
description => "no module was found to run the request",
},
+ 'WebFetch::Exception::AutoRunFailure' => {
+ isa => 'WebFetch::TracedException',
+ alias => 'throw_autoload_fail',
+ description => "AUTORUN failed to handle function call",
+ },
+
);
# initialize class variables
-our $VERSION = '0.12';
+our $VERSION = '0.13-pre29';
our %default_modules = (
"input" => {
"rss" => "WebFetch::Input::RSS",
@@ -203,12 +200,14 @@
}
);
our %modules;
+our $AUTOLOAD;
my $debug;
-=item import( "param-name" => "value", ... )
+sub debug
+{
+ $debug and print STDERR "debug: ".join( " ", @_ )."\n";
+}
-=cut
-
=item WebFetch::module_register( $module, @capabilities );
This function allows a Perl module to register itself with the WebFetch API
@@ -223,9 +222,15 @@
The @capabilities array is any number of strings as needed to list the
capabilities which the module performs for the WebFetch API.
The currently-recognized capabilities are "cmdline", "input" and "output".
-"config" and "storage" are reserved for future use. The function will save
-all the capability names that the module provides.
+"config", "filter", "save" and "storage" are reserved for future use. The
+function will save all the capability names that the module provides, without
+checking whether any code will use it.
+For example, the WebFetch::Output::TT module registers itself like this:
+ C<__PACKAGE__->module_register( "cmdline", "output:tt" );>
+meaning that it defines additional command-line options, and it provides an
+output format handler for the "tt" format, the Perl Template Toolkit.
+
=cut
sub module_register
@@ -258,10 +263,6 @@
}
}
-# satisfy POD coverage test - but don't put this function in the user manual
-=pod
-=cut
-
# module selection - choose WebFetch module based on selected file format
# for WebFetch internal use only
sub module_select
@@ -269,8 +270,7 @@
my $capability = shift;
my $is_optional = shift;
- $debug and print STDERR "debug: "
- ."module_select($capability,$is_optional)\n";
+ debug "module_select($capability,$is_optional)";
# parse the capability string
my ( $group, $topic );
if ( $capability =~ /([^:]*):(.*)/ ) {
@@ -320,13 +320,12 @@
}
}
- # check if any handlers were found for this input format
+ # check if any handlers were found for this format
if ( ! @handlers and ! $is_optional ) {
- throw_no_input_handler( "handler not found for $capability" );
+ throw_no_handler( "handler not found for $capability" );
}
- $debug and print STDERR "debug: module_select: "
- .join( " ", @handlers )."\n";
+ debug "module_select: ".join( " ", @handlers );
return @handlers;
}
@@ -341,7 +340,7 @@
{
my $group = shift;
- $debug and print STDERR "debug: singular_handler($group)\n";
+ debug "singular_handler($group)";
my $count = 0;
my ( $entry, $last );
foreach $entry ( keys %{$modules{$group}} ) {
@@ -358,8 +357,7 @@
}
# if there's only one registered, that's the one to use
- $debug and print STDERR "debug: singular_handler: "
- ."count=$count last=$last\n";
+ debug "singular_handler: count=$count last=$last";
return $count == 1 ? $last : undef;
}
@@ -432,11 +430,13 @@
and ( ref $modules{cmdline} eq "ARRAY" ))
{
foreach $cli_mod ( @{$modules{cmdline}}) {
- if ( defined @cli_mod::Options ) {
- push @mod_options, @cli_mod::Options;
+ if ( eval "defined \@{".$cli_mod."::Options}" ) {
+ eval "push \@mod_options,"
+ ."\@{".$cli_mod."::Options}";
}
- if ( defined @cli_mod::Usage ) {
- push @mod_options, @cli_mod::Usage;
+ if ( eval "defined \@{".$cli_mod."::Usage}" ) {
+ eval "push \@mod_options, \@{"
+ .$cli_mod."::Usage}";
}
}
}
@@ -470,7 +470,7 @@
if (( exists $options{debug}) and $options{debug}) {
$debug = 1;
}
- $debug and print STDERR "debug: fetch_main\n";
+ debug "fetch_main";
# if either source/input or dest/output formats were not provided,
@@ -511,7 +511,7 @@
# check if any handlers were found for this input format
if ( ! @handlers ) {
- throw_no_input_handler( "input handler not found for "
+ throw_no_handler( "input handler not found for "
.$options{source_format});
}
@@ -519,7 +519,7 @@
my $pkgname;
my $run_count = 0;
foreach $pkgname ( @handlers ) {
- $debug and print STDERR "debug: running for $pkgname\n";
+ debug "running for $pkgname";
eval { &WebFetch::run( $pkgname, \%options )};
if ( $@ ) {
print STDERR "WebFetch: run eval error: $@\n";
@@ -661,7 +661,7 @@
my $options_ref = shift;
my $obj;
- $debug and print STDERR "debug: entered run for $run_pkg\n";
+ debug "entered run for $run_pkg";
# make sure we have the run package loaded
mod_load $run_pkg;
@@ -677,7 +677,7 @@
# create the new object
# this also calls the $obj->fetch() routine for the module which
# has inherited from WebFetch to do this
- $debug and print STDERR "debug: run before new\n";
+ debug "run before new";
$obj = eval $run_pkg."->new( \%\$options_ref )";
if ( $@ ) {
throw_mod_run_failure( "module run failure: ".$@ );
@@ -686,7 +686,7 @@
# if the object had data for the WebFetch-embedding API,
# then data processing is external to the fetch routine
# (This externalizes the data for other software to capture it.)
- $debug and print STDERR "run before output\n";
+ debug "run before output";
my $dest_format = $obj->{dest_format};
if ( !exists $obj->{actions}) {
$obj->{actions} = {};
@@ -705,22 +705,26 @@
throw_no_save( "save failed: no data or nowhere to save it" );
}
- $debug and print STDERR "run before save\n";
+ debug "run before save";
my $result = $obj->save();
- # Old WebFetch pre-0.9 API code, should not be needed any more
- #if ( ! $result ) {
- # my $savable;
- # foreach $savable ( @{$obj->{savable}}) {
- # (ref $savable eq "HASH") or next;
- # if ( exists $savable->{error}) {
- # throw_save_error( "error saving in "
- # .$obj->{dir}
- # ."file: ".$savable->{file}
- # ."error: " .$savable->{error} );
- # }
- # }
- #}
+ # check for errors, throw exception to report errors per savable item
+ if ( ! $result ) {
+ my $savable;
+ my @errors;
+ foreach $savable ( @{$obj->{savable}}) {
+ (ref $savable eq "HASH") or next;
+ if ( exists $savable->{error}) {
+ push @errors, "file: ".$savable->{file}
+ ."error: " .$savable->{error};
+ }
+ }
+ if ( @errors ) {
+ throw_save_error( "error saving results in "
+ .$obj->{dir}
+ ."\n".join( "\n", @errors )."\n" );
+ }
+ }
return $result ? 0 : 1;
}
@@ -765,6 +769,10 @@
URL or file path (as appropriate) to the news source
+=item id
+
+unique identifier string for the entry
+
=item date
a date stamp,
@@ -942,6 +950,7 @@
sub do_actions
{
my ( $self ) = @_;
+ debug "in WebFetch::do_actions";
# we *really* need the data and actions to be set!
# otherwise assume we're in WebFetch 0.09 compatibility mode and
@@ -962,7 +971,6 @@
if ( exists $modules{output}{$action_spec}) {
my $class;
foreach $class ( @{$modules{output}{$action_spec}}) {
- print STDERR "can test on $class\n";
if ( $class->can( $action_handler )) {
$handler_ref = \&{$class."::".$action_handler};
last;
@@ -1258,6 +1266,24 @@
});
}
+=item $obj->no_savables_ok
+
+This can be used by an output function which handles its own intricate output
+operation (such as WebFetch::Output::TWiki). If the savables array is empty,
+it would cause an error. Using this function drops a note in it which
+basically says that's OK.
+
+=cut
+
+sub no_savables_ok
+{
+ my $self = shift;
+
+ push ( @{$self->{savable}}, {
+ 'ok_empty' => 1,
+ });
+}
+
=item $obj->save
This WebFetch utility function goes through all the entries in the
@@ -1330,6 +1356,11 @@
print STDERR "saving ".$savable->{file}."\n";
}
+ # an output module may have handled a more intricate operation
+ if ( exists $savable->{ok_empty}) {
+ last;
+ }
+
# verify contents of savable record
if ( !exists $savable->{file}) {
$savable->{error} = "missing file name - skipped";
@@ -1571,6 +1602,10 @@
return 1;
}
+=item $obj->wk2fname( $wk )
+
+=cut
+
# convert well-known name to field name
sub wk2fname
{
@@ -1604,6 +1639,10 @@
return undef;
}
+=item $obj->fname2fnum( $fname )
+
+=cut
+
# convert a field name to a field number
sub fname2fnum
{
@@ -1614,6 +1653,10 @@
? $self->{fname2fnum}{$fname} : undef;
}
+=item $obj->wk2fnum( $wk )
+
+=cut
+
# convert well-known name to field number
sub wk2fnum
{
@@ -1624,6 +1667,54 @@
? $self->{wk2fnum}{$wk} : undef;
}
+=item AUTOLOAD
+
+=cut
+
+# autoloader catches calls to unknown functions
+# first try: redirect to the class which made the call, if the function exists
+# second try: act as a read-only accessor for object data
+# (want a read/write accessor? define the function explicitly)
+sub AUTOLOAD
+{
+ my $self = shift;
+ my $type = ref($self) or throw_autoload_fail "self is not an object";
+
+ my $name = $AUTOLOAD;
+ $name =~ s/.*://; # strip fully-qualified portion
+
+ # skip all-caps special Perl functions
+ if ( $name =~ /^[A-Z]+$/ ) {
+ return;
+ }
+
+ # check for function in caller package
+ # (WebFetch may hand an input module's object to an output module)
+ my ( $package, $filename, $line ) = caller;
+ if ( $package->can( $name )) {
+ my $retval = eval $package."::".$name."( \$self, \@_ )";
+ if ( $@ ) {
+ my $e = Exception::Class->caught();
+ ref $e ? $e->rethrow
+ : throw_autoload_fail "failure in "
+ ."autoloaded function: ".$e;
+ }
+ return $retval;
+ }
+
+ # act as a read-only accessor
+ # add write accessors when API can specify what's OK to write
+ if ( exists $self->{$name}) {
+ # define the sub for better efficiency next time
+ eval "sub WebFetch::$name { return \$_[0]->{$name}; }";
+ return $self->{$name};
+ }
+
+ # if we got here, we failed
+ throw_autoload_fail "function $name not found - "
+ ."called by $package ($filename line $line)";
+}
+
1;
__END__
# remainder of POD docs follow
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|