[PerlWikiBot] SF.net SVN: perlwikibot:[90] trunk/no-interwiki
Status: Pre-Alpha
Brought to you by:
rotemliss
From: <am...@us...> - 2010-03-26 18:42:19
|
Revision: 90 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=90&view=rev Author: amire80 Date: 2010-03-26 18:42:13 +0000 (Fri, 26 Mar 2010) Log Message: ----------- Ported to MediaWiki::DumpFile, the new version of Parse::MediaWikiDump. Modified Paths: -------------- trunk/no-interwiki/MediaWiki/Toolkit.pm trunk/no-interwiki/prepare_noiw_list.pl Modified: trunk/no-interwiki/MediaWiki/Toolkit.pm =================================================================== --- trunk/no-interwiki/MediaWiki/Toolkit.pm 2010-03-26 15:31:20 UTC (rev 89) +++ trunk/no-interwiki/MediaWiki/Toolkit.pm 2010-03-26 18:42:13 UTC (rev 90) @@ -15,6 +15,7 @@ FIELD_SEP => [qw( $FIELD_SEP $FIELD_SEP_RE )], GET_STRING => [qw( get_strings get_string )], FILE_UTIL => [qw( file_error append_to_file read_file )], + NAMESPACE => [qw( init_namespaces namespace )], ); our @EXPORT_OK = map { @{$_} } values %EXPORT_TAGS; @@ -111,5 +112,27 @@ return $text; } +my @NAMESPACES; + +sub init_namespaces { + @NAMESPACES = @_; + return; +} + +my $NAMESPACE_RE = qr/\A (?<namespace>[^:]+) : .* /xms; + +sub namespace { + my ($title) = @_; + my $namespace = q(); + + if ($title =~ $NAMESPACE_RE) { + if ($+{namespace} ~~ @NAMESPACES) { + $namespace = $+{namespace}; + } + } + + return $namespace || get_string('article space'); +} + 1; Modified: trunk/no-interwiki/prepare_noiw_list.pl =================================================================== --- trunk/no-interwiki/prepare_noiw_list.pl 2010-03-26 15:31:20 UTC (rev 89) +++ trunk/no-interwiki/prepare_noiw_list.pl 2010-03-26 18:42:13 UTC (rev 90) @@ -54,12 +54,15 @@ use Regexp::Common; use Lingua::Translit; use Log::Log4perl qw(:easy); -use Parse::MediaWikiDump 0.91; # Earlier versions have a different API +use MediaWiki::DumpFile::Pages; + use MediaWiki::Toolkit ( qw( :FIELD_SEP - :GET_STRING :FILE_UTIL) + :GET_STRING :FILE_UTIL + :NAMESPACE + ) ); our $VERSION = '0.2.1'; @@ -94,7 +97,7 @@ Log::Log4perl->easy_init( { level => $INFO, # print everything - file => ":utf8>$LOG_FN", # utf is important + file => ":utf8>$LOG_FN", # utf8 is important layout => '%m%n', # No need to print the date } ); @@ -360,8 +363,11 @@ my $the_dump = load_dump(); -my $namespaces_alt = join $ALT_SEP, - grep { length > 0 } @{ $the_dump->namespaces_names() }; +my %NAMESPACES = $the_dump->namespaces(); +my @NAMESPACES = values %NAMESPACES; +init_namespaces(@NAMESPACES); +my $namespaces_alt = join $ALT_SEP, grep { length > 0 } @NAMESPACES; + my $PURE_TITLE_RE = qr{ \A (?: @@ -414,16 +420,11 @@ sub load_dump { my $dump; - $dump = Parse::MediaWikiDump::Pages->new($dump_fn); + $dump = MediaWiki::DumpFile::Pages->new($dump_fn); return $dump; } -sub namespace { - my ($page) = @_; - return $page->namespace() || get_string('article space'); -} - sub next_page { my ($dump) = @_; my $page_ref = eval { $dump->next(); }; @@ -447,9 +448,9 @@ last PAGE; } - my $page_text_ref = $page_ref->text(); + my $page_text = $page_ref->revision()->text(); - if (${$page_text_ref} !~ /\S/xmsi + if ($page_text !~ /\S/xmsi and not is_in_namespace($page_ref, 'User', 'User talk')) { special_cases_file('empty_page', $page_ref); @@ -457,7 +458,7 @@ if ( $page_counter < $option{start_from} - or not defined ${$page_text_ref} # must be tested before redirect + or not defined $page_text # must be tested before redirect or not is_in_namespace($page_ref, @INCLUDE_NAMESPACES) or is_redirect($page_ref) ) @@ -465,11 +466,11 @@ next PAGE; } - my $page_namespace = namespace($page_ref); + my $page_title = $page_ref->title(); + + my $page_namespace = namespace($page_title); $namespace_count{$page_namespace}++; - my $page_title = $page_ref->title(); - INFO("\n* processing $page_counter - ", $page_title); # TODO: Be more precise here. @@ -482,8 +483,7 @@ } # A simple sanity check: is the no_iw template anywhere around here? - my $has_template_no_iw = - (${$page_text_ref} =~ $SIMPLE_NO_IW_CHECK_RE); + my $has_template_no_iw = ($page_text =~ $SIMPLE_NO_IW_CHECK_RE); # Does the page have interwiki links? # BIG XXX Actually checks only for English @@ -493,7 +493,7 @@ INFO("has link to $has_iw"); if ($has_template_no_iw) { INFO('has template no_iw. trying to remove ...'); - remove_template_no_iw($page_text_ref); + remove_template_no_iw($page_text); $statistics{'has both valid interwiki and template'}++; special_cases_file('outdated_template', $page_ref); } @@ -513,20 +513,21 @@ $has_iw # scalar bool ) = @_; + my $page_title = $page_ref->title(); INFO(q(does not have iw link.)); - $statistics{'has no interwiki link'}->{ namespace($page_ref) }++; + $statistics{'has no interwiki link'}->{ namespace($page_title) }++; # Now we need to search for no_iw templates # and parse their parameters - date and type my @found_templates = (); - my $page_text_ref = $page_ref->text(); + my $page_text = $page_ref->revision()->text(); # Optimized - does not start searching, # if we already know that it is not there if ($has_template_no_iw) { - find_templates($page_text_ref, \@found_templates, + find_templates(\$page_text, \@found_templates, [ get_string('no_iw') ]); } @@ -566,7 +567,7 @@ } elsif (cooling_date_passed($date_ref)) { INFO('cooling date passed, updating to today ...'); - update_cooling_date($page_text_ref); + update_cooling_date($page_text); $statistics{'cooling date passed'}++; } else { @@ -583,7 +584,7 @@ my @all_types = get_all_types($template->{params}->{type}, $page_ref); foreach my $type (@all_types) { - INFO('adding ' . $page_ref->title() . " to the list as type $type"); + INFO('adding ' . $page_title . " to the list as type $type"); add_to_no_iw_list($page_ref, $type); $type_count{$type}++; } @@ -792,9 +793,9 @@ my ($page) = @_; my $page_title = $page->title(); - my $page_text = ${ $page->text() }; # XXX + my $page_text = $page->revision()->text(); - study $page_text; # XXX + study $page_text; # XXX my %iw_links; my %special_cases; @@ -1135,9 +1136,9 @@ return 'English'; } - my $page_text_ref = $page->text(); + my $page_text = $page->revision()->text(); - if (${$page_text_ref} =~ $LOCAL_REDIRECT_RE) { + if ($page_text =~ $LOCAL_REDIRECT_RE) { return 'local'; } @@ -1147,7 +1148,8 @@ sub is_in_namespace { my ($page, @namespaces) = @_; - return namespace($page) ~~ [ map { get_string($_) } @namespaces ]; + return namespace($page->title()) ~~ + [ map { get_string($_) } @namespaces ]; } sub is_category { @@ -1158,8 +1160,8 @@ sub is_disambig { my ($page) = @_; - my $found_templates = - find_templates($page->text(), [], [ get_string('disambig') ]); + my $found_templates = find_templates(\$page->revision()->text(), + [], [ get_string('disambig') ]); return scalar @{$found_templates}; } @@ -1497,7 +1499,7 @@ =head2 unable to handle any case setting besides 'first-letter' Something is weird with the dump. See the documentation of -L<Parse::MediaWikiDump> and MediaWiki. +L<MediaWiki::DumpFile> and MediaWiki. =head2 A page has no pure title @@ -1551,7 +1553,7 @@ =over -=item * C<Parse::MediaWikiDump> +=item * C<MediaWiki::DumpFile> This module is used for reading pages from the XML dump. @@ -1671,7 +1673,7 @@ =item * Statistics and multi links are just slapped to the log. =item * At least some of the code can be rewritten as classes that inherit -from L<Parse::MediaWikiDump>. +from L<MediaWiki::DumpFile>. =back This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |