[PerlWikiBot] SF.net SVN: perlwikibot:[90] trunk/no-interwiki
Status: Pre-Alpha
Brought to you by:
rotemliss
|
From: <am...@us...> - 2010-03-26 18:42:19
|
Revision: 90
http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=90&view=rev
Author: amire80
Date: 2010-03-26 18:42:13 +0000 (Fri, 26 Mar 2010)
Log Message:
-----------
Ported to MediaWiki::DumpFile, the new version of Parse::MediaWikiDump.
Modified Paths:
--------------
trunk/no-interwiki/MediaWiki/Toolkit.pm
trunk/no-interwiki/prepare_noiw_list.pl
Modified: trunk/no-interwiki/MediaWiki/Toolkit.pm
===================================================================
--- trunk/no-interwiki/MediaWiki/Toolkit.pm 2010-03-26 15:31:20 UTC (rev 89)
+++ trunk/no-interwiki/MediaWiki/Toolkit.pm 2010-03-26 18:42:13 UTC (rev 90)
@@ -15,6 +15,7 @@
FIELD_SEP => [qw( $FIELD_SEP $FIELD_SEP_RE )],
GET_STRING => [qw( get_strings get_string )],
FILE_UTIL => [qw( file_error append_to_file read_file )],
+ NAMESPACE => [qw( init_namespaces namespace )],
);
our @EXPORT_OK = map { @{$_} } values %EXPORT_TAGS;
@@ -111,5 +112,27 @@
return $text;
}
+my @NAMESPACES;
+
+sub init_namespaces {
+ @NAMESPACES = @_;
+ return;
+}
+
+my $NAMESPACE_RE = qr/\A (?<namespace>[^:]+) : .* /xms;
+
+sub namespace {
+ my ($title) = @_;
+ my $namespace = q();
+
+ if ($title =~ $NAMESPACE_RE) {
+ if ($+{namespace} ~~ @NAMESPACES) {
+ $namespace = $+{namespace};
+ }
+ }
+
+ return $namespace || get_string('article space');
+}
+
1;
Modified: trunk/no-interwiki/prepare_noiw_list.pl
===================================================================
--- trunk/no-interwiki/prepare_noiw_list.pl 2010-03-26 15:31:20 UTC (rev 89)
+++ trunk/no-interwiki/prepare_noiw_list.pl 2010-03-26 18:42:13 UTC (rev 90)
@@ -54,12 +54,15 @@
use Regexp::Common;
use Lingua::Translit;
use Log::Log4perl qw(:easy);
-use Parse::MediaWikiDump 0.91; # Earlier versions have a different API
+use MediaWiki::DumpFile::Pages;
+
use MediaWiki::Toolkit (
qw(
:FIELD_SEP
- :GET_STRING :FILE_UTIL)
+ :GET_STRING :FILE_UTIL
+ :NAMESPACE
+ )
);
our $VERSION = '0.2.1';
@@ -94,7 +97,7 @@
Log::Log4perl->easy_init(
{
level => $INFO, # print everything
- file => ":utf8>$LOG_FN", # utf is important
+ file => ":utf8>$LOG_FN", # utf8 is important
layout => '%m%n', # No need to print the date
}
);
@@ -360,8 +363,11 @@
my $the_dump = load_dump();
-my $namespaces_alt = join $ALT_SEP,
- grep { length > 0 } @{ $the_dump->namespaces_names() };
+my %NAMESPACES = $the_dump->namespaces();
+my @NAMESPACES = values %NAMESPACES;
+init_namespaces(@NAMESPACES);
+my $namespaces_alt = join $ALT_SEP, grep { length > 0 } @NAMESPACES;
+
my $PURE_TITLE_RE = qr{
\A
(?:
@@ -414,16 +420,11 @@
sub load_dump {
my $dump;
- $dump = Parse::MediaWikiDump::Pages->new($dump_fn);
+ $dump = MediaWiki::DumpFile::Pages->new($dump_fn);
return $dump;
}
-sub namespace {
- my ($page) = @_;
- return $page->namespace() || get_string('article space');
-}
-
sub next_page {
my ($dump) = @_;
my $page_ref = eval { $dump->next(); };
@@ -447,9 +448,9 @@
last PAGE;
}
- my $page_text_ref = $page_ref->text();
+ my $page_text = $page_ref->revision()->text();
- if (${$page_text_ref} !~ /\S/xmsi
+ if ($page_text !~ /\S/xmsi
and not is_in_namespace($page_ref, 'User', 'User talk'))
{
special_cases_file('empty_page', $page_ref);
@@ -457,7 +458,7 @@
if (
$page_counter < $option{start_from}
- or not defined ${$page_text_ref} # must be tested before redirect
+ or not defined $page_text # must be tested before redirect
or not is_in_namespace($page_ref, @INCLUDE_NAMESPACES)
or is_redirect($page_ref)
)
@@ -465,11 +466,11 @@
next PAGE;
}
- my $page_namespace = namespace($page_ref);
+ my $page_title = $page_ref->title();
+
+ my $page_namespace = namespace($page_title);
$namespace_count{$page_namespace}++;
- my $page_title = $page_ref->title();
-
INFO("\n* processing $page_counter - ", $page_title);
# TODO: Be more precise here.
@@ -482,8 +483,7 @@
}
# A simple sanity check: is the no_iw template anywhere around here?
- my $has_template_no_iw =
- (${$page_text_ref} =~ $SIMPLE_NO_IW_CHECK_RE);
+ my $has_template_no_iw = ($page_text =~ $SIMPLE_NO_IW_CHECK_RE);
# Does the page have interwiki links?
# BIG XXX Actually checks only for English
@@ -493,7 +493,7 @@
INFO("has link to $has_iw");
if ($has_template_no_iw) {
INFO('has template no_iw. trying to remove ...');
- remove_template_no_iw($page_text_ref);
+ remove_template_no_iw($page_text);
$statistics{'has both valid interwiki and template'}++;
special_cases_file('outdated_template', $page_ref);
}
@@ -513,20 +513,21 @@
$has_iw # scalar bool
) = @_;
+ my $page_title = $page_ref->title();
INFO(q(does not have iw link.));
- $statistics{'has no interwiki link'}->{ namespace($page_ref) }++;
+ $statistics{'has no interwiki link'}->{ namespace($page_title) }++;
# Now we need to search for no_iw templates
# and parse their parameters - date and type
my @found_templates = ();
- my $page_text_ref = $page_ref->text();
+ my $page_text = $page_ref->revision()->text();
# Optimized - does not start searching,
# if we already know that it is not there
if ($has_template_no_iw) {
- find_templates($page_text_ref, \@found_templates,
+ find_templates(\$page_text, \@found_templates,
[ get_string('no_iw') ]);
}
@@ -566,7 +567,7 @@
}
elsif (cooling_date_passed($date_ref)) {
INFO('cooling date passed, updating to today ...');
- update_cooling_date($page_text_ref);
+ update_cooling_date($page_text);
$statistics{'cooling date passed'}++;
}
else {
@@ -583,7 +584,7 @@
my @all_types = get_all_types($template->{params}->{type}, $page_ref);
foreach my $type (@all_types) {
- INFO('adding ' . $page_ref->title() . " to the list as type $type");
+ INFO('adding ' . $page_title . " to the list as type $type");
add_to_no_iw_list($page_ref, $type);
$type_count{$type}++;
}
@@ -792,9 +793,9 @@
my ($page) = @_;
my $page_title = $page->title();
- my $page_text = ${ $page->text() }; # XXX
+ my $page_text = $page->revision()->text();
- study $page_text; # XXX
+ study $page_text; # XXX
my %iw_links;
my %special_cases;
@@ -1135,9 +1136,9 @@
return 'English';
}
- my $page_text_ref = $page->text();
+ my $page_text = $page->revision()->text();
- if (${$page_text_ref} =~ $LOCAL_REDIRECT_RE) {
+ if ($page_text =~ $LOCAL_REDIRECT_RE) {
return 'local';
}
@@ -1147,7 +1148,8 @@
sub is_in_namespace {
my ($page, @namespaces) = @_;
- return namespace($page) ~~ [ map { get_string($_) } @namespaces ];
+ return namespace($page->title()) ~~
+ [ map { get_string($_) } @namespaces ];
}
sub is_category {
@@ -1158,8 +1160,8 @@
sub is_disambig {
my ($page) = @_;
- my $found_templates =
- find_templates($page->text(), [], [ get_string('disambig') ]);
+ my $found_templates = find_templates(\$page->revision()->text(),
+ [], [ get_string('disambig') ]);
return scalar @{$found_templates};
}
@@ -1497,7 +1499,7 @@
=head2 unable to handle any case setting besides 'first-letter'
Something is weird with the dump. See the documentation of
-L<Parse::MediaWikiDump> and MediaWiki.
+L<MediaWiki::DumpFile> and MediaWiki.
=head2 A page has no pure title
@@ -1551,7 +1553,7 @@
=over
-=item * C<Parse::MediaWikiDump>
+=item * C<MediaWiki::DumpFile>
This module is used for reading pages from the XML dump.
@@ -1671,7 +1673,7 @@
=item * Statistics and multi links are just slapped to the log.
=item * At least some of the code can be rewritten as classes that inherit
-from L<Parse::MediaWikiDump>.
+from L<MediaWiki::DumpFile>.
=back
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|