perlwikibot-svn Mailing List for Perl MediaWiki Robot
Status: Pre-Alpha
Brought to you by:
rotemliss
You can subscribe to this list here.
| 2006 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
(19) |
Oct
|
Nov
|
Dec
|
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2007 |
Jan
|
Feb
|
Mar
(4) |
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
| 2008 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
(20) |
Aug
(12) |
Sep
|
Oct
|
Nov
(3) |
Dec
|
| 2009 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
(11) |
Oct
(1) |
Nov
|
Dec
|
| 2010 |
Jan
|
Feb
|
Mar
(6) |
Apr
|
May
(1) |
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
(2) |
Dec
|
|
From: <am...@us...> - 2010-11-06 15:20:27
|
Revision: 93
http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=93&view=rev
Author: amire80
Date: 2010-11-06 15:20:21 +0000 (Sat, 06 Nov 2010)
Log Message:
-----------
Import carp.
Modified Paths:
--------------
trunk/no-interwiki/prepare_noiw_list.pl
Modified: trunk/no-interwiki/prepare_noiw_list.pl
===================================================================
--- trunk/no-interwiki/prepare_noiw_list.pl 2010-11-06 15:08:56 UTC (rev 92)
+++ trunk/no-interwiki/prepare_noiw_list.pl 2010-11-06 15:20:21 UTC (rev 93)
@@ -38,7 +38,7 @@
# Standard library
# These modules should come installed with Perl
use English qw(-no_match_vars);
-use Carp qw(croak cluck);
+use Carp qw(carp croak cluck);
use Time::Local;
use Getopt::Long;
use Data::Dumper;
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <am...@us...> - 2010-11-06 15:09:02
|
Revision: 92
http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=92&view=rev
Author: amire80
Date: 2010-11-06 15:08:56 +0000 (Sat, 06 Nov 2010)
Log Message:
-----------
Not forcing SVN version number.
Modified Paths:
--------------
trunk/no-interwiki/prepare_noiw_list.pl
Modified: trunk/no-interwiki/prepare_noiw_list.pl
===================================================================
--- trunk/no-interwiki/prepare_noiw_list.pl 2010-05-26 15:36:59 UTC (rev 91)
+++ trunk/no-interwiki/prepare_noiw_list.pl 2010-11-06 15:08:56 UTC (rev 92)
@@ -74,10 +74,13 @@
Date => '$Date$',
);
#>>>
-our $FULL_VERSION =
- ($SVN_PROPS{Revision} =~ /\A\$Revision:\ (?<revision_num>\d+)\ \$\z/xms)
- ? "$VERSION.$+{revision_num}"
- : croak(q(Something is wrong with SVN revision number));
+our $FULL_VERSION = $VERSION;
+if ($SVN_PROPS{Revision} =~ /\A\$Revision:\ (?<revision_num>\d+)\ \$\z/xms) {
+ $FULL_VERSION .= ".$+{revision_num}";
+}
+else {
+ carp(q(Something is wrong with SVN revision number, but never mind));
+}
my $WIKITEXT_EXT = 'wiki.txt';
my $OUT_DIR = 'out';
@@ -1195,10 +1198,10 @@
carp("$lang_code is $lang_reftype, not hashref!");
next LANG_CODE;
}
- my $filename = "$MULTI_DIR/$lang_code.$WIKITEXT_EXT";
- say 'sort keys found_links lang_code'; # XXX
+ my $filename = "$MULTI_DIR/$lang_code.$WIKITEXT_EXT";
+ say 'sort keys found_links lang_code'; # XXX
my @foreign_articles = sort keys %{ $found_links{$lang_code} };
- say 'format_link_table lang_code scalar foreign_articles'; # XXX
+ say 'format_link_table lang_code scalar foreign_articles'; # XXX
say format_link_table($lang_code, scalar @foreign_articles);
FOREIGN_ARTICLE:
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <am...@us...> - 2010-05-26 15:37:06
|
Revision: 91
http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=91&view=rev
Author: amire80
Date: 2010-05-26 15:36:59 +0000 (Wed, 26 May 2010)
Log Message:
-----------
Strings for Masry (arz).
Modified Paths:
--------------
trunk/no-interwiki/MediaWiki/Toolkit.pm
trunk/no-interwiki/prepare_noiw_list.pl
trunk/no-interwiki/upload_iw.pl
Added Paths:
-----------
trunk/no-interwiki/arz.language_codes.txt
trunk/no-interwiki/arz.strings.txt
Property Changed:
----------------
trunk/no-interwiki/MediaWiki/Toolkit.pm
trunk/no-interwiki/upload_iw.pl
Modified: trunk/no-interwiki/MediaWiki/Toolkit.pm
===================================================================
--- trunk/no-interwiki/MediaWiki/Toolkit.pm 2010-03-26 18:42:13 UTC (rev 90)
+++ trunk/no-interwiki/MediaWiki/Toolkit.pm 2010-05-26 15:36:59 UTC (rev 91)
@@ -16,15 +16,16 @@
GET_STRING => [qw( get_strings get_string )],
FILE_UTIL => [qw( file_error append_to_file read_file )],
NAMESPACE => [qw( init_namespaces namespace )],
+ DUMP => [qw( load_dump )],
);
our @EXPORT_OK = map { @{$_} } values %EXPORT_TAGS;
our $VERSION = '0.01';
#<<< no perltidy
my %SVN_PROPS = ( ## no critic (RequireInterpolationOfMetachars)
- Revision => '$Revision: 0 $',
- HeadURL => '$HeadURL: https://perlwikibot.svn.sourceforge.net/svnroot/perlwikibot/trunk/no-interwiki/MediaWiki/Toolkit.pm $',
- Date => '$Date: 2010-03-25 17:02:28 +0200 (Thu, 25 Mar 2010) $',
+ Revision => '$Revision$',
+ HeadURL => '$HeadURL$',
+ Date => '$Date$',
);
#>>>
@@ -134,5 +135,13 @@
return $namespace || get_string('article space');
}
+sub load_dump {
+ my ($dump_fn) = @_;
+
+ my $dump = MediaWiki::DumpFile::Pages->new($dump_fn);
+
+ return $dump;
+}
+
1;
Property changes on: trunk/no-interwiki/MediaWiki/Toolkit.pm
___________________________________________________________________
Added: svn:keywords
+ Revision HeadURL Date
Added: trunk/no-interwiki/arz.language_codes.txt
===================================================================
--- trunk/no-interwiki/arz.language_codes.txt (rev 0)
+++ trunk/no-interwiki/arz.language_codes.txt 2010-05-26 15:36:59 UTC (rev 91)
@@ -0,0 +1 @@
+link language_codes.txt
\ No newline at end of file
Property changes on: trunk/no-interwiki/arz.language_codes.txt
___________________________________________________________________
Added: svn:special
+ *
Added: trunk/no-interwiki/arz.strings.txt
===================================================================
--- trunk/no-interwiki/arz.strings.txt (rev 0)
+++ trunk/no-interwiki/arz.strings.txt 2010-05-26 15:36:59 UTC (rev 91)
@@ -0,0 +1,37 @@
+# months
+January يناير
+February فبراير
+March مارس
+April ابريل
+May مايو
+June يونيه
+July يوليه
+August اغسطس
+September سبتمبر
+October اكتوبر
+November نوفمبر
+December ديسمبر
+
+no_iw no_iw
+disambig توضيح
+template قالب
+
+date تاريخ
+type نوع
+
+# MW specials
+REDIRECT تحويل
+
+# Namespaces
+User مستخدم
+User talk نقاش المستخدم
+Image ملف
+Portal بوابة
+Category تصنيف
+article space (رئيسى)
+
+# Other
+other متفرقات
+rlm ‏
+exclude_lowercase ß
+
Modified: trunk/no-interwiki/prepare_noiw_list.pl
===================================================================
--- trunk/no-interwiki/prepare_noiw_list.pl 2010-03-26 18:42:13 UTC (rev 90)
+++ trunk/no-interwiki/prepare_noiw_list.pl 2010-05-26 15:36:59 UTC (rev 91)
@@ -62,6 +62,7 @@
:FIELD_SEP
:GET_STRING :FILE_UTIL
:NAMESPACE
+ :DUMP
)
);
@@ -361,7 +362,7 @@
}
}
-my $the_dump = load_dump();
+my $the_dump = load_dump($dump_fn);
my %NAMESPACES = $the_dump->namespaces();
my @NAMESPACES = values %NAMESPACES;
@@ -417,14 +418,6 @@
exit;
-sub load_dump {
- my $dump;
-
- $dump = MediaWiki::DumpFile::Pages->new($dump_fn);
-
- return $dump;
-}
-
sub next_page {
my ($dump) = @_;
my $page_ref = eval { $dump->next(); };
@@ -1203,11 +1196,14 @@
next LANG_CODE;
}
my $filename = "$MULTI_DIR/$lang_code.$WIKITEXT_EXT";
+ say 'sort keys found_links lang_code'; # XXX
my @foreign_articles = sort keys %{ $found_links{$lang_code} };
+ say 'format_link_table lang_code scalar foreign_articles'; # XXX
say format_link_table($lang_code, scalar @foreign_articles);
FOREIGN_ARTICLE:
foreach my $foreign_article (@foreign_articles) {
+ say 'local_articles = keys found_links lang_code';
my @local_articles =
keys %{ $found_links{$lang_code}->{$foreign_article} };
@@ -1219,6 +1215,7 @@
}
if (scalar @local_articles > 1) {
+ say 'links = join sort map make_link';
my $links = join q{ | }, sort map { make_link($_) }
keys %{ $found_links{$lang_code}->{$foreign_article} };
@@ -1226,6 +1223,7 @@
make_link($lang_code
. $MW_SYNTAX{namespace_sep}
. $foreign_article);
+ say 'append_to_file filename';
append_to_file($filename, "* '''$foreign_title''' - $links");
}
}
Modified: trunk/no-interwiki/upload_iw.pl
===================================================================
--- trunk/no-interwiki/upload_iw.pl 2010-03-26 18:42:13 UTC (rev 90)
+++ trunk/no-interwiki/upload_iw.pl 2010-05-26 15:36:59 UTC (rev 91)
@@ -26,9 +26,9 @@
our $VERSION = '0.01';
#<<< no perltidy
my %SVN_PROPS = ( ## no critic (RequireInterpolationOfMetachars)
- Revision => '$Revision: 0 $',
- HeadURL => '$HeadURL: https://perlwikibot.svn.sourceforge.net/svnroot/perlwikibot/trunk/no-interwiki/upload_iw.pl $',
- Date => '$Date: 2010-03-25 17:02:28 +0200 (Thu, 25 Mar 2010) $',
+ Revision => '$Revision$',
+ HeadURL => '$HeadURL$',
+ Date => '$Date$',
);
#>>>
Property changes on: trunk/no-interwiki/upload_iw.pl
___________________________________________________________________
Added: svn:keywords
+ Revision HeadURL Date
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <am...@us...> - 2010-03-26 18:42:19
|
Revision: 90
http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=90&view=rev
Author: amire80
Date: 2010-03-26 18:42:13 +0000 (Fri, 26 Mar 2010)
Log Message:
-----------
Ported to MediaWiki::DumpFile, the new version of Parse::MediaWikiDump.
Modified Paths:
--------------
trunk/no-interwiki/MediaWiki/Toolkit.pm
trunk/no-interwiki/prepare_noiw_list.pl
Modified: trunk/no-interwiki/MediaWiki/Toolkit.pm
===================================================================
--- trunk/no-interwiki/MediaWiki/Toolkit.pm 2010-03-26 15:31:20 UTC (rev 89)
+++ trunk/no-interwiki/MediaWiki/Toolkit.pm 2010-03-26 18:42:13 UTC (rev 90)
@@ -15,6 +15,7 @@
FIELD_SEP => [qw( $FIELD_SEP $FIELD_SEP_RE )],
GET_STRING => [qw( get_strings get_string )],
FILE_UTIL => [qw( file_error append_to_file read_file )],
+ NAMESPACE => [qw( init_namespaces namespace )],
);
our @EXPORT_OK = map { @{$_} } values %EXPORT_TAGS;
@@ -111,5 +112,27 @@
return $text;
}
+my @NAMESPACES;
+
+sub init_namespaces {
+ @NAMESPACES = @_;
+ return;
+}
+
+my $NAMESPACE_RE = qr/\A (?<namespace>[^:]+) : .* /xms;
+
+sub namespace {
+ my ($title) = @_;
+ my $namespace = q();
+
+ if ($title =~ $NAMESPACE_RE) {
+ if ($+{namespace} ~~ @NAMESPACES) {
+ $namespace = $+{namespace};
+ }
+ }
+
+ return $namespace || get_string('article space');
+}
+
1;
Modified: trunk/no-interwiki/prepare_noiw_list.pl
===================================================================
--- trunk/no-interwiki/prepare_noiw_list.pl 2010-03-26 15:31:20 UTC (rev 89)
+++ trunk/no-interwiki/prepare_noiw_list.pl 2010-03-26 18:42:13 UTC (rev 90)
@@ -54,12 +54,15 @@
use Regexp::Common;
use Lingua::Translit;
use Log::Log4perl qw(:easy);
-use Parse::MediaWikiDump 0.91; # Earlier versions have a different API
+use MediaWiki::DumpFile::Pages;
+
use MediaWiki::Toolkit (
qw(
:FIELD_SEP
- :GET_STRING :FILE_UTIL)
+ :GET_STRING :FILE_UTIL
+ :NAMESPACE
+ )
);
our $VERSION = '0.2.1';
@@ -94,7 +97,7 @@
Log::Log4perl->easy_init(
{
level => $INFO, # print everything
- file => ":utf8>$LOG_FN", # utf is important
+ file => ":utf8>$LOG_FN", # utf8 is important
layout => '%m%n', # No need to print the date
}
);
@@ -360,8 +363,11 @@
my $the_dump = load_dump();
-my $namespaces_alt = join $ALT_SEP,
- grep { length > 0 } @{ $the_dump->namespaces_names() };
+my %NAMESPACES = $the_dump->namespaces();
+my @NAMESPACES = values %NAMESPACES;
+init_namespaces(@NAMESPACES);
+my $namespaces_alt = join $ALT_SEP, grep { length > 0 } @NAMESPACES;
+
my $PURE_TITLE_RE = qr{
\A
(?:
@@ -414,16 +420,11 @@
sub load_dump {
my $dump;
- $dump = Parse::MediaWikiDump::Pages->new($dump_fn);
+ $dump = MediaWiki::DumpFile::Pages->new($dump_fn);
return $dump;
}
-sub namespace {
- my ($page) = @_;
- return $page->namespace() || get_string('article space');
-}
-
sub next_page {
my ($dump) = @_;
my $page_ref = eval { $dump->next(); };
@@ -447,9 +448,9 @@
last PAGE;
}
- my $page_text_ref = $page_ref->text();
+ my $page_text = $page_ref->revision()->text();
- if (${$page_text_ref} !~ /\S/xmsi
+ if ($page_text !~ /\S/xmsi
and not is_in_namespace($page_ref, 'User', 'User talk'))
{
special_cases_file('empty_page', $page_ref);
@@ -457,7 +458,7 @@
if (
$page_counter < $option{start_from}
- or not defined ${$page_text_ref} # must be tested before redirect
+ or not defined $page_text # must be tested before redirect
or not is_in_namespace($page_ref, @INCLUDE_NAMESPACES)
or is_redirect($page_ref)
)
@@ -465,11 +466,11 @@
next PAGE;
}
- my $page_namespace = namespace($page_ref);
+ my $page_title = $page_ref->title();
+
+ my $page_namespace = namespace($page_title);
$namespace_count{$page_namespace}++;
- my $page_title = $page_ref->title();
-
INFO("\n* processing $page_counter - ", $page_title);
# TODO: Be more precise here.
@@ -482,8 +483,7 @@
}
# A simple sanity check: is the no_iw template anywhere around here?
- my $has_template_no_iw =
- (${$page_text_ref} =~ $SIMPLE_NO_IW_CHECK_RE);
+ my $has_template_no_iw = ($page_text =~ $SIMPLE_NO_IW_CHECK_RE);
# Does the page have interwiki links?
# BIG XXX Actually checks only for English
@@ -493,7 +493,7 @@
INFO("has link to $has_iw");
if ($has_template_no_iw) {
INFO('has template no_iw. trying to remove ...');
- remove_template_no_iw($page_text_ref);
+ remove_template_no_iw($page_text);
$statistics{'has both valid interwiki and template'}++;
special_cases_file('outdated_template', $page_ref);
}
@@ -513,20 +513,21 @@
$has_iw # scalar bool
) = @_;
+ my $page_title = $page_ref->title();
INFO(q(does not have iw link.));
- $statistics{'has no interwiki link'}->{ namespace($page_ref) }++;
+ $statistics{'has no interwiki link'}->{ namespace($page_title) }++;
# Now we need to search for no_iw templates
# and parse their parameters - date and type
my @found_templates = ();
- my $page_text_ref = $page_ref->text();
+ my $page_text = $page_ref->revision()->text();
# Optimized - does not start searching,
# if we already know that it is not there
if ($has_template_no_iw) {
- find_templates($page_text_ref, \@found_templates,
+ find_templates(\$page_text, \@found_templates,
[ get_string('no_iw') ]);
}
@@ -566,7 +567,7 @@
}
elsif (cooling_date_passed($date_ref)) {
INFO('cooling date passed, updating to today ...');
- update_cooling_date($page_text_ref);
+ update_cooling_date($page_text);
$statistics{'cooling date passed'}++;
}
else {
@@ -583,7 +584,7 @@
my @all_types = get_all_types($template->{params}->{type}, $page_ref);
foreach my $type (@all_types) {
- INFO('adding ' . $page_ref->title() . " to the list as type $type");
+ INFO('adding ' . $page_title . " to the list as type $type");
add_to_no_iw_list($page_ref, $type);
$type_count{$type}++;
}
@@ -792,9 +793,9 @@
my ($page) = @_;
my $page_title = $page->title();
- my $page_text = ${ $page->text() }; # XXX
+ my $page_text = $page->revision()->text();
- study $page_text; # XXX
+ study $page_text; # XXX
my %iw_links;
my %special_cases;
@@ -1135,9 +1136,9 @@
return 'English';
}
- my $page_text_ref = $page->text();
+ my $page_text = $page->revision()->text();
- if (${$page_text_ref} =~ $LOCAL_REDIRECT_RE) {
+ if ($page_text =~ $LOCAL_REDIRECT_RE) {
return 'local';
}
@@ -1147,7 +1148,8 @@
sub is_in_namespace {
my ($page, @namespaces) = @_;
- return namespace($page) ~~ [ map { get_string($_) } @namespaces ];
+ return namespace($page->title()) ~~
+ [ map { get_string($_) } @namespaces ];
}
sub is_category {
@@ -1158,8 +1160,8 @@
sub is_disambig {
my ($page) = @_;
- my $found_templates =
- find_templates($page->text(), [], [ get_string('disambig') ]);
+ my $found_templates = find_templates(\$page->revision()->text(),
+ [], [ get_string('disambig') ]);
return scalar @{$found_templates};
}
@@ -1497,7 +1499,7 @@
=head2 unable to handle any case setting besides 'first-letter'
Something is weird with the dump. See the documentation of
-L<Parse::MediaWikiDump> and MediaWiki.
+L<MediaWiki::DumpFile> and MediaWiki.
=head2 A page has no pure title
@@ -1551,7 +1553,7 @@
=over
-=item * C<Parse::MediaWikiDump>
+=item * C<MediaWiki::DumpFile>
This module is used for reading pages from the XML dump.
@@ -1671,7 +1673,7 @@
=item * Statistics and multi links are just slapped to the log.
=item * At least some of the code can be rewritten as classes that inherit
-from L<Parse::MediaWikiDump>.
+from L<MediaWiki::DumpFile>.
=back
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <am...@us...> - 2010-03-26 15:31:29
|
Revision: 89
http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=89&view=rev
Author: amire80
Date: 2010-03-26 15:31:20 +0000 (Fri, 26 Mar 2010)
Log Message:
-----------
Thanks to almut and ikegami on PerlMonks.
Modified Paths:
--------------
trunk/no-interwiki/upload_iw.pl
Modified: trunk/no-interwiki/upload_iw.pl
===================================================================
--- trunk/no-interwiki/upload_iw.pl 2010-03-26 15:22:02 UTC (rev 88)
+++ trunk/no-interwiki/upload_iw.pl 2010-03-26 15:31:20 UTC (rev 89)
@@ -80,7 +80,12 @@
say Dumper(\@filenames);
foreach my $filename (@filenames) {
- my $pagename = Encode::decode('UTF-8', "$filename");
+ # This was tested on Ubuntu. Other systems may have other encodings
+ # for file names. Test in a sandbox before using it on a live Wikipedia,
+ # otherwise pages with gibberish titles may be created.
+ # Thanks to almut and ikegami on PerlMonks:
+ # http://www.perlmonks.org/?node_id=830948
+ my $pagename = Encode::decode('UTF-8', $filename);
for ($pagename) {
s/\.$INPUT_EXTENSION\z//xms;
s/\A$dirname//xms;
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <am...@us...> - 2010-03-26 15:22:09
|
Revision: 88
http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=88&view=rev
Author: amire80
Date: 2010-03-26 15:22:02 +0000 (Fri, 26 Mar 2010)
Log Message:
-----------
Adding upload_iw.pl.
Modified Paths:
--------------
trunk/no-interwiki/MediaWiki/Toolkit.pm
trunk/no-interwiki/ru.strings.txt
Added Paths:
-----------
trunk/no-interwiki/upload_iw.pl
Modified: trunk/no-interwiki/MediaWiki/Toolkit.pm
===================================================================
--- trunk/no-interwiki/MediaWiki/Toolkit.pm 2010-03-26 14:03:26 UTC (rev 87)
+++ trunk/no-interwiki/MediaWiki/Toolkit.pm 2010-03-26 15:22:02 UTC (rev 88)
@@ -14,7 +14,7 @@
our %EXPORT_TAGS = (
FIELD_SEP => [qw( $FIELD_SEP $FIELD_SEP_RE )],
GET_STRING => [qw( get_strings get_string )],
- FILE_UTIL => [qw( file_error append_to_file )],
+ FILE_UTIL => [qw( file_error append_to_file read_file )],
);
our @EXPORT_OK = map { @{$_} } values %EXPORT_TAGS;
@@ -81,24 +81,35 @@
}
sub file_error {
- my ($operation, $fn, $access_type) = @_;
- my $string = "error $operation $fn, $access_type: $OS_ERROR";
+ my ($operation, $filename, $access_type) = @_;
+ my $string = "error $operation $filename, $access_type: $OS_ERROR";
return $string;
}
sub append_to_file {
- my ($fn, $line) = @_;
+ my ($filename, $line) = @_;
- open my $file, '>>', $fn
- or croak(file_error('opening', $fn, 'appending'));
-
+ open my $file, '>>', $filename
+ or croak(file_error('opening', $filename, 'appending'));
say {$file} ($line // q{}); # /
-
close $file
- or croak(file_error('closing', $fn, 'appeding'));
+ or croak(file_error('closing', $filename, 'appeding'));
return;
}
+sub read_file {
+ my ($filename) = @_;
+
+ local $INPUT_RECORD_SEPARATOR = undef;
+ open my $file, '<', $filename
+ or croak(file_error('opening', $filename, 'reader'));
+ my $text = <$file>;
+ close $file
+ or croak(file_error('closing', $filename, 'reading'));
+
+ return $text;
+}
+
1;
Modified: trunk/no-interwiki/ru.strings.txt
===================================================================
--- trunk/no-interwiki/ru.strings.txt 2010-03-26 14:03:26 UTC (rev 87)
+++ trunk/no-interwiki/ru.strings.txt 2010-03-26 15:22:02 UTC (rev 88)
@@ -30,6 +30,12 @@
Category Категория
article space (статьи)
+# pages
+project prefix Википедия:Проект:Интервики/Виды/Категории
+
+# summaries
+updating list of pages without interwiki обновление списка страниц без интервики
+
# Other
other другое
rlm {{כ}}
Added: trunk/no-interwiki/upload_iw.pl
===================================================================
--- trunk/no-interwiki/upload_iw.pl (rev 0)
+++ trunk/no-interwiki/upload_iw.pl 2010-03-26 15:22:02 UTC (rev 88)
@@ -0,0 +1,114 @@
+#!/usr/bin/perl
+
+use 5.010;
+
+use strict;
+use warnings;
+use open ':encoding(utf8)';
+use utf8;
+
+use English qw(-no_match_vars);
+use Carp qw(croak cluck);
+use Getopt::Long;
+use Data::Dumper;
+use Encode;
+
+use Readonly;
+
+use MediaWiki::API;
+
+use MediaWiki::Toolkit (
+ qw(
+ :GET_STRING
+ :FILE_UTIL)
+);
+
+our $VERSION = '0.01';
+#<<< no perltidy
+my %SVN_PROPS = ( ## no critic (RequireInterpolationOfMetachars)
+ Revision => '$Revision: 0 $',
+ HeadURL => '$HeadURL: https://perlwikibot.svn.sourceforge.net/svnroot/perlwikibot/trunk/no-interwiki/upload_iw.pl $',
+ Date => '$Date: 2010-03-25 17:02:28 +0200 (Thu, 25 Mar 2010) $',
+);
+#>>>
+
+Readonly my $INPUT_EXTENSION => 'wiki.txt';
+Readonly my $INTERVAL_BETWEEN_EDITS => 5;
+
+my %option = (
+ help => 0,
+ usage => 0,
+ version => 0,
+);
+
+my $valid_options = GetOptions(
+ 'langcode=s' => \$option{langcode},
+ 'username=s' => \$option{username},
+ 'password=s' => \$option{password},
+ 'help' => \$option{help},
+ 'usage' => \$option{usage},
+ 'version' => \$option{version},
+);
+
+if (not $valid_options) {
+ croak('Invalid command line options.');
+}
+
+for my $required_option (qw(langcode username password)) {
+ if (not $option{$required_option}) {
+ croak "option $required_option is required";
+ }
+}
+
+get_strings($option{langcode});
+
+my $mw = MediaWiki::API->new();
+$mw->{config}->{api_url} = "http://$option{langcode}.wikipedia.org/w/api.php";
+
+$mw->login(
+ {
+ lgname => $option{username},
+ lgpassword => $option{password},
+ }
+) or croak $mw->{error}->{code} . ': ' . $mw->{error}->{details};
+
+my $page_prefix = get_string('project prefix');
+
+my $dirname = "./out.$option{langcode}/";
+my @filenames = glob $dirname . get_string('Category') . "*.$INPUT_EXTENSION";
+
+say Dumper(\@filenames);
+
+foreach my $filename (@filenames) {
+ my $pagename = Encode::decode('UTF-8', "$filename");
+ for ($pagename) {
+ s/\.$INPUT_EXTENSION\z//xms;
+ s/\A$dirname//xms;
+ }
+ $pagename = "$page_prefix/$pagename";
+
+ my $page = $mw->get_page({ title => $pagename });
+ if ($page->{missing}) {
+ say "page $pagename is missing, trying to create";
+ }
+
+ say "uploading to $pagename";
+
+ $mw->edit(
+ {
+ action => 'edit',
+ title => $pagename,
+ summary => get_string('updating list of pages without interwiki'),
+ basetimestamp => $page->{timestamp},
+ text => read_file($filename),
+ },
+ { skip_encoding => 1, }
+ ) or croak $mw->{error}->{codie} . ': ' . $mw->{error}->{details};
+
+ sleep $INTERVAL_BETWEEN_EDITS;
+}
+
+exit;
+
+__END__
+
Property changes on: trunk/no-interwiki/upload_iw.pl
___________________________________________________________________
Added: svn:executable
+ *
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <am...@us...> - 2010-03-26 14:03:33
|
Revision: 87
http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=87&view=rev
Author: amire80
Date: 2010-03-26 14:03:26 +0000 (Fri, 26 Mar 2010)
Log Message:
-----------
Starting very naive refactoring.
Modified Paths:
--------------
trunk/no-interwiki/prepare_noiw_list.pl
Added Paths:
-----------
trunk/no-interwiki/MediaWiki/
trunk/no-interwiki/MediaWiki/Toolkit.pm
Added: trunk/no-interwiki/MediaWiki/Toolkit.pm
===================================================================
--- trunk/no-interwiki/MediaWiki/Toolkit.pm (rev 0)
+++ trunk/no-interwiki/MediaWiki/Toolkit.pm 2010-03-26 14:03:26 UTC (rev 87)
@@ -0,0 +1,104 @@
+package MediaWiki::Toolkit;
+
+use 5.010;
+
+use strict;
+use warnings;
+use integer;
+use utf8;
+use open ':encoding(utf8)';
+
+use English qw(-no_match_vars);
+
+use base 'Exporter';
+our %EXPORT_TAGS = (
+ FIELD_SEP => [qw( $FIELD_SEP $FIELD_SEP_RE )],
+ GET_STRING => [qw( get_strings get_string )],
+ FILE_UTIL => [qw( file_error append_to_file )],
+);
+our @EXPORT_OK = map { @{$_} } values %EXPORT_TAGS;
+
+our $VERSION = '0.01';
+#<<< no perltidy
+my %SVN_PROPS = ( ## no critic (RequireInterpolationOfMetachars)
+ Revision => '$Revision: 0 $',
+ HeadURL => '$HeadURL: https://perlwikibot.svn.sourceforge.net/svnroot/perlwikibot/trunk/no-interwiki/MediaWiki/Toolkit.pm $',
+ Date => '$Date: 2010-03-25 17:02:28 +0200 (Thu, 25 Mar 2010) $',
+);
+#>>>
+
+our $FIELD_SEP = qq{\t};
+our $FIELD_SEP_RE = qr{\Q$FIELD_SEP\E}xms;
+
+#
+# This is poor man's gettext.
+# TODO: Replace it with Locale::Maketext
+#
+{
+ my $STRING_SKIP_RE = qr{
+ \A # Begin string
+ \s* # Zero or more spaces
+ (?:\#.*)? # Comment lines
+ \z # End string
+}xms;
+
+ my %string;
+
+ sub get_strings {
+ my ($lang) = @_;
+
+ my $strings_fn = "$lang.strings.txt";
+
+ open my $strings_file, '<', $strings_fn
+ or croak(file_error('opening', $strings_fn, 'reading'));
+ my @strings_file_lines = <$strings_file>;
+ close $strings_file
+ or croak(file_error('closing', $strings_fn, 'reading'));
+
+ STRING_LINE:
+ foreach my $next_string_line (@strings_file_lines) {
+
+ # Skip blanks and comments
+ next STRING_LINE if ($next_string_line =~ $STRING_SKIP_RE);
+
+ chomp $next_string_line;
+ my ($english, $target) = split $FIELD_SEP_RE, $next_string_line;
+
+ # Fallback to English if no target language string was supplied
+ $string{$english} = $target // $english; # /
+ }
+
+ return;
+ }
+
+ sub get_string {
+ my ($english, $if_defined) = @_;
+
+ return $if_defined
+ ? ($string{$english})
+ : ($string{$english} //= $english); # /
+ }
+}
+
+sub file_error {
+ my ($operation, $fn, $access_type) = @_;
+ my $string = "error $operation $fn, $access_type: $OS_ERROR";
+ return $string;
+}
+
+sub append_to_file {
+ my ($fn, $line) = @_;
+
+ open my $file, '>>', $fn
+ or croak(file_error('opening', $fn, 'appending'));
+
+ say {$file} ($line // q{}); # /
+
+ close $file
+ or croak(file_error('closing', $fn, 'appeding'));
+
+ return;
+}
+
+1;
+
Modified: trunk/no-interwiki/prepare_noiw_list.pl
===================================================================
--- trunk/no-interwiki/prepare_noiw_list.pl 2010-03-25 15:02:28 UTC (rev 86)
+++ trunk/no-interwiki/prepare_noiw_list.pl 2010-03-26 14:03:26 UTC (rev 87)
@@ -31,7 +31,7 @@
use strict;
use warnings;
use integer; # Improves performance
-use open ':utf8';
+use open ':encoding(utf8)';
use utf8;
use charnames ':full';
@@ -56,6 +56,12 @@
use Log::Log4perl qw(:easy);
use Parse::MediaWikiDump 0.91; # Earlier versions have a different API
+use MediaWiki::Toolkit (
+ qw(
+ :FIELD_SEP
+ :GET_STRING :FILE_UTIL)
+);
+
our $VERSION = '0.2.1';
#<<< no perltidy
my %SVN_PROPS = ( ## no critic (RequireInterpolationOfMetachars)
@@ -74,7 +80,6 @@
my $UNSORTED_DIR = "$OUT_DIR/unsorted";
my $MULTI_DIR = "$OUT_DIR/multilinks";
my $ALT_SEP = q{|};
-my $FIELD_SEP = qq{\t};
my $LINK_SEP = q{|};
my $TYPE_SEP = qr/\s*;\s*/xms;
my $HTML_RLM = '‏';
@@ -158,28 +163,19 @@
wiki # Is supposed to be after the lang code
}xms;
-my $FIELD_SEP_RE = qr{\t}xms;
-
-my $STRING_SKIP_RE = qr{
- \A # Begin string
- \s* # Zero or more spaces
- (?:\#.*)? # Comment lines
- \z # End string
-}xms;
-
# pages-meta-current
my $dump_fn = $ARGV[0]
or croak('Dump filename must be supplied as an argument.');
-my %STRING;
my $WIKI_LANG;
if ((basename $dump_fn) =~ $DUMP_FN_RE) {
$WIKI_LANG = $+{wiki_lang};
- get_strings($WIKI_LANG);
}
else {
croak("$dump_fn is a weird dump file name.");
}
+get_strings($WIKI_LANG);
+
# XXX - bad i18n
# ISO 9 is mostly good for Russian and it is still not perfect ASCII
my $TRANSLITERATOR = Lingua::Translit->new('ISO 9');
@@ -204,14 +200,12 @@
# Constants for date processing
#
-my @MONTHS = @STRING{
- qw(
- January February March
- April May June
- July August September
- October November December
- )
- };
+my @MONTHS = map { get_string($_) } qw(
+ January February March
+ April May June
+ July August September
+ October November December
+);
my $ALT_MONTHS = join $ALT_SEP, @MONTHS;
my %REV_MONTH;
Readonly my $LAST_MONTH => 11;
@@ -1163,52 +1157,16 @@
sub is_disambig {
my ($page) = @_;
+
my $found_templates =
find_templates($page->text(), [], [ get_string('disambig') ]);
+
return scalar @{$found_templates};
}
-#
-# This is poor man's gettext.
-# TODO: Replace it with Locale::Maketext
-#
-sub get_strings {
- my ($lang) = @_;
-
- my $STRINGS_FN = "$lang.strings.txt";
-
- open my $STRINGS_FILE, '<:utf8', $STRINGS_FN
- or croak(file_error('opening', $STRINGS_FN, 'reading'));
- my @strings_file_lines = <$STRINGS_FILE>;
- close $STRINGS_FILE
- or croak(file_error('closing', $STRINGS_FN, 'reading'));
-
- STRING_LINE:
- foreach my $next_string_line (@strings_file_lines) {
-
- # Skip blanks and comments
- next STRING_LINE if ($next_string_line =~ $STRING_SKIP_RE);
-
- chomp $next_string_line;
- my ($english, $target) = split $FIELD_SEP_RE, $next_string_line;
-
- # Fallback to English if no target language string was supplied
- $STRING{$english} = $target // $english; # /
- }
-
- return;
-}
-
-sub get_string {
- my ($english, $if_defined) = @_;
- return $if_defined
- ? ($STRING{$english})
- : ($STRING{$english} //= $english); # /
-}
-
sub make_type_fn {
my ($type, $unsorted) = @_;
- $unsorted //= 0; # /
+ $unsorted //= 0; # /
#my $transliterated_type = $TRANSLITERATOR->translit($type);
my $transliterated_type = $type;
@@ -1222,20 +1180,6 @@
return $type_fn;
}
-sub append_to_file {
- my ($fn, $line) = @_;
-
- open my $file, '>>:utf8', $fn
- or croak(file_error('opening', $fn, 'appending'));
-
- say {$file} ($line // q{}); # /
-
- close $file
- or croak(file_error('closing', $fn, 'appeding'));
-
- return;
-}
-
# It appears simple, but in non-alphabetic languages such as Chinese
# it may be different, so it will sit here ready for better i18n.
sub get_sort_letter {
@@ -1243,12 +1187,6 @@
return substr $string, 0, 1;
}
-sub file_error {
- my ($operation, $fn, $access_type) = @_;
- my $string = "error $operation $fn for $access_type: $OS_ERROR";
- return $string;
-}
-
sub format_link_table { ## no critic (RequireArgUnpacking)
return sprintf '%-15s %8d', @_;
}
@@ -1259,7 +1197,7 @@
say "processing $lang_code";
my $lang_reftype = ref $found_links{$lang_code};
if ($lang_reftype ne 'HASH') {
- carp('$lang_code is $lang_reftype, not hashref!');
+ carp("$lang_code is $lang_reftype, not hashref!");
next LANG_CODE;
}
my $filename = "$MULTI_DIR/$lang_code.$WIKITEXT_EXT";
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <am...@us...> - 2010-03-25 15:02:34
|
Revision: 86
http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=86&view=rev
Author: amire80
Date: 2010-03-25 15:02:28 +0000 (Thu, 25 Mar 2010)
Log Message:
-----------
Update of empty pages handling.
Modified Paths:
--------------
trunk/no-interwiki/prepare_noiw_list.pl
Modified: trunk/no-interwiki/prepare_noiw_list.pl
===================================================================
--- trunk/no-interwiki/prepare_noiw_list.pl 2010-03-09 12:44:41 UTC (rev 85)
+++ trunk/no-interwiki/prepare_noiw_list.pl 2010-03-25 15:02:28 UTC (rev 86)
@@ -280,9 +280,7 @@
my $SECTION_LINK_RE = qr{(?<!&)\#}xms;
my $LOWERCASE_LINK_RE = qr{\A[[:lower:]]}xms;
-## no critic (RegularExpressions::ProhibitEscapedMetacharacters)
my $TRUE_TEMPLATE_RE = qr/\{ $RE{balanced}{-parens=>'{}'} \}/xms;
-## use critic (RegularExpressions::ProhibitEscapedMetacharacters)
# get_string() cannot be used in re
my $string_exclude_lowercase = get_string('exclude_lowercase');
@@ -420,23 +418,9 @@
exit;
sub load_dump {
- Readonly my $WORKING_PMWD_VER => 0.91;
- Readonly my $FUTURE_PMWD_VER => 0.94;
-
my $dump;
- if ($Parse::MediaWikiDump::VERSION == $WORKING_PMWD_VER) {
- $dump = Parse::MediaWikiDump::Pages->new($dump_fn);
- }
- else {
- if ($Parse::MediaWikiDump::VERSION < $FUTURE_PMWD_VER) {
- carp( 'You are running Parse::MediaWikiDump version '
- . $Parse::MediaWikiDump::VERSION
- . ".\n Redirect handling may be broken\n");
- }
- my $pmwd = Parse::MediaWikiDump->new();
- $dump = $pmwd->revisions($dump_fn);
- }
+ $dump = Parse::MediaWikiDump::Pages->new($dump_fn);
return $dump;
}
@@ -471,6 +455,12 @@
my $page_text_ref = $page_ref->text();
+ if (${$page_text_ref} !~ /\S/xmsi
+ and not is_in_namespace($page_ref, 'User', 'User talk'))
+ {
+ special_cases_file('empty_page', $page_ref);
+ }
+
if (
$page_counter < $option{start_from}
or not defined ${$page_text_ref} # must be tested before redirect
@@ -1266,6 +1256,12 @@
sub print_multi_links_by_foreign {
LANG_CODE:
foreach my $lang_code (sort keys %found_links) {
+ say "processing $lang_code";
+ my $lang_reftype = ref $found_links{$lang_code};
+ if ($lang_reftype ne 'HASH') {
+ carp('$lang_code is $lang_reftype, not hashref!');
+ next LANG_CODE;
+ }
my $filename = "$MULTI_DIR/$lang_code.$WIKITEXT_EXT";
my @foreign_articles = sort keys %{ $found_links{$lang_code} };
say format_link_table($lang_code, scalar @foreign_articles);
@@ -1274,6 +1270,14 @@
foreach my $foreign_article (@foreign_articles) {
my @local_articles =
keys %{ $found_links{$lang_code}->{$foreign_article} };
+
+ my $article_reftype =
+ ref $found_links{$lang_code}->{$foreign_article};
+ if ($article_reftype ne 'HASH') {
+ carp("$foreign_article is $article_reftype, not hashref!");
+ next FOREIGN_ARTICLE;
+ }
+
if (scalar @local_articles > 1) {
my $links = join q{ | }, sort map { make_link($_) }
keys %{ $found_links{$lang_code}->{$foreign_article} };
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <am...@us...> - 2010-03-09 12:44:47
|
Revision: 85
http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=85&view=rev
Author: amire80
Date: 2010-03-09 12:44:41 +0000 (Tue, 09 Mar 2010)
Log Message:
-----------
Adding config files for ru.wikipedia.
Added Paths:
-----------
trunk/no-interwiki/ru.language_codes.txt
trunk/no-interwiki/ru.strings.txt
Added: trunk/no-interwiki/ru.language_codes.txt
===================================================================
--- trunk/no-interwiki/ru.language_codes.txt (rev 0)
+++ trunk/no-interwiki/ru.language_codes.txt 2010-03-09 12:44:41 UTC (rev 85)
@@ -0,0 +1 @@
+link language_codes.txt
\ No newline at end of file
Property changes on: trunk/no-interwiki/ru.language_codes.txt
___________________________________________________________________
Added: svn:special
+ *
Added: trunk/no-interwiki/ru.strings.txt
===================================================================
--- trunk/no-interwiki/ru.strings.txt (rev 0)
+++ trunk/no-interwiki/ru.strings.txt 2010-03-09 12:44:41 UTC (rev 85)
@@ -0,0 +1,37 @@
+# months
+January января
+February февраля
+March марта
+April апреля
+May мая
+June июня
+July июля
+August августа
+September сентября
+October ортября
+November ноября
+December декабря
+
+no_iw Нет интервики
+disambig неоднозначность
+template шаблон
+
+date дата
+type topic
+
+# MW specials
+REDIRECT перенаправление
+
+# Namespaces
+User Участник
+User talk Обсуждение участника
+Image Файл
+Portal Портал
+Category Категория
+article space (статьи)
+
+# Other
+other другое
+rlm {{כ}}
+exclude_lowercase ß
+
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <am...@us...> - 2009-10-01 11:39:18
|
Revision: 84
http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=84&view=rev
Author: amire80
Date: 2009-10-01 11:39:00 +0000 (Thu, 01 Oct 2009)
Log Message:
-----------
Starting the big refactoring.
Added Paths:
-----------
trunk/no-interwiki/Maintenance.pm
Added: trunk/no-interwiki/Maintenance.pm
===================================================================
--- trunk/no-interwiki/Maintenance.pm (rev 0)
+++ trunk/no-interwiki/Maintenance.pm 2009-10-01 11:39:00 UTC (rev 84)
@@ -0,0 +1,1755 @@
+package Maintenance;
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 3
+# as published by the Free Software Foundation or under the terms of
+# Artistic License version 2.0.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the licenses
+# along with this program. If not, see
+# * <http://www.gnu.org/licenses/gpl-3.0.html>.
+# * <http://www.perlfoundation.org/artistic_license_2_0>
+
+# Upgrade! This program actually uses new Perl 5.10 constructs, so you need it
+use 5.010;
+
+# pragmata
+use strict;
+use warnings;
+use integer; # Improves performance
+use open ':utf8';
+use utf8;
+use charnames ':full';
+
+# Standard library
+# These modules should come installed with Perl
+use English qw(-no_match_vars);
+use Carp qw(croak cluck);
+# use Time::Local;
+# use Data::Dumper;
+# use File::Basename;
+
+# CPAN
+# You must install these modules from CPAN
+
+# I only use Readonly where P::C complains about it. Using it with regular
+# expressions which have non-ASCII chars produces heisenbugs.
+use Readonly;
+# use Pod::Usage;
+# use Regexp::Common;
+# use Lingua::Translit;
+# use Log::Log4perl qw(:easy);
+use Parse::MediaWikiDump 0.91; # Earlier versions have a different API
+
+our $VERSION = '0.0.1';
+#<<< no perltidy
+my %SVN_PROPS = ( ## no critic (RequireInterpolationOfMetachars)
+ Revision => '$Revision: 83 $',
+ HeadURL => '$HeadURL: https://perlwikibot.svn.sourceforge.net/svnroot/perlwikibot/trunk/no-interwiki/prepare_noiw_list.pl $',
+ Date => '$Date: 2009-09-29 15:25:11 +0200 (Tue, 29 Sep 2009) $',
+);
+#>>>
+
+my $LANG_CODE;
+my %STRINGS;
+my %LANG_CODE;
+my $WIKI_LANG;
+
+sub init {
+ my (%args) = @_;
+
+ if ((basename $args{dump_fn}) =~ $DUMP_FN_RE) {
+ $WIKI_LANG = $+{wiki_lang};
+ load_strings($WIKI_LANG);
+ }
+ else {
+ croak("$dump_fn is a weird dump file name.");
+ }
+
+ my $LANG_CODE_FN = "$wiki_lang.language_codes.txt";
+ open my $lang_code_file, '<', $LANG_CODE_FN
+ or croak(file_error('opening', $LANG_CODE_FN, 'reading'));
+ while (my $line = <$lang_code_file>) {
+ chomp $line;
+ my ($code, $name) = split /\t/xms, $line;
+ if (defined $code) {
+ $LANG_CODE{$code} = $name // $code; # /
+ }
+ }
+ close $lang_code_file
+ or croak(file_error('closing', $LANG_CODE_FN, 'reading'));
+
+ init_logger('outtest.log');
+
+ # TODO: Make smarter, configurable, whatever
+ # $OUT_DIR must be first, because it's the parent
+ foreach my $out_dir ($OUT_DIR, $UNSORTED_DIR, $MULTI_DIR) {
+ if (-d $out_dir) {
+ unlink glob "$out_dir/*$WIKITEXT_EXT";
+ }
+ else {
+ mkdir $out_dir;
+ }
+ }
+
+ return;
+}
+
+my $WIKITEXT_EXT = 'wiki.txt';
+my $OUT_DIR = 'out';
+my $UNSORTED_DIR = "$OUT_DIR/unsorted";
+my $ALT_SEP = q{|};
+my $FIELD_SEP = qq{\t};
+my $LINK_SEP = q{|};
+my $TYPE_SEP = qr/\s*;\s*/xms;
+my $HTML_RLM = '‏';
+
+Readonly our $DEFAULT_PAGE_FREQ => 1000;
+Readonly our $DEFAULT_MAX_LINKS_PER_SECTION => 100;
+Readonly our $DEFAULT_MAX_SECTIONS_PER_PAGE => 20;
+
+sub init_logger {
+ my ($log_fn) = @_;
+ Log::Log4perl->easy_init(
+ {
+ level => $INFO, # print everything
+ file => ":utf8>$log_fn", # utf is important
+ layout => '%m%n', # No need to print the date
+ }
+ );
+ return;
+}
+
+# XXX Too coupled to Wikipedia, won't work for other projects.
+our $DUMP_FN_RE = qr{
+ \A # Begin string
+ (?<wiki_lang>\w+) # Lang code
+ wiki # Is supposed to be after the lang code
+}xms;
+
+our $FIELD_SEP_RE = qr{\t}xms;
+
+our $STRING_SKIP_RE = qr{
+ \A # Begin string
+ \s* # Zero or more spaces
+ (?:\#.*)? # Comment lines
+ \z # End string
+}xms;
+
+my %STRING;
+
+# XXX - bad i18n
+# ISO 9 is mostly good for Russian and it is still not perfect ASCII
+# my $TRANSLITERATOR = Lingua::Translit->new('ISO 9');
+
+our %MW_SYNTAX = (
+ 'start_template' => '{{',
+ 'end_template' => '}}',
+ 'start_link' => '[[',
+ 'end_link' => ']]',
+ 'param_sep' => q{|},
+ 'paragraph' => qq{\n\n},
+ 'start_wikitable' => '{|',
+ 'namespace_sep' => q{:},
+);
+
+# This monstrosity basically says: | and optional spaces
+our $PARAM_SEP_RE = qr{\s*\Q$MW_SYNTAX{param_sep}\E\s*}xms;
+
+#
+# Constants for date processing
+#
+
+our @MONTHS = @STRING{
+ qw(
+ January February March
+ April May June
+ July August September
+ October November December
+ )
+ };
+my $ALT_MONTHS = join $ALT_SEP, @MONTHS;
+my %REV_MONTH;
+Readonly my $LAST_MONTH => 11;
+@REV_MONTH{@MONTHS} = (0 .. $LAST_MONTH);
+
+# XXX Internationalize
+my $string_in = get_string('in');
+our $HEB_DATE_RE = qr{
+ \A # begin
+ (?<hour>\d{1,2}) # hour
+ : # :
+ (?<min>\d{2}),\s # minute
+ (?<mday>\d{1,2})\s # day of month
+ $string_in? # This preposition appears sometimes
+ (?<mon>$ALT_MONTHS)\s # A name of a month
+ (?<year>\d+?)\s # Year
+ \([A-Z]{3}\) # Three letters in brackets - timezone
+ \z # end
+}xms;
+
+our $PARAM_RE = qr{
+ \A # Beginning of a string
+ \s* # Zero or more space
+ (?: # No capture
+ (?<param_name>\w+) # Word chars - param name
+ \s* # Zero or more space
+ = # =
+ )? # Param name is optional
+ (?<value>.*) # value
+}xms;
+
+# A simplistic template just for testing.
+# Quite possibly it is not needed anymore.
+# Until i get a better regex for matching balancing {{}} ...
+our $TEMPLATE_RE = qr{
+ \A # beginning of string
+ \Q$MW_SYNTAX{start_template}\E # {{
+ .+ # some chars
+ \Q$MW_SYNTAX{end_template}\E # }}
+ \z # end of string
+}xms;
+
+our $WIKITABLE_RE = qr{
+ \A
+ \Q$MW_SYNTAX{start_wikitable}\E
+}xms;
+
+# Redirect in local language.
+# Regular expression mostly copied from
+# Parse::MediaWikiDump::page::redirect
+# TODO: Try to look for the local language redirect keyword in the dump.
+my $local_redirect = get_string('REDIRECT');
+my $LOCAL_REDIRECT_RE = qr{
+ \A # Beginning of string (page)
+ \# # a # character
+ $local_redirect # Redirect keyword in local language
+ \s*:?\s*\[\[([^\]]*)\]\] # the link after the redirect
+}xmsi;
+
+my $LTR_CHAR_RE = qr/\P{IsRightToLeft}/xms; # \P is negation
+my $SECTION_LINK_RE = qr{(?<!&)\#}xms;
+my $LOWERCASE_LINK_RE = qr{\A[[:lower:]]}xms;
+
+## no critic (RegularExpressions::ProhibitEscapedMetacharacters)
+my $TRUE_TEMPLATE_RE = qr/\{ $RE{balanced}{-parens=>'{}'} \}/xms;
+## use critic (RegularExpressions::ProhibitEscapedMetacharacters)
+
+my $string_exclude_lowercase = get_string('exclude_lowercase');
+my $EXCLUDE_LOWERCASE_RE = qr{
+ \A # Beginning of foreign article name
+ [$string_exclude_lowercase] # Character class of possibly lowercase chars
+}xms;
+my $NUMBERED_FILE_RE = qr{
+ (?: _ \d*)?
+ \.$WIKITEXT_EXT
+}xms;
+my $INVALID_FILENAME_CHAR_RE = qr{[\\\n/:*?"<>|]}xms; # "
+
+my $TWO_DIGIT_CHARNUMBER_RE = qr{
+ (?:
+ [%.] # There are both %C4%B0 and .AA.E0
+ [[:xdigit:]]{2} # 2 hex digits
+ )
+}xms;
+
+my $HTML_CHARNUMBER_RE = qr{
+ (?:
+ &\#\d+; # stuff like Š
+ )
+}xms;
+
+# TODO: Check whether it is Neapolitan with its ''
+my $CHARACTER_CODE_IN_LINK_RE = qr{
+ $TWO_DIGIT_CHARNUMBER_RE | $HTML_CHARNUMBER_RE
+}xms;
+
+sub load_lang_codes {
+ my ($wiki_lang) = @_;
+
+}
+
+my $string_no_iw = get_string('no_iw');
+my $SIMPLE_NO_IW_CHECK_RE = qr{
+ \Q$string_no_iw\E # The string may have spaces
+}xmsi;
+
+my $ALT_LANGS = join $ALT_SEP, keys %LANG_CODE;
+
+my $INTERWIKI_LINK_RE = qr{
+ \Q$MW_SYNTAX{start_link}\E
+ (?<lang_code>
+ $ALT_LANGS
+ )
+ :
+ (?<foreign_article>
+ .+?
+ )
+ \Q$MW_SYNTAX{end_link}\E
+}xms;
+
+# Lojban allows lowercase articles
+my @LOWERCASE_LANGS = qw(jbo);
+
+our $the_dump = load_dump();
+
+my $namespaces_alt = join $ALT_SEP,
+ grep { length > 0 } @{ $the_dump->namespaces_names() };
+my $PURE_TITLE_RE = qr{
+ \A
+ (?:
+ (?:$namespaces_alt)
+ $MW_SYNTAX{namespace_sep}
+ )? # Namespace name is optional
+ (?<pure_title>.+)
+ \z
+}xms;
+
+our %statistics = (count_iw => []);
+
+my $begin_time = time;
+process_dump();
+my $total_time = time - $begin_time;
+
+print_stats();
+
+create_no_iw_pages();
+
+INFO(q{});
+
+# my @found_lang_codes = sort keys %found_links;
+# INFO("found lang_codes: @found_lang_codes");
+
+my $begin_multi_links_time = time;
+
+say "\nlisting multi links by language";
+print_multi_links_by_foreign();
+
+say "\nlisting multi links by local articles";
+print_multi_links_by_local();
+
+my $total_multi_links_time = time - $begin_multi_links_time;
+INFO("total multi links time: $total_multi_links_time");
+
+exit;
+
+sub load_dump {
+ Readonly my $WORKING_PMWD_VER => 0.91;
+ Readonly my $FUTURE_PMWD_VER => 0.94;
+
+ my $dump;
+
+ if ($Parse::MediaWikiDump::VERSION == $WORKING_PMWD_VER) {
+ $dump = Parse::MediaWikiDump::Pages->new($dump_fn);
+ }
+ else {
+ if ($Parse::MediaWikiDump::VERSION < $FUTURE_PMWD_VER) {
+ carp( 'You are running Parse::MediaWikiDump version '
+ . $Parse::MediaWikiDump::VERSION
+ . ".\n Redirect handling may be broken\n");
+ }
+ my $pmwd = Parse::MediaWikiDump->new();
+ $dump = $pmwd->revisions($dump_fn);
+ }
+
+ # This is the only currently known value
+ # but there could be more in the future
+ if ($dump->case() ne 'first-letter') {
+ croak(q{unable to handle any case setting besides 'first-letter'});
+ }
+
+ return $dump;
+}
+
+sub namespace {
+ my ($page) = @_;
+ return $page->namespace() || get_string('article space');
+}
+
+sub next_page {
+ my ($dump) = @_;
+ my $page_ref = eval { $dump->next(); };
+ if ($EVAL_ERROR) {
+ confess("Failed reading a page: $EVAL_ERROR");
+ }
+ return $page_ref;
+}
+
+sub process_dump {
+ PAGE:
+ while (my $page_ref = next_page($the_dump)) {
+ $page_counter++;
+ if ($page_counter % $option{page_freq} == 0) {
+ say $page_counter;
+ }
+
+ if ( $option{stop_after}
+ and $page_counter > $option{stop_after})
+ {
+ last PAGE;
+ }
+
+ my $page_text_ref = $page_ref->text();
+
+ if (
+ $page_counter < $option{start_from}
+ or not defined ${$page_text_ref} # must be tested before redirect
+ or not is_in_namespace($page_ref, @INCLUDE_NAMESPACES)
+ or is_redirect($page_ref)
+ )
+ {
+ next PAGE;
+ }
+
+ my $page_namespace = namespace($page_ref);
+ $namespace_count{$page_namespace}++;
+
+ my $page_title = $page_ref->title();
+
+ INFO("\n* processing $page_counter - ", $page_title);
+
+ # TODO: Be more precise here.
+ # Portal pages which have a '/' in their name are probably
+ # internal and do not need interwiki links.
+ if (is_in_namespace($page_ref, 'Portal') and $page_title =~ m{/}xms) {
+ INFO('internal portal, skipping');
+ $statistics{'internal portal'}++;
+ next PAGE;
+ }
+
+ # A simple sanity check: is the no_iw template anywhere around here?
+ my $has_template_no_iw =
+ (${$page_text_ref} =~ $SIMPLE_NO_IW_CHECK_RE);
+
+ # Does the page have interwiki links?
+ # BIG XXX Actually checks only for English
+ my $has_iw = has_interwiki($page_ref);
+
+ if ($has_iw) {
+ INFO("has link to $has_iw");
+ if ($has_template_no_iw) {
+ INFO('has template no_iw. trying to remove ...');
+ remove_template_no_iw($page_text_ref);
+ $statistics{'has both valid interwiki and template'}++;
+ special_cases_file('outdated_template', $page_ref);
+ }
+ }
+ else { # does not have iw
+ process_iwless_page($page_ref, $has_template_no_iw, $has_iw);
+ }
+ }
+
+ return;
+}
+
+sub process_iwless_page {
+ my (
+ $page_ref, # object ref
+ $has_template_no_iw, # scalar bool
+ $has_iw # scalar bool
+ ) = @_;
+
+ INFO(q(does not have iw link.));
+ $statistics{'has no interwiki link'}->{ namespace($page_ref) }++;
+
+ # Now we need to search for no_iw templates
+ # and parse their parameters - date and type
+
+ my @found_templates = ();
+
+ my $page_text_ref = $page_ref->text();
+
+ # Optimized - does not start searching,
+ # if we already know that it is not there
+ if ($has_template_no_iw) {
+ find_templates($page_text_ref, \@found_templates,
+ [ get_string('no_iw') ]);
+ }
+
+ my $found_templates_count = scalar @found_templates;
+ INFO("found templates: $found_templates_count");
+
+ my $template;
+ if ($found_templates_count) {
+
+ # Assume the first one is the right one
+ $template = $found_templates[0];
+ if ($found_templates_count > 1) {
+ WARN('many templates were found');
+ $statistics{'many templates'}++;
+ special_cases_file('many_templates', $page_ref);
+ }
+ else {
+ INFO('good, found one template');
+ $statistics{'one template'}++;
+ }
+ }
+ else {
+ INFO('no templates found');
+ $statistics{'no templates found'}++;
+ }
+
+ if (defined $template) {
+ INFO('has template no_iw');
+ my $date_str = $template->{params}->{date};
+ if (defined $date_str) {
+ INFO('checking cooling date');
+ my $date_ref = parse_date($date_str);
+ if (not defined $date_ref) {
+ INFO("invalid date: '$date_str'");
+ $statistics{'invalid date'}++;
+ special_cases_file('invalid_date', $page_ref);
+ }
+ elsif (cooling_date_passed($date_ref)) {
+ INFO('cooling date passed, updating to today ...');
+ update_cooling_date($page_text_ref);
+ $statistics{'cooling date passed'}++;
+ }
+ else {
+ INFO(q(cooling date did not pass.));
+ $statistics{q(cooling date did not pass)}++;
+ }
+ }
+ else {
+ INFO('date not defined');
+ }
+
+ }
+
+ my @all_types = get_all_types($template->{params}->{type}, $page_ref);
+
+ foreach my $type (@all_types) {
+ INFO('adding ' . $page_ref->title() . " to the list as type $type");
+ add_to_no_iw_list($page_ref, $type);
+ $type_count{$type}++;
+ }
+
+ return 1;
+}
+
+sub get_all_types {
+ my ($type_param, $page) = @_;
+
+ $type_param //= q{}; # /
+ strip_whitespace($type_param);
+
+ my @all_types = split $TYPE_SEP, $type_param;
+
+ my $page_title = $page->title();
+ if (is_category($page)) {
+ INFO("$page_title is a category");
+ push @all_types, get_string('Category');
+ $statistics{'categories'}++;
+ }
+
+ if (is_in_namespace($page, 'Portal')) {
+ INFO("$page_title is a portal");
+ push @all_types, get_string('Portal');
+ $statistics{'portal'}++;
+ }
+
+ if (is_disambig($page)) {
+ INFO("$page_title is a disambiguation");
+ push @all_types, get_string('disambig');
+ $statistics{'disambig'}++;
+ }
+
+ # Still nothing?
+ if (not scalar @all_types) {
+ my $other_type = get_string('other');
+ INFO("$page_title does not have any type, adding to $other_type");
+ @all_types = ($other_type);
+ $statistics{'automatically added to type other'}++;
+ }
+
+ return @all_types;
+}
+
+sub find_templates {
+ my (
+ $text_ref, # string ref
+ $found_templates_ref, # array ref
+ $filter # string array ref
+ ) = @_;
+
+ # A reference to an array with one empty string.
+ # Matching against an empty string will always succeed.
+ $filter //= [q{}]; # /
+
+ # Get all highest-level matches
+ my @matches = (${$text_ref} =~ /$TRUE_TEMPLATE_RE/xmsgo);
+
+ MATCH:
+ foreach my $next_match (@matches) {
+ if ($next_match !~ $TEMPLATE_RE) {
+ INFO(q(i thought that it is a template, but it was:));
+ if ($next_match =~ $WIKITABLE_RE) {
+ INFO('a wikitable');
+ }
+ else {
+ INFO("something else: $next_match");
+ }
+ INFO(q{});
+ next MATCH;
+ }
+
+ foreach my $next_filter (@{$filter}) {
+
+ # N.B. - case-insensitive. Wrong, but kinda useful.
+ if ($next_match =~
+ /\A\Q$MW_SYNTAX{'start_template'}$next_filter/xmsi)
+ {
+
+ # N.B.: parse_template calls find_templates() recursively
+ my $parsed_template =
+ parse_template(\$next_match, [qw(date type)], $filter);
+ push @{$found_templates_ref}, $parsed_template;
+ }
+ }
+ }
+
+ return $found_templates_ref;
+}
+
+sub parse_template {
+ my (
+ $template, # string ref
+ $default_param_names, # string array ref
+ $subtemplate_filter, # string array ref
+ ) = @_;
+
+ # %parsed_template:
+ # {text} - string ref; might be big
+ # {name} - string
+ # {params} - hash ref
+ # {subtemplates} - array ref
+ my (%parsed_template, %parsed_params, @clauses);
+
+ $parsed_template{text} = strip_template_curlies(${$template});
+
+ # First string of the split is the template name,
+ # the rest is the params
+ ($parsed_template{name}, @clauses) =
+ (split $PARAM_SEP_RE, ${ $parsed_template{text} });
+
+ my $param_counter = 0;
+ foreach my $clause (@clauses) {
+ if ($clause =~ $PARAM_RE) {
+ #<<< no perltidy
+ my ($name, $value) = @+{ qw(param_name value) };
+ #>>>
+ if (defined $name) {
+ if (defined $param_name{$name}) {
+ $name = $param_name{$name};
+ }
+ }
+ else {
+
+ # Get next default param name.
+ # If a default name is not defined, just use the number
+ $name = $default_param_names->[$param_counter]
+ || $param_counter;
+ }
+ $parsed_params{$name} = $value;
+ }
+ else {
+ my $error_msg = "Weird - $clause does not look like a param";
+ INFO($error_msg);
+ cluck($error_msg);
+ $statistics{'weird param'}++;
+ }
+ $param_counter++;
+ }
+ $parsed_template{params} = \%parsed_params;
+
+ # Possible recursion
+ find_templates($parsed_template{text}, $parsed_template{subtemplates},
+ $subtemplate_filter);
+
+ return \%parsed_template;
+}
+
+sub parse_date {
+ my ($date_str) = @_;
+
+ return if (not defined $date_str);
+
+ if ($date_str =~ $HEB_DATE_RE) {
+ INFO("found a valid date: $date_str");
+ my %parsed_date = (
+ 'sec' => 0, # useful for timelocal
+ );
+ foreach my $date_part (qw(hour min mday mon year)) {
+ $parsed_date{$date_part} = $+{$date_part};
+ }
+ $parsed_date{mon} = $REV_MONTH{ $parsed_date{mon} };
+
+ # strip leading zeros
+ foreach (values %parsed_date) {
+ s{
+ \A
+ 0+
+ (?<number>\d+)
+ }
+ {$+{number}}xms;
+ }
+ return \%parsed_date;
+ }
+
+ # Returns undef for an invalid date
+ return;
+}
+
+sub strip_template_curlies {
+ my ($template) = @_;
+ for ($template) {
+ s{
+ \A
+ \Q$MW_SYNTAX{start_template}\E
+ }{}xms;
+ s{
+ \Q$MW_SYNTAX{end_template}\E
+ \z
+ }{}xms;
+ }
+ return \$template;
+}
+
+# no arg unpacking for simplicity and performance
+sub strip_whitespace { ## no critic (RequireArgUnpacking)
+ for (@_) {
+ s/\A\s*//xms;
+ s/\s*\z//xms;
+ }
+ return;
+}
+
+sub has_interwiki {
+ my ($page) = @_;
+
+ my $page_title = $page->title();
+ my $page_text = ${ $page->text() }; # XXX
+
+ study $page_text; # XXX
+
+ my %iw_links;
+ my %special_cases;
+
+ while ($page_text =~ /$INTERWIKI_LINK_RE/xmsgo) { # XXX
+ my ($lang_code, $foreign_article) = @+{qw(lang_code foreign_article)};
+ if (defined $iw_links{$lang_code}) {
+ $special_cases{double_links}->{$lang_code} = q{};
+ }
+ else {
+ $iw_links{$lang_code} = $foreign_article;
+ }
+
+ # A # sign not after an &.
+ # After an & it is probably a character number.
+ if ($foreign_article =~ $SECTION_LINK_RE) {
+ $special_cases{section_links}->{$lang_code} = q{};
+ }
+
+ # Char codes are common in section links, so there is no
+ # need to show them again
+ elsif ($foreign_article =~ $CHARACTER_CODE_IN_LINK_RE) {
+ $special_cases{charnumber_links}->{$lang_code} = q{};
+ }
+
+ # Lowercase links
+ if ( (not $lang_code ~~ @LOWERCASE_LANGS)
+ and ($foreign_article =~ $LOWERCASE_LINK_RE))
+ {
+ my $include_lowercase_link = 1;
+
+ if (defined get_string('exclude_lowercase', 'if defined')
+ and $foreign_article =~ $EXCLUDE_LOWERCASE_RE)
+ {
+ $include_lowercase_link = 0;
+ }
+
+ if ($include_lowercase_link) {
+ $special_cases{lowercase_links}->{$lang_code} = q{};
+ }
+ }
+
+ $found_links{$lang_code}->{$foreign_article}->{$page_title} = q{};
+ }
+
+ my @all_langs = keys %iw_links;
+ my $count_iw = scalar @all_langs;
+ if ($count_iw) {
+ if (not defined $statistics{count_iw}->[$count_iw]) {
+ $statistics{count_iw}->[$count_iw] = [];
+ }
+ push @{ $statistics{count_iw}->[$count_iw] }, $page_title;
+ }
+ INFO("iw link count for $page_title is: $count_iw");
+
+ for my $special_case_name (keys %special_cases) {
+ if (scalar %{ $special_cases{$special_case_name} }) {
+ special_cases_file($special_case_name, $page,
+ $special_cases{$special_case_name});
+ }
+ }
+
+ # BIG XXX Still very stupid, but getting better
+ if (defined $iw_links{en}) {
+ return 'en';
+ }
+
+ return q{};
+}
+
+sub special_cases_file {
+ my ($special_case_name, $page, $special_cases_ref) = @_;
+ $special_cases_ref //= {}; # /
+
+ my $special_case_langs = join q{ }, sort keys %{$special_cases_ref};
+
+ if ($special_case_langs) {
+ $special_case_langs = " ($special_case_langs)";
+ }
+
+ my $special_case_fn = make_type_fn($special_case_name, 1);
+
+ if (not -e $special_case_fn) {
+ append_to_file($special_case_fn, $special_case_name);
+ }
+
+ my $page_title = $page->title();
+ my $link = make_link($page_title);
+ my $line =
+ $link
+ . $special_case_langs
+ . $FIELD_SEP
+ . get_sort_title($page_title);
+
+ append_to_file($special_case_fn, $line);
+
+ return;
+}
+
+sub remove_template_no_iw {
+ my ($params) = @_;
+ INFO( "Supposed to remove the no_iw template now, but ...\n"
+ . 'This sub is a stub. You can help Wikipedia by expanding it!');
+
+ return 0;
+}
+
+sub cooling_date_passed {
+ my ($date_ref) = @_;
+
+ my @page_times = @{$date_ref}{qw(sec min hour mday mon year)};
+ INFO("page times: @page_times");
+
+ my $pageseconds = timelocal(@page_times);
+ return $pageseconds < $LATEST_COOLING;
+}
+
+sub update_cooling_date {
+ my ($params) = @_;
+
+ INFO( "Supposed to update cooling date now, but ...\n"
+ . "This sub is a stub. You can help Wikipedia by expanding it!\n"
+ );
+
+ return 0;
+}
+
+# Just a debugging thingie
+sub print_parsed_template {
+ my ($template_ref) = @_; # hash ref
+
+ INFO('text:');
+ INFO(${ $template_ref->{text} });
+ INFO('name:');
+ INFO($template_ref->{name});
+ INFO('params:');
+ foreach my $next_param (sort keys %{ $template_ref->{params} }) {
+ INFO("$next_param: $template_ref->{params}->{$next_param}");
+ }
+
+ if (defined $template_ref->{subtemplates}) {
+ INFO("subtemplates: $template_ref->{subtemplates}");
+ }
+
+ return;
+}
+
+sub get_sort_title {
+ my ($page_title) = @_;
+
+ my $sort_title;
+ if ($page_title =~ $PURE_TITLE_RE) {
+ $sort_title = $+{pure_title};
+ }
+ else {
+ INFO( 'Something badly weird happened - '
+ . "$page_title has no pure title");
+ croak('A page has no pure title');
+ }
+
+ return $sort_title;
+}
+
+sub add_to_no_iw_list {
+ my (
+ $page, # ref
+ $page_type, # string
+ ) = @_;
+
+ my $page_title = $page->title();
+ INFO("Adding $page_title to no_iw_list");
+
+ my $sort_title = get_sort_title($page_title);
+ INFO("full title: $page_title");
+ INFO("sort title: $sort_title");
+
+ my $unsorted_type_fn = make_type_fn($page_type, 1);
+ if (not -e $unsorted_type_fn) {
+ append_to_file($unsorted_type_fn, $page_type);
+ }
+ my $link = make_link($page_title);
+ INFO("link to page: $link");
+ my $line = $link . $FIELD_SEP . $sort_title;
+
+ append_to_file($unsorted_type_fn, $line);
+
+ return;
+}
+
+sub make_link {
+ my ($page_title) = @_;
+
+ my $link_to_page =
+ $MW_SYNTAX{start_link}
+ . $MW_SYNTAX{namespace_sep}
+ . $page_title
+ . $MW_SYNTAX{end_link};
+
+ if ($option{rtl} and $page_title =~ $LTR_CHAR_RE) {
+ $link_to_page = $HTML_RLM . $link_to_page . $HTML_RLM;
+ }
+
+ return $link_to_page;
+}
+
+sub create_no_iw_pages {
+ my ($params) = @_;
+
+ INFO("\ncreating no_iw pages");
+
+ # Run over page types
+ UNSORTED_TYPE_FN:
+ foreach my $unsorted_type_fn (glob "$UNSORTED_DIR/*") {
+ my %all_pages_in_type = ();
+ open my $unsorted_type_file, '<', $unsorted_type_fn
+ or croak(file_error('opening', $unsorted_type_fn, 'reading'));
+ my @lines = <$unsorted_type_file>;
+ close $unsorted_type_file
+ or croak(file_error('closing', $unsorted_type_fn, 'reading'));
+
+ my $type_name = shift @lines;
+ chomp $type_name;
+ foreach my $line (@lines) {
+ chomp $line;
+ my ($page_title, $sort_title) = split $FIELD_SEP_RE, $line;
+ my $sort_letter = get_sort_letter($sort_title);
+ $all_pages_in_type{$sort_letter} //= []; # /
+ push @{ $all_pages_in_type{$sort_letter} }, $page_title;
+ }
+ write_sorted_pages($type_name, \%all_pages_in_type);
+ }
+ return;
+}
+
+sub write_sorted_pages {
+ my ($type_name, $type_tree_ref) = @_;
+
+ my $type_fn = make_type_fn($type_name);
+
+ my $section_counter = 0;
+ my $page = q{};
+ my $file_number = 1;
+
+ LETTER:
+ foreach my $next_letter (sort keys %{$type_tree_ref}) {
+ my @all_links_in_letter =
+ sort @{ $type_tree_ref->{$next_letter} };
+
+ my $links_count = scalar @all_links_in_letter;
+ my $sections =
+ ($links_count - 1) / $option{max_links_per_section} + 1;
+ my $links_per_section = $links_count / $sections;
+
+ SECTION:
+ foreach my $section_num (1 .. $sections) {
+ if ($section_counter == $option{max_sections_per_page}) {
+ write_page(\$page, \$type_fn, $file_number++);
+ $section_counter = 0;
+
+ $page = q{};
+ }
+ elsif ($section_counter) {
+ $page .= $MW_SYNTAX{paragraph};
+ }
+ $section_counter++;
+ my $title = $next_letter;
+ if ($sections > 1) {
+ $title .= " $section_num";
+ }
+ $page .= mw_heading($title);
+
+ my $first_link = ($section_num - 1) * $links_per_section;
+ my $last_link =
+ ($section_num == $sections)
+ ? $links_count - 1
+ : $first_link + $links_per_section - 1;
+ my $links = join_links(
+ [ @all_links_in_letter[ $first_link .. $last_link ] ]);
+ $page .= $links;
+ }
+ }
+
+ # The page may be empty at this point
+ if ($page) {
+ write_page(\$page, \$type_fn, $file_number++);
+ }
+
+ return;
+}
+
+sub write_page {
+ my ($page_ref, $type_fn_ref, $file_number) = @_;
+
+ my $pretty_file_number = sprintf '%03d', $file_number;
+ ${$type_fn_ref} =~ s{
+ $NUMBERED_FILE_RE
+ }
+ {_$pretty_file_number.$WIKITEXT_EXT}xmso;
+ INFO("creating file ${$type_fn_ref}");
+ append_to_file(${$type_fn_ref}, ${$page_ref});
+
+ return;
+}
+
+sub mw_heading {
+ my (
+ $text, # string
+ $level # number
+ ) = @_;
+
+ $level //= 2; # /
+ my $level_marker = q{=} x $level;
+
+ # Line ending is mandatory
+ return "$level_marker $text $level_marker\n";
+}
+
+sub mw_bold {
+ my ($text) = @_;
+ return "'''$text'''";
+}
+
+# Custom Unicode character property, which is like \w, but for Hebrew.
+# The custom is to give custom Unicode character classes CamelCase names.
+# P::C policy ProhibitMixedCaseSubs is deprecated.
+sub IsRightToLeft { ## no critic (Capitalization)
+ return <<'END';
++utf8::InHebrew
++utf8::IsSpace
++utf8::IsPunct
+END
+}
+
+sub is_redirect {
+ my ($page) = @_;
+
+ if ($page->redirect()) {
+ return 'English';
+ }
+
+ my $page_text_ref = $page->text();
+
+ if (${$page_text_ref} =~ $LOCAL_REDIRECT_RE) {
+ return 'local';
+ }
+
+ return q();
+}
+
+sub is_in_namespace {
+ my ($page, @namespaces) = @_;
+
+ return namespace($page) ~~ [ map { get_string($_) } @namespaces ];
+}
+
+sub is_category {
+ my ($page) = @_;
+ return is_in_namespace($page, 'Category');
+}
+
+sub is_disambig {
+ my ($page) = @_;
+ my $found_templates =
+ find_templates($page->text(), [], [ get_string('disambig') ]);
+ return scalar @{$found_templates};
+}
+
+#
+# This is poor man's gettext.
+# TODO: Replace it with Locale::Maketext
+#
+sub get_strings {
+ my ($lang) = @_;
+
+ my $STRINGS_FN = "$lang.strings.txt";
+
+ open my $STRINGS_FILE, '<:utf8', $STRINGS_FN
+ or croak(file_error('opening', $STRINGS_FN, 'reading'));
+ my @strings_file_lines = <$STRINGS_FILE>;
+ close $STRINGS_FILE
+ or croak(file_error('closing', $STRINGS_FN, 'reading'));
+
+ STRING_LINE:
+ foreach my $next_string_line (@strings_file_lines) {
+
+ # Skip blanks and comments
+ next STRING_LINE if ($next_string_line =~ $STRING_SKIP_RE);
+
+ chomp $next_string_line;
+ my ($english, $target) = split $FIELD_SEP_RE, $next_string_line;
+
+ # Fallback to English if no target language string was supplied
+ $STRING{$english} = $target // $english; # /
+ }
+
+ return;
+}
+
+sub get_string {
+ my ($english, $if_defined) = @_;
+ return $if_defined
+ ? ($STRING{$english})
+ : ($STRING{$english} //= $english); # /
+}
+
+sub make_type_fn {
+ my ($type, $unsorted) = @_;
+ $unsorted //= 0; # /
+
+ #my $transliterated_type = $TRANSLITERATOR->translit($type);
+ my $transliterated_type = $type;
+
+ my $type_fn = "$transliterated_type.$WIKITEXT_EXT";
+
+ $type_fn =~ s{$INVALID_FILENAME_CHAR_RE}{-}xmsgo;
+ my $dir = $unsorted ? $UNSORTED_DIR : $OUT_DIR;
+ $type_fn = "$dir/$type_fn";
+
+ return $type_fn;
+}
+
+sub append_to_file {
+ my ($fn, $line) = @_;
+
+ open my $file, '>>:utf8', $fn
+ or croak(file_error('opening', $fn, 'appending'));
+
+ say {$file} ($line // q{}); # /
+
+ close $file
+ or croak(file_error('closing', $fn, 'appeding'));
+
+ return;
+}
+
+# It appears simple, but in non-alphabetic languages such as Chinese
+# it may be different, so it will sit here ready for better i18n.
+sub get_sort_letter {
+ my ($string) = @_;
+ return substr $string, 0, 1;
+}
+
+sub file_error {
+ my ($operation, $fn, $access_type) = @_;
+ my $string = "error $operation $fn for $access_type: $OS_ERROR";
+ return $string;
+}
+
+sub format_link_table { ## no critic (RequireArgUnpacking)
+ return sprintf '%-15s %8d', @_;
+}
+
+sub print_multi_links_by_foreign {
+ LANG_CODE:
+ foreach my $lang_code (sort keys %found_links) {
+ my $filename = "$MULTI_DIR/$lang_code.$WIKITEXT_EXT";
+ my @foreign_articles = sort keys %{ $found_links{$lang_code} };
+ say format_link_table($lang_code, scalar @foreign_articles);
+
+ FOREIGN_ARTICLE:
+ foreach my $foreign_article (@foreign_articles) {
+ my @local_articles =
+ keys %{ $found_links{$lang_code}->{$foreign_article} };
+ if (scalar @local_articles > 1) {
+ my $links = join q{ | }, sort map { make_link($_) }
+ keys %{ $found_links{$lang_code}->{$foreign_article} };
+
+ my $foreign_title =
+ make_link($lang_code
+ . $MW_SYNTAX{namespace_sep}
+ . $foreign_article);
+ append_to_file($filename, "* '''$foreign_title''' - $links");
+ }
+ }
+ }
+
+ return;
+}
+
+sub print_multi_links_by_local {
+ my %local_multi_links;
+ LANG_CODE:
+ foreach my $lang_code (sort keys %found_links) {
+ my @foreign_articles = sort keys %{ $found_links{$lang_code} };
+ say format_link_table($lang_code, scalar @foreign_articles);
+
+ FOREIGN_ARTICLE:
+ foreach my $foreign_article (@foreign_articles) {
+ my @local_articles =
+ keys %{ $found_links{$lang_code}->{$foreign_article} };
+
+ if (scalar @local_articles > 1) {
+ add_local_multi(
+ \%local_multi_links,
+ make_link(
+ $lang_code
+ . $MW_SYNTAX{namespace_sep}
+ . $foreign_article
+ ),
+ @local_articles
+ );
+ }
+ }
+ }
+
+ say 'writing local multilinks file';
+ my $filename = "$MULTI_DIR/LOCAL.$WIKITEXT_EXT";
+ foreach my $local_multi_article (sort keys %local_multi_links) {
+ append_to_file($filename,
+ '* ' . mw_bold(make_link($local_multi_article)));
+
+ foreach my $other_local_article (
+ sort keys %{ $local_multi_links{$local_multi_article} })
+ {
+ append_to_file($filename,
+ '** ' . make_link($other_local_article));
+ my $foreign_articles = join_links(
+ $local_multi_links{$local_multi_article}
+ ->{$other_local_article},
+ 0
+ );
+ append_to_file($filename, "*** $foreign_articles");
+ }
+ append_to_file($filename);
+ }
+
+ return;
+}
+
+sub add_local_multi {
+ my (
+ $local_multi_links_ref, $foreign_link,
+ $first_local_article, @other_local_articles
+ ) = @_;
+
+ $local_multi_links_ref->{$first_local_article} //= {}; # /
+
+ foreach my $other_local_article (@other_local_articles) {
+ $local_multi_links_ref->{$first_local_article}
+ ->{$other_local_article} //= []; # /
+ push @{ $local_multi_links_ref->{$first_local_article}
+ ->{$other_local_article} }, $foreign_link;
+ }
+
+ return;
+}
+
+sub join_links {
+ my ($links_ref, $line_end) = @_;
+ $line_end //= 1; # /
+
+ my $link_sep = q{ } . $LINK_SEP . ($line_end ? "\n" : q{ });
+ return join $link_sep, @{$links_ref};
+}
+
+sub print_stats {
+ INFO("\nSUMMARY");
+ INFO("total time: $total_time");
+ foreach my $stat_type (sort keys %statistics) {
+ if (not ref $statistics{$stat_type}) {
+ INFO("$stat_type: $statistics{$stat_type}");
+ }
+ }
+
+ my $max_iw_index = $#{ $statistics{count_iw} };
+ INFO("max_iw_index: $max_iw_index");
+ MAX_IW:
+ for my $max_iw_place (0 .. $option{max_iw_places}) {
+ my @links =
+ map { make_link($_) } @{ $statistics{count_iw}->[$max_iw_index] }
+ or last MAX_IW;
+ INFO("# $max_iw_index: " . join_links(\@links, 0));
+
+ # Do nothing, just count down to the next index with a defined list.
+ # $max_iw_index needs to be checked for nonzero-ness for the rare case
+ # of a very low page count.
+ while ($max_iw_index
+ and not defined $statistics{count_iw}->[ --$max_iw_index ])
+ {
+ }
+ }
+
+ INFO('pages without interwiki links per namespace');
+ foreach my $namespace (keys %{ $statistics{'has no interwiki link'} }) {
+ my $iwless_in_namespace =
+ $statistics{'has no interwiki link'}->{$namespace};
+
+ no integer;
+ ## no critic (ProhibitMagicNumbers)
+ my $percentage = sprintf '%.2f',
+ 100 * $iwless_in_namespace / $namespace_count{$namespace};
+ ## use critic (ValuesAndExpressions::ProhibitMagicNumbers)
+ use integer;
+
+ INFO("$namespace: $iwless_in_namespace, $percentage%");
+ }
+
+ INFO("\nNAMESPACES");
+ foreach my $namespace (sort keys %namespace_count) {
+ INFO("$namespace: $namespace_count{$namespace}");
+ }
+
+ INFO("\nTYPES");
+ foreach my $type (sort keys %type_count) {
+ INFO("$type: $type_count{$type}");
+ }
+
+ my $dump_size = -s $dump_fn;
+ my $log_size = -s $LOG_FN;
+ INFO("dump size: $dump_fn");
+ INFO("log size: $LOG_FN");
+ if ($log_size > 0) {
+ my $dump_log_ratio = $dump_size / $log_size;
+ INFO("dump/log ratio: $dump_log_ratio");
+ }
+ else {
+ WARN("weird: log file size $LOG_FN is 0");
+ }
+
+ return;
+}
+
+__END__
+
+=head1 NAME
+
+C<prepare_noiw_list.pl> - Searches a MediaWiki dump for pages without
+interlanguage links and prepares categorized lists of those pages.
+
+=head1 VERSION
+
+This documentation refers to
+
+prepare_noiw_list.pl
+
+version 0.2.1 - Noa.
+
+=head1 USAGE
+
+=over
+
+=item * C<prepare_noiw_list.pl ./big-files/ruwiki-20080420-pages-meta-current.xml>
+
+=item * C<prepare_noiw_list.pl --rtl ./big-files/hewiki-20080420-pages-meta-current.xml>
+
+=item * C<prepare_noiw_list.pl --stop_after=20000 ./big-files/hewiki-20080420-pages-meta-current.xml>
+
+=back
+
+=head1 REQUIRED ARGUMENTS
+
+=over
+
+=item * MediaWiki dump file name is required
+
+=back
+
+=head1 OPTIONS
+
+=over
+
+=item * --rtl makes special fixes for right-to-left languages.
+
+=item * --stop_after=NUMBER Stops processing after page with the given
+NUMBER.
+
+=item * --start_from=NUMBER Begins processing after page with the given
+NUMBER.
+
+=item * --page_freq=NUMBER Print the page counter every NUMBER of
+pages. 1000 by default.
+
+=item * --max_links_per_section Maximum number of links per section in
+the output page. Default is 100.
+
+=item * --max_sections_per_page Maximum number of sections per output
+page. Default is 20.
+
+=item * --max_iw_places Number of places to print in the statistics of
+pages with the most interlanguage links.
+
+=item * --version Print the version number and exit.
+
+=item * --usage Print basic usage and exit.
+
+=item * --help Print full help and exit.
+
+=back
+
+=head1 DESCRIPTION
+
+The main goal of this program is to find pages which do not have
+interwiki (interlanguage) links to certain languages.
+
+This program scans a MediaWiki XML dump file. It searches every page for
+interwiki links and for a special hidden template, which defines to which
+types of interwiki-less pages it belongs.
+
+=over
+
+=item * If the page contains links to the defined languages and does not
+contain the "no interwiki" template, its processing stops.
+
+=item * If the page contains links to the defined languages and contains this
+template, it is logged, so the template can be removed. (It is planned that
+it will be removed automatically in the future.)
+
+=item * If the page contains no links to the defined languages and does not
+contain the template, it is automatically added to type "other".
+
+=item * If the page contains no links to the defined languages and
+a template with types, it is added to the defined types.
+
+=back
+
+Pages without links are added to nicely formatted lists according
+to their type.
+
+This program also collects some information on the way about problematic
+cases - more than one "no interwiki" template, invalid templates, invalid
+or redundant interwiki links etc.
+
+You can get a dump for using this program at L<http://download.wikimedia.org/>.
+
+=head1 DIAGNOSTICS
+
+=head2 Invalid command line options.
+
+See L</"REQUIRED ARGUMENTS"> and L</"OPTIONS">.
+
+=head2 FILENAME is a weird dump file name.
+
+The dump file does not appear to have a standard name that appears
+at L<http://download.wikimedia.org/>.
+
+=head2 error opening FILENAME ...
+
+=head2 error closing FILENAME ...
+
+Check the corresponding error.
+
+=head2 unable to handle any case setting besides 'first-letter'
+
+Something is weird with the dump. See the documentation of
+L<Parse::MediaWikiDump> and MediaWiki.
+
+=head2 A page has no pure title
+
+Something is particularly weird with the name of a page. The program cannot
+separate its name from its namespace. It can also be a bug in this program.
+
+=head2 Some weirdness happened - STRING does not look a param
+
+STRING is supposed to be a parameter in a template, but it does not look like
+one. It could be an error in the template, and also a bug in this program
+(the parser that this program employs is rather stupid).
+
+=head2 Unicode character 0xNUMBER is illegal
+
+This is a standard Perl warning. It may appear if a page or its title have
+funky Unicode characters which should not be there according to the Unicode
+standard (to be more precise, according to the implementation of this
+standard in your version of perl). Most probably these characters are not
+supposed to be in the page and should be fixed, but otherwise this issue
+is not supposed to affect the functionality of this program significantly.
+
+This was reported as a MediaWiki bug:
+L<https://bugzilla.wikimedia.org/show_bug.cgi?id=14600>
+
+=head1 EXIT STATUS
+
+Nothing interesting, read the log :)
+
+=head1 CONFIGURATION
+
+=head2 Language codes file
+
+The file name has the form C<LANGUAGE_CODE.language_codes.txt>, where
+LANGUAGE_CODE is the code of the wiki which is being processed. It includes
+a list of language codes which are searched, the full name of the language
+and the group to which this language belongs.
+
+=head2 Localized strings file
+
+The file name has the form C<LANGUAGE_CODE.strings.txt>, where LANGUAGE_CODE
+is the code of the wiki which is being processed. This file includes pairs
+of strings - the English name of a feature and its localized counterpart.
+
+=head2 Log
+
+The log is called outtest.log.
+
+=head1 DEPENDENCIES
+
+This module requires these CPAN modules:
+
+=over
+
+=item * C<Parse::MediaWikiDump>
+
+This module is used for reading pages from the XML dump.
+
+=item * C<Regexp::Common>
+
+This module is used for searching templates which use balanced parenthesis,
+but pretty badly. :)
+
+=item * C<Log::Log4perl>
+
+This module provides easy global logging.
+
+=item * C<Lingua::Translit>
+
+This module is used for transliterating filenames to ASCII.
+
+=item * C<Readonly>
+
+To make Perl::Critic happy :)
+
+=item * C<Pod::Usage>
+
+Some style guide recommended it. I don't even remember which one, but i love style guides.
+
+=back
+
+=head1 HACKING
+
+=head2 Perl 5.10
+
+This program requires Perl 5.10. It has new clean and useful syntax, which
+makes the programs easier to hack, maintain and debug. It is useless to try
+and run it on an older version, unless you want to waste your time
+backporting. Please upgrade your Perl installation if you still have 5.8 or
+(horrors!) something older.
+
+=head2 Perl Best Practices, Perl::Critic and perltidy
+
+Great effort has been put into making this source code pass as cleanly as
+possible the Perl::Critic tests in the 'brutal' mode. It also uses perltidy
+for automatic code formatting. If you modify it, do yourself a favor, install
+Perl::Critic and regularly test it using this command:
+
+./tidy.sh prepare_noiw_list.pl
+
+It checks the syntax, runs perltidy on the code and runs Perl::Critic.
+
+All the places where P::C has been disabled using "no critic" are explained.
+
+The time invested in making the code P::C-friendly will be returned as time
+saved on debugging. Also consider reading the book "Perl Best Practices" by
+Damian Conway if you have not already.
+
+=head1 INCOMPATIBILITIES
+
+=head2 Unicode issues
+
+This program works best on GNU/Linux, where Perl and the filesystem are
+Unicode-friendly.
+
+This program was also tested on Windows XP and Vista with ActivePerl and
+Cygwin. In these environments Unicode-related issues caused
+filenames and clipboard text to become jumbled. You have been warned.
+
+=head1 BUGS AND LIMITATIONS
+
+Please report all bugs, features requests and other comments to
+Amir E. Aharoni (ami...@ma...).
+
+=head2 There is no equality between languages
+
+Currently this program actually only lists pages which do not have
+an interwiki link to the English Wikipedia. This is obviously not useful on
+the English Wikipedia and is conceptually problematic on other Wikipedias,
+too. This is being fixed, but it is not simple to do it Right.
+
+Goal: 0.4 Reut
+
+=head2 Internationalization is far from perfect
+
+Date handling and strings localization is very primitive. There are plans
+to upgrade it to smarter modules such as Locale::Maketext.
+
+Goal: 0.6 Itay
+
+=head2 MediaWiki parsing is ad hoc
+
+This program only does very rudimentary and ad hoc MediaWiki syntax parsing.
+
+Goal: None at the moment, it works well enough.
+
+=head2 Templates are removed semi-automatically
+
+Templates on pages which already have needed links are not removed
+automatically. A list of them is created and a bot can run on it and remove
+the outdated templates. This can be done automatically.
+
+Goal: None at the moment, it works well enough.
+
+=head2 Cooling date
+
+The implementation of the cooling date is very rudimentary.
+
+Goal version: v0.4 Reut
+
+=head2 Major refactoring is needed
+
+=over
+
+=item * The main code is on the brink of passing the threshold for complexity that
+P::C accepts.
+
+=item * There is no separation of searching and formatting. There are two main
+function here: C<process_dump()> and C<create_no_iw_pages()>. They are doing
+separate things and should run from different programs.
+
+=item * Statistics and multi links are just slapped to the log.
+
+=item * At least some of the code can be rewritten as classes that inherit
+from L<Parse::MediaWikiDump>.
+
+=back
+
+Goal: v0.8 Moshe
+
+=head2 There is no test suite
+
+That can be done after proper modularization. Also, a local test MediaWiki
+server would be needed.
+
+Goal: v1.0 Drora
+
+=head1 HISTORY
+
+=over
+
+=item * B<0.2 - Noa>: Perl 5.10. Russian l10n. POD documentation. Pretty
+categories sorting. Memory usage optimization - accumulating information in
+files. More generic, but far-from-perfect handling of links to languages
+other than English. Transliteration with Lingua::Translit. Logging with
+Log::Log4perl. Brutal Perl::Critic 1.90. Started using Readonly. Not finished:
+complete statistics, removal of templates from pages which already have links.
+
+=item * B<0.1 - First and unnamed Amir E. Aharoni's version>: Types
+introduced. Conceptual l10n, but only tested on he.wiki. Still en links
+only. Informative, though lo-tec logging.
+
+=item * B<0.0 - Felagund's version>: no types, he.wiki only, en links only,
+but it uploaded the lists automatically...
+
+=back
+
+=head1 AUTHOR
+
+=over
+
+=item * Creator is Nadav Perez (Felagund)
+
+=over
+
+=item * Hebrew: L<http://he.wikipedia.org/wiki/User:Felagund>
+
+=back
+
+=item * It is (roughly) based on another bot by Guy Shaked (Costello).
+
+=over
+
+=item * Hebrew: L<http://he.wikipedia.org/wiki/User:Costello>
+
+=back
+
+=item * Then Amir E. Aharoni (Amire80) came and mutilated it beyond
+recognition.
+
+=over
+
+=item * Hebrew: L<http://he.wikipedia.org/wiki/User:Amire80>
+
+=item * English: L<http://en.wikipedia.org/wiki/User:Amire80>
+
+=item * Russian: L<http://ru.wikipedia.org/wiki/User:Amire80>
+
+=back
+
+=back
+
+=head1 LICENSE AND COPYRIGHT
+
+Copyright 2009 Guy Shaked, Nadav Perez, Amir E. Aharoni.
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License version 3
+as published by the Free Software Foundation or under the terms of
+Artistic License version 2.0.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the licenses
+along with this program. If not, see
+
+=over
+
+=item * L<http://www.gnu.org/licenses/gpl-3.0.html>.
+
+=item * L<http://www.perlfoundation.org/artistic_license_2_0>
+
+=back
+
+I<Visca la llibertat!>
+
+=cut
+
+1;
+
Property changes on: trunk/no-interwiki/Maintenance.pm
___________________________________________________________________
Added: svn:executable
+ *
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <am...@us...> - 2009-09-29 13:25:18
|
Revision: 83
http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=83&view=rev
Author: amire80
Date: 2009-09-29 13:25:11 +0000 (Tue, 29 Sep 2009)
Log Message:
-----------
Refactoring empty pages handling.
Modified Paths:
--------------
trunk/no-interwiki/cv.strings.txt
trunk/no-interwiki/prepare_noiw_list.pl
Modified: trunk/no-interwiki/cv.strings.txt
===================================================================
--- trunk/no-interwiki/cv.strings.txt 2009-09-27 12:41:34 UTC (rev 82)
+++ trunk/no-interwiki/cv.strings.txt 2009-09-29 13:25:11 UTC (rev 83)
@@ -23,7 +23,8 @@
type тĕс
# MW specials
-REDIRECT куçару
+#REDIRECT куçару
+REDIRECT перенаправление
# Namespaces
User Хутшăнакан
Modified: trunk/no-interwiki/prepare_noiw_list.pl
===================================================================
--- trunk/no-interwiki/prepare_noiw_list.pl 2009-09-27 12:41:34 UTC (rev 82)
+++ trunk/no-interwiki/prepare_noiw_list.pl 2009-09-29 13:25:11 UTC (rev 83)
@@ -463,55 +463,40 @@
say $page_counter;
}
- last PAGE
- if ($option{stop_after}
- and $page_counter > $option{stop_after});
-
- next PAGE
- if ($page_counter < $option{start_from});
-
- my $page_namespace = namespace($page_ref);
- my $page_title = $page_ref->title();
- my $page_text_ref = $page_ref->text();
-
- if (not defined $page_text_ref) {
- WARN('ref to page text undefined, skipping');
- $statistics{'ref to page text undefined'}++;
- next PAGE;
+ if ( $option{stop_after}
+ and $page_counter > $option{stop_after})
+ {
+ last PAGE;
}
- if (not defined ${$page_text_ref}) {
- WARN('page text undefined, skipping');
- $statistics{'page text undefined'}++;
- next PAGE;
- }
+ my $page_text_ref = $page_ref->text();
- if (${$page_text_ref} eq q()) {
- WARN('page text empty, skipping');
- $statistics{'page text empty'}++;
+ if (
+ $page_counter < $option{start_from}
+ or not defined ${$page_text_ref} # must be tested before redirect
+ or not is_in_namespace($page_ref, @INCLUDE_NAMESPACES)
+ or is_redirect($page_ref)
+ )
+ {
next PAGE;
}
- if (not is_in_namespace($page_ref, @INCLUDE_NAMESPACES)) {
- next PAGE;
- }
+ my $page_namespace = namespace($page_ref);
+ $namespace_count{$page_namespace}++;
- if (my $redirect_type = is_redirect($page_ref)) {
- $statistics{"redirect - $redirect_type"}++;
- DEBUG("\n$page_title - $redirect_type redirect");
- next PAGE;
- }
+ my $page_title = $page_ref->title();
+ INFO("\n* processing $page_counter - ", $page_title);
+
# TODO: Be more precise here.
# Portal pages which have a '/' in their name are probably
# internal and do not need interwiki links.
if (is_in_namespace($page_ref, 'Portal') and $page_title =~ m{/}xms) {
+ INFO('internal portal, skipping');
+ $statistics{'internal portal'}++;
next PAGE;
}
- $namespace_count{$page_namespace}++;
- INFO("\n* processing $page_counter - ", $page_title);
-
# A simple sanity check: is the no_iw template anywhere around here?
my $has_template_no_iw =
(${$page_text_ref} =~ $SIMPLE_NO_IW_CHECK_RE);
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <am...@us...> - 2009-09-27 12:41:46
|
Revision: 82
http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=82&view=rev
Author: amire80
Date: 2009-09-27 12:41:34 +0000 (Sun, 27 Sep 2009)
Log Message:
-----------
Refactoring, improving performance. Gmar khatima tova.
Modified Paths:
--------------
trunk/no-interwiki/prepare_noiw_list.pl
Modified: trunk/no-interwiki/prepare_noiw_list.pl
===================================================================
--- trunk/no-interwiki/prepare_noiw_list.pl 2009-09-27 00:44:08 UTC (rev 81)
+++ trunk/no-interwiki/prepare_noiw_list.pl 2009-09-27 12:41:34 UTC (rev 82)
@@ -30,7 +30,7 @@
# pragmata
use strict;
use warnings;
-use integer;
+use integer; # Improves performance
use open ':utf8';
use utf8;
use charnames ':full';
@@ -54,7 +54,7 @@
use Regexp::Common;
use Lingua::Translit;
use Log::Log4perl qw(:easy);
-use Parse::MediaWikiDump 0.92; # Earlier versions have a different API
+use Parse::MediaWikiDump 0.91; # Earlier versions have a different API
our $VERSION = '0.2.1';
#<<< no perltidy
@@ -366,11 +366,10 @@
}
}
-my $pmwd = Parse::MediaWikiDump->new();
-my $dump = $pmwd->revisions($dump_fn);
+my $the_dump = load_dump();
my $namespaces_alt = join $ALT_SEP,
- grep { length > 0 } @{ $dump->namespaces_names() };
+ grep { length > 0 } @{ $the_dump->namespaces_names() };
my $PURE_TITLE_RE = qr{
\A
(?:
@@ -383,7 +382,7 @@
# This is the only currently known value
# but there could be more in the future
-if ($dump->case() ne 'first-letter') {
+if ($the_dump->case() ne 'first-letter') {
croak(q{unable to handle any case setting besides 'first-letter'});
}
@@ -420,12 +419,35 @@
exit;
+sub load_dump {
+ Readonly my $WORKING_PMWD_VER => 0.91;
+ Readonly my $FUTURE_PMWD_VER => 0.94;
+
+ my $dump;
+
+ if ($Parse::MediaWikiDump::VERSION == $WORKING_PMWD_VER) {
+ $dump = Parse::MediaWikiDump::Pages->new($dump_fn);
+ }
+ else {
+ if ($Parse::MediaWikiDump::VERSION < $FUTURE_PMWD_VER) {
+ carp( 'You are running Parse::MediaWikiDump version '
+ . $Parse::MediaWikiDump::VERSION
+ . ".\n Redirect handling may be broken\n");
+ }
+ my $pmwd = Parse::MediaWikiDump->new();
+ $dump = $pmwd->revisions($dump_fn);
+ }
+
+ return $dump;
+}
+
sub namespace {
my ($page) = @_;
return $page->namespace() || get_string('article space');
}
sub next_page {
+ my ($dump) = @_;
my $page_ref = eval { $dump->next(); };
if ($EVAL_ERROR) {
confess("Failed reading a page: $EVAL_ERROR");
@@ -435,7 +457,7 @@
sub find_iwless {
PAGE:
- while (my $page_ref = next_page()) {
+ while (my $page_ref = next_page($the_dump)) {
$page_counter++;
if ($page_counter % $option{page_freq} == 0) {
say $page_counter;
@@ -1390,12 +1412,14 @@
foreach my $namespace (keys %{ $statistics{'has no interwiki link'} }) {
my $iwless_in_namespace =
$statistics{'has no interwiki link'}->{$namespace};
+
no integer;
## no critic (ProhibitMagicNumbers)
my $percentage = sprintf '%.2f',
100 * $iwless_in_namespace / $namespace_count{$namespace};
## use critic (ValuesAndExpressions::ProhibitMagicNumbers)
use integer;
+
INFO("$namespace: $iwless_in_namespace, $percentage%");
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <am...@us...> - 2009-09-27 00:44:16
|
Revision: 81
http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=81&view=rev
Author: amire80
Date: 2009-09-27 00:44:08 +0000 (Sun, 27 Sep 2009)
Log Message:
-----------
Multilinks cleanup.
Modified Paths:
--------------
trunk/no-interwiki/prepare_noiw_list.pl
Modified: trunk/no-interwiki/prepare_noiw_list.pl
===================================================================
--- trunk/no-interwiki/prepare_noiw_list.pl 2009-09-26 23:43:11 UTC (rev 80)
+++ trunk/no-interwiki/prepare_noiw_list.pl 2009-09-27 00:44:08 UTC (rev 81)
@@ -409,10 +409,10 @@
my $begin_multi_links_time = time;
-say 'listing multi links by language';
+say "\nlisting multi links by language";
print_multi_links_by_foreign();
-say 'listing multi links by local articles';
+say "\nlisting multi links by local articles";
print_multi_links_by_local();
my $total_multi_links_time = time - $begin_multi_links_time;
@@ -1252,12 +1252,16 @@
return $string;
}
+sub format_link_table { ## no critic (RequireArgUnpacking)
+ return sprintf '%-15s %8d', @_;
+}
+
sub print_multi_links_by_foreign {
LANG_CODE:
foreach my $lang_code (sort keys %found_links) {
my $filename = "$MULTI_DIR/$lang_code.$WIKITEXT_EXT";
my @foreign_articles = sort keys %{ $found_links{$lang_code} };
- say "$lang_code ", scalar @foreign_articles;
+ say format_link_table($lang_code, scalar @foreign_articles);
FOREIGN_ARTICLE:
foreach my $foreign_article (@foreign_articles) {
@@ -1284,7 +1288,7 @@
LANG_CODE:
foreach my $lang_code (sort keys %found_links) {
my @foreign_articles = sort keys %{ $found_links{$lang_code} };
- say "$lang_code ", scalar @foreign_articles;
+ say format_link_table($lang_code, scalar @foreign_articles);
FOREIGN_ARTICLE:
foreach my $foreign_article (@foreign_articles) {
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <am...@us...> - 2009-09-26 23:43:19
|
Revision: 80
http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=80&view=rev
Author: amire80
Date: 2009-09-26 23:43:11 +0000 (Sat, 26 Sep 2009)
Log Message:
-----------
Copyright, MediaWikiDump 0.92 requirement, version number cleanup, RTL cleanup, logging cleanup, POD cleanup, get_string cleanup, simple dump ratio, P::C cleanup, adding pod spelling.
Modified Paths:
--------------
trunk/no-interwiki/.perlcriticrc
trunk/no-interwiki/prepare_noiw_list.pl
Added Paths:
-----------
trunk/no-interwiki/stopwords.txt
Modified: trunk/no-interwiki/.perlcriticrc
===================================================================
--- trunk/no-interwiki/.perlcriticrc 2009-09-26 10:51:33 UTC (rev 79)
+++ trunk/no-interwiki/.perlcriticrc 2009-09-26 23:43:11 UTC (rev 80)
@@ -2,6 +2,10 @@
[Variables::ProhibitPunctuationVars]
allow = %+ $+ @+
-# This is probably useful, but appears buggy, so it should remain manual
-[-Documentation::PodSpelling]
+[Documentation::PodSpelling]
+stop_words_file = stopwords.txt
+# say is for writing to the terminal, no need to check it
+[InputOutput::RequireCheckedSyscalls]
+exclude_functions = say
+
Modified: trunk/no-interwiki/prepare_noiw_list.pl
===================================================================
--- trunk/no-interwiki/prepare_noiw_list.pl 2009-09-26 10:51:33 UTC (rev 79)
+++ trunk/no-interwiki/prepare_noiw_list.pl 2009-09-26 23:43:11 UTC (rev 80)
@@ -1,7 +1,7 @@
#!/usr/bin/perl
# prepare_noiw_list.pl
-# version 0.2 Noa - development
+# version 0.2.1 Noa - development
# See the POD documentation at the end of the file or run
# perldoc prepare_noiw_list.pl
@@ -9,9 +9,21 @@
# "Had he been a French child,
# he would have heard an infinite number of sentences" - Otto Jespersen
-# This program is Free Software; you can redistribute it and/or modify it
-# under the same terms as Perl itself.
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 3
+# as published by the Free Software Foundation or under the terms of
+# Artistic License version 2.0.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the licenses
+# along with this program. If not, see
+# * <http://www.gnu.org/licenses/gpl-3.0.html>.
+# * <http://www.perlfoundation.org/artistic_license_2_0>
+
# Upgrade! This program actually uses new Perl 5.10 constructs, so you need it
use 5.010;
@@ -38,11 +50,13 @@
# I only use Readonly where P::C complains about it. Using it with regular
# expressions which have non-ASCII chars produces heisenbugs.
use Readonly;
+use Pod::Usage;
use Regexp::Common;
use Lingua::Translit;
use Log::Log4perl qw(:easy);
-use Parse::MediaWikiDump 0.51; # Earlier versions have a bug in namespaces
+use Parse::MediaWikiDump 0.92; # Earlier versions have a different API
+our $VERSION = '0.2.1';
#<<< no perltidy
my %SVN_PROPS = ( ## no critic (RequireInterpolationOfMetachars)
Revision => '$Revision$',
@@ -50,8 +64,9 @@
Date => '$Date$',
);
#>>>
-our $VERSION = ($SVN_PROPS{Revision} =~ /\A\$Revision:\ (?<revision_num>\d+)\ \$\z/xms)
- ? "0.1.9.$+{revision_num}"
+our $FULL_VERSION =
+ ($SVN_PROPS{Revision} =~ /\A\$Revision:\ (?<revision_num>\d+)\ \$\z/xms)
+ ? "$VERSION.$+{revision_num}"
: croak(q(Something is wrong with SVN revision number));
my $WIKITEXT_EXT = 'wiki.txt';
@@ -62,6 +77,7 @@
my $FIELD_SEP = qq{\t};
my $LINK_SEP = q{|};
my $TYPE_SEP = qr/\s*;\s*/xms;
+my $HTML_RLM = '‏';
Readonly my $DEFAULT_MAX_IW_PLACES => 20;
Readonly my $DEFAULT_PAGE_FREQ => 1000;
@@ -69,11 +85,12 @@
Readonly my $DEFAULT_MAX_SECTIONS_PER_PAGE => 20;
# Initialize logger
+my $LOG_FN = 'outtest.log';
Log::Log4perl->easy_init(
{
- level => $INFO, # print everything
- file => ':utf8>outtest.log', # utf is important
- layout => '%m%n', # No need to print the date
+ level => $INFO, # print everything
+ file => ":utf8>$LOG_FN", # utf is important
+ layout => '%m%n', # No need to print the date
}
);
@@ -85,6 +102,9 @@
max_links_per_section => $DEFAULT_MAX_LINKS_PER_SECTION,
max_sections_per_page => $DEFAULT_MAX_SECTIONS_PER_PAGE,
max_iw_places => $DEFAULT_MAX_IW_PLACES,
+ help => 0,
+ usage => 0,
+ version => 0,
);
my $valid_options = GetOptions(
@@ -95,12 +115,42 @@
'max_links_per_section' => \$option{max_links_per_section},
'max_sections_per_page' => \$option{max_sections_per_page},
'max_iw_places' => \$option{max_iw_places},
+ 'help' => \$option{help},
+ 'usage' => \$option{usage},
+ 'version' => \$option{version},
);
if (not $valid_options) {
croak('Invalid command line options.');
}
+if ($option{'version'}) {
+ my $program_name = __FILE__;
+ say <<"END_VERSION";
+$program_name version $FULL_VERSION
+This program searches for pages without interlanguage links (interwiki).
+
+Copyright 2005-2009 Guy Shaked, Nadav Perez and Amir E. Aharoni.
+
+This program is Free Software; you can redistribute it and/or modify it
+under the terms of Artistic License version 2.0 or the
+GNU General Public license version 3.0.
+
+Run `perldoc $program_name' for full terms.
+END_VERSION
+ exit;
+}
+
+if ($option{'help'} or $option{'usage'}) {
+ my $verbose_level = $option{'usage'} ? 1 : 2;
+ pod2usage(
+ {
+ -exitval => 0,
+ -verbose => $verbose_level,
+ }
+ );
+}
+
# XXX Too coupled to Wikipedia, won't work for other projects.
my $DUMP_FN_RE = qr{
\A # Begin string
@@ -168,14 +218,14 @@
@REV_MONTH{@MONTHS} = (0 .. $LAST_MONTH);
# XXX Internationalize
+my $string_in = get_string('in');
my $HEB_DATE_RE = qr{
\A # begin
(?<hour>\d{1,2}) # hour
: # :
(?<min>\d{2}),\s # minute
(?<mday>\d{1,2})\s # day of month
- $STRING{in}? # This preposition appears sometimes
- # It should have been get_string()
+ $string_in? # This preposition appears sometimes
(?<mon>$ALT_MONTHS)\s # A name of a month
(?<year>\d+?)\s # Year
\([A-Z]{3}\) # Three letters in brackets - timezone
@@ -193,9 +243,9 @@
(?<value>.*) # value
}xms;
-# XXX It should use get_string()
+my $string_no_iw = get_string('no_iw');
my $SIMPLE_NO_IW_CHECK_RE = qr{
- \Q$STRING{no_iw}\E # The string may have spaces
+ \Q$string_no_iw\E # The string may have spaces
}xmsi;
# A simplistic template just for testing.
@@ -218,21 +268,21 @@
# Regular expression mostly copied from
# Parse::MediaWikiDump::page::redirect
# TODO: Try to look for the local language redirect keyword in the dump.
+my $local_redirect = get_string('REDIRECT');
my $LOCAL_REDIRECT_RE = qr{
\A # Beginning of string (page)
\# # a # character
- $STRING{REDIRECT} # Redirect keyword in local language
- # XXX It should use get_string()
+ $local_redirect # Redirect keyword in local language
\s*:?\s*\[\[([^\]]*)\]\] # the link after the redirect
}xmsi;
-my $LTR_CHAR_RE = qr/\P{IsLeftToRight}/xms;
+my $LTR_CHAR_RE = qr/\P{IsRightToLeft}/xms; # \P is negation
my $SECTION_LINK_RE = qr{(?<!&)\#}xms;
my $LOWERCASE_LINK_RE = qr{\A[[:lower:]]}xms;
-## no critic (ProhibitEscapedMetacharacters)
+## no critic (RegularExpressions::ProhibitEscapedMetacharacters)
my $TRUE_TEMPLATE_RE = qr/\{ $RE{balanced}{-parens=>'{}'} \}/xms;
-## use critic (ProhibitEscapedMetacharacters)
+## use critic (RegularExpressions::ProhibitEscapedMetacharacters)
# get_string() cannot be used in re
my $string_exclude_lowercase = get_string('exclude_lowercase');
@@ -357,15 +407,16 @@
# my @found_lang_codes = sort keys %found_links;
# INFO("found lang_codes: @found_lang_codes");
-say 'looking for multi links';
my $begin_multi_links_time = time;
+say 'listing multi links by language';
print_multi_links_by_foreign();
+say 'listing multi links by local articles';
print_multi_links_by_local();
my $total_multi_links_time = time - $begin_multi_links_time;
-say "total multi links time: $total_multi_links_time";
+INFO("total multi links time: $total_multi_links_time");
exit;
@@ -784,8 +835,7 @@
{
my $include_lowercase_link = 1;
- # XXX get_string() cannot be used here
- if (defined $STRING{exclude_lowercase}
+ if (defined get_string('exclude_lowercase', 'if defined')
and $foreign_article =~ $EXCLUDE_LOWERCASE_RE)
{
$include_lowercase_link = 0;
@@ -807,15 +857,7 @@
}
push @{ $statistics{count_iw}->[$count_iw] }, $page_title;
}
- INFO(
- "iw link count for $page_title"
- . (
- $option{'rlm'}
- ? "\N{LEFT-TO-RIGHT MARK}"
- : q()
- )
- . " is: $count_iw"
- );
+ INFO("iw link count for $page_title is: $count_iw");
for my $special_case_name (keys %special_cases) {
if (scalar %{ $special_cases{$special_case_name} }) {
@@ -835,14 +877,19 @@
sub special_cases_file {
my ($special_case_name, $page, $special_cases_ref) = @_;
$special_cases_ref //= {}; # /
- my $special_case_langs = join q{, }, sort keys %{$special_cases_ref};
+
+ my $special_case_langs = join q{ }, sort keys %{$special_cases_ref};
+
if ($special_case_langs) {
$special_case_langs = " ($special_case_langs)";
}
+
my $special_case_fn = make_type_fn($special_case_name, 1);
+
if (not -e $special_case_fn) {
append_to_file($special_case_fn, $special_case_name);
}
+
my $page_title = $page->title();
my $link = make_link($page_title);
my $line =
@@ -852,6 +899,7 @@
. get_sort_title($page_title);
append_to_file($special_case_fn, $line);
+
return;
}
@@ -954,12 +1002,8 @@
. $page_title
. $MW_SYNTAX{end_link};
- if ($option{rtl}) {
- if ($page_title =~ $LTR_CHAR_RE) {
-
- # XXX get_string() cannot be used here
- $link_to_page = $STRING{rlm} . $link_to_page . $STRING{rlm};
- }
+ if ($option{rtl} and $page_title =~ $LTR_CHAR_RE) {
+ $link_to_page = $HTML_RLM . $link_to_page . $HTML_RLM;
}
return $link_to_page;
@@ -968,7 +1012,7 @@
sub create_no_iw_pages {
my ($params) = @_;
- INFO('creating no_iw pages');
+ INFO("\ncreating no_iw pages");
# Run over page types
UNSORTED_TYPE_FN:
@@ -1082,9 +1126,10 @@
return "'''$text'''";
}
-# Custom Unicode character property for finding characters.
-# The custom is to give those subroutines CamelCase names.
-sub IsLeftToRight { ## no critic (ProhibitMixedCaseSubs)
+# Custom Unicode character property, which is like \w, but for Hebrew.
+# The custom is to give custom Unicode character classes CamelCase names.
+# P::C policy ProhibitMixedCaseSubs is deprecated.
+sub IsRightToLeft { ## no critic (Capitalization)
return <<'END';
+utf8::InHebrew
+utf8::IsSpace
@@ -1158,8 +1203,10 @@
}
sub get_string {
- my ($english) = @_;
- return $STRING{$english} //= $english; # /
+ my ($english, $if_defined) = @_;
+ return $if_defined
+ ? ($STRING{$english})
+ : ($STRING{$english} //= $english); # /
}
sub make_type_fn {
@@ -1210,6 +1257,8 @@
foreach my $lang_code (sort keys %found_links) {
my $filename = "$MULTI_DIR/$lang_code.$WIKITEXT_EXT";
my @foreign_articles = sort keys %{ $found_links{$lang_code} };
+ say "$lang_code ", scalar @foreign_articles;
+
FOREIGN_ARTICLE:
foreach my $foreign_article (@foreign_articles) {
my @local_articles =
@@ -1235,6 +1284,8 @@
LANG_CODE:
foreach my $lang_code (sort keys %found_links) {
my @foreign_articles = sort keys %{ $found_links{$lang_code} };
+ say "$lang_code ", scalar @foreign_articles;
+
FOREIGN_ARTICLE:
foreach my $foreign_article (@foreign_articles) {
my @local_articles =
@@ -1254,10 +1305,12 @@
}
}
+ say 'writing local multilinks file';
my $filename = "$MULTI_DIR/LOCAL.$WIKITEXT_EXT";
foreach my $local_multi_article (sort keys %local_multi_links) {
append_to_file($filename,
'* ' . mw_bold(make_link($local_multi_article)));
+
foreach my $other_local_article (
sort keys %{ $local_multi_links{$local_multi_article} })
{
@@ -1304,7 +1357,7 @@
sub print_stats {
INFO("\nSUMMARY");
- say "total time: $total_time";
+ INFO("total time: $total_time");
foreach my $stat_type (sort keys %statistics) {
if (not ref $statistics{$stat_type}) {
INFO("$stat_type: $statistics{$stat_type}");
@@ -1346,11 +1399,24 @@
foreach my $namespace (sort keys %namespace_count) {
INFO("$namespace: $namespace_count{$namespace}");
}
+
INFO("\nTYPES");
foreach my $type (sort keys %type_count) {
INFO("$type: $type_count{$type}");
}
+ my $dump_size = -s $dump_fn;
+ my $log_size = -s $LOG_FN;
+ INFO("dump size: $dump_fn");
+ INFO("log size: $LOG_FN");
+ if ($log_size > 0) {
+ my $dump_log_ratio = $dump_size / $log_size;
+ INFO("dump/log ratio: $dump_log_ratio");
+ }
+ else {
+ WARN("weird: log file size $LOG_FN is 0");
+ }
+
return;
}
@@ -1413,6 +1479,12 @@
=item * --max_iw_places Number of places to print in the statistics of
pages with the most interlanguage links.
+=item * --version Print the version number and exit.
+
+=item * --usage Print basic usage and exit.
+
+=item * --help Print full help and exit.
+
=back
=head1 DESCRIPTION
@@ -1545,6 +1617,10 @@
To make Perl::Critic happy :)
+=item * C<Pod::Usage>
+
+Some style guide recommended it. I don't even remember which one, but i love style guides.
+
=back
=head1 HACKING
@@ -1564,7 +1640,7 @@
for automatic code formatting. If you modify it, do yourself a favor, install
Perl::Critic and regularly test it using this command:
-./tidy.sh
+./tidy.sh prepare_noiw_list.pl
It checks the syntax, runs perltidy on the code and runs Perl::Critic.
@@ -1710,9 +1786,29 @@
=head1 LICENSE AND COPYRIGHT
-This program is Free Software; you can redistribute it and/or modify it
-under the same terms as Perl itself.
+Copyright 2009 Guy Shaked, Nadav Perez, Amir E. Aharoni.
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License version 3
+as published by the Free Software Foundation or under the terms of
+Artistic License version 2.0.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the licenses
+along with this program. If not, see
+
+=over
+
+=item * L<http://www.gnu.org/licenses/gpl-3.0.html>.
+
+=item * L<http://www.perlfoundation.org/artistic_license_2_0>
+
+=back
+
I<Visca la llibertat!>
=cut
Added: trunk/no-interwiki/stopwords.txt
===================================================================
--- trunk/no-interwiki/stopwords.txt (rev 0)
+++ trunk/no-interwiki/stopwords.txt 2009-09-26 23:43:11 UTC (rev 80)
@@ -0,0 +1,40 @@
+ActivePerl
+Aharoni
+Aharoni's
+Amir
+Amire
+backporting
+CPAN
+Cygwin
+Drora
+Felagund
+Felagund's
+FILENAME
+filenames
+filesystem
+hoc
+interlanguage
+interwiki
+Itay
+llibertat
+MediaWiki
+multi
+Nadav
+namespace
+Noa
+outtest
+param
+perl
+perltidy
+Readonly
+refactoring
+Reut
+rtl
+Shaked
+tec
+Visca
+wiki
+Wikipedia
+Wikipedias
+xNUMBER
+XP
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <am...@us...> - 2009-09-26 10:51:46
|
Revision: 79
http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=79&view=rev
Author: amire80
Date: 2009-09-26 10:51:33 +0000 (Sat, 26 Sep 2009)
Log Message:
-----------
Better diff.
Modified Paths:
--------------
trunk/no-interwiki/tidy.sh
Modified: trunk/no-interwiki/tidy.sh
===================================================================
--- trunk/no-interwiki/tidy.sh 2009-09-26 10:42:51 UTC (rev 78)
+++ trunk/no-interwiki/tidy.sh 2009-09-26 10:51:33 UTC (rev 79)
@@ -1,26 +1,33 @@
#!/bin/bash
-FN=$1
+FILENAME=$1
echo checking syntax
-perl -c $FN
+perl -c $FILENAME
if [ $? -ne 0 ]; then
exit 1
fi
echo tidying
-perltidy $FN
+perltidy $FILENAME
if [ $? -ne 0 ]; then
exit 1
fi
-diff $FN.bak ${FN}
-if [ $? -eq 2 ]; then
+diff $FILENAME.bak $FILENAME
+diff_exit_code=$?
+
+case $diff_exit_code in
+0)
+ rm $FILENAME.bak
+;;
+2)
+ echo trouble in diff after perltidy, exiting.
exit 1
-fi
+;;
+esac
echo criticizing
-perlcritic -brutal $FN
+perlcritic -brutal $FILENAME
exit $?
-
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <am...@us...> - 2009-09-26 10:43:01
|
Revision: 78
http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=78&view=rev
Author: amire80
Date: 2009-09-26 10:42:51 +0000 (Sat, 26 Sep 2009)
Log Message:
-----------
Less logging, redirects are supposed to be stable.
Modified Paths:
--------------
trunk/no-interwiki/prepare_noiw_list.pl
Modified: trunk/no-interwiki/prepare_noiw_list.pl
===================================================================
--- trunk/no-interwiki/prepare_noiw_list.pl 2009-09-26 10:41:09 UTC (rev 77)
+++ trunk/no-interwiki/prepare_noiw_list.pl 2009-09-26 10:42:51 UTC (rev 78)
@@ -71,7 +71,7 @@
# Initialize logger
Log::Log4perl->easy_init(
{
- level => $DEBUG, # print everything
+ level => $INFO, # print everything
file => ':utf8>outtest.log', # utf is important
layout => '%m%n', # No need to print the date
}
@@ -425,7 +425,7 @@
if (my $redirect_type = is_redirect($page_ref)) {
$statistics{"redirect - $redirect_type"}++;
- INFO("\n$page_title - $redirect_type redirect");
+ DEBUG("\n$page_title - $redirect_type redirect");
next PAGE;
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <am...@us...> - 2009-09-26 10:41:19
|
Revision: 77
http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=77&view=rev
Author: amire80
Date: 2009-09-26 10:41:09 +0000 (Sat, 26 Sep 2009)
Log Message:
-----------
Better diff handling.
Modified Paths:
--------------
trunk/no-interwiki/.perltidyrc
Modified: trunk/no-interwiki/.perltidyrc
===================================================================
--- trunk/no-interwiki/.perltidyrc 2009-09-25 10:43:03 UTC (rev 76)
+++ trunk/no-interwiki/.perltidyrc 2009-09-26 10:41:09 UTC (rev 77)
@@ -1,3 +1,4 @@
+--backup-and-modify-in-place
--maximum-line-length=78
--continuation-indentation=4
--nooutdent-long-lines
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <am...@us...> - 2009-09-25 10:43:13
|
Revision: 76
http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=76&view=rev
Author: amire80
Date: 2009-09-25 10:43:03 +0000 (Fri, 25 Sep 2009)
Log Message:
-----------
Adding el, fixing tidy.
Modified Paths:
--------------
trunk/no-interwiki/.perltidyrc
Added Paths:
-----------
trunk/no-interwiki/el.language_codes.txt
trunk/no-interwiki/el.strings.txt
Modified: trunk/no-interwiki/.perltidyrc
===================================================================
--- trunk/no-interwiki/.perltidyrc 2009-09-24 22:40:56 UTC (rev 75)
+++ trunk/no-interwiki/.perltidyrc 2009-09-25 10:43:03 UTC (rev 76)
@@ -1,4 +1,3 @@
---backup-and-modify-in-place
--maximum-line-length=78
--continuation-indentation=4
--nooutdent-long-lines
Added: trunk/no-interwiki/el.language_codes.txt
===================================================================
--- trunk/no-interwiki/el.language_codes.txt (rev 0)
+++ trunk/no-interwiki/el.language_codes.txt 2009-09-25 10:43:03 UTC (rev 76)
@@ -0,0 +1 @@
+link language_codes.txt
\ No newline at end of file
Property changes on: trunk/no-interwiki/el.language_codes.txt
___________________________________________________________________
Added: svn:special
+ *
Added: trunk/no-interwiki/el.strings.txt
===================================================================
--- trunk/no-interwiki/el.strings.txt (rev 0)
+++ trunk/no-interwiki/el.strings.txt 2009-09-25 10:43:03 UTC (rev 76)
@@ -0,0 +1,36 @@
+# months
+January Januaro
+February Februaro
+March Marto
+April Aprilo
+May Majo
+June Junio
+July Julio
+August Auxgusto
+September Septembro
+October Oktobro
+November Novembro
+December Decembro
+
+in
+
+no_iw iw
+disambig Αποσαφήνιση
+template Πρότυπο
+
+date dato
+type tipo
+
+# MW specials
+REDIRECT REDIRECT
+
+# Namespaces
+User Χρήστης
+User talk Συζήτηση χρήστη
+Image Αρχείο
+Portal Πύλη
+Category Κατηγορία
+article space (Κύριος χώρος)
+
+# Other
+other other
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <am...@us...> - 2009-09-24 22:41:06
|
Revision: 75
http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=75&view=rev
Author: amire80
Date: 2009-09-24 22:40:56 +0000 (Thu, 24 Sep 2009)
Log Message:
-----------
Removed a little too much.
Modified Paths:
--------------
trunk/no-interwiki/prepare_noiw_list.pl
Modified: trunk/no-interwiki/prepare_noiw_list.pl
===================================================================
--- trunk/no-interwiki/prepare_noiw_list.pl 2009-09-24 22:33:58 UTC (rev 74)
+++ trunk/no-interwiki/prepare_noiw_list.pl 2009-09-24 22:40:56 UTC (rev 75)
@@ -357,6 +357,7 @@
# my @found_lang_codes = sort keys %found_links;
# INFO("found lang_codes: @found_lang_codes");
+say 'looking for multi links';
my $begin_multi_links_time = time;
print_multi_links_by_foreign();
@@ -1540,6 +1541,10 @@
This module is used for transliterating filenames to ASCII.
+=item * C<Readonly>
+
+To make Perl::Critic happy :)
+
=back
=head1 HACKING
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <am...@us...> - 2009-09-24 22:34:08
|
Revision: 74
http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=74&view=rev
Author: amire80
Date: 2009-09-24 22:33:58 +0000 (Thu, 24 Sep 2009)
Log Message:
-----------
Fixing a lot of bitrot.
Modified Paths:
--------------
trunk/no-interwiki/prepare_noiw_list.pl
trunk/no-interwiki/tidy.sh
Added Paths:
-----------
trunk/no-interwiki/cv.language_codes.txt
trunk/no-interwiki/cv.strings.txt
Added: trunk/no-interwiki/cv.language_codes.txt
===================================================================
--- trunk/no-interwiki/cv.language_codes.txt (rev 0)
+++ trunk/no-interwiki/cv.language_codes.txt 2009-09-24 22:33:58 UTC (rev 74)
@@ -0,0 +1 @@
+link language_codes.txt
\ No newline at end of file
Property changes on: trunk/no-interwiki/cv.language_codes.txt
___________________________________________________________________
Added: svn:special
+ *
Added: trunk/no-interwiki/cv.strings.txt
===================================================================
--- trunk/no-interwiki/cv.strings.txt (rev 0)
+++ trunk/no-interwiki/cv.strings.txt 2009-09-24 22:33:58 UTC (rev 74)
@@ -0,0 +1,40 @@
+# months
+January кăрлач
+February нарăс
+March пуш
+April ака
+May çу
+June çĕртме
+July утă
+August çурла
+September авăн
+October юпа
+November чӳк
+December раштав
+
+in
+
+no_iw Интервики лартман
+category Категори
+disambig тĕрлĕ пĕлтерĕш
+template шаблон
+
+date дата
+type тĕс
+
+# MW specials
+REDIRECT куçару
+
+# Namespaces
+User Хутшăнакан
+User talk Хутшăнаканăн канашлу страници
+Image Ӳкерчĕк
+Portal Portal
+Category Категори
+article space article space
+
+# Other
+other Ытти
+rlm ‏
+exclude_lowercase ß
+
Modified: trunk/no-interwiki/prepare_noiw_list.pl
===================================================================
--- trunk/no-interwiki/prepare_noiw_list.pl 2009-09-08 19:20:10 UTC (rev 73)
+++ trunk/no-interwiki/prepare_noiw_list.pl 2009-09-24 22:33:58 UTC (rev 74)
@@ -21,6 +21,7 @@
use integer;
use open ':utf8';
use utf8;
+use charnames ':full';
# Standard library
# These modules should come installed with Perl
@@ -34,17 +35,14 @@
# CPAN
# You must install these modules from CPAN
-# Versions before 0.51 have a bug with parsing the namespace of a page
+# I only use Readonly where P::C complains about it. Using it with regular
+# expressions which have non-ASCII chars produces heisenbugs.
use Readonly;
use Regexp::Common;
use Lingua::Translit;
use Log::Log4perl qw(:easy);
use Parse::MediaWikiDump 0.51; # Earlier versions have a bug in namespaces
-# XXX
-use Devel::Leak;
-use Devel::Size qw(size total_size);
-
#<<< no perltidy
my %SVN_PROPS = ( ## no critic (RequireInterpolationOfMetachars)
Revision => '$Revision$',
@@ -56,14 +54,14 @@
? "0.1.9.$+{revision_num}"
: croak(q(Something is wrong with SVN revision number));
-Readonly my $WIKITEXT_EXT => 'wiki.txt';
-Readonly my $OUT_DIR => 'out';
-Readonly my $UNSORTED_DIR => "$OUT_DIR/unsorted";
-Readonly my $MULTI_DIR => "$OUT_DIR/multilinks";
-Readonly my $ALT_SEP => q{|};
-Readonly my $FIELD_SEP => qq{\t};
-Readonly my $LINK_SEP => q{|};
-Readonly my $TYPE_SEP => qr/\s*;\s*/xms;
+my $WIKITEXT_EXT = 'wiki.txt';
+my $OUT_DIR = 'out';
+my $UNSORTED_DIR = "$OUT_DIR/unsorted";
+my $MULTI_DIR = "$OUT_DIR/multilinks";
+my $ALT_SEP = q{|};
+my $FIELD_SEP = qq{\t};
+my $LINK_SEP = q{|};
+my $TYPE_SEP = qr/\s*;\s*/xms;
Readonly my $DEFAULT_MAX_IW_PLACES => 20;
Readonly my $DEFAULT_PAGE_FREQ => 1000;
@@ -104,15 +102,15 @@
}
# XXX Too coupled to Wikipedia, won't work for other projects.
-Readonly my $DUMP_FN_RE => qr{
+my $DUMP_FN_RE = qr{
\A # Begin string
(?<wiki_lang>\w+) # Lang code
wiki # Is supposed to be after the lang code
}xms;
-Readonly my $FIELD_SEP_RE => qr{\t}xms;
+my $FIELD_SEP_RE = qr{\t}xms;
-Readonly my $STRING_SKIP_RE => qr{
+my $STRING_SKIP_RE = qr{
\A # Begin string
\s* # Zero or more spaces
(?:\#.*)? # Comment lines
@@ -136,7 +134,7 @@
# ISO 9 is mostly good for Russian and it is still not perfect ASCII
my $TRANSLITERATOR = Lingua::Translit->new('ISO 9');
-Readonly my %MW_SYNTAX => (
+my %MW_SYNTAX = (
'start_template' => '{{',
'end_template' => '}}',
'start_link' => '[[',
@@ -148,15 +146,15 @@
);
# This monstrosity basically says: | and optional spaces
-Readonly my $PARAM_SEP_RE => qr{\s*\Q$MW_SYNTAX{param_sep}\E\s*}xms;
+my $PARAM_SEP_RE = qr{\s*\Q$MW_SYNTAX{param_sep}\E\s*}xms;
-Readonly my @INCLUDE_NAMESPACES => ('article space', 'Category', 'Portal');
+my @INCLUDE_NAMESPACES = ('article space', 'Category', 'Portal');
#
# Constants for date processing
#
-Readonly my @MONTHS => @STRING{
+my @MONTHS = @STRING{
qw(
January February March
April May June
@@ -170,7 +168,7 @@
@REV_MONTH{@MONTHS} = (0 .. $LAST_MONTH);
# XXX Internationalize
-Readonly my $HEB_DATE_RE => qr{
+my $HEB_DATE_RE = qr{
\A # begin
(?<hour>\d{1,2}) # hour
: # :
@@ -184,7 +182,7 @@
\z # end
}xms;
-Readonly my $PARAM_RE => qr{
+my $PARAM_RE = qr{
\A # Beginning of a string
\s* # Zero or more space
(?: # No capture
@@ -196,14 +194,14 @@
}xms;
# XXX It should use get_string()
-Readonly my $SIMPLE_NO_IW_CHECK_RE => qr{
+my $SIMPLE_NO_IW_CHECK_RE = qr{
\Q$STRING{no_iw}\E # The string may have spaces
}xmsi;
# A simplistic template just for testing.
# Quite possibly it is not needed anymore.
# Until i get a better regex for matching balancing {{}} ...
-Readonly my $TEMPLATE_RE => qr{
+my $TEMPLATE_RE = qr{
\A # beginning of string
\Q$MW_SYNTAX{start_template}\E # {{
.+ # some chars
@@ -211,7 +209,7 @@
\z # end of string
}xms;
-Readonly my $WIKITABLE_RE => qr{
+my $WIKITABLE_RE = qr{
\A
\Q$MW_SYNTAX{start_wikitable}\E
}xms;
@@ -220,7 +218,7 @@
# Regular expression mostly copied from
# Parse::MediaWikiDump::page::redirect
# TODO: Try to look for the local language redirect keyword in the dump.
-Readonly my $LOCAL_REDIRECT_RE => qr{
+my $LOCAL_REDIRECT_RE = qr{
\A # Beginning of string (page)
\# # a # character
$STRING{REDIRECT} # Redirect keyword in local language
@@ -228,60 +226,62 @@
\s*:?\s*\[\[([^\]]*)\]\] # the link after the redirect
}xmsi;
-Readonly my $LTR_CHAR_RE => qr/\P{IsLeftToRight}/xms;
-Readonly my $SECTION_LINK_RE => qr{(?<!&)\#}xms;
-Readonly my $LOWERCASE_LINK_RE => qr{\A[[:lower:]]}xms;
+my $LTR_CHAR_RE = qr/\P{IsLeftToRight}/xms;
+my $SECTION_LINK_RE = qr{(?<!&)\#}xms;
+my $LOWERCASE_LINK_RE = qr{\A[[:lower:]]}xms;
## no critic (ProhibitEscapedMetacharacters)
-Readonly my $TRUE_TEMPLATE_RE => qr/\{ $RE{balanced}{-parens=>'{}'} \}/xms;
+my $TRUE_TEMPLATE_RE = qr/\{ $RE{balanced}{-parens=>'{}'} \}/xms;
## use critic (ProhibitEscapedMetacharacters)
# get_string() cannot be used in re
my $string_exclude_lowercase = get_string('exclude_lowercase');
-Readonly my $EXCLUDE_LOWERCASE_RE => qr{
+my $EXCLUDE_LOWERCASE_RE = qr{
\A # Beginning of foreign article name
[$string_exclude_lowercase] # Character class of possibly lowercase chars
}xms;
-Readonly my $NUMBERED_FILE_RE => qr{
+my $NUMBERED_FILE_RE = qr{
(?: _ \d*)?
\.$WIKITEXT_EXT
}xms;
-Readonly my $INVALID_FILENAME_CHAR_RE => qr{[\\\n/:*?"<>|]}xms; # "
+my $INVALID_FILENAME_CHAR_RE = qr{[\\\n/:*?"<>|]}xms; # "
-my $two_digit_charnumber_re = qr{
+my $TWO_DIGIT_CHARNUMBER_RE = qr{
(?:
[%.] # There are both %C4%B0 and .AA.E0
[[:xdigit:]]{2} # 2 hex digits
)
}xms;
-my $html_charnumber_re = qr{
+my $HTML_CHARNUMBER_RE = qr{
(?:
&\#\d+; # stuff like Š
)
}xms;
# TODO: Check whether it is Neapolitan with its ''
-Readonly my $CHARACTER_CODE_IN_LINK_RE => qr{
- $two_digit_charnumber_re | $html_charnumber_re
+my $CHARACTER_CODE_IN_LINK_RE = qr{
+ $TWO_DIGIT_CHARNUMBER_RE | $HTML_CHARNUMBER_RE
}xms;
my %LANG_CODE;
-Readonly my $LANG_CODE_FN => "$WIKI_LANG.language_codes.txt";
-open my $lang_code_file, '<', $LANG_CODE_FN
+my $LANG_CODE_FN = "$WIKI_LANG.language_codes.txt";
+open my $lang_code_file, '<', $LANG_CODE_FN ## no critic (RequireBriefOpen)
or croak(file_error('opening', $LANG_CODE_FN, 'reading'));
while (my $line = <$lang_code_file>) {
chomp $line;
my ($code, $name) = split /\t/xms, $line;
- $LANG_CODE{$code} = $name // $code; # /
+ if (defined $code) {
+ $LANG_CODE{$code} = $name // $code; # /
+ }
}
close $lang_code_file
or croak(file_error('closing', $LANG_CODE_FN, 'reading'));
-Readonly my $ALT_LANGS => join $ALT_SEP, keys %LANG_CODE;
+my $ALT_LANGS = join $ALT_SEP, keys %LANG_CODE;
-Readonly my $INTERWIKI_LINK_RE => qr{
+my $INTERWIKI_LINK_RE = qr{
\Q$MW_SYNTAX{start_link}\E
(?<lang_code>
$ALT_LANGS
@@ -294,7 +294,7 @@
}xms;
# Lojban allows lowercase articles
-Readonly my @LOWERCASE_LANGS => qw(jbo);
+my @LOWERCASE_LANGS = qw(jbo);
Readonly my $COOLING_DAYS => 120;
Readonly my $COOLING_SECONDS => $COOLING_DAYS * 24 * 60 * 60;
@@ -321,7 +321,7 @@
my $namespaces_alt = join $ALT_SEP,
grep { length > 0 } @{ $dump->namespaces_names() };
-Readonly my $PURE_TITLE_RE => qr{
+my $PURE_TITLE_RE = qr{
\A
(?:
(?:$namespaces_alt)
@@ -357,7 +357,6 @@
# my @found_lang_codes = sort keys %found_links;
# INFO("found lang_codes: @found_lang_codes");
-say 'looking for multi links';
my $begin_multi_links_time = time;
print_multi_links_by_foreign();
@@ -383,18 +382,11 @@
}
sub find_iwless {
- my $leak_handle;
- my $leak_count;
-
PAGE:
while (my $page_ref = next_page()) {
$page_counter++;
if ($page_counter % $option{page_freq} == 0) {
say $page_counter;
- # my $lead_count = Devel::Leak::NoteSV($leak_handle);
- # say "leak count: $lead_count";
- # say 'Devel::Size size of buck dump: ', total_size($dump);
- # say 'Devel::Size total_size of buck page_ref: ', total_size($page_ref);
}
last PAGE
@@ -406,27 +398,49 @@
my $page_namespace = namespace($page_ref);
my $page_title = $page_ref->title();
+ my $page_text_ref = $page_ref->text();
- # Skipping cases:
- next PAGE
- if (
- not is_in_namespace($page_ref, @INCLUDE_NAMESPACES)
- or is_redirect($page_ref)
+ if (not defined $page_text_ref) {
+ WARN('ref to page text undefined, skipping');
+ $statistics{'ref to page text undefined'}++;
+ next PAGE;
+ }
- # TODO: Be more precise here.
- # Portal pages which have a '/' in their name are probably
- # internal and do not need interwiki links.
- or (is_in_namespace($page_ref, 'Portal')
- and $page_title =~ m{/}xms)
- );
+ if (not defined ${$page_text_ref}) {
+ WARN('page text undefined, skipping');
+ $statistics{'page text undefined'}++;
+ next PAGE;
+ }
+ if (${$page_text_ref} eq q()) {
+ WARN('page text empty, skipping');
+ $statistics{'page text empty'}++;
+ next PAGE;
+ }
+
+ if (not is_in_namespace($page_ref, @INCLUDE_NAMESPACES)) {
+ next PAGE;
+ }
+
+ if (my $redirect_type = is_redirect($page_ref)) {
+ $statistics{"redirect - $redirect_type"}++;
+ INFO("\n$page_title - $redirect_type redirect");
+ next PAGE;
+ }
+
+ # TODO: Be more precise here.
+ # Portal pages which have a '/' in their name are probably
+ # internal and do not need interwiki links.
+ if (is_in_namespace($page_ref, 'Portal') and $page_title =~ m{/}xms) {
+ next PAGE;
+ }
+
$namespace_count{$page_namespace}++;
INFO("\n* processing $page_counter - ", $page_title);
- my $page_text_ref = $page_ref->text();
-
# A simple sanity check: is the no_iw template anywhere around here?
- my $has_template_no_iw = ($page_text_ref =~ $SIMPLE_NO_IW_CHECK_RE);
+ my $has_template_no_iw =
+ (${$page_text_ref} =~ $SIMPLE_NO_IW_CHECK_RE);
# Does the page have interwiki links?
# BIG XXX Actually checks only for English
@@ -792,7 +806,15 @@
}
push @{ $statistics{count_iw}->[$count_iw] }, $page_title;
}
- INFO("iw link count for $page_title: $count_iw");
+ INFO(
+ "iw link count for $page_title"
+ . (
+ $option{'rlm'}
+ ? "\N{LEFT-TO-RIGHT MARK}"
+ : q()
+ )
+ . " is: $count_iw"
+ );
for my $special_case_name (keys %special_cases) {
if (scalar %{ $special_cases{$special_case_name} }) {
@@ -1061,7 +1083,7 @@
# Custom Unicode character property for finding characters.
# The custom is to give those subroutines CamelCase names.
-sub IsLeftToRight { ## no critic (Capitalization)
+sub IsLeftToRight { ## no critic (ProhibitMixedCaseSubs)
return <<'END';
+utf8::InHebrew
+utf8::IsSpace
@@ -1070,19 +1092,19 @@
}
sub is_redirect {
- my ($page) = @_;
- my $page_title = $page->title();
- my $page_text_ref = $page->text();
+ my ($page) = @_;
if ($page->redirect()) {
- INFO("\nEnglish redirect: $page_title");
- return 1;
+ return 'English';
}
+
+ my $page_text_ref = $page->text();
+
if (${$page_text_ref} =~ $LOCAL_REDIRECT_RE) {
- INFO("\nLocal redirect: $page_title");
- return 1;
+ return 'local';
}
- return 0;
+
+ return q();
}
sub is_in_namespace {
@@ -1290,13 +1312,20 @@
my $max_iw_index = $#{ $statistics{count_iw} };
INFO("max_iw_index: $max_iw_index");
+ MAX_IW:
for my $max_iw_place (0 .. $option{max_iw_places}) {
my @links =
- map { make_link($_) } @{ $statistics{count_iw}->[$max_iw_index] };
+ map { make_link($_) } @{ $statistics{count_iw}->[$max_iw_index] }
+ or last MAX_IW;
INFO("# $max_iw_index: " . join_links(\@links, 0));
- # Do nothing, just count down to the next index with a defined list
- while (not defined $statistics{count_iw}->[ --$max_iw_index ]) { }
+ # Do nothing, just count down to the next index with a defined list.
+ # $max_iw_index needs to be checked for nonzero-ness for the rare case
+ # of a very low page count.
+ while ($max_iw_index
+ and not defined $statistics{count_iw}->[ --$max_iw_index ])
+ {
+ }
}
INFO('pages without interwiki links per namespace');
@@ -1337,7 +1366,7 @@
prepare_noiw_list.pl
-version 0.2 - Noa.
+version 0.2.1 - Noa.
=head1 USAGE
@@ -1511,10 +1540,6 @@
This module is used for transliterating filenames to ASCII.
-=item * C<Readonly>
-
-To make Perl::Critic happy :)
-
=back
=head1 HACKING
Modified: trunk/no-interwiki/tidy.sh
===================================================================
--- trunk/no-interwiki/tidy.sh 2009-09-08 19:20:10 UTC (rev 73)
+++ trunk/no-interwiki/tidy.sh 2009-09-24 22:33:58 UTC (rev 74)
@@ -1,6 +1,6 @@
#!/bin/bash
-FN=prepare_noiw_list.pl
+FN=$1
echo checking syntax
perl -c $FN
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <am...@us...> - 2009-09-08 19:20:25
|
Revision: 73
http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=73&view=rev
Author: amire80
Date: 2009-09-08 19:20:10 +0000 (Tue, 08 Sep 2009)
Log Message:
-----------
cleanup before upgrading to Parse::MediaWikiDump 0.92
Modified Paths:
--------------
trunk/no-interwiki/prepare_noiw_list.pl
Modified: trunk/no-interwiki/prepare_noiw_list.pl
===================================================================
--- trunk/no-interwiki/prepare_noiw_list.pl 2008-11-21 23:07:37 UTC (rev 72)
+++ trunk/no-interwiki/prepare_noiw_list.pl 2009-09-08 19:20:10 UTC (rev 73)
@@ -12,7 +12,7 @@
# This program is Free Software; you can redistribute it and/or modify it
# under the same terms as Perl itself.
-# Upgrade! This script actually uses new Perl 5.10 constructs, so you need it
+# Upgrade! This program actually uses new Perl 5.10 constructs, so you need it
use 5.010;
# pragmata
@@ -27,7 +27,6 @@
use English qw(-no_match_vars);
use Carp qw(croak cluck);
use Time::Local;
-use List::Util qw(first);
use Getopt::Long;
use Data::Dumper;
use File::Basename;
@@ -36,14 +35,18 @@
# You must install these modules from CPAN
# Versions before 0.51 have a bug with parsing the namespace of a page
-use Parse::MediaWikiDump 0.51;
+use Readonly;
use Regexp::Common;
use Lingua::Translit;
use Log::Log4perl qw(:easy);
-use Readonly;
+use Parse::MediaWikiDump 0.51; # Earlier versions have a bug in namespaces
+# XXX
+use Devel::Leak;
+use Devel::Size qw(size total_size);
+
#<<< no perltidy
-my %SVN_PROPS = ( ## no critic ValuesAndExpressions::RequireInterpolationOfMetachars
+my %SVN_PROPS = ( ## no critic (RequireInterpolationOfMetachars)
Revision => '$Revision$',
HeadURL => '$HeadURL$',
Date => '$Date$',
@@ -53,7 +56,6 @@
? "0.1.9.$+{revision_num}"
: croak(q(Something is wrong with SVN revision number));
-my %PATTERN;
Readonly my $WIKITEXT_EXT => 'wiki.txt';
Readonly my $OUT_DIR => 'out';
Readonly my $UNSORTED_DIR => "$OUT_DIR/unsorted";
@@ -102,15 +104,15 @@
}
# XXX Too coupled to Wikipedia, won't work for other projects.
-$PATTERN{dump_fn} = qr{
+Readonly my $DUMP_FN_RE => qr{
\A # Begin string
(?<wiki_lang>\w+) # Lang code
wiki # Is supposed to be after the lang code
}xms;
-$PATTERN{field_sep} = qr{\t}xms;
+Readonly my $FIELD_SEP_RE => qr{\t}xms;
-$PATTERN{string_skip} = qr{
+Readonly my $STRING_SKIP_RE => qr{
\A # Begin string
\s* # Zero or more spaces
(?:\#.*)? # Comment lines
@@ -122,9 +124,9 @@
or croak('Dump filename must be supplied as an argument.');
my %STRING;
my $WIKI_LANG;
-if ((basename $dump_fn) =~ $PATTERN{dump_fn}) {
+if ((basename $dump_fn) =~ $DUMP_FN_RE) {
$WIKI_LANG = $+{wiki_lang};
- %STRING = get_strings($WIKI_LANG);
+ get_strings($WIKI_LANG);
}
else {
croak("$dump_fn is a weird dump file name.");
@@ -135,8 +137,8 @@
my $TRANSLITERATOR = Lingua::Translit->new('ISO 9');
Readonly my %MW_SYNTAX => (
- 'start_tmpl' => '{{',
- 'end_tmpl' => '}}',
+ 'start_template' => '{{',
+ 'end_template' => '}}',
'start_link' => '[[',
'end_link' => ']]',
'param_sep' => q{|},
@@ -146,7 +148,7 @@
);
# This monstrosity basically says: | and optional spaces
-$PATTERN{param_sep} = qr{\s*\Q$MW_SYNTAX{param_sep}\E\s*}xms;
+Readonly my $PARAM_SEP_RE => qr{\s*\Q$MW_SYNTAX{param_sep}\E\s*}xms;
Readonly my @INCLUDE_NAMESPACES => ('article space', 'Category', 'Portal');
@@ -168,7 +170,7 @@
@REV_MONTH{@MONTHS} = (0 .. $LAST_MONTH);
# XXX Internationalize
-$PATTERN{heb_date} = qr{
+Readonly my $HEB_DATE_RE => qr{
\A # begin
(?<hour>\d{1,2}) # hour
: # :
@@ -182,7 +184,7 @@
\z # end
}xms;
-$PATTERN{param} = qr{
+Readonly my $PARAM_RE => qr{
\A # Beginning of a string
\s* # Zero or more space
(?: # No capture
@@ -194,22 +196,22 @@
}xms;
# XXX It should use get_string()
-$PATTERN{simple_no_iw_check} = qr{
+Readonly my $SIMPLE_NO_IW_CHECK_RE => qr{
\Q$STRING{no_iw}\E # The string may have spaces
}xmsi;
# A simplistic template just for testing.
# Quite possibly it is not needed anymore.
# Until i get a better regex for matching balancing {{}} ...
-$PATTERN{template} = qr{
+Readonly my $TEMPLATE_RE => qr{
\A # beginning of string
- \Q$MW_SYNTAX{start_tmpl}\E # {{
+ \Q$MW_SYNTAX{start_template}\E # {{
.+ # some chars
- \Q$MW_SYNTAX{end_tmpl}\E # }}
+ \Q$MW_SYNTAX{end_template}\E # }}
\z # end of string
}xms;
-$PATTERN{wikitable} = qr{
+Readonly my $WIKITABLE_RE => qr{
\A
\Q$MW_SYNTAX{start_wikitable}\E
}xms;
@@ -218,7 +220,7 @@
# Regular expression mostly copied from
# Parse::MediaWikiDump::page::redirect
# TODO: Try to look for the local language redirect keyword in the dump.
-$PATTERN{local_redirect} = qr{
+Readonly my $LOCAL_REDIRECT_RE => qr{
\A # Beginning of string (page)
\# # a # character
$STRING{REDIRECT} # Redirect keyword in local language
@@ -226,33 +228,44 @@
\s*:?\s*\[\[([^\]]*)\]\] # the link after the redirect
}xmsi;
-$PATTERN{ltr_char} = qr/\P{IsLeftToRight}/xms;
-$PATTERN{true_template} = qr/\{ $RE{balanced}{-parens=>'{}'} \}/xms;
-$PATTERN{section_link} = qr{(?<!&)\#}xms;
-$PATTERN{lowercase_link} = qr{\A[[:lower:]]}xms;
+Readonly my $LTR_CHAR_RE => qr/\P{IsLeftToRight}/xms;
+Readonly my $SECTION_LINK_RE => qr{(?<!&)\#}xms;
+Readonly my $LOWERCASE_LINK_RE => qr{\A[[:lower:]]}xms;
-# XXX get_string() cannot be used here
-if ($STRING{exclude_lowercase}) {
- $PATTERN{exclude_lowercase} = qr{\A[$STRING{exclude_lowercase}]}xms;
-}
-$PATTERN{numbered_file} = qr{
+## no critic (ProhibitEscapedMetacharacters)
+Readonly my $TRUE_TEMPLATE_RE => qr/\{ $RE{balanced}{-parens=>'{}'} \}/xms;
+## use critic (ProhibitEscapedMetacharacters)
+
+# get_string() cannot be used in re
+my $string_exclude_lowercase = get_string('exclude_lowercase');
+Readonly my $EXCLUDE_LOWERCASE_RE => qr{
+ \A # Beginning of foreign article name
+ [$string_exclude_lowercase] # Character class of possibly lowercase chars
+}xms;
+Readonly my $NUMBERED_FILE_RE => qr{
(?: _ \d*)?
\.$WIKITEXT_EXT
}xms;
-$PATTERN{invalid_filename_char} = qr{[\\\n/:*?"<>|]}xms; # "
+Readonly my $INVALID_FILENAME_CHAR_RE => qr{[\\\n/:*?"<>|]}xms; # "
-# TODO: Check whether it is Neapolitan with its ''
-$PATTERN{character_code_in_link} = qr{
+my $two_digit_charnumber_re = qr{
(?:
[%.] # There are both %C4%B0 and .AA.E0
[[:xdigit:]]{2} # 2 hex digits
)
- |
+}xms;
+
+my $html_charnumber_re = qr{
(?:
&\#\d+; # stuff like Š
)
}xms;
+# TODO: Check whether it is Neapolitan with its ''
+Readonly my $CHARACTER_CODE_IN_LINK_RE => qr{
+ $two_digit_charnumber_re | $html_charnumber_re
+}xms;
+
my %LANG_CODE;
Readonly my $LANG_CODE_FN => "$WIKI_LANG.language_codes.txt";
open my $lang_code_file, '<', $LANG_CODE_FN
@@ -260,7 +273,7 @@
while (my $line = <$lang_code_file>) {
chomp $line;
my ($code, $name) = split /\t/xms, $line;
- $LANG_CODE{$code} = $name // $code;
+ $LANG_CODE{$code} = $name // $code; # /
}
close $lang_code_file
@@ -268,7 +281,7 @@
Readonly my $ALT_LANGS => join $ALT_SEP, keys %LANG_CODE;
-$PATTERN{interwiki_link} = qr{
+Readonly my $INTERWIKI_LINK_RE => qr{
\Q$MW_SYNTAX{start_link}\E
(?<lang_code>
$ALT_LANGS
@@ -282,6 +295,7 @@
# Lojban allows lowercase articles
Readonly my @LOWERCASE_LANGS => qw(jbo);
+
Readonly my $COOLING_DAYS => 120;
Readonly my $COOLING_SECONDS => $COOLING_DAYS * 24 * 60 * 60;
Readonly my $LATEST_COOLING => time - $COOLING_SECONDS;
@@ -302,11 +316,12 @@
}
}
-my $dump = Parse::MediaWikiDump::Pages->new($dump_fn);
+my $pmwd = Parse::MediaWikiDump->new();
+my $dump = $pmwd->revisions($dump_fn);
my $namespaces_alt = join $ALT_SEP,
grep { length > 0 } @{ $dump->namespaces_names() };
-$PATTERN{pure_title} = qr{
+Readonly my $PURE_TITLE_RE => qr{
\A
(?:
(?:$namespaces_alt)
@@ -359,12 +374,27 @@
return $page->namespace() || get_string('article space');
}
+sub next_page {
+ my $page_ref = eval { $dump->next(); };
+ if ($EVAL_ERROR) {
+ confess("Failed reading a page: $EVAL_ERROR");
+ }
+ return $page_ref;
+}
+
sub find_iwless {
+ my $leak_handle;
+ my $leak_count;
+
PAGE:
- while (my $page = $dump->page()) {
+ while (my $page_ref = next_page()) {
$page_counter++;
if ($page_counter % $option{page_freq} == 0) {
say $page_counter;
+ # my $lead_count = Devel::Leak::NoteSV($leak_handle);
+ # say "leak count: $lead_count";
+ # say 'Devel::Size size of buck dump: ', total_size($dump);
+ # say 'Devel::Size total_size of buck page_ref: ', total_size($page_ref);
}
last PAGE
@@ -374,45 +404,45 @@
next PAGE
if ($page_counter < $option{start_from});
- my $namespace = namespace($page);
- my $page_title = $page->title();
+ my $page_namespace = namespace($page_ref);
+ my $page_title = $page_ref->title();
# Skipping cases:
next PAGE
if (
- not is_in_namespace($page, @INCLUDE_NAMESPACES)
- or is_redirect($page)
+ not is_in_namespace($page_ref, @INCLUDE_NAMESPACES)
+ or is_redirect($page_ref)
# TODO: Be more precise here.
# Portal pages which have a '/' in their name are probably
# internal and do not need interwiki links.
- or (is_in_namespace($page, 'Portal') and $page_title =~ m{/}xms)
+ or (is_in_namespace($page_ref, 'Portal')
+ and $page_title =~ m{/}xms)
);
- $namespace_count{$namespace}++;
+ $namespace_count{$page_namespace}++;
INFO("\n* processing $page_counter - ", $page_title);
- my $page_text_ref = $page->text();
+ my $page_text_ref = $page_ref->text();
# A simple sanity check: is the no_iw template anywhere around here?
- my $has_tmpl_no_iw =
- (${$page_text_ref} =~ $PATTERN{simple_no_iw_check});
+ my $has_template_no_iw = ($page_text_ref =~ $SIMPLE_NO_IW_CHECK_RE);
# Does the page have interwiki links?
# BIG XXX Actually checks only for English
- my $has_iw = has_interwiki($page);
+ my $has_iw = has_interwiki($page_ref);
if ($has_iw) {
INFO("has link to $has_iw");
- if ($has_tmpl_no_iw) {
+ if ($has_template_no_iw) {
INFO('has template no_iw. trying to remove ...');
- remove_tmpl_no_iw($page_text_ref);
+ remove_template_no_iw($page_text_ref);
$statistics{'has both valid interwiki and template'}++;
- special_cases_file('outdated_template', $page);
+ special_cases_file('outdated_template', $page_ref);
}
}
else { # does not have iw
- process_iwless_page($page, $has_tmpl_no_iw, $has_iw);
+ process_iwless_page($page_ref, $has_template_no_iw, $has_iw);
}
}
@@ -421,25 +451,24 @@
sub process_iwless_page {
my (
- $page, # object ref
- $has_tmpl_no_iw, # scalar bool
- $has_iw # scalar bool
+ $page_ref, # object ref
+ $has_template_no_iw, # scalar bool
+ $has_iw # scalar bool
) = @_;
INFO(q(does not have iw link.));
- $statistics{'has no interwiki link'}->{ namespace($page) }++;
+ $statistics{'has no interwiki link'}->{ namespace($page_ref) }++;
# Now we need to search for no_iw templates
# and parse their parameters - date and type
my @found_templates = ();
- my $page_text_ref = $page->text();
- my $page_title = $page->title();
+ my $page_text_ref = $page_ref->text();
# Optimized - does not start searching,
# if we already know that it is not there
- if ($has_tmpl_no_iw) {
+ if ($has_template_no_iw) {
find_templates($page_text_ref, \@found_templates,
[ get_string('no_iw') ]);
}
@@ -455,7 +484,7 @@
if ($found_templates_count > 1) {
WARN('many templates were found');
$statistics{'many templates'}++;
- special_cases_file('many_templates', $page);
+ special_cases_file('many_templates', $page_ref);
}
else {
INFO('good, found one template');
@@ -476,7 +505,7 @@
if (not defined $date_ref) {
INFO("invalid date: '$date_str'");
$statistics{'invalid date'}++;
- special_cases_file('invalid_date', $page);
+ special_cases_file('invalid_date', $page_ref);
}
elsif (cooling_date_passed($date_ref)) {
INFO('cooling date passed, updating to today ...');
@@ -494,11 +523,11 @@
}
- my @all_types = get_all_types($template->{params}->{type}, $page);
+ my @all_types = get_all_types($template->{params}->{type}, $page_ref);
foreach my $type (@all_types) {
- INFO("adding $page_title to the list as type $type");
- add_to_no_iw_list($page, $type);
+ INFO('adding ' . $page_ref->title() . " to the list as type $type");
+ add_to_no_iw_list($page_ref, $type);
$type_count{$type}++;
}
@@ -508,7 +537,7 @@
sub get_all_types {
my ($type_param, $page) = @_;
- $type_param //= q{};
+ $type_param //= q{}; # /
strip_whitespace($type_param);
my @all_types = split $TYPE_SEP, $type_param;
@@ -545,23 +574,23 @@
sub find_templates {
my (
- $page_text_ref, # string ref
+ $text_ref, # string ref
$found_templates_ref, # array ref
$filter # string array ref
) = @_;
# A reference to an array with one empty string.
# Matching against an empty string will always succeed.
- $filter //= [q{}];
+ $filter //= [q{}]; # /
# Get all highest-level matches
- my @matches = (${$page_text_ref} =~ /$PATTERN{true_template}/xmsgo);
+ my @matches = (${$text_ref} =~ /$TRUE_TEMPLATE_RE/xmsgo);
MATCH:
foreach my $next_match (@matches) {
- if ($next_match !~ $PATTERN{template}) {
+ if ($next_match !~ $TEMPLATE_RE) {
INFO(q(i thought that it is a template, but it was:));
- if ($next_match =~ $PATTERN{wikitable}) {
+ if ($next_match =~ $WIKITABLE_RE) {
INFO('a wikitable');
}
else {
@@ -574,7 +603,8 @@
foreach my $next_filter (@{$filter}) {
# N.B. - case-insensitive. Wrong, but kinda useful.
- if ($next_match =~ /\A\Q$MW_SYNTAX{'start_tmpl'}$next_filter/xmsi)
+ if ($next_match =~
+ /\A\Q$MW_SYNTAX{'start_template'}$next_filter/xmsi)
{
# N.B.: parse_template calls find_templates() recursively
@@ -592,7 +622,7 @@
my (
$template, # string ref
$default_param_names, # string array ref
- $subtmpl_filter, # string array ref
+ $subtemplate_filter, # string array ref
) = @_;
# %parsed_template:
@@ -602,16 +632,16 @@
# {subtemplates} - array ref
my (%parsed_template, %parsed_params, @clauses);
- $parsed_template{text} = strip_tmpl_curlies(${$template});
+ $parsed_template{text} = strip_template_curlies(${$template});
# First string of the split is the template name,
# the rest is the params
($parsed_template{name}, @clauses) =
- (split $PATTERN{param_sep}, ${ $parsed_template{text} });
+ (split $PARAM_SEP_RE, ${ $parsed_template{text} });
my $param_counter = 0;
foreach my $clause (@clauses) {
- if ($clause =~ $PATTERN{param}) {
+ if ($clause =~ $PARAM_RE) {
#<<< no perltidy
my ($name, $value) = @+{ qw(param_name value) };
#>>>
@@ -630,7 +660,7 @@
$parsed_params{$name} = $value;
}
else {
- my $error_msg = "Weird - $clause does not look a param";
+ my $error_msg = "Weird - $clause does not look like a param";
INFO($error_msg);
cluck($error_msg);
$statistics{'weird param'}++;
@@ -641,7 +671,7 @@
# Possible recursion
find_templates($parsed_template{text}, $parsed_template{subtemplates},
- $subtmpl_filter);
+ $subtemplate_filter);
return \%parsed_template;
}
@@ -651,7 +681,7 @@
return if (not defined $date_str);
- if ($date_str =~ $PATTERN{heb_date}) {
+ if ($date_str =~ $HEB_DATE_RE) {
INFO("found a valid date: $date_str");
my %parsed_date = (
'sec' => 0, # useful for timelocal
@@ -677,15 +707,15 @@
return;
}
-sub strip_tmpl_curlies {
+sub strip_template_curlies {
my ($template) = @_;
for ($template) {
s{
\A
- \Q$MW_SYNTAX{start_tmpl}\E
+ \Q$MW_SYNTAX{start_template}\E
}{}xms;
s{
- \Q$MW_SYNTAX{end_tmpl}\E
+ \Q$MW_SYNTAX{end_template}\E
\z
}{}xms;
}
@@ -693,7 +723,7 @@
}
# no arg unpacking for simplicity and performance
-sub strip_whitespace { ## no critic Subroutines::RequireArgUnpacking
+sub strip_whitespace { ## no critic (RequireArgUnpacking)
for (@_) {
s/\A\s*//xms;
s/\s*\z//xms;
@@ -705,13 +735,14 @@
my ($page) = @_;
my $page_title = $page->title();
- my $page_text = ${ $page->text() };
- study $page_text;
+ my $page_text = ${ $page->text() }; # XXX
+ study $page_text; # XXX
+
my %iw_links;
my %special_cases;
- while ($page_text =~ /$PATTERN{interwiki_link}/xmsgo) {
+ while ($page_text =~ /$INTERWIKI_LINK_RE/xmsgo) { # XXX
my ($lang_code, $foreign_article) = @+{qw(lang_code foreign_article)};
if (defined $iw_links{$lang_code}) {
$special_cases{double_links}->{$lang_code} = q{};
@@ -722,31 +753,31 @@
# A # sign not after an &.
# After an & it is probably a character number.
- if ($foreign_article =~ $PATTERN{section_link}) {
+ if ($foreign_article =~ $SECTION_LINK_RE) {
$special_cases{section_links}->{$lang_code} = q{};
}
# Char codes are common in section links, so there is no
# need to show them again
- elsif ($foreign_article =~ $PATTERN{character_code_in_link}) {
- $special_cases{charnumber_links}{$lang_code} = q{};
+ elsif ($foreign_article =~ $CHARACTER_CODE_IN_LINK_RE) {
+ $special_cases{charnumber_links}->{$lang_code} = q{};
}
# Lowercase links
if ( (not $lang_code ~~ @LOWERCASE_LANGS)
- and ($foreign_article =~ $PATTERN{lowercase_link}))
+ and ($foreign_article =~ $LOWERCASE_LINK_RE))
{
my $include_lowercase_link = 1;
# XXX get_string() cannot be used here
if (defined $STRING{exclude_lowercase}
- and $foreign_article =~ $PATTERN{exclude_lowercase})
+ and $foreign_article =~ $EXCLUDE_LOWERCASE_RE)
{
$include_lowercase_link = 0;
}
if ($include_lowercase_link) {
- $special_cases{lowercase_links}{$lang_code} = q{};
+ $special_cases{lowercase_links}->{$lang_code} = q{};
}
}
@@ -801,7 +832,7 @@
return;
}
-sub remove_tmpl_no_iw {
+sub remove_template_no_iw {
my ($params) = @_;
INFO( "Supposed to remove the no_iw template now, but ...\n"
. 'This sub is a stub. You can help Wikipedia by expanding it!');
@@ -853,7 +884,7 @@
my ($page_title) = @_;
my $sort_title;
- if ($page_title =~ $PATTERN{pure_title}) {
+ if ($page_title =~ $PURE_TITLE_RE) {
$sort_title = $+{pure_title};
}
else {
@@ -901,7 +932,7 @@
. $MW_SYNTAX{end_link};
if ($option{rtl}) {
- if ($page_title =~ $PATTERN{ltr_char}) {
+ if ($page_title =~ $LTR_CHAR_RE) {
# XXX get_string() cannot be used here
$link_to_page = $STRING{rlm} . $link_to_page . $STRING{rlm};
@@ -930,9 +961,9 @@
chomp $type_name;
foreach my $line (@lines) {
chomp $line;
- my ($page_title, $sort_title) = split $PATTERN{field_sep}, $line;
+ my ($page_title, $sort_title) = split $FIELD_SEP_RE, $line;
my $sort_letter = get_sort_letter($sort_title);
- $all_pages_in_type{$sort_letter} //= [];
+ $all_pages_in_type{$sort_letter} //= []; # /
push @{ $all_pages_in_type{$sort_letter} }, $page_title;
}
write_sorted_pages($type_name, \%all_pages_in_type);
@@ -965,8 +996,6 @@
write_page(\$page, \$type_fn, $file_number++);
$section_counter = 0;
- # N.B. Trying to free memory, not guaranteed
- undef $page;
$page = q{};
}
elsif ($section_counter) {
@@ -988,16 +1017,12 @@
[ @all_links_in_letter[ $first_link .. $last_link ] ]);
$page .= $links;
}
-
- # N.B. Trying to free memory, not guaranteed
- undef @all_links_in_letter;
}
# The page may be empty at this point
if ($page) {
write_page(\$page, \$type_fn, $file_number++);
}
- undef $page;
return;
}
@@ -1007,7 +1032,7 @@
my $pretty_file_number = sprintf '%03d', $file_number;
${$type_fn_ref} =~ s{
- $PATTERN{numbered_file}
+ $NUMBERED_FILE_RE
}
{_$pretty_file_number.$WIKITEXT_EXT}xmso;
INFO("creating file ${$type_fn_ref}");
@@ -1036,7 +1061,7 @@
# Custom Unicode character property for finding characters.
# The custom is to give those subroutines CamelCase names.
-sub IsLeftToRight { ## no critic NamingConventions::ProhibitMixedCaseSubs
+sub IsLeftToRight { ## no critic (Capitalization)
return <<'END';
+utf8::InHebrew
+utf8::IsSpace
@@ -1053,7 +1078,7 @@
INFO("\nEnglish redirect: $page_title");
return 1;
}
- if (${$page_text_ref} =~ $PATTERN{local_redirect}) {
+ if (${$page_text_ref} =~ $LOCAL_REDIRECT_RE) {
INFO("\nLocal redirect: $page_title");
return 1;
}
@@ -1093,22 +1118,20 @@
close $STRINGS_FILE
or croak(file_error('closing', $STRINGS_FN, 'reading'));
- my %STRING;
-
STRING_LINE:
foreach my $next_string_line (@strings_file_lines) {
# Skip blanks and comments
- next STRING_LINE if ($next_string_line =~ $PATTERN{string_skip});
+ next STRING_LINE if ($next_string_line =~ $STRING_SKIP_RE);
chomp $next_string_line;
- my ($english, $target) = split $PATTERN{field_sep}, $next_string_line;
+ my ($english, $target) = split $FIELD_SEP_RE, $next_string_line;
# Fallback to English if no target language string was supplied
$STRING{$english} = $target // $english; # /
}
- return %STRING;
+ return;
}
sub get_string {
@@ -1125,7 +1148,7 @@
my $type_fn = "$transliterated_type.$WIKITEXT_EXT";
- $type_fn =~ s{$PATTERN{invalid_filename_char}}{-}xmsgo;
+ $type_fn =~ s{$INVALID_FILENAME_CHAR_RE}{-}xmsgo;
my $dir = $unsorted ? $UNSORTED_DIR : $OUT_DIR;
$type_fn = "$dir/$type_fn";
@@ -1138,7 +1161,7 @@
open my $file, '>>:utf8', $fn
or croak(file_error('opening', $fn, 'appending'));
- say {$file} ($line // q{});
+ say {$file} ($line // q{}); # /
close $file
or croak(file_error('closing', $fn, 'appeding'));
@@ -1236,11 +1259,11 @@
$first_local_article, @other_local_articles
) = @_;
- $local_multi_links_ref->{$first_local_article} //= {};
+ $local_multi_links_ref->{$first_local_article} //= {}; # /
foreach my $other_local_article (@other_local_articles) {
$local_multi_links_ref->{$first_local_article}
- ->{$other_local_article} //= [];
+ ->{$other_local_article} //= []; # /
push @{ $local_multi_links_ref->{$first_local_article}
->{$other_local_article} }, $foreign_link;
}
@@ -1250,7 +1273,7 @@
sub join_links {
my ($links_ref, $line_end) = @_;
- $line_end //= 1; # /
+ $line_end //= 1; # /
my $link_sep = q{ } . $LINK_SEP . ($line_end ? "\n" : q{ });
return join $link_sep, @{$links_ref};
@@ -1280,10 +1303,11 @@
foreach my $namespace (keys %{ $statistics{'has no interwiki link'} }) {
my $iwless_in_namespace =
$statistics{'has no interwiki link'}->{$namespace};
- ## no critic ValuesAndExpressions::ProhibitMagicNumbers
no integer;
+ ## no critic (ProhibitMagicNumbers)
my $percentage = sprintf '%.2f',
100 * $iwless_in_namespace / $namespace_count{$namespace};
+ ## use critic (ValuesAndExpressions::ProhibitMagicNumbers)
use integer;
INFO("$namespace: $iwless_in_namespace, $percentage%");
}
@@ -1344,7 +1368,7 @@
=item * --stop_after=NUMBER Stops processing after page with the given
NUMBER.
-=item * --stop_after=NUMBER Begins processing after page with the given
+=item * --start_from=NUMBER Begins processing after page with the given
NUMBER.
=item * --page_freq=NUMBER Print the page counter every NUMBER of
@@ -1514,7 +1538,7 @@
It checks the syntax, runs perltidy on the code and runs Perl::Critic.
-All the places where P::C has been disabled using "# no critic" are explained.
+All the places where P::C has been disabled using "no critic" are explained.
The time invested in making the code P::C-friendly will be returned as time
saved on debugging. Also consider reading the book "Perl Best Practices" by
@@ -1527,14 +1551,14 @@
This program works best on GNU/Linux, where Perl and the filesystem are
Unicode-friendly.
-This program was also tested on Windows XP and Vista with ActivePerl 5.10
-and Cygwin Perl 5.10. In these environments Unicode-related issues caused
+This program was also tested on Windows XP and Vista with ActivePerl and
+Cygwin. In these environments Unicode-related issues caused
filenames and clipboard text to become jumbled. You have been warned.
=head1 BUGS AND LIMITATIONS
Please report all bugs, features requests and other comments to
-Amir E. Aharoni (ami...@gm...).
+Amir E. Aharoni (ami...@ma...).
=head2 There is no equality between languages
@@ -1662,4 +1686,3 @@
I<Visca la llibertat!>
=cut
-
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <am...@us...> - 2008-11-21 23:07:41
|
Revision: 72
http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=72&view=rev
Author: amire80
Date: 2008-11-21 23:07:37 +0000 (Fri, 21 Nov 2008)
Log Message:
-----------
Update language codes.
Modified Paths:
--------------
trunk/no-interwiki/language_codes.txt
Modified: trunk/no-interwiki/language_codes.txt
===================================================================
--- trunk/no-interwiki/language_codes.txt 2008-11-21 22:39:40 UTC (rev 71)
+++ trunk/no-interwiki/language_codes.txt 2008-11-21 23:07:37 UTC (rev 72)
@@ -251,3 +251,12 @@
rn Kirundi
chy Cheyenne
ng Ndonga
+cho Choctaw
+chy Cheyenne
+mh Marshallese
+aa Afar
+kj Kuanyama
+ho Hiri Motu
+mus Muscogee
+kr Kanuri
+
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <am...@us...> - 2008-11-21 22:39:42
|
Revision: 71
http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=71&view=rev
Author: amire80
Date: 2008-11-21 22:39:40 +0000 (Fri, 21 Nov 2008)
Log Message:
-----------
POD spelling, critic update.
Modified Paths:
--------------
trunk/no-interwiki/.perlcriticrc
trunk/no-interwiki/prepare_noiw_list.pl
Modified: trunk/no-interwiki/.perlcriticrc
===================================================================
--- trunk/no-interwiki/.perlcriticrc 2008-11-21 20:57:24 UTC (rev 70)
+++ trunk/no-interwiki/.perlcriticrc 2008-11-21 22:39:40 UTC (rev 71)
@@ -1,6 +1,7 @@
-#[-CodeLayout::RequireTidyCode]
-#[-Miscellanea::RequireRcsKeywords]
-
# English.pm doesn't support named capture variables (yet)
[Variables::ProhibitPunctuationVars]
allow = %+ $+ @+
+
+# This is probably useful, but appears buggy, so it should remain manual
+[-Documentation::PodSpelling]
+
Modified: trunk/no-interwiki/prepare_noiw_list.pl
===================================================================
--- trunk/no-interwiki/prepare_noiw_list.pl 2008-11-21 20:57:24 UTC (rev 70)
+++ trunk/no-interwiki/prepare_noiw_list.pl 2008-11-21 22:39:40 UTC (rev 71)
@@ -1380,7 +1380,7 @@
it will be removed automatically in the future.)
=item * If the page contains no links to the defined languages and does not
-comtain the template, it is automatically added to type "other".
+contain the template, it is automatically added to type "other".
=item * If the page contains no links to the defined languages and
a template with types, it is added to the defined types.
@@ -1606,7 +1606,7 @@
=item * B<0.2 - Noa>: Perl 5.10. Russian l10n. POD documentation. Pretty
categories sorting. Memory usage optimization - accumulating information in
files. More generic, but far-from-perfect handling of links to languages
-other than English. Translitetaion with Lingua::Translit. Logging with
+other than English. Transliteration with Lingua::Translit. Logging with
Log::Log4perl. Brutal Perl::Critic 1.90. Started using Readonly. Not finished:
complete statistics, removal of templates from pages which already have links.
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <am...@us...> - 2008-11-21 20:57:27
|
Revision: 70
http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=70&view=rev
Author: amire80
Date: 2008-11-21 20:57:24 +0000 (Fri, 21 Nov 2008)
Log Message:
-----------
generic perlcritic location.
Modified Paths:
--------------
trunk/no-interwiki/tidy.sh
Modified: trunk/no-interwiki/tidy.sh
===================================================================
--- trunk/no-interwiki/tidy.sh 2008-08-13 10:30:54 UTC (rev 69)
+++ trunk/no-interwiki/tidy.sh 2008-11-21 20:57:24 UTC (rev 70)
@@ -20,7 +20,7 @@
fi
echo criticizing
-/usr/local/bin/perlcritic -brutal $FN
+perlcritic -brutal $FN
exit $?
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <am...@us...> - 2008-08-13 10:30:44
|
Revision: 69
http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=69&view=rev
Author: amire80
Date: 2008-08-13 10:30:54 +0000 (Wed, 13 Aug 2008)
Log Message:
-----------
Exclude namespace before redirect check, makes log shorter.
Modified Paths:
--------------
trunk/no-interwiki/prepare_noiw_list.pl
trunk/no-interwiki/tidy.sh
Modified: trunk/no-interwiki/prepare_noiw_list.pl
===================================================================
--- trunk/no-interwiki/prepare_noiw_list.pl 2008-08-12 07:54:04 UTC (rev 68)
+++ trunk/no-interwiki/prepare_noiw_list.pl 2008-08-13 10:30:54 UTC (rev 69)
@@ -380,8 +380,8 @@
# Skipping cases:
next PAGE
if (
- is_redirect($page)
- or not is_in_namespace($page, @INCLUDE_NAMESPACES)
+ not is_in_namespace($page, @INCLUDE_NAMESPACES)
+ or is_redirect($page)
# TODO: Be more precise here.
# Portal pages which have a '/' in their name are probably
Modified: trunk/no-interwiki/tidy.sh
===================================================================
--- trunk/no-interwiki/tidy.sh 2008-08-12 07:54:04 UTC (rev 68)
+++ trunk/no-interwiki/tidy.sh 2008-08-13 10:30:54 UTC (rev 69)
@@ -14,7 +14,7 @@
exit 1
fi
-diff $FN ${FN}.bak
+diff $FN.bak ${FN}
if [ $? -eq 2 ]; then
exit 1
fi
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|