[PerlWikiBot] SF.net SVN: perlwikibot:[73] trunk/no-interwiki/prepare_noiw_list.pl
Status: Pre-Alpha
Brought to you by:
rotemliss
|
From: <am...@us...> - 2009-09-08 19:20:25
|
Revision: 73
http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=73&view=rev
Author: amire80
Date: 2009-09-08 19:20:10 +0000 (Tue, 08 Sep 2009)
Log Message:
-----------
cleanup before upgrading to Parse::MediaWikiDump 0.92
Modified Paths:
--------------
trunk/no-interwiki/prepare_noiw_list.pl
Modified: trunk/no-interwiki/prepare_noiw_list.pl
===================================================================
--- trunk/no-interwiki/prepare_noiw_list.pl 2008-11-21 23:07:37 UTC (rev 72)
+++ trunk/no-interwiki/prepare_noiw_list.pl 2009-09-08 19:20:10 UTC (rev 73)
@@ -12,7 +12,7 @@
# This program is Free Software; you can redistribute it and/or modify it
# under the same terms as Perl itself.
-# Upgrade! This script actually uses new Perl 5.10 constructs, so you need it
+# Upgrade! This program actually uses new Perl 5.10 constructs, so you need it
use 5.010;
# pragmata
@@ -27,7 +27,6 @@
use English qw(-no_match_vars);
use Carp qw(croak cluck);
use Time::Local;
-use List::Util qw(first);
use Getopt::Long;
use Data::Dumper;
use File::Basename;
@@ -36,14 +35,18 @@
# You must install these modules from CPAN
# Versions before 0.51 have a bug with parsing the namespace of a page
-use Parse::MediaWikiDump 0.51;
+use Readonly;
use Regexp::Common;
use Lingua::Translit;
use Log::Log4perl qw(:easy);
-use Readonly;
+use Parse::MediaWikiDump 0.51; # Earlier versions have a bug in namespaces
+# XXX
+use Devel::Leak;
+use Devel::Size qw(size total_size);
+
#<<< no perltidy
-my %SVN_PROPS = ( ## no critic ValuesAndExpressions::RequireInterpolationOfMetachars
+my %SVN_PROPS = ( ## no critic (RequireInterpolationOfMetachars)
Revision => '$Revision$',
HeadURL => '$HeadURL$',
Date => '$Date$',
@@ -53,7 +56,6 @@
? "0.1.9.$+{revision_num}"
: croak(q(Something is wrong with SVN revision number));
-my %PATTERN;
Readonly my $WIKITEXT_EXT => 'wiki.txt';
Readonly my $OUT_DIR => 'out';
Readonly my $UNSORTED_DIR => "$OUT_DIR/unsorted";
@@ -102,15 +104,15 @@
}
# XXX Too coupled to Wikipedia, won't work for other projects.
-$PATTERN{dump_fn} = qr{
+Readonly my $DUMP_FN_RE => qr{
\A # Begin string
(?<wiki_lang>\w+) # Lang code
wiki # Is supposed to be after the lang code
}xms;
-$PATTERN{field_sep} = qr{\t}xms;
+Readonly my $FIELD_SEP_RE => qr{\t}xms;
-$PATTERN{string_skip} = qr{
+Readonly my $STRING_SKIP_RE => qr{
\A # Begin string
\s* # Zero or more spaces
(?:\#.*)? # Comment lines
@@ -122,9 +124,9 @@
or croak('Dump filename must be supplied as an argument.');
my %STRING;
my $WIKI_LANG;
-if ((basename $dump_fn) =~ $PATTERN{dump_fn}) {
+if ((basename $dump_fn) =~ $DUMP_FN_RE) {
$WIKI_LANG = $+{wiki_lang};
- %STRING = get_strings($WIKI_LANG);
+ get_strings($WIKI_LANG);
}
else {
croak("$dump_fn is a weird dump file name.");
@@ -135,8 +137,8 @@
my $TRANSLITERATOR = Lingua::Translit->new('ISO 9');
Readonly my %MW_SYNTAX => (
- 'start_tmpl' => '{{',
- 'end_tmpl' => '}}',
+ 'start_template' => '{{',
+ 'end_template' => '}}',
'start_link' => '[[',
'end_link' => ']]',
'param_sep' => q{|},
@@ -146,7 +148,7 @@
);
# This monstrosity basically says: | and optional spaces
-$PATTERN{param_sep} = qr{\s*\Q$MW_SYNTAX{param_sep}\E\s*}xms;
+Readonly my $PARAM_SEP_RE => qr{\s*\Q$MW_SYNTAX{param_sep}\E\s*}xms;
Readonly my @INCLUDE_NAMESPACES => ('article space', 'Category', 'Portal');
@@ -168,7 +170,7 @@
@REV_MONTH{@MONTHS} = (0 .. $LAST_MONTH);
# XXX Internationalize
-$PATTERN{heb_date} = qr{
+Readonly my $HEB_DATE_RE => qr{
\A # begin
(?<hour>\d{1,2}) # hour
: # :
@@ -182,7 +184,7 @@
\z # end
}xms;
-$PATTERN{param} = qr{
+Readonly my $PARAM_RE => qr{
\A # Beginning of a string
\s* # Zero or more space
(?: # No capture
@@ -194,22 +196,22 @@
}xms;
# XXX It should use get_string()
-$PATTERN{simple_no_iw_check} = qr{
+Readonly my $SIMPLE_NO_IW_CHECK_RE => qr{
\Q$STRING{no_iw}\E # The string may have spaces
}xmsi;
# A simplistic template just for testing.
# Quite possibly it is not needed anymore.
# Until i get a better regex for matching balancing {{}} ...
-$PATTERN{template} = qr{
+Readonly my $TEMPLATE_RE => qr{
\A # beginning of string
- \Q$MW_SYNTAX{start_tmpl}\E # {{
+ \Q$MW_SYNTAX{start_template}\E # {{
.+ # some chars
- \Q$MW_SYNTAX{end_tmpl}\E # }}
+ \Q$MW_SYNTAX{end_template}\E # }}
\z # end of string
}xms;
-$PATTERN{wikitable} = qr{
+Readonly my $WIKITABLE_RE => qr{
\A
\Q$MW_SYNTAX{start_wikitable}\E
}xms;
@@ -218,7 +220,7 @@
# Regular expression mostly copied from
# Parse::MediaWikiDump::page::redirect
# TODO: Try to look for the local language redirect keyword in the dump.
-$PATTERN{local_redirect} = qr{
+Readonly my $LOCAL_REDIRECT_RE => qr{
\A # Beginning of string (page)
\# # a # character
$STRING{REDIRECT} # Redirect keyword in local language
@@ -226,33 +228,44 @@
\s*:?\s*\[\[([^\]]*)\]\] # the link after the redirect
}xmsi;
-$PATTERN{ltr_char} = qr/\P{IsLeftToRight}/xms;
-$PATTERN{true_template} = qr/\{ $RE{balanced}{-parens=>'{}'} \}/xms;
-$PATTERN{section_link} = qr{(?<!&)\#}xms;
-$PATTERN{lowercase_link} = qr{\A[[:lower:]]}xms;
+Readonly my $LTR_CHAR_RE => qr/\P{IsLeftToRight}/xms;
+Readonly my $SECTION_LINK_RE => qr{(?<!&)\#}xms;
+Readonly my $LOWERCASE_LINK_RE => qr{\A[[:lower:]]}xms;
-# XXX get_string() cannot be used here
-if ($STRING{exclude_lowercase}) {
- $PATTERN{exclude_lowercase} = qr{\A[$STRING{exclude_lowercase}]}xms;
-}
-$PATTERN{numbered_file} = qr{
+## no critic (ProhibitEscapedMetacharacters)
+Readonly my $TRUE_TEMPLATE_RE => qr/\{ $RE{balanced}{-parens=>'{}'} \}/xms;
+## use critic (ProhibitEscapedMetacharacters)
+
+# get_string() cannot be used in re
+my $string_exclude_lowercase = get_string('exclude_lowercase');
+Readonly my $EXCLUDE_LOWERCASE_RE => qr{
+ \A # Beginning of foreign article name
+ [$string_exclude_lowercase] # Character class of possibly lowercase chars
+}xms;
+Readonly my $NUMBERED_FILE_RE => qr{
(?: _ \d*)?
\.$WIKITEXT_EXT
}xms;
-$PATTERN{invalid_filename_char} = qr{[\\\n/:*?"<>|]}xms; # "
+Readonly my $INVALID_FILENAME_CHAR_RE => qr{[\\\n/:*?"<>|]}xms; # "
-# TODO: Check whether it is Neapolitan with its ''
-$PATTERN{character_code_in_link} = qr{
+my $two_digit_charnumber_re = qr{
(?:
[%.] # There are both %C4%B0 and .AA.E0
[[:xdigit:]]{2} # 2 hex digits
)
- |
+}xms;
+
+my $html_charnumber_re = qr{
(?:
&\#\d+; # stuff like Š
)
}xms;
+# TODO: Check whether it is Neapolitan with its ''
+Readonly my $CHARACTER_CODE_IN_LINK_RE => qr{
+ $two_digit_charnumber_re | $html_charnumber_re
+}xms;
+
my %LANG_CODE;
Readonly my $LANG_CODE_FN => "$WIKI_LANG.language_codes.txt";
open my $lang_code_file, '<', $LANG_CODE_FN
@@ -260,7 +273,7 @@
while (my $line = <$lang_code_file>) {
chomp $line;
my ($code, $name) = split /\t/xms, $line;
- $LANG_CODE{$code} = $name // $code;
+ $LANG_CODE{$code} = $name // $code; # /
}
close $lang_code_file
@@ -268,7 +281,7 @@
Readonly my $ALT_LANGS => join $ALT_SEP, keys %LANG_CODE;
-$PATTERN{interwiki_link} = qr{
+Readonly my $INTERWIKI_LINK_RE => qr{
\Q$MW_SYNTAX{start_link}\E
(?<lang_code>
$ALT_LANGS
@@ -282,6 +295,7 @@
# Lojban allows lowercase articles
Readonly my @LOWERCASE_LANGS => qw(jbo);
+
Readonly my $COOLING_DAYS => 120;
Readonly my $COOLING_SECONDS => $COOLING_DAYS * 24 * 60 * 60;
Readonly my $LATEST_COOLING => time - $COOLING_SECONDS;
@@ -302,11 +316,12 @@
}
}
-my $dump = Parse::MediaWikiDump::Pages->new($dump_fn);
+my $pmwd = Parse::MediaWikiDump->new();
+my $dump = $pmwd->revisions($dump_fn);
my $namespaces_alt = join $ALT_SEP,
grep { length > 0 } @{ $dump->namespaces_names() };
-$PATTERN{pure_title} = qr{
+Readonly my $PURE_TITLE_RE => qr{
\A
(?:
(?:$namespaces_alt)
@@ -359,12 +374,27 @@
return $page->namespace() || get_string('article space');
}
+sub next_page {
+ my $page_ref = eval { $dump->next(); };
+ if ($EVAL_ERROR) {
+ confess("Failed reading a page: $EVAL_ERROR");
+ }
+ return $page_ref;
+}
+
sub find_iwless {
+ my $leak_handle;
+ my $leak_count;
+
PAGE:
- while (my $page = $dump->page()) {
+ while (my $page_ref = next_page()) {
$page_counter++;
if ($page_counter % $option{page_freq} == 0) {
say $page_counter;
+ # my $lead_count = Devel::Leak::NoteSV($leak_handle);
+ # say "leak count: $lead_count";
+ # say 'Devel::Size size of buck dump: ', total_size($dump);
+ # say 'Devel::Size total_size of buck page_ref: ', total_size($page_ref);
}
last PAGE
@@ -374,45 +404,45 @@
next PAGE
if ($page_counter < $option{start_from});
- my $namespace = namespace($page);
- my $page_title = $page->title();
+ my $page_namespace = namespace($page_ref);
+ my $page_title = $page_ref->title();
# Skipping cases:
next PAGE
if (
- not is_in_namespace($page, @INCLUDE_NAMESPACES)
- or is_redirect($page)
+ not is_in_namespace($page_ref, @INCLUDE_NAMESPACES)
+ or is_redirect($page_ref)
# TODO: Be more precise here.
# Portal pages which have a '/' in their name are probably
# internal and do not need interwiki links.
- or (is_in_namespace($page, 'Portal') and $page_title =~ m{/}xms)
+ or (is_in_namespace($page_ref, 'Portal')
+ and $page_title =~ m{/}xms)
);
- $namespace_count{$namespace}++;
+ $namespace_count{$page_namespace}++;
INFO("\n* processing $page_counter - ", $page_title);
- my $page_text_ref = $page->text();
+ my $page_text_ref = $page_ref->text();
# A simple sanity check: is the no_iw template anywhere around here?
- my $has_tmpl_no_iw =
- (${$page_text_ref} =~ $PATTERN{simple_no_iw_check});
+ my $has_template_no_iw = ($page_text_ref =~ $SIMPLE_NO_IW_CHECK_RE);
# Does the page have interwiki links?
# BIG XXX Actually checks only for English
- my $has_iw = has_interwiki($page);
+ my $has_iw = has_interwiki($page_ref);
if ($has_iw) {
INFO("has link to $has_iw");
- if ($has_tmpl_no_iw) {
+ if ($has_template_no_iw) {
INFO('has template no_iw. trying to remove ...');
- remove_tmpl_no_iw($page_text_ref);
+ remove_template_no_iw($page_text_ref);
$statistics{'has both valid interwiki and template'}++;
- special_cases_file('outdated_template', $page);
+ special_cases_file('outdated_template', $page_ref);
}
}
else { # does not have iw
- process_iwless_page($page, $has_tmpl_no_iw, $has_iw);
+ process_iwless_page($page_ref, $has_template_no_iw, $has_iw);
}
}
@@ -421,25 +451,24 @@
sub process_iwless_page {
my (
- $page, # object ref
- $has_tmpl_no_iw, # scalar bool
- $has_iw # scalar bool
+ $page_ref, # object ref
+ $has_template_no_iw, # scalar bool
+ $has_iw # scalar bool
) = @_;
INFO(q(does not have iw link.));
- $statistics{'has no interwiki link'}->{ namespace($page) }++;
+ $statistics{'has no interwiki link'}->{ namespace($page_ref) }++;
# Now we need to search for no_iw templates
# and parse their parameters - date and type
my @found_templates = ();
- my $page_text_ref = $page->text();
- my $page_title = $page->title();
+ my $page_text_ref = $page_ref->text();
# Optimized - does not start searching,
# if we already know that it is not there
- if ($has_tmpl_no_iw) {
+ if ($has_template_no_iw) {
find_templates($page_text_ref, \@found_templates,
[ get_string('no_iw') ]);
}
@@ -455,7 +484,7 @@
if ($found_templates_count > 1) {
WARN('many templates were found');
$statistics{'many templates'}++;
- special_cases_file('many_templates', $page);
+ special_cases_file('many_templates', $page_ref);
}
else {
INFO('good, found one template');
@@ -476,7 +505,7 @@
if (not defined $date_ref) {
INFO("invalid date: '$date_str'");
$statistics{'invalid date'}++;
- special_cases_file('invalid_date', $page);
+ special_cases_file('invalid_date', $page_ref);
}
elsif (cooling_date_passed($date_ref)) {
INFO('cooling date passed, updating to today ...');
@@ -494,11 +523,11 @@
}
- my @all_types = get_all_types($template->{params}->{type}, $page);
+ my @all_types = get_all_types($template->{params}->{type}, $page_ref);
foreach my $type (@all_types) {
- INFO("adding $page_title to the list as type $type");
- add_to_no_iw_list($page, $type);
+ INFO('adding ' . $page_ref->title() . " to the list as type $type");
+ add_to_no_iw_list($page_ref, $type);
$type_count{$type}++;
}
@@ -508,7 +537,7 @@
sub get_all_types {
my ($type_param, $page) = @_;
- $type_param //= q{};
+ $type_param //= q{}; # /
strip_whitespace($type_param);
my @all_types = split $TYPE_SEP, $type_param;
@@ -545,23 +574,23 @@
sub find_templates {
my (
- $page_text_ref, # string ref
+ $text_ref, # string ref
$found_templates_ref, # array ref
$filter # string array ref
) = @_;
# A reference to an array with one empty string.
# Matching against an empty string will always succeed.
- $filter //= [q{}];
+ $filter //= [q{}]; # /
# Get all highest-level matches
- my @matches = (${$page_text_ref} =~ /$PATTERN{true_template}/xmsgo);
+ my @matches = (${$text_ref} =~ /$TRUE_TEMPLATE_RE/xmsgo);
MATCH:
foreach my $next_match (@matches) {
- if ($next_match !~ $PATTERN{template}) {
+ if ($next_match !~ $TEMPLATE_RE) {
INFO(q(i thought that it is a template, but it was:));
- if ($next_match =~ $PATTERN{wikitable}) {
+ if ($next_match =~ $WIKITABLE_RE) {
INFO('a wikitable');
}
else {
@@ -574,7 +603,8 @@
foreach my $next_filter (@{$filter}) {
# N.B. - case-insensitive. Wrong, but kinda useful.
- if ($next_match =~ /\A\Q$MW_SYNTAX{'start_tmpl'}$next_filter/xmsi)
+ if ($next_match =~
+ /\A\Q$MW_SYNTAX{'start_template'}$next_filter/xmsi)
{
# N.B.: parse_template calls find_templates() recursively
@@ -592,7 +622,7 @@
my (
$template, # string ref
$default_param_names, # string array ref
- $subtmpl_filter, # string array ref
+ $subtemplate_filter, # string array ref
) = @_;
# %parsed_template:
@@ -602,16 +632,16 @@
# {subtemplates} - array ref
my (%parsed_template, %parsed_params, @clauses);
- $parsed_template{text} = strip_tmpl_curlies(${$template});
+ $parsed_template{text} = strip_template_curlies(${$template});
# First string of the split is the template name,
# the rest is the params
($parsed_template{name}, @clauses) =
- (split $PATTERN{param_sep}, ${ $parsed_template{text} });
+ (split $PARAM_SEP_RE, ${ $parsed_template{text} });
my $param_counter = 0;
foreach my $clause (@clauses) {
- if ($clause =~ $PATTERN{param}) {
+ if ($clause =~ $PARAM_RE) {
#<<< no perltidy
my ($name, $value) = @+{ qw(param_name value) };
#>>>
@@ -630,7 +660,7 @@
$parsed_params{$name} = $value;
}
else {
- my $error_msg = "Weird - $clause does not look a param";
+ my $error_msg = "Weird - $clause does not look like a param";
INFO($error_msg);
cluck($error_msg);
$statistics{'weird param'}++;
@@ -641,7 +671,7 @@
# Possible recursion
find_templates($parsed_template{text}, $parsed_template{subtemplates},
- $subtmpl_filter);
+ $subtemplate_filter);
return \%parsed_template;
}
@@ -651,7 +681,7 @@
return if (not defined $date_str);
- if ($date_str =~ $PATTERN{heb_date}) {
+ if ($date_str =~ $HEB_DATE_RE) {
INFO("found a valid date: $date_str");
my %parsed_date = (
'sec' => 0, # useful for timelocal
@@ -677,15 +707,15 @@
return;
}
-sub strip_tmpl_curlies {
+sub strip_template_curlies {
my ($template) = @_;
for ($template) {
s{
\A
- \Q$MW_SYNTAX{start_tmpl}\E
+ \Q$MW_SYNTAX{start_template}\E
}{}xms;
s{
- \Q$MW_SYNTAX{end_tmpl}\E
+ \Q$MW_SYNTAX{end_template}\E
\z
}{}xms;
}
@@ -693,7 +723,7 @@
}
# no arg unpacking for simplicity and performance
-sub strip_whitespace { ## no critic Subroutines::RequireArgUnpacking
+sub strip_whitespace { ## no critic (RequireArgUnpacking)
for (@_) {
s/\A\s*//xms;
s/\s*\z//xms;
@@ -705,13 +735,14 @@
my ($page) = @_;
my $page_title = $page->title();
- my $page_text = ${ $page->text() };
- study $page_text;
+ my $page_text = ${ $page->text() }; # XXX
+ study $page_text; # XXX
+
my %iw_links;
my %special_cases;
- while ($page_text =~ /$PATTERN{interwiki_link}/xmsgo) {
+ while ($page_text =~ /$INTERWIKI_LINK_RE/xmsgo) { # XXX
my ($lang_code, $foreign_article) = @+{qw(lang_code foreign_article)};
if (defined $iw_links{$lang_code}) {
$special_cases{double_links}->{$lang_code} = q{};
@@ -722,31 +753,31 @@
# A # sign not after an &.
# After an & it is probably a character number.
- if ($foreign_article =~ $PATTERN{section_link}) {
+ if ($foreign_article =~ $SECTION_LINK_RE) {
$special_cases{section_links}->{$lang_code} = q{};
}
# Char codes are common in section links, so there is no
# need to show them again
- elsif ($foreign_article =~ $PATTERN{character_code_in_link}) {
- $special_cases{charnumber_links}{$lang_code} = q{};
+ elsif ($foreign_article =~ $CHARACTER_CODE_IN_LINK_RE) {
+ $special_cases{charnumber_links}->{$lang_code} = q{};
}
# Lowercase links
if ( (not $lang_code ~~ @LOWERCASE_LANGS)
- and ($foreign_article =~ $PATTERN{lowercase_link}))
+ and ($foreign_article =~ $LOWERCASE_LINK_RE))
{
my $include_lowercase_link = 1;
# XXX get_string() cannot be used here
if (defined $STRING{exclude_lowercase}
- and $foreign_article =~ $PATTERN{exclude_lowercase})
+ and $foreign_article =~ $EXCLUDE_LOWERCASE_RE)
{
$include_lowercase_link = 0;
}
if ($include_lowercase_link) {
- $special_cases{lowercase_links}{$lang_code} = q{};
+ $special_cases{lowercase_links}->{$lang_code} = q{};
}
}
@@ -801,7 +832,7 @@
return;
}
-sub remove_tmpl_no_iw {
+sub remove_template_no_iw {
my ($params) = @_;
INFO( "Supposed to remove the no_iw template now, but ...\n"
. 'This sub is a stub. You can help Wikipedia by expanding it!');
@@ -853,7 +884,7 @@
my ($page_title) = @_;
my $sort_title;
- if ($page_title =~ $PATTERN{pure_title}) {
+ if ($page_title =~ $PURE_TITLE_RE) {
$sort_title = $+{pure_title};
}
else {
@@ -901,7 +932,7 @@
. $MW_SYNTAX{end_link};
if ($option{rtl}) {
- if ($page_title =~ $PATTERN{ltr_char}) {
+ if ($page_title =~ $LTR_CHAR_RE) {
# XXX get_string() cannot be used here
$link_to_page = $STRING{rlm} . $link_to_page . $STRING{rlm};
@@ -930,9 +961,9 @@
chomp $type_name;
foreach my $line (@lines) {
chomp $line;
- my ($page_title, $sort_title) = split $PATTERN{field_sep}, $line;
+ my ($page_title, $sort_title) = split $FIELD_SEP_RE, $line;
my $sort_letter = get_sort_letter($sort_title);
- $all_pages_in_type{$sort_letter} //= [];
+ $all_pages_in_type{$sort_letter} //= []; # /
push @{ $all_pages_in_type{$sort_letter} }, $page_title;
}
write_sorted_pages($type_name, \%all_pages_in_type);
@@ -965,8 +996,6 @@
write_page(\$page, \$type_fn, $file_number++);
$section_counter = 0;
- # N.B. Trying to free memory, not guaranteed
- undef $page;
$page = q{};
}
elsif ($section_counter) {
@@ -988,16 +1017,12 @@
[ @all_links_in_letter[ $first_link .. $last_link ] ]);
$page .= $links;
}
-
- # N.B. Trying to free memory, not guaranteed
- undef @all_links_in_letter;
}
# The page may be empty at this point
if ($page) {
write_page(\$page, \$type_fn, $file_number++);
}
- undef $page;
return;
}
@@ -1007,7 +1032,7 @@
my $pretty_file_number = sprintf '%03d', $file_number;
${$type_fn_ref} =~ s{
- $PATTERN{numbered_file}
+ $NUMBERED_FILE_RE
}
{_$pretty_file_number.$WIKITEXT_EXT}xmso;
INFO("creating file ${$type_fn_ref}");
@@ -1036,7 +1061,7 @@
# Custom Unicode character property for finding characters.
# The custom is to give those subroutines CamelCase names.
-sub IsLeftToRight { ## no critic NamingConventions::ProhibitMixedCaseSubs
+sub IsLeftToRight { ## no critic (Capitalization)
return <<'END';
+utf8::InHebrew
+utf8::IsSpace
@@ -1053,7 +1078,7 @@
INFO("\nEnglish redirect: $page_title");
return 1;
}
- if (${$page_text_ref} =~ $PATTERN{local_redirect}) {
+ if (${$page_text_ref} =~ $LOCAL_REDIRECT_RE) {
INFO("\nLocal redirect: $page_title");
return 1;
}
@@ -1093,22 +1118,20 @@
close $STRINGS_FILE
or croak(file_error('closing', $STRINGS_FN, 'reading'));
- my %STRING;
-
STRING_LINE:
foreach my $next_string_line (@strings_file_lines) {
# Skip blanks and comments
- next STRING_LINE if ($next_string_line =~ $PATTERN{string_skip});
+ next STRING_LINE if ($next_string_line =~ $STRING_SKIP_RE);
chomp $next_string_line;
- my ($english, $target) = split $PATTERN{field_sep}, $next_string_line;
+ my ($english, $target) = split $FIELD_SEP_RE, $next_string_line;
# Fallback to English if no target language string was supplied
$STRING{$english} = $target // $english; # /
}
- return %STRING;
+ return;
}
sub get_string {
@@ -1125,7 +1148,7 @@
my $type_fn = "$transliterated_type.$WIKITEXT_EXT";
- $type_fn =~ s{$PATTERN{invalid_filename_char}}{-}xmsgo;
+ $type_fn =~ s{$INVALID_FILENAME_CHAR_RE}{-}xmsgo;
my $dir = $unsorted ? $UNSORTED_DIR : $OUT_DIR;
$type_fn = "$dir/$type_fn";
@@ -1138,7 +1161,7 @@
open my $file, '>>:utf8', $fn
or croak(file_error('opening', $fn, 'appending'));
- say {$file} ($line // q{});
+ say {$file} ($line // q{}); # /
close $file
or croak(file_error('closing', $fn, 'appeding'));
@@ -1236,11 +1259,11 @@
$first_local_article, @other_local_articles
) = @_;
- $local_multi_links_ref->{$first_local_article} //= {};
+ $local_multi_links_ref->{$first_local_article} //= {}; # /
foreach my $other_local_article (@other_local_articles) {
$local_multi_links_ref->{$first_local_article}
- ->{$other_local_article} //= [];
+ ->{$other_local_article} //= []; # /
push @{ $local_multi_links_ref->{$first_local_article}
->{$other_local_article} }, $foreign_link;
}
@@ -1250,7 +1273,7 @@
sub join_links {
my ($links_ref, $line_end) = @_;
- $line_end //= 1; # /
+ $line_end //= 1; # /
my $link_sep = q{ } . $LINK_SEP . ($line_end ? "\n" : q{ });
return join $link_sep, @{$links_ref};
@@ -1280,10 +1303,11 @@
foreach my $namespace (keys %{ $statistics{'has no interwiki link'} }) {
my $iwless_in_namespace =
$statistics{'has no interwiki link'}->{$namespace};
- ## no critic ValuesAndExpressions::ProhibitMagicNumbers
no integer;
+ ## no critic (ProhibitMagicNumbers)
my $percentage = sprintf '%.2f',
100 * $iwless_in_namespace / $namespace_count{$namespace};
+ ## use critic (ValuesAndExpressions::ProhibitMagicNumbers)
use integer;
INFO("$namespace: $iwless_in_namespace, $percentage%");
}
@@ -1344,7 +1368,7 @@
=item * --stop_after=NUMBER Stops processing after page with the given
NUMBER.
-=item * --stop_after=NUMBER Begins processing after page with the given
+=item * --start_from=NUMBER Begins processing after page with the given
NUMBER.
=item * --page_freq=NUMBER Print the page counter every NUMBER of
@@ -1514,7 +1538,7 @@
It checks the syntax, runs perltidy on the code and runs Perl::Critic.
-All the places where P::C has been disabled using "# no critic" are explained.
+All the places where P::C has been disabled using "no critic" are explained.
The time invested in making the code P::C-friendly will be returned as time
saved on debugging. Also consider reading the book "Perl Best Practices" by
@@ -1527,14 +1551,14 @@
This program works best on GNU/Linux, where Perl and the filesystem are
Unicode-friendly.
-This program was also tested on Windows XP and Vista with ActivePerl 5.10
-and Cygwin Perl 5.10. In these environments Unicode-related issues caused
+This program was also tested on Windows XP and Vista with ActivePerl and
+Cygwin. In these environments Unicode-related issues caused
filenames and clipboard text to become jumbled. You have been warned.
=head1 BUGS AND LIMITATIONS
Please report all bugs, features requests and other comments to
-Amir E. Aharoni (ami...@gm...).
+Amir E. Aharoni (ami...@ma...).
=head2 There is no equality between languages
@@ -1662,4 +1686,3 @@
I<Visca la llibertat!>
=cut
-
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|