[PerlWikiBot] SF.net SVN: perlwikibot:[74] trunk/no-interwiki
Status: Pre-Alpha
Brought to you by:
rotemliss
|
From: <am...@us...> - 2009-09-24 22:34:08
|
Revision: 74
http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=74&view=rev
Author: amire80
Date: 2009-09-24 22:33:58 +0000 (Thu, 24 Sep 2009)
Log Message:
-----------
Fixing a lot of bitrot.
Modified Paths:
--------------
trunk/no-interwiki/prepare_noiw_list.pl
trunk/no-interwiki/tidy.sh
Added Paths:
-----------
trunk/no-interwiki/cv.language_codes.txt
trunk/no-interwiki/cv.strings.txt
Added: trunk/no-interwiki/cv.language_codes.txt
===================================================================
--- trunk/no-interwiki/cv.language_codes.txt (rev 0)
+++ trunk/no-interwiki/cv.language_codes.txt 2009-09-24 22:33:58 UTC (rev 74)
@@ -0,0 +1 @@
+link language_codes.txt
\ No newline at end of file
Property changes on: trunk/no-interwiki/cv.language_codes.txt
___________________________________________________________________
Added: svn:special
+ *
Added: trunk/no-interwiki/cv.strings.txt
===================================================================
--- trunk/no-interwiki/cv.strings.txt (rev 0)
+++ trunk/no-interwiki/cv.strings.txt 2009-09-24 22:33:58 UTC (rev 74)
@@ -0,0 +1,40 @@
+# months
+January кăрлач
+February нарăс
+March пуш
+April ака
+May çу
+June çĕртме
+July утă
+August çурла
+September авăн
+October юпа
+November чӳк
+December раштав
+
+in
+
+no_iw Интервики лартман
+category Категори
+disambig тĕрлĕ пĕлтерĕш
+template шаблон
+
+date дата
+type тĕс
+
+# MW specials
+REDIRECT куçару
+
+# Namespaces
+User Хутшăнакан
+User talk Хутшăнаканăн канашлу страници
+Image Ӳкерчĕк
+Portal Portal
+Category Категори
+article space article space
+
+# Other
+other Ытти
+rlm ‏
+exclude_lowercase ß
+
Modified: trunk/no-interwiki/prepare_noiw_list.pl
===================================================================
--- trunk/no-interwiki/prepare_noiw_list.pl 2009-09-08 19:20:10 UTC (rev 73)
+++ trunk/no-interwiki/prepare_noiw_list.pl 2009-09-24 22:33:58 UTC (rev 74)
@@ -21,6 +21,7 @@
use integer;
use open ':utf8';
use utf8;
+use charnames ':full';
# Standard library
# These modules should come installed with Perl
@@ -34,17 +35,14 @@
# CPAN
# You must install these modules from CPAN
-# Versions before 0.51 have a bug with parsing the namespace of a page
+# I only use Readonly where P::C complains about it. Using it with regular
+# expressions which have non-ASCII chars produces heisenbugs.
use Readonly;
use Regexp::Common;
use Lingua::Translit;
use Log::Log4perl qw(:easy);
use Parse::MediaWikiDump 0.51; # Earlier versions have a bug in namespaces
-# XXX
-use Devel::Leak;
-use Devel::Size qw(size total_size);
-
#<<< no perltidy
my %SVN_PROPS = ( ## no critic (RequireInterpolationOfMetachars)
Revision => '$Revision$',
@@ -56,14 +54,14 @@
? "0.1.9.$+{revision_num}"
: croak(q(Something is wrong with SVN revision number));
-Readonly my $WIKITEXT_EXT => 'wiki.txt';
-Readonly my $OUT_DIR => 'out';
-Readonly my $UNSORTED_DIR => "$OUT_DIR/unsorted";
-Readonly my $MULTI_DIR => "$OUT_DIR/multilinks";
-Readonly my $ALT_SEP => q{|};
-Readonly my $FIELD_SEP => qq{\t};
-Readonly my $LINK_SEP => q{|};
-Readonly my $TYPE_SEP => qr/\s*;\s*/xms;
+my $WIKITEXT_EXT = 'wiki.txt';
+my $OUT_DIR = 'out';
+my $UNSORTED_DIR = "$OUT_DIR/unsorted";
+my $MULTI_DIR = "$OUT_DIR/multilinks";
+my $ALT_SEP = q{|};
+my $FIELD_SEP = qq{\t};
+my $LINK_SEP = q{|};
+my $TYPE_SEP = qr/\s*;\s*/xms;
Readonly my $DEFAULT_MAX_IW_PLACES => 20;
Readonly my $DEFAULT_PAGE_FREQ => 1000;
@@ -104,15 +102,15 @@
}
# XXX Too coupled to Wikipedia, won't work for other projects.
-Readonly my $DUMP_FN_RE => qr{
+my $DUMP_FN_RE = qr{
\A # Begin string
(?<wiki_lang>\w+) # Lang code
wiki # Is supposed to be after the lang code
}xms;
-Readonly my $FIELD_SEP_RE => qr{\t}xms;
+my $FIELD_SEP_RE = qr{\t}xms;
-Readonly my $STRING_SKIP_RE => qr{
+my $STRING_SKIP_RE = qr{
\A # Begin string
\s* # Zero or more spaces
(?:\#.*)? # Comment lines
@@ -136,7 +134,7 @@
# ISO 9 is mostly good for Russian and it is still not perfect ASCII
my $TRANSLITERATOR = Lingua::Translit->new('ISO 9');
-Readonly my %MW_SYNTAX => (
+my %MW_SYNTAX = (
'start_template' => '{{',
'end_template' => '}}',
'start_link' => '[[',
@@ -148,15 +146,15 @@
);
# This monstrosity basically says: | and optional spaces
-Readonly my $PARAM_SEP_RE => qr{\s*\Q$MW_SYNTAX{param_sep}\E\s*}xms;
+my $PARAM_SEP_RE = qr{\s*\Q$MW_SYNTAX{param_sep}\E\s*}xms;
-Readonly my @INCLUDE_NAMESPACES => ('article space', 'Category', 'Portal');
+my @INCLUDE_NAMESPACES = ('article space', 'Category', 'Portal');
#
# Constants for date processing
#
-Readonly my @MONTHS => @STRING{
+my @MONTHS = @STRING{
qw(
January February March
April May June
@@ -170,7 +168,7 @@
@REV_MONTH{@MONTHS} = (0 .. $LAST_MONTH);
# XXX Internationalize
-Readonly my $HEB_DATE_RE => qr{
+my $HEB_DATE_RE = qr{
\A # begin
(?<hour>\d{1,2}) # hour
: # :
@@ -184,7 +182,7 @@
\z # end
}xms;
-Readonly my $PARAM_RE => qr{
+my $PARAM_RE = qr{
\A # Beginning of a string
\s* # Zero or more space
(?: # No capture
@@ -196,14 +194,14 @@
}xms;
# XXX It should use get_string()
-Readonly my $SIMPLE_NO_IW_CHECK_RE => qr{
+my $SIMPLE_NO_IW_CHECK_RE = qr{
\Q$STRING{no_iw}\E # The string may have spaces
}xmsi;
# A simplistic template just for testing.
# Quite possibly it is not needed anymore.
# Until i get a better regex for matching balancing {{}} ...
-Readonly my $TEMPLATE_RE => qr{
+my $TEMPLATE_RE = qr{
\A # beginning of string
\Q$MW_SYNTAX{start_template}\E # {{
.+ # some chars
@@ -211,7 +209,7 @@
\z # end of string
}xms;
-Readonly my $WIKITABLE_RE => qr{
+my $WIKITABLE_RE = qr{
\A
\Q$MW_SYNTAX{start_wikitable}\E
}xms;
@@ -220,7 +218,7 @@
# Regular expression mostly copied from
# Parse::MediaWikiDump::page::redirect
# TODO: Try to look for the local language redirect keyword in the dump.
-Readonly my $LOCAL_REDIRECT_RE => qr{
+my $LOCAL_REDIRECT_RE = qr{
\A # Beginning of string (page)
\# # a # character
$STRING{REDIRECT} # Redirect keyword in local language
@@ -228,60 +226,62 @@
\s*:?\s*\[\[([^\]]*)\]\] # the link after the redirect
}xmsi;
-Readonly my $LTR_CHAR_RE => qr/\P{IsLeftToRight}/xms;
-Readonly my $SECTION_LINK_RE => qr{(?<!&)\#}xms;
-Readonly my $LOWERCASE_LINK_RE => qr{\A[[:lower:]]}xms;
+my $LTR_CHAR_RE = qr/\P{IsLeftToRight}/xms;
+my $SECTION_LINK_RE = qr{(?<!&)\#}xms;
+my $LOWERCASE_LINK_RE = qr{\A[[:lower:]]}xms;
## no critic (ProhibitEscapedMetacharacters)
-Readonly my $TRUE_TEMPLATE_RE => qr/\{ $RE{balanced}{-parens=>'{}'} \}/xms;
+my $TRUE_TEMPLATE_RE = qr/\{ $RE{balanced}{-parens=>'{}'} \}/xms;
## use critic (ProhibitEscapedMetacharacters)
# get_string() cannot be used in re
my $string_exclude_lowercase = get_string('exclude_lowercase');
-Readonly my $EXCLUDE_LOWERCASE_RE => qr{
+my $EXCLUDE_LOWERCASE_RE = qr{
\A # Beginning of foreign article name
[$string_exclude_lowercase] # Character class of possibly lowercase chars
}xms;
-Readonly my $NUMBERED_FILE_RE => qr{
+my $NUMBERED_FILE_RE = qr{
(?: _ \d*)?
\.$WIKITEXT_EXT
}xms;
-Readonly my $INVALID_FILENAME_CHAR_RE => qr{[\\\n/:*?"<>|]}xms; # "
+my $INVALID_FILENAME_CHAR_RE = qr{[\\\n/:*?"<>|]}xms; # "
-my $two_digit_charnumber_re = qr{
+my $TWO_DIGIT_CHARNUMBER_RE = qr{
(?:
[%.] # There are both %C4%B0 and .AA.E0
[[:xdigit:]]{2} # 2 hex digits
)
}xms;
-my $html_charnumber_re = qr{
+my $HTML_CHARNUMBER_RE = qr{
(?:
&\#\d+; # stuff like Š
)
}xms;
# TODO: Check whether it is Neapolitan with its ''
-Readonly my $CHARACTER_CODE_IN_LINK_RE => qr{
- $two_digit_charnumber_re | $html_charnumber_re
+my $CHARACTER_CODE_IN_LINK_RE = qr{
+ $TWO_DIGIT_CHARNUMBER_RE | $HTML_CHARNUMBER_RE
}xms;
my %LANG_CODE;
-Readonly my $LANG_CODE_FN => "$WIKI_LANG.language_codes.txt";
-open my $lang_code_file, '<', $LANG_CODE_FN
+my $LANG_CODE_FN = "$WIKI_LANG.language_codes.txt";
+open my $lang_code_file, '<', $LANG_CODE_FN ## no critic (RequireBriefOpen)
or croak(file_error('opening', $LANG_CODE_FN, 'reading'));
while (my $line = <$lang_code_file>) {
chomp $line;
my ($code, $name) = split /\t/xms, $line;
- $LANG_CODE{$code} = $name // $code; # /
+ if (defined $code) {
+ $LANG_CODE{$code} = $name // $code; # /
+ }
}
close $lang_code_file
or croak(file_error('closing', $LANG_CODE_FN, 'reading'));
-Readonly my $ALT_LANGS => join $ALT_SEP, keys %LANG_CODE;
+my $ALT_LANGS = join $ALT_SEP, keys %LANG_CODE;
-Readonly my $INTERWIKI_LINK_RE => qr{
+my $INTERWIKI_LINK_RE = qr{
\Q$MW_SYNTAX{start_link}\E
(?<lang_code>
$ALT_LANGS
@@ -294,7 +294,7 @@
}xms;
# Lojban allows lowercase articles
-Readonly my @LOWERCASE_LANGS => qw(jbo);
+my @LOWERCASE_LANGS = qw(jbo);
Readonly my $COOLING_DAYS => 120;
Readonly my $COOLING_SECONDS => $COOLING_DAYS * 24 * 60 * 60;
@@ -321,7 +321,7 @@
my $namespaces_alt = join $ALT_SEP,
grep { length > 0 } @{ $dump->namespaces_names() };
-Readonly my $PURE_TITLE_RE => qr{
+my $PURE_TITLE_RE = qr{
\A
(?:
(?:$namespaces_alt)
@@ -357,7 +357,6 @@
# my @found_lang_codes = sort keys %found_links;
# INFO("found lang_codes: @found_lang_codes");
-say 'looking for multi links';
my $begin_multi_links_time = time;
print_multi_links_by_foreign();
@@ -383,18 +382,11 @@
}
sub find_iwless {
- my $leak_handle;
- my $leak_count;
-
PAGE:
while (my $page_ref = next_page()) {
$page_counter++;
if ($page_counter % $option{page_freq} == 0) {
say $page_counter;
- # my $lead_count = Devel::Leak::NoteSV($leak_handle);
- # say "leak count: $lead_count";
- # say 'Devel::Size size of buck dump: ', total_size($dump);
- # say 'Devel::Size total_size of buck page_ref: ', total_size($page_ref);
}
last PAGE
@@ -406,27 +398,49 @@
my $page_namespace = namespace($page_ref);
my $page_title = $page_ref->title();
+ my $page_text_ref = $page_ref->text();
- # Skipping cases:
- next PAGE
- if (
- not is_in_namespace($page_ref, @INCLUDE_NAMESPACES)
- or is_redirect($page_ref)
+ if (not defined $page_text_ref) {
+ WARN('ref to page text undefined, skipping');
+ $statistics{'ref to page text undefined'}++;
+ next PAGE;
+ }
- # TODO: Be more precise here.
- # Portal pages which have a '/' in their name are probably
- # internal and do not need interwiki links.
- or (is_in_namespace($page_ref, 'Portal')
- and $page_title =~ m{/}xms)
- );
+ if (not defined ${$page_text_ref}) {
+ WARN('page text undefined, skipping');
+ $statistics{'page text undefined'}++;
+ next PAGE;
+ }
+ if (${$page_text_ref} eq q()) {
+ WARN('page text empty, skipping');
+ $statistics{'page text empty'}++;
+ next PAGE;
+ }
+
+ if (not is_in_namespace($page_ref, @INCLUDE_NAMESPACES)) {
+ next PAGE;
+ }
+
+ if (my $redirect_type = is_redirect($page_ref)) {
+ $statistics{"redirect - $redirect_type"}++;
+ INFO("\n$page_title - $redirect_type redirect");
+ next PAGE;
+ }
+
+ # TODO: Be more precise here.
+ # Portal pages which have a '/' in their name are probably
+ # internal and do not need interwiki links.
+ if (is_in_namespace($page_ref, 'Portal') and $page_title =~ m{/}xms) {
+ next PAGE;
+ }
+
$namespace_count{$page_namespace}++;
INFO("\n* processing $page_counter - ", $page_title);
- my $page_text_ref = $page_ref->text();
-
# A simple sanity check: is the no_iw template anywhere around here?
- my $has_template_no_iw = ($page_text_ref =~ $SIMPLE_NO_IW_CHECK_RE);
+ my $has_template_no_iw =
+ (${$page_text_ref} =~ $SIMPLE_NO_IW_CHECK_RE);
# Does the page have interwiki links?
# BIG XXX Actually checks only for English
@@ -792,7 +806,15 @@
}
push @{ $statistics{count_iw}->[$count_iw] }, $page_title;
}
- INFO("iw link count for $page_title: $count_iw");
+ INFO(
+ "iw link count for $page_title"
+ . (
+ $option{'rlm'}
+ ? "\N{LEFT-TO-RIGHT MARK}"
+ : q()
+ )
+ . " is: $count_iw"
+ );
for my $special_case_name (keys %special_cases) {
if (scalar %{ $special_cases{$special_case_name} }) {
@@ -1061,7 +1083,7 @@
# Custom Unicode character property for finding characters.
# The custom is to give those subroutines CamelCase names.
-sub IsLeftToRight { ## no critic (Capitalization)
+sub IsLeftToRight { ## no critic (ProhibitMixedCaseSubs)
return <<'END';
+utf8::InHebrew
+utf8::IsSpace
@@ -1070,19 +1092,19 @@
}
sub is_redirect {
- my ($page) = @_;
- my $page_title = $page->title();
- my $page_text_ref = $page->text();
+ my ($page) = @_;
if ($page->redirect()) {
- INFO("\nEnglish redirect: $page_title");
- return 1;
+ return 'English';
}
+
+ my $page_text_ref = $page->text();
+
if (${$page_text_ref} =~ $LOCAL_REDIRECT_RE) {
- INFO("\nLocal redirect: $page_title");
- return 1;
+ return 'local';
}
- return 0;
+
+ return q();
}
sub is_in_namespace {
@@ -1290,13 +1312,20 @@
my $max_iw_index = $#{ $statistics{count_iw} };
INFO("max_iw_index: $max_iw_index");
+ MAX_IW:
for my $max_iw_place (0 .. $option{max_iw_places}) {
my @links =
- map { make_link($_) } @{ $statistics{count_iw}->[$max_iw_index] };
+ map { make_link($_) } @{ $statistics{count_iw}->[$max_iw_index] }
+ or last MAX_IW;
INFO("# $max_iw_index: " . join_links(\@links, 0));
- # Do nothing, just count down to the next index with a defined list
- while (not defined $statistics{count_iw}->[ --$max_iw_index ]) { }
+ # Do nothing, just count down to the next index with a defined list.
+ # $max_iw_index needs to be checked for nonzero-ness for the rare case
+ # of a very low page count.
+ while ($max_iw_index
+ and not defined $statistics{count_iw}->[ --$max_iw_index ])
+ {
+ }
}
INFO('pages without interwiki links per namespace');
@@ -1337,7 +1366,7 @@
prepare_noiw_list.pl
-version 0.2 - Noa.
+version 0.2.1 - Noa.
=head1 USAGE
@@ -1511,10 +1540,6 @@
This module is used for transliterating filenames to ASCII.
-=item * C<Readonly>
-
-To make Perl::Critic happy :)
-
=back
=head1 HACKING
Modified: trunk/no-interwiki/tidy.sh
===================================================================
--- trunk/no-interwiki/tidy.sh 2009-09-08 19:20:10 UTC (rev 73)
+++ trunk/no-interwiki/tidy.sh 2009-09-24 22:33:58 UTC (rev 74)
@@ -1,6 +1,6 @@
#!/bin/bash
-FN=prepare_noiw_list.pl
+FN=$1
echo checking syntax
perl -c $FN
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|