[PerlWikiBot] SF.net SVN: perlwikibot:[74] trunk/no-interwiki
Status: Pre-Alpha
Brought to you by:
rotemliss
From: <am...@us...> - 2009-09-24 22:34:08
|
Revision: 74 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=74&view=rev Author: amire80 Date: 2009-09-24 22:33:58 +0000 (Thu, 24 Sep 2009) Log Message: ----------- Fixing a lot of bitrot. Modified Paths: -------------- trunk/no-interwiki/prepare_noiw_list.pl trunk/no-interwiki/tidy.sh Added Paths: ----------- trunk/no-interwiki/cv.language_codes.txt trunk/no-interwiki/cv.strings.txt Added: trunk/no-interwiki/cv.language_codes.txt =================================================================== --- trunk/no-interwiki/cv.language_codes.txt (rev 0) +++ trunk/no-interwiki/cv.language_codes.txt 2009-09-24 22:33:58 UTC (rev 74) @@ -0,0 +1 @@ +link language_codes.txt \ No newline at end of file Property changes on: trunk/no-interwiki/cv.language_codes.txt ___________________________________________________________________ Added: svn:special + * Added: trunk/no-interwiki/cv.strings.txt =================================================================== --- trunk/no-interwiki/cv.strings.txt (rev 0) +++ trunk/no-interwiki/cv.strings.txt 2009-09-24 22:33:58 UTC (rev 74) @@ -0,0 +1,40 @@ +# months +January кăрлач +February нарăс +March пуш +April ака +May çу +June çĕртме +July утă +August çурла +September авăн +October юпа +November чӳк +December раштав + +in + +no_iw Интервики лартман +category Категори +disambig тĕрлĕ пĕлтерĕш +template шаблон + +date дата +type тĕс + +# MW specials +REDIRECT куçару + +# Namespaces +User Хутшăнакан +User talk Хутшăнаканăн канашлу страници +Image Ӳкерчĕк +Portal Portal +Category Категори +article space article space + +# Other +other Ытти +rlm ‏ +exclude_lowercase ß + Modified: trunk/no-interwiki/prepare_noiw_list.pl =================================================================== --- trunk/no-interwiki/prepare_noiw_list.pl 2009-09-08 19:20:10 UTC (rev 73) +++ trunk/no-interwiki/prepare_noiw_list.pl 2009-09-24 22:33:58 UTC (rev 74) @@ -21,6 +21,7 @@ use integer; use open ':utf8'; use utf8; +use charnames ':full'; # Standard library # These modules should come installed with Perl @@ -34,17 +35,14 @@ # CPAN # You must install these modules from CPAN -# Versions before 0.51 have a bug with parsing the namespace of a page +# I only use Readonly where P::C complains about it. Using it with regular +# expressions which have non-ASCII chars produces heisenbugs. use Readonly; use Regexp::Common; use Lingua::Translit; use Log::Log4perl qw(:easy); use Parse::MediaWikiDump 0.51; # Earlier versions have a bug in namespaces -# XXX -use Devel::Leak; -use Devel::Size qw(size total_size); - #<<< no perltidy my %SVN_PROPS = ( ## no critic (RequireInterpolationOfMetachars) Revision => '$Revision$', @@ -56,14 +54,14 @@ ? "0.1.9.$+{revision_num}" : croak(q(Something is wrong with SVN revision number)); -Readonly my $WIKITEXT_EXT => 'wiki.txt'; -Readonly my $OUT_DIR => 'out'; -Readonly my $UNSORTED_DIR => "$OUT_DIR/unsorted"; -Readonly my $MULTI_DIR => "$OUT_DIR/multilinks"; -Readonly my $ALT_SEP => q{|}; -Readonly my $FIELD_SEP => qq{\t}; -Readonly my $LINK_SEP => q{|}; -Readonly my $TYPE_SEP => qr/\s*;\s*/xms; +my $WIKITEXT_EXT = 'wiki.txt'; +my $OUT_DIR = 'out'; +my $UNSORTED_DIR = "$OUT_DIR/unsorted"; +my $MULTI_DIR = "$OUT_DIR/multilinks"; +my $ALT_SEP = q{|}; +my $FIELD_SEP = qq{\t}; +my $LINK_SEP = q{|}; +my $TYPE_SEP = qr/\s*;\s*/xms; Readonly my $DEFAULT_MAX_IW_PLACES => 20; Readonly my $DEFAULT_PAGE_FREQ => 1000; @@ -104,15 +102,15 @@ } # XXX Too coupled to Wikipedia, won't work for other projects. -Readonly my $DUMP_FN_RE => qr{ +my $DUMP_FN_RE = qr{ \A # Begin string (?<wiki_lang>\w+) # Lang code wiki # Is supposed to be after the lang code }xms; -Readonly my $FIELD_SEP_RE => qr{\t}xms; +my $FIELD_SEP_RE = qr{\t}xms; -Readonly my $STRING_SKIP_RE => qr{ +my $STRING_SKIP_RE = qr{ \A # Begin string \s* # Zero or more spaces (?:\#.*)? # Comment lines @@ -136,7 +134,7 @@ # ISO 9 is mostly good for Russian and it is still not perfect ASCII my $TRANSLITERATOR = Lingua::Translit->new('ISO 9'); -Readonly my %MW_SYNTAX => ( +my %MW_SYNTAX = ( 'start_template' => '{{', 'end_template' => '}}', 'start_link' => '[[', @@ -148,15 +146,15 @@ ); # This monstrosity basically says: | and optional spaces -Readonly my $PARAM_SEP_RE => qr{\s*\Q$MW_SYNTAX{param_sep}\E\s*}xms; +my $PARAM_SEP_RE = qr{\s*\Q$MW_SYNTAX{param_sep}\E\s*}xms; -Readonly my @INCLUDE_NAMESPACES => ('article space', 'Category', 'Portal'); +my @INCLUDE_NAMESPACES = ('article space', 'Category', 'Portal'); # # Constants for date processing # -Readonly my @MONTHS => @STRING{ +my @MONTHS = @STRING{ qw( January February March April May June @@ -170,7 +168,7 @@ @REV_MONTH{@MONTHS} = (0 .. $LAST_MONTH); # XXX Internationalize -Readonly my $HEB_DATE_RE => qr{ +my $HEB_DATE_RE = qr{ \A # begin (?<hour>\d{1,2}) # hour : # : @@ -184,7 +182,7 @@ \z # end }xms; -Readonly my $PARAM_RE => qr{ +my $PARAM_RE = qr{ \A # Beginning of a string \s* # Zero or more space (?: # No capture @@ -196,14 +194,14 @@ }xms; # XXX It should use get_string() -Readonly my $SIMPLE_NO_IW_CHECK_RE => qr{ +my $SIMPLE_NO_IW_CHECK_RE = qr{ \Q$STRING{no_iw}\E # The string may have spaces }xmsi; # A simplistic template just for testing. # Quite possibly it is not needed anymore. # Until i get a better regex for matching balancing {{}} ... -Readonly my $TEMPLATE_RE => qr{ +my $TEMPLATE_RE = qr{ \A # beginning of string \Q$MW_SYNTAX{start_template}\E # {{ .+ # some chars @@ -211,7 +209,7 @@ \z # end of string }xms; -Readonly my $WIKITABLE_RE => qr{ +my $WIKITABLE_RE = qr{ \A \Q$MW_SYNTAX{start_wikitable}\E }xms; @@ -220,7 +218,7 @@ # Regular expression mostly copied from # Parse::MediaWikiDump::page::redirect # TODO: Try to look for the local language redirect keyword in the dump. -Readonly my $LOCAL_REDIRECT_RE => qr{ +my $LOCAL_REDIRECT_RE = qr{ \A # Beginning of string (page) \# # a # character $STRING{REDIRECT} # Redirect keyword in local language @@ -228,60 +226,62 @@ \s*:?\s*\[\[([^\]]*)\]\] # the link after the redirect }xmsi; -Readonly my $LTR_CHAR_RE => qr/\P{IsLeftToRight}/xms; -Readonly my $SECTION_LINK_RE => qr{(?<!&)\#}xms; -Readonly my $LOWERCASE_LINK_RE => qr{\A[[:lower:]]}xms; +my $LTR_CHAR_RE = qr/\P{IsLeftToRight}/xms; +my $SECTION_LINK_RE = qr{(?<!&)\#}xms; +my $LOWERCASE_LINK_RE = qr{\A[[:lower:]]}xms; ## no critic (ProhibitEscapedMetacharacters) -Readonly my $TRUE_TEMPLATE_RE => qr/\{ $RE{balanced}{-parens=>'{}'} \}/xms; +my $TRUE_TEMPLATE_RE = qr/\{ $RE{balanced}{-parens=>'{}'} \}/xms; ## use critic (ProhibitEscapedMetacharacters) # get_string() cannot be used in re my $string_exclude_lowercase = get_string('exclude_lowercase'); -Readonly my $EXCLUDE_LOWERCASE_RE => qr{ +my $EXCLUDE_LOWERCASE_RE = qr{ \A # Beginning of foreign article name [$string_exclude_lowercase] # Character class of possibly lowercase chars }xms; -Readonly my $NUMBERED_FILE_RE => qr{ +my $NUMBERED_FILE_RE = qr{ (?: _ \d*)? \.$WIKITEXT_EXT }xms; -Readonly my $INVALID_FILENAME_CHAR_RE => qr{[\\\n/:*?"<>|]}xms; # " +my $INVALID_FILENAME_CHAR_RE = qr{[\\\n/:*?"<>|]}xms; # " -my $two_digit_charnumber_re = qr{ +my $TWO_DIGIT_CHARNUMBER_RE = qr{ (?: [%.] # There are both %C4%B0 and .AA.E0 [[:xdigit:]]{2} # 2 hex digits ) }xms; -my $html_charnumber_re = qr{ +my $HTML_CHARNUMBER_RE = qr{ (?: &\#\d+; # stuff like Š ) }xms; # TODO: Check whether it is Neapolitan with its '' -Readonly my $CHARACTER_CODE_IN_LINK_RE => qr{ - $two_digit_charnumber_re | $html_charnumber_re +my $CHARACTER_CODE_IN_LINK_RE = qr{ + $TWO_DIGIT_CHARNUMBER_RE | $HTML_CHARNUMBER_RE }xms; my %LANG_CODE; -Readonly my $LANG_CODE_FN => "$WIKI_LANG.language_codes.txt"; -open my $lang_code_file, '<', $LANG_CODE_FN +my $LANG_CODE_FN = "$WIKI_LANG.language_codes.txt"; +open my $lang_code_file, '<', $LANG_CODE_FN ## no critic (RequireBriefOpen) or croak(file_error('opening', $LANG_CODE_FN, 'reading')); while (my $line = <$lang_code_file>) { chomp $line; my ($code, $name) = split /\t/xms, $line; - $LANG_CODE{$code} = $name // $code; # / + if (defined $code) { + $LANG_CODE{$code} = $name // $code; # / + } } close $lang_code_file or croak(file_error('closing', $LANG_CODE_FN, 'reading')); -Readonly my $ALT_LANGS => join $ALT_SEP, keys %LANG_CODE; +my $ALT_LANGS = join $ALT_SEP, keys %LANG_CODE; -Readonly my $INTERWIKI_LINK_RE => qr{ +my $INTERWIKI_LINK_RE = qr{ \Q$MW_SYNTAX{start_link}\E (?<lang_code> $ALT_LANGS @@ -294,7 +294,7 @@ }xms; # Lojban allows lowercase articles -Readonly my @LOWERCASE_LANGS => qw(jbo); +my @LOWERCASE_LANGS = qw(jbo); Readonly my $COOLING_DAYS => 120; Readonly my $COOLING_SECONDS => $COOLING_DAYS * 24 * 60 * 60; @@ -321,7 +321,7 @@ my $namespaces_alt = join $ALT_SEP, grep { length > 0 } @{ $dump->namespaces_names() }; -Readonly my $PURE_TITLE_RE => qr{ +my $PURE_TITLE_RE = qr{ \A (?: (?:$namespaces_alt) @@ -357,7 +357,6 @@ # my @found_lang_codes = sort keys %found_links; # INFO("found lang_codes: @found_lang_codes"); -say 'looking for multi links'; my $begin_multi_links_time = time; print_multi_links_by_foreign(); @@ -383,18 +382,11 @@ } sub find_iwless { - my $leak_handle; - my $leak_count; - PAGE: while (my $page_ref = next_page()) { $page_counter++; if ($page_counter % $option{page_freq} == 0) { say $page_counter; - # my $lead_count = Devel::Leak::NoteSV($leak_handle); - # say "leak count: $lead_count"; - # say 'Devel::Size size of buck dump: ', total_size($dump); - # say 'Devel::Size total_size of buck page_ref: ', total_size($page_ref); } last PAGE @@ -406,27 +398,49 @@ my $page_namespace = namespace($page_ref); my $page_title = $page_ref->title(); + my $page_text_ref = $page_ref->text(); - # Skipping cases: - next PAGE - if ( - not is_in_namespace($page_ref, @INCLUDE_NAMESPACES) - or is_redirect($page_ref) + if (not defined $page_text_ref) { + WARN('ref to page text undefined, skipping'); + $statistics{'ref to page text undefined'}++; + next PAGE; + } - # TODO: Be more precise here. - # Portal pages which have a '/' in their name are probably - # internal and do not need interwiki links. - or (is_in_namespace($page_ref, 'Portal') - and $page_title =~ m{/}xms) - ); + if (not defined ${$page_text_ref}) { + WARN('page text undefined, skipping'); + $statistics{'page text undefined'}++; + next PAGE; + } + if (${$page_text_ref} eq q()) { + WARN('page text empty, skipping'); + $statistics{'page text empty'}++; + next PAGE; + } + + if (not is_in_namespace($page_ref, @INCLUDE_NAMESPACES)) { + next PAGE; + } + + if (my $redirect_type = is_redirect($page_ref)) { + $statistics{"redirect - $redirect_type"}++; + INFO("\n$page_title - $redirect_type redirect"); + next PAGE; + } + + # TODO: Be more precise here. + # Portal pages which have a '/' in their name are probably + # internal and do not need interwiki links. + if (is_in_namespace($page_ref, 'Portal') and $page_title =~ m{/}xms) { + next PAGE; + } + $namespace_count{$page_namespace}++; INFO("\n* processing $page_counter - ", $page_title); - my $page_text_ref = $page_ref->text(); - # A simple sanity check: is the no_iw template anywhere around here? - my $has_template_no_iw = ($page_text_ref =~ $SIMPLE_NO_IW_CHECK_RE); + my $has_template_no_iw = + (${$page_text_ref} =~ $SIMPLE_NO_IW_CHECK_RE); # Does the page have interwiki links? # BIG XXX Actually checks only for English @@ -792,7 +806,15 @@ } push @{ $statistics{count_iw}->[$count_iw] }, $page_title; } - INFO("iw link count for $page_title: $count_iw"); + INFO( + "iw link count for $page_title" + . ( + $option{'rlm'} + ? "\N{LEFT-TO-RIGHT MARK}" + : q() + ) + . " is: $count_iw" + ); for my $special_case_name (keys %special_cases) { if (scalar %{ $special_cases{$special_case_name} }) { @@ -1061,7 +1083,7 @@ # Custom Unicode character property for finding characters. # The custom is to give those subroutines CamelCase names. -sub IsLeftToRight { ## no critic (Capitalization) +sub IsLeftToRight { ## no critic (ProhibitMixedCaseSubs) return <<'END'; +utf8::InHebrew +utf8::IsSpace @@ -1070,19 +1092,19 @@ } sub is_redirect { - my ($page) = @_; - my $page_title = $page->title(); - my $page_text_ref = $page->text(); + my ($page) = @_; if ($page->redirect()) { - INFO("\nEnglish redirect: $page_title"); - return 1; + return 'English'; } + + my $page_text_ref = $page->text(); + if (${$page_text_ref} =~ $LOCAL_REDIRECT_RE) { - INFO("\nLocal redirect: $page_title"); - return 1; + return 'local'; } - return 0; + + return q(); } sub is_in_namespace { @@ -1290,13 +1312,20 @@ my $max_iw_index = $#{ $statistics{count_iw} }; INFO("max_iw_index: $max_iw_index"); + MAX_IW: for my $max_iw_place (0 .. $option{max_iw_places}) { my @links = - map { make_link($_) } @{ $statistics{count_iw}->[$max_iw_index] }; + map { make_link($_) } @{ $statistics{count_iw}->[$max_iw_index] } + or last MAX_IW; INFO("# $max_iw_index: " . join_links(\@links, 0)); - # Do nothing, just count down to the next index with a defined list - while (not defined $statistics{count_iw}->[ --$max_iw_index ]) { } + # Do nothing, just count down to the next index with a defined list. + # $max_iw_index needs to be checked for nonzero-ness for the rare case + # of a very low page count. + while ($max_iw_index + and not defined $statistics{count_iw}->[ --$max_iw_index ]) + { + } } INFO('pages without interwiki links per namespace'); @@ -1337,7 +1366,7 @@ prepare_noiw_list.pl -version 0.2 - Noa. +version 0.2.1 - Noa. =head1 USAGE @@ -1511,10 +1540,6 @@ This module is used for transliterating filenames to ASCII. -=item * C<Readonly> - -To make Perl::Critic happy :) - =back =head1 HACKING Modified: trunk/no-interwiki/tidy.sh =================================================================== --- trunk/no-interwiki/tidy.sh 2009-09-08 19:20:10 UTC (rev 73) +++ trunk/no-interwiki/tidy.sh 2009-09-24 22:33:58 UTC (rev 74) @@ -1,6 +1,6 @@ #!/bin/bash -FN=prepare_noiw_list.pl +FN=$1 echo checking syntax perl -c $FN This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |