[PerlWikiBot] SF.net SVN: perlwikibot:[84] trunk/no-interwiki/Maintenance.pm
Status: Pre-Alpha
Brought to you by:
rotemliss
From: <am...@us...> - 2009-10-01 11:39:18
|
Revision: 84 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=84&view=rev Author: amire80 Date: 2009-10-01 11:39:00 +0000 (Thu, 01 Oct 2009) Log Message: ----------- Starting the big refactoring. Added Paths: ----------- trunk/no-interwiki/Maintenance.pm Added: trunk/no-interwiki/Maintenance.pm =================================================================== --- trunk/no-interwiki/Maintenance.pm (rev 0) +++ trunk/no-interwiki/Maintenance.pm 2009-10-01 11:39:00 UTC (rev 84) @@ -0,0 +1,1755 @@ +package Maintenance; + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 3 +# as published by the Free Software Foundation or under the terms of +# Artistic License version 2.0. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the licenses +# along with this program. If not, see +# * <http://www.gnu.org/licenses/gpl-3.0.html>. +# * <http://www.perlfoundation.org/artistic_license_2_0> + +# Upgrade! This program actually uses new Perl 5.10 constructs, so you need it +use 5.010; + +# pragmata +use strict; +use warnings; +use integer; # Improves performance +use open ':utf8'; +use utf8; +use charnames ':full'; + +# Standard library +# These modules should come installed with Perl +use English qw(-no_match_vars); +use Carp qw(croak cluck); +# use Time::Local; +# use Data::Dumper; +# use File::Basename; + +# CPAN +# You must install these modules from CPAN + +# I only use Readonly where P::C complains about it. Using it with regular +# expressions which have non-ASCII chars produces heisenbugs. +use Readonly; +# use Pod::Usage; +# use Regexp::Common; +# use Lingua::Translit; +# use Log::Log4perl qw(:easy); +use Parse::MediaWikiDump 0.91; # Earlier versions have a different API + +our $VERSION = '0.0.1'; +#<<< no perltidy +my %SVN_PROPS = ( ## no critic (RequireInterpolationOfMetachars) + Revision => '$Revision: 83 $', + HeadURL => '$HeadURL: https://perlwikibot.svn.sourceforge.net/svnroot/perlwikibot/trunk/no-interwiki/prepare_noiw_list.pl $', + Date => '$Date: 2009-09-29 15:25:11 +0200 (Tue, 29 Sep 2009) $', +); +#>>> + +my $LANG_CODE; +my %STRINGS; +my %LANG_CODE; +my $WIKI_LANG; + +sub init { + my (%args) = @_; + + if ((basename $args{dump_fn}) =~ $DUMP_FN_RE) { + $WIKI_LANG = $+{wiki_lang}; + load_strings($WIKI_LANG); + } + else { + croak("$dump_fn is a weird dump file name."); + } + + my $LANG_CODE_FN = "$wiki_lang.language_codes.txt"; + open my $lang_code_file, '<', $LANG_CODE_FN + or croak(file_error('opening', $LANG_CODE_FN, 'reading')); + while (my $line = <$lang_code_file>) { + chomp $line; + my ($code, $name) = split /\t/xms, $line; + if (defined $code) { + $LANG_CODE{$code} = $name // $code; # / + } + } + close $lang_code_file + or croak(file_error('closing', $LANG_CODE_FN, 'reading')); + + init_logger('outtest.log'); + + # TODO: Make smarter, configurable, whatever + # $OUT_DIR must be first, because it's the parent + foreach my $out_dir ($OUT_DIR, $UNSORTED_DIR, $MULTI_DIR) { + if (-d $out_dir) { + unlink glob "$out_dir/*$WIKITEXT_EXT"; + } + else { + mkdir $out_dir; + } + } + + return; +} + +my $WIKITEXT_EXT = 'wiki.txt'; +my $OUT_DIR = 'out'; +my $UNSORTED_DIR = "$OUT_DIR/unsorted"; +my $ALT_SEP = q{|}; +my $FIELD_SEP = qq{\t}; +my $LINK_SEP = q{|}; +my $TYPE_SEP = qr/\s*;\s*/xms; +my $HTML_RLM = '‏'; + +Readonly our $DEFAULT_PAGE_FREQ => 1000; +Readonly our $DEFAULT_MAX_LINKS_PER_SECTION => 100; +Readonly our $DEFAULT_MAX_SECTIONS_PER_PAGE => 20; + +sub init_logger { + my ($log_fn) = @_; + Log::Log4perl->easy_init( + { + level => $INFO, # print everything + file => ":utf8>$log_fn", # utf is important + layout => '%m%n', # No need to print the date + } + ); + return; +} + +# XXX Too coupled to Wikipedia, won't work for other projects. +our $DUMP_FN_RE = qr{ + \A # Begin string + (?<wiki_lang>\w+) # Lang code + wiki # Is supposed to be after the lang code +}xms; + +our $FIELD_SEP_RE = qr{\t}xms; + +our $STRING_SKIP_RE = qr{ + \A # Begin string + \s* # Zero or more spaces + (?:\#.*)? # Comment lines + \z # End string +}xms; + +my %STRING; + +# XXX - bad i18n +# ISO 9 is mostly good for Russian and it is still not perfect ASCII +# my $TRANSLITERATOR = Lingua::Translit->new('ISO 9'); + +our %MW_SYNTAX = ( + 'start_template' => '{{', + 'end_template' => '}}', + 'start_link' => '[[', + 'end_link' => ']]', + 'param_sep' => q{|}, + 'paragraph' => qq{\n\n}, + 'start_wikitable' => '{|', + 'namespace_sep' => q{:}, +); + +# This monstrosity basically says: | and optional spaces +our $PARAM_SEP_RE = qr{\s*\Q$MW_SYNTAX{param_sep}\E\s*}xms; + +# +# Constants for date processing +# + +our @MONTHS = @STRING{ + qw( + January February March + April May June + July August September + October November December + ) + }; +my $ALT_MONTHS = join $ALT_SEP, @MONTHS; +my %REV_MONTH; +Readonly my $LAST_MONTH => 11; +@REV_MONTH{@MONTHS} = (0 .. $LAST_MONTH); + +# XXX Internationalize +my $string_in = get_string('in'); +our $HEB_DATE_RE = qr{ + \A # begin + (?<hour>\d{1,2}) # hour + : # : + (?<min>\d{2}),\s # minute + (?<mday>\d{1,2})\s # day of month + $string_in? # This preposition appears sometimes + (?<mon>$ALT_MONTHS)\s # A name of a month + (?<year>\d+?)\s # Year + \([A-Z]{3}\) # Three letters in brackets - timezone + \z # end +}xms; + +our $PARAM_RE = qr{ + \A # Beginning of a string + \s* # Zero or more space + (?: # No capture + (?<param_name>\w+) # Word chars - param name + \s* # Zero or more space + = # = + )? # Param name is optional + (?<value>.*) # value +}xms; + +# A simplistic template just for testing. +# Quite possibly it is not needed anymore. +# Until i get a better regex for matching balancing {{}} ... +our $TEMPLATE_RE = qr{ + \A # beginning of string + \Q$MW_SYNTAX{start_template}\E # {{ + .+ # some chars + \Q$MW_SYNTAX{end_template}\E # }} + \z # end of string +}xms; + +our $WIKITABLE_RE = qr{ + \A + \Q$MW_SYNTAX{start_wikitable}\E +}xms; + +# Redirect in local language. +# Regular expression mostly copied from +# Parse::MediaWikiDump::page::redirect +# TODO: Try to look for the local language redirect keyword in the dump. +my $local_redirect = get_string('REDIRECT'); +my $LOCAL_REDIRECT_RE = qr{ + \A # Beginning of string (page) + \# # a # character + $local_redirect # Redirect keyword in local language + \s*:?\s*\[\[([^\]]*)\]\] # the link after the redirect +}xmsi; + +my $LTR_CHAR_RE = qr/\P{IsRightToLeft}/xms; # \P is negation +my $SECTION_LINK_RE = qr{(?<!&)\#}xms; +my $LOWERCASE_LINK_RE = qr{\A[[:lower:]]}xms; + +## no critic (RegularExpressions::ProhibitEscapedMetacharacters) +my $TRUE_TEMPLATE_RE = qr/\{ $RE{balanced}{-parens=>'{}'} \}/xms; +## use critic (RegularExpressions::ProhibitEscapedMetacharacters) + +my $string_exclude_lowercase = get_string('exclude_lowercase'); +my $EXCLUDE_LOWERCASE_RE = qr{ + \A # Beginning of foreign article name + [$string_exclude_lowercase] # Character class of possibly lowercase chars +}xms; +my $NUMBERED_FILE_RE = qr{ + (?: _ \d*)? + \.$WIKITEXT_EXT +}xms; +my $INVALID_FILENAME_CHAR_RE = qr{[\\\n/:*?"<>|]}xms; # " + +my $TWO_DIGIT_CHARNUMBER_RE = qr{ + (?: + [%.] # There are both %C4%B0 and .AA.E0 + [[:xdigit:]]{2} # 2 hex digits + ) +}xms; + +my $HTML_CHARNUMBER_RE = qr{ + (?: + &\#\d+; # stuff like Š + ) +}xms; + +# TODO: Check whether it is Neapolitan with its '' +my $CHARACTER_CODE_IN_LINK_RE = qr{ + $TWO_DIGIT_CHARNUMBER_RE | $HTML_CHARNUMBER_RE +}xms; + +sub load_lang_codes { + my ($wiki_lang) = @_; + +} + +my $string_no_iw = get_string('no_iw'); +my $SIMPLE_NO_IW_CHECK_RE = qr{ + \Q$string_no_iw\E # The string may have spaces +}xmsi; + +my $ALT_LANGS = join $ALT_SEP, keys %LANG_CODE; + +my $INTERWIKI_LINK_RE = qr{ + \Q$MW_SYNTAX{start_link}\E + (?<lang_code> + $ALT_LANGS + ) + : + (?<foreign_article> + .+? + ) + \Q$MW_SYNTAX{end_link}\E +}xms; + +# Lojban allows lowercase articles +my @LOWERCASE_LANGS = qw(jbo); + +our $the_dump = load_dump(); + +my $namespaces_alt = join $ALT_SEP, + grep { length > 0 } @{ $the_dump->namespaces_names() }; +my $PURE_TITLE_RE = qr{ + \A + (?: + (?:$namespaces_alt) + $MW_SYNTAX{namespace_sep} + )? # Namespace name is optional + (?<pure_title>.+) + \z +}xms; + +our %statistics = (count_iw => []); + +my $begin_time = time; +process_dump(); +my $total_time = time - $begin_time; + +print_stats(); + +create_no_iw_pages(); + +INFO(q{}); + +# my @found_lang_codes = sort keys %found_links; +# INFO("found lang_codes: @found_lang_codes"); + +my $begin_multi_links_time = time; + +say "\nlisting multi links by language"; +print_multi_links_by_foreign(); + +say "\nlisting multi links by local articles"; +print_multi_links_by_local(); + +my $total_multi_links_time = time - $begin_multi_links_time; +INFO("total multi links time: $total_multi_links_time"); + +exit; + +sub load_dump { + Readonly my $WORKING_PMWD_VER => 0.91; + Readonly my $FUTURE_PMWD_VER => 0.94; + + my $dump; + + if ($Parse::MediaWikiDump::VERSION == $WORKING_PMWD_VER) { + $dump = Parse::MediaWikiDump::Pages->new($dump_fn); + } + else { + if ($Parse::MediaWikiDump::VERSION < $FUTURE_PMWD_VER) { + carp( 'You are running Parse::MediaWikiDump version ' + . $Parse::MediaWikiDump::VERSION + . ".\n Redirect handling may be broken\n"); + } + my $pmwd = Parse::MediaWikiDump->new(); + $dump = $pmwd->revisions($dump_fn); + } + + # This is the only currently known value + # but there could be more in the future + if ($dump->case() ne 'first-letter') { + croak(q{unable to handle any case setting besides 'first-letter'}); + } + + return $dump; +} + +sub namespace { + my ($page) = @_; + return $page->namespace() || get_string('article space'); +} + +sub next_page { + my ($dump) = @_; + my $page_ref = eval { $dump->next(); }; + if ($EVAL_ERROR) { + confess("Failed reading a page: $EVAL_ERROR"); + } + return $page_ref; +} + +sub process_dump { + PAGE: + while (my $page_ref = next_page($the_dump)) { + $page_counter++; + if ($page_counter % $option{page_freq} == 0) { + say $page_counter; + } + + if ( $option{stop_after} + and $page_counter > $option{stop_after}) + { + last PAGE; + } + + my $page_text_ref = $page_ref->text(); + + if ( + $page_counter < $option{start_from} + or not defined ${$page_text_ref} # must be tested before redirect + or not is_in_namespace($page_ref, @INCLUDE_NAMESPACES) + or is_redirect($page_ref) + ) + { + next PAGE; + } + + my $page_namespace = namespace($page_ref); + $namespace_count{$page_namespace}++; + + my $page_title = $page_ref->title(); + + INFO("\n* processing $page_counter - ", $page_title); + + # TODO: Be more precise here. + # Portal pages which have a '/' in their name are probably + # internal and do not need interwiki links. + if (is_in_namespace($page_ref, 'Portal') and $page_title =~ m{/}xms) { + INFO('internal portal, skipping'); + $statistics{'internal portal'}++; + next PAGE; + } + + # A simple sanity check: is the no_iw template anywhere around here? + my $has_template_no_iw = + (${$page_text_ref} =~ $SIMPLE_NO_IW_CHECK_RE); + + # Does the page have interwiki links? + # BIG XXX Actually checks only for English + my $has_iw = has_interwiki($page_ref); + + if ($has_iw) { + INFO("has link to $has_iw"); + if ($has_template_no_iw) { + INFO('has template no_iw. trying to remove ...'); + remove_template_no_iw($page_text_ref); + $statistics{'has both valid interwiki and template'}++; + special_cases_file('outdated_template', $page_ref); + } + } + else { # does not have iw + process_iwless_page($page_ref, $has_template_no_iw, $has_iw); + } + } + + return; +} + +sub process_iwless_page { + my ( + $page_ref, # object ref + $has_template_no_iw, # scalar bool + $has_iw # scalar bool + ) = @_; + + INFO(q(does not have iw link.)); + $statistics{'has no interwiki link'}->{ namespace($page_ref) }++; + + # Now we need to search for no_iw templates + # and parse their parameters - date and type + + my @found_templates = (); + + my $page_text_ref = $page_ref->text(); + + # Optimized - does not start searching, + # if we already know that it is not there + if ($has_template_no_iw) { + find_templates($page_text_ref, \@found_templates, + [ get_string('no_iw') ]); + } + + my $found_templates_count = scalar @found_templates; + INFO("found templates: $found_templates_count"); + + my $template; + if ($found_templates_count) { + + # Assume the first one is the right one + $template = $found_templates[0]; + if ($found_templates_count > 1) { + WARN('many templates were found'); + $statistics{'many templates'}++; + special_cases_file('many_templates', $page_ref); + } + else { + INFO('good, found one template'); + $statistics{'one template'}++; + } + } + else { + INFO('no templates found'); + $statistics{'no templates found'}++; + } + + if (defined $template) { + INFO('has template no_iw'); + my $date_str = $template->{params}->{date}; + if (defined $date_str) { + INFO('checking cooling date'); + my $date_ref = parse_date($date_str); + if (not defined $date_ref) { + INFO("invalid date: '$date_str'"); + $statistics{'invalid date'}++; + special_cases_file('invalid_date', $page_ref); + } + elsif (cooling_date_passed($date_ref)) { + INFO('cooling date passed, updating to today ...'); + update_cooling_date($page_text_ref); + $statistics{'cooling date passed'}++; + } + else { + INFO(q(cooling date did not pass.)); + $statistics{q(cooling date did not pass)}++; + } + } + else { + INFO('date not defined'); + } + + } + + my @all_types = get_all_types($template->{params}->{type}, $page_ref); + + foreach my $type (@all_types) { + INFO('adding ' . $page_ref->title() . " to the list as type $type"); + add_to_no_iw_list($page_ref, $type); + $type_count{$type}++; + } + + return 1; +} + +sub get_all_types { + my ($type_param, $page) = @_; + + $type_param //= q{}; # / + strip_whitespace($type_param); + + my @all_types = split $TYPE_SEP, $type_param; + + my $page_title = $page->title(); + if (is_category($page)) { + INFO("$page_title is a category"); + push @all_types, get_string('Category'); + $statistics{'categories'}++; + } + + if (is_in_namespace($page, 'Portal')) { + INFO("$page_title is a portal"); + push @all_types, get_string('Portal'); + $statistics{'portal'}++; + } + + if (is_disambig($page)) { + INFO("$page_title is a disambiguation"); + push @all_types, get_string('disambig'); + $statistics{'disambig'}++; + } + + # Still nothing? + if (not scalar @all_types) { + my $other_type = get_string('other'); + INFO("$page_title does not have any type, adding to $other_type"); + @all_types = ($other_type); + $statistics{'automatically added to type other'}++; + } + + return @all_types; +} + +sub find_templates { + my ( + $text_ref, # string ref + $found_templates_ref, # array ref + $filter # string array ref + ) = @_; + + # A reference to an array with one empty string. + # Matching against an empty string will always succeed. + $filter //= [q{}]; # / + + # Get all highest-level matches + my @matches = (${$text_ref} =~ /$TRUE_TEMPLATE_RE/xmsgo); + + MATCH: + foreach my $next_match (@matches) { + if ($next_match !~ $TEMPLATE_RE) { + INFO(q(i thought that it is a template, but it was:)); + if ($next_match =~ $WIKITABLE_RE) { + INFO('a wikitable'); + } + else { + INFO("something else: $next_match"); + } + INFO(q{}); + next MATCH; + } + + foreach my $next_filter (@{$filter}) { + + # N.B. - case-insensitive. Wrong, but kinda useful. + if ($next_match =~ + /\A\Q$MW_SYNTAX{'start_template'}$next_filter/xmsi) + { + + # N.B.: parse_template calls find_templates() recursively + my $parsed_template = + parse_template(\$next_match, [qw(date type)], $filter); + push @{$found_templates_ref}, $parsed_template; + } + } + } + + return $found_templates_ref; +} + +sub parse_template { + my ( + $template, # string ref + $default_param_names, # string array ref + $subtemplate_filter, # string array ref + ) = @_; + + # %parsed_template: + # {text} - string ref; might be big + # {name} - string + # {params} - hash ref + # {subtemplates} - array ref + my (%parsed_template, %parsed_params, @clauses); + + $parsed_template{text} = strip_template_curlies(${$template}); + + # First string of the split is the template name, + # the rest is the params + ($parsed_template{name}, @clauses) = + (split $PARAM_SEP_RE, ${ $parsed_template{text} }); + + my $param_counter = 0; + foreach my $clause (@clauses) { + if ($clause =~ $PARAM_RE) { + #<<< no perltidy + my ($name, $value) = @+{ qw(param_name value) }; + #>>> + if (defined $name) { + if (defined $param_name{$name}) { + $name = $param_name{$name}; + } + } + else { + + # Get next default param name. + # If a default name is not defined, just use the number + $name = $default_param_names->[$param_counter] + || $param_counter; + } + $parsed_params{$name} = $value; + } + else { + my $error_msg = "Weird - $clause does not look like a param"; + INFO($error_msg); + cluck($error_msg); + $statistics{'weird param'}++; + } + $param_counter++; + } + $parsed_template{params} = \%parsed_params; + + # Possible recursion + find_templates($parsed_template{text}, $parsed_template{subtemplates}, + $subtemplate_filter); + + return \%parsed_template; +} + +sub parse_date { + my ($date_str) = @_; + + return if (not defined $date_str); + + if ($date_str =~ $HEB_DATE_RE) { + INFO("found a valid date: $date_str"); + my %parsed_date = ( + 'sec' => 0, # useful for timelocal + ); + foreach my $date_part (qw(hour min mday mon year)) { + $parsed_date{$date_part} = $+{$date_part}; + } + $parsed_date{mon} = $REV_MONTH{ $parsed_date{mon} }; + + # strip leading zeros + foreach (values %parsed_date) { + s{ + \A + 0+ + (?<number>\d+) + } + {$+{number}}xms; + } + return \%parsed_date; + } + + # Returns undef for an invalid date + return; +} + +sub strip_template_curlies { + my ($template) = @_; + for ($template) { + s{ + \A + \Q$MW_SYNTAX{start_template}\E + }{}xms; + s{ + \Q$MW_SYNTAX{end_template}\E + \z + }{}xms; + } + return \$template; +} + +# no arg unpacking for simplicity and performance +sub strip_whitespace { ## no critic (RequireArgUnpacking) + for (@_) { + s/\A\s*//xms; + s/\s*\z//xms; + } + return; +} + +sub has_interwiki { + my ($page) = @_; + + my $page_title = $page->title(); + my $page_text = ${ $page->text() }; # XXX + + study $page_text; # XXX + + my %iw_links; + my %special_cases; + + while ($page_text =~ /$INTERWIKI_LINK_RE/xmsgo) { # XXX + my ($lang_code, $foreign_article) = @+{qw(lang_code foreign_article)}; + if (defined $iw_links{$lang_code}) { + $special_cases{double_links}->{$lang_code} = q{}; + } + else { + $iw_links{$lang_code} = $foreign_article; + } + + # A # sign not after an &. + # After an & it is probably a character number. + if ($foreign_article =~ $SECTION_LINK_RE) { + $special_cases{section_links}->{$lang_code} = q{}; + } + + # Char codes are common in section links, so there is no + # need to show them again + elsif ($foreign_article =~ $CHARACTER_CODE_IN_LINK_RE) { + $special_cases{charnumber_links}->{$lang_code} = q{}; + } + + # Lowercase links + if ( (not $lang_code ~~ @LOWERCASE_LANGS) + and ($foreign_article =~ $LOWERCASE_LINK_RE)) + { + my $include_lowercase_link = 1; + + if (defined get_string('exclude_lowercase', 'if defined') + and $foreign_article =~ $EXCLUDE_LOWERCASE_RE) + { + $include_lowercase_link = 0; + } + + if ($include_lowercase_link) { + $special_cases{lowercase_links}->{$lang_code} = q{}; + } + } + + $found_links{$lang_code}->{$foreign_article}->{$page_title} = q{}; + } + + my @all_langs = keys %iw_links; + my $count_iw = scalar @all_langs; + if ($count_iw) { + if (not defined $statistics{count_iw}->[$count_iw]) { + $statistics{count_iw}->[$count_iw] = []; + } + push @{ $statistics{count_iw}->[$count_iw] }, $page_title; + } + INFO("iw link count for $page_title is: $count_iw"); + + for my $special_case_name (keys %special_cases) { + if (scalar %{ $special_cases{$special_case_name} }) { + special_cases_file($special_case_name, $page, + $special_cases{$special_case_name}); + } + } + + # BIG XXX Still very stupid, but getting better + if (defined $iw_links{en}) { + return 'en'; + } + + return q{}; +} + +sub special_cases_file { + my ($special_case_name, $page, $special_cases_ref) = @_; + $special_cases_ref //= {}; # / + + my $special_case_langs = join q{ }, sort keys %{$special_cases_ref}; + + if ($special_case_langs) { + $special_case_langs = " ($special_case_langs)"; + } + + my $special_case_fn = make_type_fn($special_case_name, 1); + + if (not -e $special_case_fn) { + append_to_file($special_case_fn, $special_case_name); + } + + my $page_title = $page->title(); + my $link = make_link($page_title); + my $line = + $link + . $special_case_langs + . $FIELD_SEP + . get_sort_title($page_title); + + append_to_file($special_case_fn, $line); + + return; +} + +sub remove_template_no_iw { + my ($params) = @_; + INFO( "Supposed to remove the no_iw template now, but ...\n" + . 'This sub is a stub. You can help Wikipedia by expanding it!'); + + return 0; +} + +sub cooling_date_passed { + my ($date_ref) = @_; + + my @page_times = @{$date_ref}{qw(sec min hour mday mon year)}; + INFO("page times: @page_times"); + + my $pageseconds = timelocal(@page_times); + return $pageseconds < $LATEST_COOLING; +} + +sub update_cooling_date { + my ($params) = @_; + + INFO( "Supposed to update cooling date now, but ...\n" + . "This sub is a stub. You can help Wikipedia by expanding it!\n" + ); + + return 0; +} + +# Just a debugging thingie +sub print_parsed_template { + my ($template_ref) = @_; # hash ref + + INFO('text:'); + INFO(${ $template_ref->{text} }); + INFO('name:'); + INFO($template_ref->{name}); + INFO('params:'); + foreach my $next_param (sort keys %{ $template_ref->{params} }) { + INFO("$next_param: $template_ref->{params}->{$next_param}"); + } + + if (defined $template_ref->{subtemplates}) { + INFO("subtemplates: $template_ref->{subtemplates}"); + } + + return; +} + +sub get_sort_title { + my ($page_title) = @_; + + my $sort_title; + if ($page_title =~ $PURE_TITLE_RE) { + $sort_title = $+{pure_title}; + } + else { + INFO( 'Something badly weird happened - ' + . "$page_title has no pure title"); + croak('A page has no pure title'); + } + + return $sort_title; +} + +sub add_to_no_iw_list { + my ( + $page, # ref + $page_type, # string + ) = @_; + + my $page_title = $page->title(); + INFO("Adding $page_title to no_iw_list"); + + my $sort_title = get_sort_title($page_title); + INFO("full title: $page_title"); + INFO("sort title: $sort_title"); + + my $unsorted_type_fn = make_type_fn($page_type, 1); + if (not -e $unsorted_type_fn) { + append_to_file($unsorted_type_fn, $page_type); + } + my $link = make_link($page_title); + INFO("link to page: $link"); + my $line = $link . $FIELD_SEP . $sort_title; + + append_to_file($unsorted_type_fn, $line); + + return; +} + +sub make_link { + my ($page_title) = @_; + + my $link_to_page = + $MW_SYNTAX{start_link} + . $MW_SYNTAX{namespace_sep} + . $page_title + . $MW_SYNTAX{end_link}; + + if ($option{rtl} and $page_title =~ $LTR_CHAR_RE) { + $link_to_page = $HTML_RLM . $link_to_page . $HTML_RLM; + } + + return $link_to_page; +} + +sub create_no_iw_pages { + my ($params) = @_; + + INFO("\ncreating no_iw pages"); + + # Run over page types + UNSORTED_TYPE_FN: + foreach my $unsorted_type_fn (glob "$UNSORTED_DIR/*") { + my %all_pages_in_type = (); + open my $unsorted_type_file, '<', $unsorted_type_fn + or croak(file_error('opening', $unsorted_type_fn, 'reading')); + my @lines = <$unsorted_type_file>; + close $unsorted_type_file + or croak(file_error('closing', $unsorted_type_fn, 'reading')); + + my $type_name = shift @lines; + chomp $type_name; + foreach my $line (@lines) { + chomp $line; + my ($page_title, $sort_title) = split $FIELD_SEP_RE, $line; + my $sort_letter = get_sort_letter($sort_title); + $all_pages_in_type{$sort_letter} //= []; # / + push @{ $all_pages_in_type{$sort_letter} }, $page_title; + } + write_sorted_pages($type_name, \%all_pages_in_type); + } + return; +} + +sub write_sorted_pages { + my ($type_name, $type_tree_ref) = @_; + + my $type_fn = make_type_fn($type_name); + + my $section_counter = 0; + my $page = q{}; + my $file_number = 1; + + LETTER: + foreach my $next_letter (sort keys %{$type_tree_ref}) { + my @all_links_in_letter = + sort @{ $type_tree_ref->{$next_letter} }; + + my $links_count = scalar @all_links_in_letter; + my $sections = + ($links_count - 1) / $option{max_links_per_section} + 1; + my $links_per_section = $links_count / $sections; + + SECTION: + foreach my $section_num (1 .. $sections) { + if ($section_counter == $option{max_sections_per_page}) { + write_page(\$page, \$type_fn, $file_number++); + $section_counter = 0; + + $page = q{}; + } + elsif ($section_counter) { + $page .= $MW_SYNTAX{paragraph}; + } + $section_counter++; + my $title = $next_letter; + if ($sections > 1) { + $title .= " $section_num"; + } + $page .= mw_heading($title); + + my $first_link = ($section_num - 1) * $links_per_section; + my $last_link = + ($section_num == $sections) + ? $links_count - 1 + : $first_link + $links_per_section - 1; + my $links = join_links( + [ @all_links_in_letter[ $first_link .. $last_link ] ]); + $page .= $links; + } + } + + # The page may be empty at this point + if ($page) { + write_page(\$page, \$type_fn, $file_number++); + } + + return; +} + +sub write_page { + my ($page_ref, $type_fn_ref, $file_number) = @_; + + my $pretty_file_number = sprintf '%03d', $file_number; + ${$type_fn_ref} =~ s{ + $NUMBERED_FILE_RE + } + {_$pretty_file_number.$WIKITEXT_EXT}xmso; + INFO("creating file ${$type_fn_ref}"); + append_to_file(${$type_fn_ref}, ${$page_ref}); + + return; +} + +sub mw_heading { + my ( + $text, # string + $level # number + ) = @_; + + $level //= 2; # / + my $level_marker = q{=} x $level; + + # Line ending is mandatory + return "$level_marker $text $level_marker\n"; +} + +sub mw_bold { + my ($text) = @_; + return "'''$text'''"; +} + +# Custom Unicode character property, which is like \w, but for Hebrew. +# The custom is to give custom Unicode character classes CamelCase names. +# P::C policy ProhibitMixedCaseSubs is deprecated. +sub IsRightToLeft { ## no critic (Capitalization) + return <<'END'; ++utf8::InHebrew ++utf8::IsSpace ++utf8::IsPunct +END +} + +sub is_redirect { + my ($page) = @_; + + if ($page->redirect()) { + return 'English'; + } + + my $page_text_ref = $page->text(); + + if (${$page_text_ref} =~ $LOCAL_REDIRECT_RE) { + return 'local'; + } + + return q(); +} + +sub is_in_namespace { + my ($page, @namespaces) = @_; + + return namespace($page) ~~ [ map { get_string($_) } @namespaces ]; +} + +sub is_category { + my ($page) = @_; + return is_in_namespace($page, 'Category'); +} + +sub is_disambig { + my ($page) = @_; + my $found_templates = + find_templates($page->text(), [], [ get_string('disambig') ]); + return scalar @{$found_templates}; +} + +# +# This is poor man's gettext. +# TODO: Replace it with Locale::Maketext +# +sub get_strings { + my ($lang) = @_; + + my $STRINGS_FN = "$lang.strings.txt"; + + open my $STRINGS_FILE, '<:utf8', $STRINGS_FN + or croak(file_error('opening', $STRINGS_FN, 'reading')); + my @strings_file_lines = <$STRINGS_FILE>; + close $STRINGS_FILE + or croak(file_error('closing', $STRINGS_FN, 'reading')); + + STRING_LINE: + foreach my $next_string_line (@strings_file_lines) { + + # Skip blanks and comments + next STRING_LINE if ($next_string_line =~ $STRING_SKIP_RE); + + chomp $next_string_line; + my ($english, $target) = split $FIELD_SEP_RE, $next_string_line; + + # Fallback to English if no target language string was supplied + $STRING{$english} = $target // $english; # / + } + + return; +} + +sub get_string { + my ($english, $if_defined) = @_; + return $if_defined + ? ($STRING{$english}) + : ($STRING{$english} //= $english); # / +} + +sub make_type_fn { + my ($type, $unsorted) = @_; + $unsorted //= 0; # / + + #my $transliterated_type = $TRANSLITERATOR->translit($type); + my $transliterated_type = $type; + + my $type_fn = "$transliterated_type.$WIKITEXT_EXT"; + + $type_fn =~ s{$INVALID_FILENAME_CHAR_RE}{-}xmsgo; + my $dir = $unsorted ? $UNSORTED_DIR : $OUT_DIR; + $type_fn = "$dir/$type_fn"; + + return $type_fn; +} + +sub append_to_file { + my ($fn, $line) = @_; + + open my $file, '>>:utf8', $fn + or croak(file_error('opening', $fn, 'appending')); + + say {$file} ($line // q{}); # / + + close $file + or croak(file_error('closing', $fn, 'appeding')); + + return; +} + +# It appears simple, but in non-alphabetic languages such as Chinese +# it may be different, so it will sit here ready for better i18n. +sub get_sort_letter { + my ($string) = @_; + return substr $string, 0, 1; +} + +sub file_error { + my ($operation, $fn, $access_type) = @_; + my $string = "error $operation $fn for $access_type: $OS_ERROR"; + return $string; +} + +sub format_link_table { ## no critic (RequireArgUnpacking) + return sprintf '%-15s %8d', @_; +} + +sub print_multi_links_by_foreign { + LANG_CODE: + foreach my $lang_code (sort keys %found_links) { + my $filename = "$MULTI_DIR/$lang_code.$WIKITEXT_EXT"; + my @foreign_articles = sort keys %{ $found_links{$lang_code} }; + say format_link_table($lang_code, scalar @foreign_articles); + + FOREIGN_ARTICLE: + foreach my $foreign_article (@foreign_articles) { + my @local_articles = + keys %{ $found_links{$lang_code}->{$foreign_article} }; + if (scalar @local_articles > 1) { + my $links = join q{ | }, sort map { make_link($_) } + keys %{ $found_links{$lang_code}->{$foreign_article} }; + + my $foreign_title = + make_link($lang_code + . $MW_SYNTAX{namespace_sep} + . $foreign_article); + append_to_file($filename, "* '''$foreign_title''' - $links"); + } + } + } + + return; +} + +sub print_multi_links_by_local { + my %local_multi_links; + LANG_CODE: + foreach my $lang_code (sort keys %found_links) { + my @foreign_articles = sort keys %{ $found_links{$lang_code} }; + say format_link_table($lang_code, scalar @foreign_articles); + + FOREIGN_ARTICLE: + foreach my $foreign_article (@foreign_articles) { + my @local_articles = + keys %{ $found_links{$lang_code}->{$foreign_article} }; + + if (scalar @local_articles > 1) { + add_local_multi( + \%local_multi_links, + make_link( + $lang_code + . $MW_SYNTAX{namespace_sep} + . $foreign_article + ), + @local_articles + ); + } + } + } + + say 'writing local multilinks file'; + my $filename = "$MULTI_DIR/LOCAL.$WIKITEXT_EXT"; + foreach my $local_multi_article (sort keys %local_multi_links) { + append_to_file($filename, + '* ' . mw_bold(make_link($local_multi_article))); + + foreach my $other_local_article ( + sort keys %{ $local_multi_links{$local_multi_article} }) + { + append_to_file($filename, + '** ' . make_link($other_local_article)); + my $foreign_articles = join_links( + $local_multi_links{$local_multi_article} + ->{$other_local_article}, + 0 + ); + append_to_file($filename, "*** $foreign_articles"); + } + append_to_file($filename); + } + + return; +} + +sub add_local_multi { + my ( + $local_multi_links_ref, $foreign_link, + $first_local_article, @other_local_articles + ) = @_; + + $local_multi_links_ref->{$first_local_article} //= {}; # / + + foreach my $other_local_article (@other_local_articles) { + $local_multi_links_ref->{$first_local_article} + ->{$other_local_article} //= []; # / + push @{ $local_multi_links_ref->{$first_local_article} + ->{$other_local_article} }, $foreign_link; + } + + return; +} + +sub join_links { + my ($links_ref, $line_end) = @_; + $line_end //= 1; # / + + my $link_sep = q{ } . $LINK_SEP . ($line_end ? "\n" : q{ }); + return join $link_sep, @{$links_ref}; +} + +sub print_stats { + INFO("\nSUMMARY"); + INFO("total time: $total_time"); + foreach my $stat_type (sort keys %statistics) { + if (not ref $statistics{$stat_type}) { + INFO("$stat_type: $statistics{$stat_type}"); + } + } + + my $max_iw_index = $#{ $statistics{count_iw} }; + INFO("max_iw_index: $max_iw_index"); + MAX_IW: + for my $max_iw_place (0 .. $option{max_iw_places}) { + my @links = + map { make_link($_) } @{ $statistics{count_iw}->[$max_iw_index] } + or last MAX_IW; + INFO("# $max_iw_index: " . join_links(\@links, 0)); + + # Do nothing, just count down to the next index with a defined list. + # $max_iw_index needs to be checked for nonzero-ness for the rare case + # of a very low page count. + while ($max_iw_index + and not defined $statistics{count_iw}->[ --$max_iw_index ]) + { + } + } + + INFO('pages without interwiki links per namespace'); + foreach my $namespace (keys %{ $statistics{'has no interwiki link'} }) { + my $iwless_in_namespace = + $statistics{'has no interwiki link'}->{$namespace}; + + no integer; + ## no critic (ProhibitMagicNumbers) + my $percentage = sprintf '%.2f', + 100 * $iwless_in_namespace / $namespace_count{$namespace}; + ## use critic (ValuesAndExpressions::ProhibitMagicNumbers) + use integer; + + INFO("$namespace: $iwless_in_namespace, $percentage%"); + } + + INFO("\nNAMESPACES"); + foreach my $namespace (sort keys %namespace_count) { + INFO("$namespace: $namespace_count{$namespace}"); + } + + INFO("\nTYPES"); + foreach my $type (sort keys %type_count) { + INFO("$type: $type_count{$type}"); + } + + my $dump_size = -s $dump_fn; + my $log_size = -s $LOG_FN; + INFO("dump size: $dump_fn"); + INFO("log size: $LOG_FN"); + if ($log_size > 0) { + my $dump_log_ratio = $dump_size / $log_size; + INFO("dump/log ratio: $dump_log_ratio"); + } + else { + WARN("weird: log file size $LOG_FN is 0"); + } + + return; +} + +__END__ + +=head1 NAME + +C<prepare_noiw_list.pl> - Searches a MediaWiki dump for pages without +interlanguage links and prepares categorized lists of those pages. + +=head1 VERSION + +This documentation refers to + +prepare_noiw_list.pl + +version 0.2.1 - Noa. + +=head1 USAGE + +=over + +=item * C<prepare_noiw_list.pl ./big-files/ruwiki-20080420-pages-meta-current.xml> + +=item * C<prepare_noiw_list.pl --rtl ./big-files/hewiki-20080420-pages-meta-current.xml> + +=item * C<prepare_noiw_list.pl --stop_after=20000 ./big-files/hewiki-20080420-pages-meta-current.xml> + +=back + +=head1 REQUIRED ARGUMENTS + +=over + +=item * MediaWiki dump file name is required + +=back + +=head1 OPTIONS + +=over + +=item * --rtl makes special fixes for right-to-left languages. + +=item * --stop_after=NUMBER Stops processing after page with the given +NUMBER. + +=item * --start_from=NUMBER Begins processing after page with the given +NUMBER. + +=item * --page_freq=NUMBER Print the page counter every NUMBER of +pages. 1000 by default. + +=item * --max_links_per_section Maximum number of links per section in +the output page. Default is 100. + +=item * --max_sections_per_page Maximum number of sections per output +page. Default is 20. + +=item * --max_iw_places Number of places to print in the statistics of +pages with the most interlanguage links. + +=item * --version Print the version number and exit. + +=item * --usage Print basic usage and exit. + +=item * --help Print full help and exit. + +=back + +=head1 DESCRIPTION + +The main goal of this program is to find pages which do not have +interwiki (interlanguage) links to certain languages. + +This program scans a MediaWiki XML dump file. It searches every page for +interwiki links and for a special hidden template, which defines to which +types of interwiki-less pages it belongs. + +=over + +=item * If the page contains links to the defined languages and does not +contain the "no interwiki" template, its processing stops. + +=item * If the page contains links to the defined languages and contains this +template, it is logged, so the template can be removed. (It is planned that +it will be removed automatically in the future.) + +=item * If the page contains no links to the defined languages and does not +contain the template, it is automatically added to type "other". + +=item * If the page contains no links to the defined languages and +a template with types, it is added to the defined types. + +=back + +Pages without links are added to nicely formatted lists according +to their type. + +This program also collects some information on the way about problematic +cases - more than one "no interwiki" template, invalid templates, invalid +or redundant interwiki links etc. + +You can get a dump for using this program at L<http://download.wikimedia.org/>. + +=head1 DIAGNOSTICS + +=head2 Invalid command line options. + +See L</"REQUIRED ARGUMENTS"> and L</"OPTIONS">. + +=head2 FILENAME is a weird dump file name. + +The dump file does not appear to have a standard name that appears +at L<http://download.wikimedia.org/>. + +=head2 error opening FILENAME ... + +=head2 error closing FILENAME ... + +Check the corresponding error. + +=head2 unable to handle any case setting besides 'first-letter' + +Something is weird with the dump. See the documentation of +L<Parse::MediaWikiDump> and MediaWiki. + +=head2 A page has no pure title + +Something is particularly weird with the name of a page. The program cannot +separate its name from its namespace. It can also be a bug in this program. + +=head2 Some weirdness happened - STRING does not look a param + +STRING is supposed to be a parameter in a template, but it does not look like +one. It could be an error in the template, and also a bug in this program +(the parser that this program employs is rather stupid). + +=head2 Unicode character 0xNUMBER is illegal + +This is a standard Perl warning. It may appear if a page or its title have +funky Unicode characters which should not be there according to the Unicode +standard (to be more precise, according to the implementation of this +standard in your version of perl). Most probably these characters are not +supposed to be in the page and should be fixed, but otherwise this issue +is not supposed to affect the functionality of this program significantly. + +This was reported as a MediaWiki bug: +L<https://bugzilla.wikimedia.org/show_bug.cgi?id=14600> + +=head1 EXIT STATUS + +Nothing interesting, read the log :) + +=head1 CONFIGURATION + +=head2 Language codes file + +The file name has the form C<LANGUAGE_CODE.language_codes.txt>, where +LANGUAGE_CODE is the code of the wiki which is being processed. It includes +a list of language codes which are searched, the full name of the language +and the group to which this language belongs. + +=head2 Localized strings file + +The file name has the form C<LANGUAGE_CODE.strings.txt>, where LANGUAGE_CODE +is the code of the wiki which is being processed. This file includes pairs +of strings - the English name of a feature and its localized counterpart. + +=head2 Log + +The log is called outtest.log. + +=head1 DEPENDENCIES + +This module requires these CPAN modules: + +=over + +=item * C<Parse::MediaWikiDump> + +This module is used for reading pages from the XML dump. + +=item * C<Regexp::Common> + +This module is used for searching templates which use balanced parenthesis, +but pretty badly. :) + +=item * C<Log::Log4perl> + +This module provides easy global logging. + +=item * C<Lingua::Translit> + +This module is used for transliterating filenames to ASCII. + +=item * C<Readonly> + +To make Perl::Critic happy :) + +=item * C<Pod::Usage> + +Some style guide recommended it. I don't even remember which one, but i love style guides. + +=back + +=head1 HACKING + +=head2 Perl 5.10 + +This program requires Perl 5.10. It has new clean and useful syntax, which +makes the programs easier to hack, maintain and debug. It is useless to try +and run it on an older version, unless you want to waste your time +backporting. Please upgrade your Perl installation if you still have 5.8 or +(horrors!) something older. + +=head2 Perl Best Practices, Perl::Critic and perltidy + +Great effort has been put into making this source code pass as cleanly as +possible the Perl::Critic tests in the 'brutal' mode. It also uses perltidy +for automatic code formatting. If you modify it, do yourself a favor, install +Perl::Critic and regularly test it using this command: + +./tidy.sh prepare_noiw_list.pl + +It checks the syntax, runs perltidy on the code and runs Perl::Critic. + +All the places where P::C has been disabled using "no critic" are explained. + +The time invested in making the code P::C-friendly will be returned as time +saved on debugging. Also consider reading the book "Perl Best Practices" by +Damian Conway if you have not already. + +=head1 INCOMPATIBILITIES + +=head2 Unicode issues + +This program works best on GNU/Linux, where Perl and the filesystem are +Unicode-friendly. + +This program was also tested on Windows XP and Vista with ActivePerl and +Cygwin. In these environments Unicode-related issues caused +filenames and clipboard text to become jumbled. You have been warned. + +=head1 BUGS AND LIMITATIONS + +Please report all bugs, features requests and other comments to +Amir E. Aharoni (ami...@ma...). + +=head2 There is no equality between languages + +Currently this program actually only lists pages which do not have +an interwiki link to the English Wikipedia. This is obviously not useful on +the English Wikipedia and is conceptually problematic on other Wikipedias, +too. This is being fixed, but it is not simple to do it Right. + +Goal: 0.4 Reut + +=head2 Internationalization is far from perfect + +Date handling and strings localization is very primitive. There are plans +to upgrade it to smarter modules such as Locale::Maketext. + +Goal: 0.6 Itay + +=head2 MediaWiki parsing is ad hoc + +This program only does very rudimentary and ad hoc MediaWiki syntax parsing. + +Goal: None at the moment, it works well enough. + +=head2 Templates are removed semi-automatically + +Templates on pages which already have needed links are not removed +automatically. A list of them is created and a bot can run on it and remove +the outdated templates. This can be done automatically. + +Goal: None at the moment, it works well enough. + +=head2 Cooling date + +The implementation of the cooling date is very rudimentary. + +Goal version: v0.4 Reut + +=head2 Major refactoring is needed + +=over + +=item * The main code is on the brink of passing the threshold for complexity that +P::C accepts. + +=item * There is no separation of searching and formatting. There are two main +function here: C<process_dump()> and C<create_no_iw_pages()>. They are doing +separate things and should run from different programs. + +=item * Statistics and multi links are just slapped to the log. + +=item * At least some of the code can be rewritten as classes that inherit +from L<Parse::MediaWikiDump>. + +=back + +Goal: v0.8 Moshe + +=head2 There is no test suite + +That can be done after proper modularization. Also, a local test MediaWiki +server would be needed. + +Goal: v1.0 Drora + +=head1 HISTORY + +=over + +=item * B<0.2 - Noa>: Perl 5.10. Russian l10n. POD documentation. Pretty +categories sorting. Memory usage optimization - accumulating information in +files. More generic, but far-from-perfect handling of links to languages +other than English. Transliteration with Lingua::Translit. Logging with +Log::Log4perl. Brutal Perl::Critic 1.90. Started using Readonly. Not finished: +complete statistics, removal of templates from pages which already have links. + +=item * B<0.1 - First and unnamed Amir E. Aharoni's version>: Types +introduced. Conceptual l10n, but only tested on he.wiki. Still en links +only. Informative, though lo-tec logging. + +=item * B<0.0 - Felagund's version>: no types, he.wiki only, en links only, +but it uploaded the lists automatically... + +=back + +=head1 AUTHOR + +=over + +=item * Creator is Nadav Perez (Felagund) + +=over + +=item * Hebrew: L<http://he.wikipedia.org/wiki/User:Felagund> + +=back + +=item * It is (roughly) based on another bot by Guy Shaked (Costello). + +=over + +=item * Hebrew: L<http://he.wikipedia.org/wiki/User:Costello> + +=back + +=item * Then Amir E. Aharoni (Amire80) came and mutilated it beyond +recognition. + +=over + +=item * Hebrew: L<http://he.wikipedia.org/wiki/User:Amire80> + +=item * English: L<http://en.wikipedia.org/wiki/User:Amire80> + +=item * Russian: L<http://ru.wikipedia.org/wiki/User:Amire80> + +=back + +=back + +=head1 LICENSE AND COPYRIGHT + +Copyright 2009 Guy Shaked, Nadav Perez, Amir E. Aharoni. + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License version 3 +as published by the Free Software Foundation or under the terms of +Artistic License version 2.0. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the licenses +along with this program. If not, see + +=over + +=item * L<http://www.gnu.org/licenses/gpl-3.0.html>. + +=item * L<http://www.perlfoundation.org/artistic_license_2_0> + +=back + +I<Visca la llibertat!> + +=cut + +1; + Property changes on: trunk/no-interwiki/Maintenance.pm ___________________________________________________________________ Added: svn:executable + * This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |