[PerlWikiBot] SF.net SVN: perlwikibot:[84] trunk/no-interwiki/Maintenance.pm
Status: Pre-Alpha
Brought to you by:
rotemliss
|
From: <am...@us...> - 2009-10-01 11:39:18
|
Revision: 84
http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=84&view=rev
Author: amire80
Date: 2009-10-01 11:39:00 +0000 (Thu, 01 Oct 2009)
Log Message:
-----------
Starting the big refactoring.
Added Paths:
-----------
trunk/no-interwiki/Maintenance.pm
Added: trunk/no-interwiki/Maintenance.pm
===================================================================
--- trunk/no-interwiki/Maintenance.pm (rev 0)
+++ trunk/no-interwiki/Maintenance.pm 2009-10-01 11:39:00 UTC (rev 84)
@@ -0,0 +1,1755 @@
+package Maintenance;
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 3
+# as published by the Free Software Foundation or under the terms of
+# Artistic License version 2.0.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the licenses
+# along with this program. If not, see
+# * <http://www.gnu.org/licenses/gpl-3.0.html>.
+# * <http://www.perlfoundation.org/artistic_license_2_0>
+
+# Upgrade! This program actually uses new Perl 5.10 constructs, so you need it
+use 5.010;
+
+# pragmata
+use strict;
+use warnings;
+use integer; # Improves performance
+use open ':utf8';
+use utf8;
+use charnames ':full';
+
+# Standard library
+# These modules should come installed with Perl
+use English qw(-no_match_vars);
+use Carp qw(croak cluck);
+# use Time::Local;
+# use Data::Dumper;
+# use File::Basename;
+
+# CPAN
+# You must install these modules from CPAN
+
+# I only use Readonly where P::C complains about it. Using it with regular
+# expressions which have non-ASCII chars produces heisenbugs.
+use Readonly;
+# use Pod::Usage;
+# use Regexp::Common;
+# use Lingua::Translit;
+# use Log::Log4perl qw(:easy);
+use Parse::MediaWikiDump 0.91; # Earlier versions have a different API
+
+our $VERSION = '0.0.1';
+#<<< no perltidy
+my %SVN_PROPS = ( ## no critic (RequireInterpolationOfMetachars)
+ Revision => '$Revision: 83 $',
+ HeadURL => '$HeadURL: https://perlwikibot.svn.sourceforge.net/svnroot/perlwikibot/trunk/no-interwiki/prepare_noiw_list.pl $',
+ Date => '$Date: 2009-09-29 15:25:11 +0200 (Tue, 29 Sep 2009) $',
+);
+#>>>
+
+my $LANG_CODE;
+my %STRINGS;
+my %LANG_CODE;
+my $WIKI_LANG;
+
+sub init {
+ my (%args) = @_;
+
+ if ((basename $args{dump_fn}) =~ $DUMP_FN_RE) {
+ $WIKI_LANG = $+{wiki_lang};
+ load_strings($WIKI_LANG);
+ }
+ else {
+ croak("$dump_fn is a weird dump file name.");
+ }
+
+ my $LANG_CODE_FN = "$wiki_lang.language_codes.txt";
+ open my $lang_code_file, '<', $LANG_CODE_FN
+ or croak(file_error('opening', $LANG_CODE_FN, 'reading'));
+ while (my $line = <$lang_code_file>) {
+ chomp $line;
+ my ($code, $name) = split /\t/xms, $line;
+ if (defined $code) {
+ $LANG_CODE{$code} = $name // $code; # /
+ }
+ }
+ close $lang_code_file
+ or croak(file_error('closing', $LANG_CODE_FN, 'reading'));
+
+ init_logger('outtest.log');
+
+ # TODO: Make smarter, configurable, whatever
+ # $OUT_DIR must be first, because it's the parent
+ foreach my $out_dir ($OUT_DIR, $UNSORTED_DIR, $MULTI_DIR) {
+ if (-d $out_dir) {
+ unlink glob "$out_dir/*$WIKITEXT_EXT";
+ }
+ else {
+ mkdir $out_dir;
+ }
+ }
+
+ return;
+}
+
+my $WIKITEXT_EXT = 'wiki.txt';
+my $OUT_DIR = 'out';
+my $UNSORTED_DIR = "$OUT_DIR/unsorted";
+my $ALT_SEP = q{|};
+my $FIELD_SEP = qq{\t};
+my $LINK_SEP = q{|};
+my $TYPE_SEP = qr/\s*;\s*/xms;
+my $HTML_RLM = '‏';
+
+Readonly our $DEFAULT_PAGE_FREQ => 1000;
+Readonly our $DEFAULT_MAX_LINKS_PER_SECTION => 100;
+Readonly our $DEFAULT_MAX_SECTIONS_PER_PAGE => 20;
+
+sub init_logger {
+ my ($log_fn) = @_;
+ Log::Log4perl->easy_init(
+ {
+ level => $INFO, # print everything
+ file => ":utf8>$log_fn", # utf is important
+ layout => '%m%n', # No need to print the date
+ }
+ );
+ return;
+}
+
+# XXX Too coupled to Wikipedia, won't work for other projects.
+our $DUMP_FN_RE = qr{
+ \A # Begin string
+ (?<wiki_lang>\w+) # Lang code
+ wiki # Is supposed to be after the lang code
+}xms;
+
+our $FIELD_SEP_RE = qr{\t}xms;
+
+our $STRING_SKIP_RE = qr{
+ \A # Begin string
+ \s* # Zero or more spaces
+ (?:\#.*)? # Comment lines
+ \z # End string
+}xms;
+
+my %STRING;
+
+# XXX - bad i18n
+# ISO 9 is mostly good for Russian and it is still not perfect ASCII
+# my $TRANSLITERATOR = Lingua::Translit->new('ISO 9');
+
+our %MW_SYNTAX = (
+ 'start_template' => '{{',
+ 'end_template' => '}}',
+ 'start_link' => '[[',
+ 'end_link' => ']]',
+ 'param_sep' => q{|},
+ 'paragraph' => qq{\n\n},
+ 'start_wikitable' => '{|',
+ 'namespace_sep' => q{:},
+);
+
+# This monstrosity basically says: | and optional spaces
+our $PARAM_SEP_RE = qr{\s*\Q$MW_SYNTAX{param_sep}\E\s*}xms;
+
+#
+# Constants for date processing
+#
+
+our @MONTHS = @STRING{
+ qw(
+ January February March
+ April May June
+ July August September
+ October November December
+ )
+ };
+my $ALT_MONTHS = join $ALT_SEP, @MONTHS;
+my %REV_MONTH;
+Readonly my $LAST_MONTH => 11;
+@REV_MONTH{@MONTHS} = (0 .. $LAST_MONTH);
+
+# XXX Internationalize
+my $string_in = get_string('in');
+our $HEB_DATE_RE = qr{
+ \A # begin
+ (?<hour>\d{1,2}) # hour
+ : # :
+ (?<min>\d{2}),\s # minute
+ (?<mday>\d{1,2})\s # day of month
+ $string_in? # This preposition appears sometimes
+ (?<mon>$ALT_MONTHS)\s # A name of a month
+ (?<year>\d+?)\s # Year
+ \([A-Z]{3}\) # Three letters in brackets - timezone
+ \z # end
+}xms;
+
+our $PARAM_RE = qr{
+ \A # Beginning of a string
+ \s* # Zero or more space
+ (?: # No capture
+ (?<param_name>\w+) # Word chars - param name
+ \s* # Zero or more space
+ = # =
+ )? # Param name is optional
+ (?<value>.*) # value
+}xms;
+
+# A simplistic template just for testing.
+# Quite possibly it is not needed anymore.
+# Until i get a better regex for matching balancing {{}} ...
+our $TEMPLATE_RE = qr{
+ \A # beginning of string
+ \Q$MW_SYNTAX{start_template}\E # {{
+ .+ # some chars
+ \Q$MW_SYNTAX{end_template}\E # }}
+ \z # end of string
+}xms;
+
+our $WIKITABLE_RE = qr{
+ \A
+ \Q$MW_SYNTAX{start_wikitable}\E
+}xms;
+
+# Redirect in local language.
+# Regular expression mostly copied from
+# Parse::MediaWikiDump::page::redirect
+# TODO: Try to look for the local language redirect keyword in the dump.
+my $local_redirect = get_string('REDIRECT');
+my $LOCAL_REDIRECT_RE = qr{
+ \A # Beginning of string (page)
+ \# # a # character
+ $local_redirect # Redirect keyword in local language
+ \s*:?\s*\[\[([^\]]*)\]\] # the link after the redirect
+}xmsi;
+
+my $LTR_CHAR_RE = qr/\P{IsRightToLeft}/xms; # \P is negation
+my $SECTION_LINK_RE = qr{(?<!&)\#}xms;
+my $LOWERCASE_LINK_RE = qr{\A[[:lower:]]}xms;
+
+## no critic (RegularExpressions::ProhibitEscapedMetacharacters)
+my $TRUE_TEMPLATE_RE = qr/\{ $RE{balanced}{-parens=>'{}'} \}/xms;
+## use critic (RegularExpressions::ProhibitEscapedMetacharacters)
+
+my $string_exclude_lowercase = get_string('exclude_lowercase');
+my $EXCLUDE_LOWERCASE_RE = qr{
+ \A # Beginning of foreign article name
+ [$string_exclude_lowercase] # Character class of possibly lowercase chars
+}xms;
+my $NUMBERED_FILE_RE = qr{
+ (?: _ \d*)?
+ \.$WIKITEXT_EXT
+}xms;
+my $INVALID_FILENAME_CHAR_RE = qr{[\\\n/:*?"<>|]}xms; # "
+
+my $TWO_DIGIT_CHARNUMBER_RE = qr{
+ (?:
+ [%.] # There are both %C4%B0 and .AA.E0
+ [[:xdigit:]]{2} # 2 hex digits
+ )
+}xms;
+
+my $HTML_CHARNUMBER_RE = qr{
+ (?:
+ &\#\d+; # stuff like Š
+ )
+}xms;
+
+# TODO: Check whether it is Neapolitan with its ''
+my $CHARACTER_CODE_IN_LINK_RE = qr{
+ $TWO_DIGIT_CHARNUMBER_RE | $HTML_CHARNUMBER_RE
+}xms;
+
+sub load_lang_codes {
+ my ($wiki_lang) = @_;
+
+}
+
+my $string_no_iw = get_string('no_iw');
+my $SIMPLE_NO_IW_CHECK_RE = qr{
+ \Q$string_no_iw\E # The string may have spaces
+}xmsi;
+
+my $ALT_LANGS = join $ALT_SEP, keys %LANG_CODE;
+
+my $INTERWIKI_LINK_RE = qr{
+ \Q$MW_SYNTAX{start_link}\E
+ (?<lang_code>
+ $ALT_LANGS
+ )
+ :
+ (?<foreign_article>
+ .+?
+ )
+ \Q$MW_SYNTAX{end_link}\E
+}xms;
+
+# Lojban allows lowercase articles
+my @LOWERCASE_LANGS = qw(jbo);
+
+our $the_dump = load_dump();
+
+my $namespaces_alt = join $ALT_SEP,
+ grep { length > 0 } @{ $the_dump->namespaces_names() };
+my $PURE_TITLE_RE = qr{
+ \A
+ (?:
+ (?:$namespaces_alt)
+ $MW_SYNTAX{namespace_sep}
+ )? # Namespace name is optional
+ (?<pure_title>.+)
+ \z
+}xms;
+
+our %statistics = (count_iw => []);
+
+my $begin_time = time;
+process_dump();
+my $total_time = time - $begin_time;
+
+print_stats();
+
+create_no_iw_pages();
+
+INFO(q{});
+
+# my @found_lang_codes = sort keys %found_links;
+# INFO("found lang_codes: @found_lang_codes");
+
+my $begin_multi_links_time = time;
+
+say "\nlisting multi links by language";
+print_multi_links_by_foreign();
+
+say "\nlisting multi links by local articles";
+print_multi_links_by_local();
+
+my $total_multi_links_time = time - $begin_multi_links_time;
+INFO("total multi links time: $total_multi_links_time");
+
+exit;
+
+sub load_dump {
+ Readonly my $WORKING_PMWD_VER => 0.91;
+ Readonly my $FUTURE_PMWD_VER => 0.94;
+
+ my $dump;
+
+ if ($Parse::MediaWikiDump::VERSION == $WORKING_PMWD_VER) {
+ $dump = Parse::MediaWikiDump::Pages->new($dump_fn);
+ }
+ else {
+ if ($Parse::MediaWikiDump::VERSION < $FUTURE_PMWD_VER) {
+ carp( 'You are running Parse::MediaWikiDump version '
+ . $Parse::MediaWikiDump::VERSION
+ . ".\n Redirect handling may be broken\n");
+ }
+ my $pmwd = Parse::MediaWikiDump->new();
+ $dump = $pmwd->revisions($dump_fn);
+ }
+
+ # This is the only currently known value
+ # but there could be more in the future
+ if ($dump->case() ne 'first-letter') {
+ croak(q{unable to handle any case setting besides 'first-letter'});
+ }
+
+ return $dump;
+}
+
+sub namespace {
+ my ($page) = @_;
+ return $page->namespace() || get_string('article space');
+}
+
+sub next_page {
+ my ($dump) = @_;
+ my $page_ref = eval { $dump->next(); };
+ if ($EVAL_ERROR) {
+ confess("Failed reading a page: $EVAL_ERROR");
+ }
+ return $page_ref;
+}
+
+sub process_dump {
+ PAGE:
+ while (my $page_ref = next_page($the_dump)) {
+ $page_counter++;
+ if ($page_counter % $option{page_freq} == 0) {
+ say $page_counter;
+ }
+
+ if ( $option{stop_after}
+ and $page_counter > $option{stop_after})
+ {
+ last PAGE;
+ }
+
+ my $page_text_ref = $page_ref->text();
+
+ if (
+ $page_counter < $option{start_from}
+ or not defined ${$page_text_ref} # must be tested before redirect
+ or not is_in_namespace($page_ref, @INCLUDE_NAMESPACES)
+ or is_redirect($page_ref)
+ )
+ {
+ next PAGE;
+ }
+
+ my $page_namespace = namespace($page_ref);
+ $namespace_count{$page_namespace}++;
+
+ my $page_title = $page_ref->title();
+
+ INFO("\n* processing $page_counter - ", $page_title);
+
+ # TODO: Be more precise here.
+ # Portal pages which have a '/' in their name are probably
+ # internal and do not need interwiki links.
+ if (is_in_namespace($page_ref, 'Portal') and $page_title =~ m{/}xms) {
+ INFO('internal portal, skipping');
+ $statistics{'internal portal'}++;
+ next PAGE;
+ }
+
+ # A simple sanity check: is the no_iw template anywhere around here?
+ my $has_template_no_iw =
+ (${$page_text_ref} =~ $SIMPLE_NO_IW_CHECK_RE);
+
+ # Does the page have interwiki links?
+ # BIG XXX Actually checks only for English
+ my $has_iw = has_interwiki($page_ref);
+
+ if ($has_iw) {
+ INFO("has link to $has_iw");
+ if ($has_template_no_iw) {
+ INFO('has template no_iw. trying to remove ...');
+ remove_template_no_iw($page_text_ref);
+ $statistics{'has both valid interwiki and template'}++;
+ special_cases_file('outdated_template', $page_ref);
+ }
+ }
+ else { # does not have iw
+ process_iwless_page($page_ref, $has_template_no_iw, $has_iw);
+ }
+ }
+
+ return;
+}
+
+sub process_iwless_page {
+ my (
+ $page_ref, # object ref
+ $has_template_no_iw, # scalar bool
+ $has_iw # scalar bool
+ ) = @_;
+
+ INFO(q(does not have iw link.));
+ $statistics{'has no interwiki link'}->{ namespace($page_ref) }++;
+
+ # Now we need to search for no_iw templates
+ # and parse their parameters - date and type
+
+ my @found_templates = ();
+
+ my $page_text_ref = $page_ref->text();
+
+ # Optimized - does not start searching,
+ # if we already know that it is not there
+ if ($has_template_no_iw) {
+ find_templates($page_text_ref, \@found_templates,
+ [ get_string('no_iw') ]);
+ }
+
+ my $found_templates_count = scalar @found_templates;
+ INFO("found templates: $found_templates_count");
+
+ my $template;
+ if ($found_templates_count) {
+
+ # Assume the first one is the right one
+ $template = $found_templates[0];
+ if ($found_templates_count > 1) {
+ WARN('many templates were found');
+ $statistics{'many templates'}++;
+ special_cases_file('many_templates', $page_ref);
+ }
+ else {
+ INFO('good, found one template');
+ $statistics{'one template'}++;
+ }
+ }
+ else {
+ INFO('no templates found');
+ $statistics{'no templates found'}++;
+ }
+
+ if (defined $template) {
+ INFO('has template no_iw');
+ my $date_str = $template->{params}->{date};
+ if (defined $date_str) {
+ INFO('checking cooling date');
+ my $date_ref = parse_date($date_str);
+ if (not defined $date_ref) {
+ INFO("invalid date: '$date_str'");
+ $statistics{'invalid date'}++;
+ special_cases_file('invalid_date', $page_ref);
+ }
+ elsif (cooling_date_passed($date_ref)) {
+ INFO('cooling date passed, updating to today ...');
+ update_cooling_date($page_text_ref);
+ $statistics{'cooling date passed'}++;
+ }
+ else {
+ INFO(q(cooling date did not pass.));
+ $statistics{q(cooling date did not pass)}++;
+ }
+ }
+ else {
+ INFO('date not defined');
+ }
+
+ }
+
+ my @all_types = get_all_types($template->{params}->{type}, $page_ref);
+
+ foreach my $type (@all_types) {
+ INFO('adding ' . $page_ref->title() . " to the list as type $type");
+ add_to_no_iw_list($page_ref, $type);
+ $type_count{$type}++;
+ }
+
+ return 1;
+}
+
+sub get_all_types {
+ my ($type_param, $page) = @_;
+
+ $type_param //= q{}; # /
+ strip_whitespace($type_param);
+
+ my @all_types = split $TYPE_SEP, $type_param;
+
+ my $page_title = $page->title();
+ if (is_category($page)) {
+ INFO("$page_title is a category");
+ push @all_types, get_string('Category');
+ $statistics{'categories'}++;
+ }
+
+ if (is_in_namespace($page, 'Portal')) {
+ INFO("$page_title is a portal");
+ push @all_types, get_string('Portal');
+ $statistics{'portal'}++;
+ }
+
+ if (is_disambig($page)) {
+ INFO("$page_title is a disambiguation");
+ push @all_types, get_string('disambig');
+ $statistics{'disambig'}++;
+ }
+
+ # Still nothing?
+ if (not scalar @all_types) {
+ my $other_type = get_string('other');
+ INFO("$page_title does not have any type, adding to $other_type");
+ @all_types = ($other_type);
+ $statistics{'automatically added to type other'}++;
+ }
+
+ return @all_types;
+}
+
+sub find_templates {
+ my (
+ $text_ref, # string ref
+ $found_templates_ref, # array ref
+ $filter # string array ref
+ ) = @_;
+
+ # A reference to an array with one empty string.
+ # Matching against an empty string will always succeed.
+ $filter //= [q{}]; # /
+
+ # Get all highest-level matches
+ my @matches = (${$text_ref} =~ /$TRUE_TEMPLATE_RE/xmsgo);
+
+ MATCH:
+ foreach my $next_match (@matches) {
+ if ($next_match !~ $TEMPLATE_RE) {
+ INFO(q(i thought that it is a template, but it was:));
+ if ($next_match =~ $WIKITABLE_RE) {
+ INFO('a wikitable');
+ }
+ else {
+ INFO("something else: $next_match");
+ }
+ INFO(q{});
+ next MATCH;
+ }
+
+ foreach my $next_filter (@{$filter}) {
+
+ # N.B. - case-insensitive. Wrong, but kinda useful.
+ if ($next_match =~
+ /\A\Q$MW_SYNTAX{'start_template'}$next_filter/xmsi)
+ {
+
+ # N.B.: parse_template calls find_templates() recursively
+ my $parsed_template =
+ parse_template(\$next_match, [qw(date type)], $filter);
+ push @{$found_templates_ref}, $parsed_template;
+ }
+ }
+ }
+
+ return $found_templates_ref;
+}
+
+sub parse_template {
+ my (
+ $template, # string ref
+ $default_param_names, # string array ref
+ $subtemplate_filter, # string array ref
+ ) = @_;
+
+ # %parsed_template:
+ # {text} - string ref; might be big
+ # {name} - string
+ # {params} - hash ref
+ # {subtemplates} - array ref
+ my (%parsed_template, %parsed_params, @clauses);
+
+ $parsed_template{text} = strip_template_curlies(${$template});
+
+ # First string of the split is the template name,
+ # the rest is the params
+ ($parsed_template{name}, @clauses) =
+ (split $PARAM_SEP_RE, ${ $parsed_template{text} });
+
+ my $param_counter = 0;
+ foreach my $clause (@clauses) {
+ if ($clause =~ $PARAM_RE) {
+ #<<< no perltidy
+ my ($name, $value) = @+{ qw(param_name value) };
+ #>>>
+ if (defined $name) {
+ if (defined $param_name{$name}) {
+ $name = $param_name{$name};
+ }
+ }
+ else {
+
+ # Get next default param name.
+ # If a default name is not defined, just use the number
+ $name = $default_param_names->[$param_counter]
+ || $param_counter;
+ }
+ $parsed_params{$name} = $value;
+ }
+ else {
+ my $error_msg = "Weird - $clause does not look like a param";
+ INFO($error_msg);
+ cluck($error_msg);
+ $statistics{'weird param'}++;
+ }
+ $param_counter++;
+ }
+ $parsed_template{params} = \%parsed_params;
+
+ # Possible recursion
+ find_templates($parsed_template{text}, $parsed_template{subtemplates},
+ $subtemplate_filter);
+
+ return \%parsed_template;
+}
+
+sub parse_date {
+ my ($date_str) = @_;
+
+ return if (not defined $date_str);
+
+ if ($date_str =~ $HEB_DATE_RE) {
+ INFO("found a valid date: $date_str");
+ my %parsed_date = (
+ 'sec' => 0, # useful for timelocal
+ );
+ foreach my $date_part (qw(hour min mday mon year)) {
+ $parsed_date{$date_part} = $+{$date_part};
+ }
+ $parsed_date{mon} = $REV_MONTH{ $parsed_date{mon} };
+
+ # strip leading zeros
+ foreach (values %parsed_date) {
+ s{
+ \A
+ 0+
+ (?<number>\d+)
+ }
+ {$+{number}}xms;
+ }
+ return \%parsed_date;
+ }
+
+ # Returns undef for an invalid date
+ return;
+}
+
+sub strip_template_curlies {
+ my ($template) = @_;
+ for ($template) {
+ s{
+ \A
+ \Q$MW_SYNTAX{start_template}\E
+ }{}xms;
+ s{
+ \Q$MW_SYNTAX{end_template}\E
+ \z
+ }{}xms;
+ }
+ return \$template;
+}
+
+# no arg unpacking for simplicity and performance
+sub strip_whitespace { ## no critic (RequireArgUnpacking)
+ for (@_) {
+ s/\A\s*//xms;
+ s/\s*\z//xms;
+ }
+ return;
+}
+
+sub has_interwiki {
+ my ($page) = @_;
+
+ my $page_title = $page->title();
+ my $page_text = ${ $page->text() }; # XXX
+
+ study $page_text; # XXX
+
+ my %iw_links;
+ my %special_cases;
+
+ while ($page_text =~ /$INTERWIKI_LINK_RE/xmsgo) { # XXX
+ my ($lang_code, $foreign_article) = @+{qw(lang_code foreign_article)};
+ if (defined $iw_links{$lang_code}) {
+ $special_cases{double_links}->{$lang_code} = q{};
+ }
+ else {
+ $iw_links{$lang_code} = $foreign_article;
+ }
+
+ # A # sign not after an &.
+ # After an & it is probably a character number.
+ if ($foreign_article =~ $SECTION_LINK_RE) {
+ $special_cases{section_links}->{$lang_code} = q{};
+ }
+
+ # Char codes are common in section links, so there is no
+ # need to show them again
+ elsif ($foreign_article =~ $CHARACTER_CODE_IN_LINK_RE) {
+ $special_cases{charnumber_links}->{$lang_code} = q{};
+ }
+
+ # Lowercase links
+ if ( (not $lang_code ~~ @LOWERCASE_LANGS)
+ and ($foreign_article =~ $LOWERCASE_LINK_RE))
+ {
+ my $include_lowercase_link = 1;
+
+ if (defined get_string('exclude_lowercase', 'if defined')
+ and $foreign_article =~ $EXCLUDE_LOWERCASE_RE)
+ {
+ $include_lowercase_link = 0;
+ }
+
+ if ($include_lowercase_link) {
+ $special_cases{lowercase_links}->{$lang_code} = q{};
+ }
+ }
+
+ $found_links{$lang_code}->{$foreign_article}->{$page_title} = q{};
+ }
+
+ my @all_langs = keys %iw_links;
+ my $count_iw = scalar @all_langs;
+ if ($count_iw) {
+ if (not defined $statistics{count_iw}->[$count_iw]) {
+ $statistics{count_iw}->[$count_iw] = [];
+ }
+ push @{ $statistics{count_iw}->[$count_iw] }, $page_title;
+ }
+ INFO("iw link count for $page_title is: $count_iw");
+
+ for my $special_case_name (keys %special_cases) {
+ if (scalar %{ $special_cases{$special_case_name} }) {
+ special_cases_file($special_case_name, $page,
+ $special_cases{$special_case_name});
+ }
+ }
+
+ # BIG XXX Still very stupid, but getting better
+ if (defined $iw_links{en}) {
+ return 'en';
+ }
+
+ return q{};
+}
+
+sub special_cases_file {
+ my ($special_case_name, $page, $special_cases_ref) = @_;
+ $special_cases_ref //= {}; # /
+
+ my $special_case_langs = join q{ }, sort keys %{$special_cases_ref};
+
+ if ($special_case_langs) {
+ $special_case_langs = " ($special_case_langs)";
+ }
+
+ my $special_case_fn = make_type_fn($special_case_name, 1);
+
+ if (not -e $special_case_fn) {
+ append_to_file($special_case_fn, $special_case_name);
+ }
+
+ my $page_title = $page->title();
+ my $link = make_link($page_title);
+ my $line =
+ $link
+ . $special_case_langs
+ . $FIELD_SEP
+ . get_sort_title($page_title);
+
+ append_to_file($special_case_fn, $line);
+
+ return;
+}
+
+sub remove_template_no_iw {
+ my ($params) = @_;
+ INFO( "Supposed to remove the no_iw template now, but ...\n"
+ . 'This sub is a stub. You can help Wikipedia by expanding it!');
+
+ return 0;
+}
+
+sub cooling_date_passed {
+ my ($date_ref) = @_;
+
+ my @page_times = @{$date_ref}{qw(sec min hour mday mon year)};
+ INFO("page times: @page_times");
+
+ my $pageseconds = timelocal(@page_times);
+ return $pageseconds < $LATEST_COOLING;
+}
+
+sub update_cooling_date {
+ my ($params) = @_;
+
+ INFO( "Supposed to update cooling date now, but ...\n"
+ . "This sub is a stub. You can help Wikipedia by expanding it!\n"
+ );
+
+ return 0;
+}
+
+# Just a debugging thingie
+sub print_parsed_template {
+ my ($template_ref) = @_; # hash ref
+
+ INFO('text:');
+ INFO(${ $template_ref->{text} });
+ INFO('name:');
+ INFO($template_ref->{name});
+ INFO('params:');
+ foreach my $next_param (sort keys %{ $template_ref->{params} }) {
+ INFO("$next_param: $template_ref->{params}->{$next_param}");
+ }
+
+ if (defined $template_ref->{subtemplates}) {
+ INFO("subtemplates: $template_ref->{subtemplates}");
+ }
+
+ return;
+}
+
+sub get_sort_title {
+ my ($page_title) = @_;
+
+ my $sort_title;
+ if ($page_title =~ $PURE_TITLE_RE) {
+ $sort_title = $+{pure_title};
+ }
+ else {
+ INFO( 'Something badly weird happened - '
+ . "$page_title has no pure title");
+ croak('A page has no pure title');
+ }
+
+ return $sort_title;
+}
+
+sub add_to_no_iw_list {
+ my (
+ $page, # ref
+ $page_type, # string
+ ) = @_;
+
+ my $page_title = $page->title();
+ INFO("Adding $page_title to no_iw_list");
+
+ my $sort_title = get_sort_title($page_title);
+ INFO("full title: $page_title");
+ INFO("sort title: $sort_title");
+
+ my $unsorted_type_fn = make_type_fn($page_type, 1);
+ if (not -e $unsorted_type_fn) {
+ append_to_file($unsorted_type_fn, $page_type);
+ }
+ my $link = make_link($page_title);
+ INFO("link to page: $link");
+ my $line = $link . $FIELD_SEP . $sort_title;
+
+ append_to_file($unsorted_type_fn, $line);
+
+ return;
+}
+
+sub make_link {
+ my ($page_title) = @_;
+
+ my $link_to_page =
+ $MW_SYNTAX{start_link}
+ . $MW_SYNTAX{namespace_sep}
+ . $page_title
+ . $MW_SYNTAX{end_link};
+
+ if ($option{rtl} and $page_title =~ $LTR_CHAR_RE) {
+ $link_to_page = $HTML_RLM . $link_to_page . $HTML_RLM;
+ }
+
+ return $link_to_page;
+}
+
+sub create_no_iw_pages {
+ my ($params) = @_;
+
+ INFO("\ncreating no_iw pages");
+
+ # Run over page types
+ UNSORTED_TYPE_FN:
+ foreach my $unsorted_type_fn (glob "$UNSORTED_DIR/*") {
+ my %all_pages_in_type = ();
+ open my $unsorted_type_file, '<', $unsorted_type_fn
+ or croak(file_error('opening', $unsorted_type_fn, 'reading'));
+ my @lines = <$unsorted_type_file>;
+ close $unsorted_type_file
+ or croak(file_error('closing', $unsorted_type_fn, 'reading'));
+
+ my $type_name = shift @lines;
+ chomp $type_name;
+ foreach my $line (@lines) {
+ chomp $line;
+ my ($page_title, $sort_title) = split $FIELD_SEP_RE, $line;
+ my $sort_letter = get_sort_letter($sort_title);
+ $all_pages_in_type{$sort_letter} //= []; # /
+ push @{ $all_pages_in_type{$sort_letter} }, $page_title;
+ }
+ write_sorted_pages($type_name, \%all_pages_in_type);
+ }
+ return;
+}
+
+sub write_sorted_pages {
+ my ($type_name, $type_tree_ref) = @_;
+
+ my $type_fn = make_type_fn($type_name);
+
+ my $section_counter = 0;
+ my $page = q{};
+ my $file_number = 1;
+
+ LETTER:
+ foreach my $next_letter (sort keys %{$type_tree_ref}) {
+ my @all_links_in_letter =
+ sort @{ $type_tree_ref->{$next_letter} };
+
+ my $links_count = scalar @all_links_in_letter;
+ my $sections =
+ ($links_count - 1) / $option{max_links_per_section} + 1;
+ my $links_per_section = $links_count / $sections;
+
+ SECTION:
+ foreach my $section_num (1 .. $sections) {
+ if ($section_counter == $option{max_sections_per_page}) {
+ write_page(\$page, \$type_fn, $file_number++);
+ $section_counter = 0;
+
+ $page = q{};
+ }
+ elsif ($section_counter) {
+ $page .= $MW_SYNTAX{paragraph};
+ }
+ $section_counter++;
+ my $title = $next_letter;
+ if ($sections > 1) {
+ $title .= " $section_num";
+ }
+ $page .= mw_heading($title);
+
+ my $first_link = ($section_num - 1) * $links_per_section;
+ my $last_link =
+ ($section_num == $sections)
+ ? $links_count - 1
+ : $first_link + $links_per_section - 1;
+ my $links = join_links(
+ [ @all_links_in_letter[ $first_link .. $last_link ] ]);
+ $page .= $links;
+ }
+ }
+
+ # The page may be empty at this point
+ if ($page) {
+ write_page(\$page, \$type_fn, $file_number++);
+ }
+
+ return;
+}
+
+sub write_page {
+ my ($page_ref, $type_fn_ref, $file_number) = @_;
+
+ my $pretty_file_number = sprintf '%03d', $file_number;
+ ${$type_fn_ref} =~ s{
+ $NUMBERED_FILE_RE
+ }
+ {_$pretty_file_number.$WIKITEXT_EXT}xmso;
+ INFO("creating file ${$type_fn_ref}");
+ append_to_file(${$type_fn_ref}, ${$page_ref});
+
+ return;
+}
+
+sub mw_heading {
+ my (
+ $text, # string
+ $level # number
+ ) = @_;
+
+ $level //= 2; # /
+ my $level_marker = q{=} x $level;
+
+ # Line ending is mandatory
+ return "$level_marker $text $level_marker\n";
+}
+
+sub mw_bold {
+ my ($text) = @_;
+ return "'''$text'''";
+}
+
+# Custom Unicode character property, which is like \w, but for Hebrew.
+# The custom is to give custom Unicode character classes CamelCase names.
+# P::C policy ProhibitMixedCaseSubs is deprecated.
+sub IsRightToLeft { ## no critic (Capitalization)
+ return <<'END';
++utf8::InHebrew
++utf8::IsSpace
++utf8::IsPunct
+END
+}
+
+sub is_redirect {
+ my ($page) = @_;
+
+ if ($page->redirect()) {
+ return 'English';
+ }
+
+ my $page_text_ref = $page->text();
+
+ if (${$page_text_ref} =~ $LOCAL_REDIRECT_RE) {
+ return 'local';
+ }
+
+ return q();
+}
+
+sub is_in_namespace {
+ my ($page, @namespaces) = @_;
+
+ return namespace($page) ~~ [ map { get_string($_) } @namespaces ];
+}
+
+sub is_category {
+ my ($page) = @_;
+ return is_in_namespace($page, 'Category');
+}
+
+sub is_disambig {
+ my ($page) = @_;
+ my $found_templates =
+ find_templates($page->text(), [], [ get_string('disambig') ]);
+ return scalar @{$found_templates};
+}
+
+#
+# This is poor man's gettext.
+# TODO: Replace it with Locale::Maketext
+#
+sub get_strings {
+ my ($lang) = @_;
+
+ my $STRINGS_FN = "$lang.strings.txt";
+
+ open my $STRINGS_FILE, '<:utf8', $STRINGS_FN
+ or croak(file_error('opening', $STRINGS_FN, 'reading'));
+ my @strings_file_lines = <$STRINGS_FILE>;
+ close $STRINGS_FILE
+ or croak(file_error('closing', $STRINGS_FN, 'reading'));
+
+ STRING_LINE:
+ foreach my $next_string_line (@strings_file_lines) {
+
+ # Skip blanks and comments
+ next STRING_LINE if ($next_string_line =~ $STRING_SKIP_RE);
+
+ chomp $next_string_line;
+ my ($english, $target) = split $FIELD_SEP_RE, $next_string_line;
+
+ # Fallback to English if no target language string was supplied
+ $STRING{$english} = $target // $english; # /
+ }
+
+ return;
+}
+
+sub get_string {
+ my ($english, $if_defined) = @_;
+ return $if_defined
+ ? ($STRING{$english})
+ : ($STRING{$english} //= $english); # /
+}
+
+sub make_type_fn {
+ my ($type, $unsorted) = @_;
+ $unsorted //= 0; # /
+
+ #my $transliterated_type = $TRANSLITERATOR->translit($type);
+ my $transliterated_type = $type;
+
+ my $type_fn = "$transliterated_type.$WIKITEXT_EXT";
+
+ $type_fn =~ s{$INVALID_FILENAME_CHAR_RE}{-}xmsgo;
+ my $dir = $unsorted ? $UNSORTED_DIR : $OUT_DIR;
+ $type_fn = "$dir/$type_fn";
+
+ return $type_fn;
+}
+
+sub append_to_file {
+ my ($fn, $line) = @_;
+
+ open my $file, '>>:utf8', $fn
+ or croak(file_error('opening', $fn, 'appending'));
+
+ say {$file} ($line // q{}); # /
+
+ close $file
+ or croak(file_error('closing', $fn, 'appeding'));
+
+ return;
+}
+
+# It appears simple, but in non-alphabetic languages such as Chinese
+# it may be different, so it will sit here ready for better i18n.
+sub get_sort_letter {
+ my ($string) = @_;
+ return substr $string, 0, 1;
+}
+
+sub file_error {
+ my ($operation, $fn, $access_type) = @_;
+ my $string = "error $operation $fn for $access_type: $OS_ERROR";
+ return $string;
+}
+
+sub format_link_table { ## no critic (RequireArgUnpacking)
+ return sprintf '%-15s %8d', @_;
+}
+
+sub print_multi_links_by_foreign {
+ LANG_CODE:
+ foreach my $lang_code (sort keys %found_links) {
+ my $filename = "$MULTI_DIR/$lang_code.$WIKITEXT_EXT";
+ my @foreign_articles = sort keys %{ $found_links{$lang_code} };
+ say format_link_table($lang_code, scalar @foreign_articles);
+
+ FOREIGN_ARTICLE:
+ foreach my $foreign_article (@foreign_articles) {
+ my @local_articles =
+ keys %{ $found_links{$lang_code}->{$foreign_article} };
+ if (scalar @local_articles > 1) {
+ my $links = join q{ | }, sort map { make_link($_) }
+ keys %{ $found_links{$lang_code}->{$foreign_article} };
+
+ my $foreign_title =
+ make_link($lang_code
+ . $MW_SYNTAX{namespace_sep}
+ . $foreign_article);
+ append_to_file($filename, "* '''$foreign_title''' - $links");
+ }
+ }
+ }
+
+ return;
+}
+
+sub print_multi_links_by_local {
+ my %local_multi_links;
+ LANG_CODE:
+ foreach my $lang_code (sort keys %found_links) {
+ my @foreign_articles = sort keys %{ $found_links{$lang_code} };
+ say format_link_table($lang_code, scalar @foreign_articles);
+
+ FOREIGN_ARTICLE:
+ foreach my $foreign_article (@foreign_articles) {
+ my @local_articles =
+ keys %{ $found_links{$lang_code}->{$foreign_article} };
+
+ if (scalar @local_articles > 1) {
+ add_local_multi(
+ \%local_multi_links,
+ make_link(
+ $lang_code
+ . $MW_SYNTAX{namespace_sep}
+ . $foreign_article
+ ),
+ @local_articles
+ );
+ }
+ }
+ }
+
+ say 'writing local multilinks file';
+ my $filename = "$MULTI_DIR/LOCAL.$WIKITEXT_EXT";
+ foreach my $local_multi_article (sort keys %local_multi_links) {
+ append_to_file($filename,
+ '* ' . mw_bold(make_link($local_multi_article)));
+
+ foreach my $other_local_article (
+ sort keys %{ $local_multi_links{$local_multi_article} })
+ {
+ append_to_file($filename,
+ '** ' . make_link($other_local_article));
+ my $foreign_articles = join_links(
+ $local_multi_links{$local_multi_article}
+ ->{$other_local_article},
+ 0
+ );
+ append_to_file($filename, "*** $foreign_articles");
+ }
+ append_to_file($filename);
+ }
+
+ return;
+}
+
+sub add_local_multi {
+ my (
+ $local_multi_links_ref, $foreign_link,
+ $first_local_article, @other_local_articles
+ ) = @_;
+
+ $local_multi_links_ref->{$first_local_article} //= {}; # /
+
+ foreach my $other_local_article (@other_local_articles) {
+ $local_multi_links_ref->{$first_local_article}
+ ->{$other_local_article} //= []; # /
+ push @{ $local_multi_links_ref->{$first_local_article}
+ ->{$other_local_article} }, $foreign_link;
+ }
+
+ return;
+}
+
+sub join_links {
+ my ($links_ref, $line_end) = @_;
+ $line_end //= 1; # /
+
+ my $link_sep = q{ } . $LINK_SEP . ($line_end ? "\n" : q{ });
+ return join $link_sep, @{$links_ref};
+}
+
+sub print_stats {
+ INFO("\nSUMMARY");
+ INFO("total time: $total_time");
+ foreach my $stat_type (sort keys %statistics) {
+ if (not ref $statistics{$stat_type}) {
+ INFO("$stat_type: $statistics{$stat_type}");
+ }
+ }
+
+ my $max_iw_index = $#{ $statistics{count_iw} };
+ INFO("max_iw_index: $max_iw_index");
+ MAX_IW:
+ for my $max_iw_place (0 .. $option{max_iw_places}) {
+ my @links =
+ map { make_link($_) } @{ $statistics{count_iw}->[$max_iw_index] }
+ or last MAX_IW;
+ INFO("# $max_iw_index: " . join_links(\@links, 0));
+
+ # Do nothing, just count down to the next index with a defined list.
+ # $max_iw_index needs to be checked for nonzero-ness for the rare case
+ # of a very low page count.
+ while ($max_iw_index
+ and not defined $statistics{count_iw}->[ --$max_iw_index ])
+ {
+ }
+ }
+
+ INFO('pages without interwiki links per namespace');
+ foreach my $namespace (keys %{ $statistics{'has no interwiki link'} }) {
+ my $iwless_in_namespace =
+ $statistics{'has no interwiki link'}->{$namespace};
+
+ no integer;
+ ## no critic (ProhibitMagicNumbers)
+ my $percentage = sprintf '%.2f',
+ 100 * $iwless_in_namespace / $namespace_count{$namespace};
+ ## use critic (ValuesAndExpressions::ProhibitMagicNumbers)
+ use integer;
+
+ INFO("$namespace: $iwless_in_namespace, $percentage%");
+ }
+
+ INFO("\nNAMESPACES");
+ foreach my $namespace (sort keys %namespace_count) {
+ INFO("$namespace: $namespace_count{$namespace}");
+ }
+
+ INFO("\nTYPES");
+ foreach my $type (sort keys %type_count) {
+ INFO("$type: $type_count{$type}");
+ }
+
+ my $dump_size = -s $dump_fn;
+ my $log_size = -s $LOG_FN;
+ INFO("dump size: $dump_fn");
+ INFO("log size: $LOG_FN");
+ if ($log_size > 0) {
+ my $dump_log_ratio = $dump_size / $log_size;
+ INFO("dump/log ratio: $dump_log_ratio");
+ }
+ else {
+ WARN("weird: log file size $LOG_FN is 0");
+ }
+
+ return;
+}
+
+__END__
+
+=head1 NAME
+
+C<prepare_noiw_list.pl> - Searches a MediaWiki dump for pages without
+interlanguage links and prepares categorized lists of those pages.
+
+=head1 VERSION
+
+This documentation refers to
+
+prepare_noiw_list.pl
+
+version 0.2.1 - Noa.
+
+=head1 USAGE
+
+=over
+
+=item * C<prepare_noiw_list.pl ./big-files/ruwiki-20080420-pages-meta-current.xml>
+
+=item * C<prepare_noiw_list.pl --rtl ./big-files/hewiki-20080420-pages-meta-current.xml>
+
+=item * C<prepare_noiw_list.pl --stop_after=20000 ./big-files/hewiki-20080420-pages-meta-current.xml>
+
+=back
+
+=head1 REQUIRED ARGUMENTS
+
+=over
+
+=item * MediaWiki dump file name is required
+
+=back
+
+=head1 OPTIONS
+
+=over
+
+=item * --rtl makes special fixes for right-to-left languages.
+
+=item * --stop_after=NUMBER Stops processing after page with the given
+NUMBER.
+
+=item * --start_from=NUMBER Begins processing after page with the given
+NUMBER.
+
+=item * --page_freq=NUMBER Print the page counter every NUMBER of
+pages. 1000 by default.
+
+=item * --max_links_per_section Maximum number of links per section in
+the output page. Default is 100.
+
+=item * --max_sections_per_page Maximum number of sections per output
+page. Default is 20.
+
+=item * --max_iw_places Number of places to print in the statistics of
+pages with the most interlanguage links.
+
+=item * --version Print the version number and exit.
+
+=item * --usage Print basic usage and exit.
+
+=item * --help Print full help and exit.
+
+=back
+
+=head1 DESCRIPTION
+
+The main goal of this program is to find pages which do not have
+interwiki (interlanguage) links to certain languages.
+
+This program scans a MediaWiki XML dump file. It searches every page for
+interwiki links and for a special hidden template, which defines to which
+types of interwiki-less pages it belongs.
+
+=over
+
+=item * If the page contains links to the defined languages and does not
+contain the "no interwiki" template, its processing stops.
+
+=item * If the page contains links to the defined languages and contains this
+template, it is logged, so the template can be removed. (It is planned that
+it will be removed automatically in the future.)
+
+=item * If the page contains no links to the defined languages and does not
+contain the template, it is automatically added to type "other".
+
+=item * If the page contains no links to the defined languages and
+a template with types, it is added to the defined types.
+
+=back
+
+Pages without links are added to nicely formatted lists according
+to their type.
+
+This program also collects some information on the way about problematic
+cases - more than one "no interwiki" template, invalid templates, invalid
+or redundant interwiki links etc.
+
+You can get a dump for using this program at L<http://download.wikimedia.org/>.
+
+=head1 DIAGNOSTICS
+
+=head2 Invalid command line options.
+
+See L</"REQUIRED ARGUMENTS"> and L</"OPTIONS">.
+
+=head2 FILENAME is a weird dump file name.
+
+The dump file does not appear to have a standard name that appears
+at L<http://download.wikimedia.org/>.
+
+=head2 error opening FILENAME ...
+
+=head2 error closing FILENAME ...
+
+Check the corresponding error.
+
+=head2 unable to handle any case setting besides 'first-letter'
+
+Something is weird with the dump. See the documentation of
+L<Parse::MediaWikiDump> and MediaWiki.
+
+=head2 A page has no pure title
+
+Something is particularly weird with the name of a page. The program cannot
+separate its name from its namespace. It can also be a bug in this program.
+
+=head2 Some weirdness happened - STRING does not look a param
+
+STRING is supposed to be a parameter in a template, but it does not look like
+one. It could be an error in the template, and also a bug in this program
+(the parser that this program employs is rather stupid).
+
+=head2 Unicode character 0xNUMBER is illegal
+
+This is a standard Perl warning. It may appear if a page or its title have
+funky Unicode characters which should not be there according to the Unicode
+standard (to be more precise, according to the implementation of this
+standard in your version of perl). Most probably these characters are not
+supposed to be in the page and should be fixed, but otherwise this issue
+is not supposed to affect the functionality of this program significantly.
+
+This was reported as a MediaWiki bug:
+L<https://bugzilla.wikimedia.org/show_bug.cgi?id=14600>
+
+=head1 EXIT STATUS
+
+Nothing interesting, read the log :)
+
+=head1 CONFIGURATION
+
+=head2 Language codes file
+
+The file name has the form C<LANGUAGE_CODE.language_codes.txt>, where
+LANGUAGE_CODE is the code of the wiki which is being processed. It includes
+a list of language codes which are searched, the full name of the language
+and the group to which this language belongs.
+
+=head2 Localized strings file
+
+The file name has the form C<LANGUAGE_CODE.strings.txt>, where LANGUAGE_CODE
+is the code of the wiki which is being processed. This file includes pairs
+of strings - the English name of a feature and its localized counterpart.
+
+=head2 Log
+
+The log is called outtest.log.
+
+=head1 DEPENDENCIES
+
+This module requires these CPAN modules:
+
+=over
+
+=item * C<Parse::MediaWikiDump>
+
+This module is used for reading pages from the XML dump.
+
+=item * C<Regexp::Common>
+
+This module is used for searching templates which use balanced parenthesis,
+but pretty badly. :)
+
+=item * C<Log::Log4perl>
+
+This module provides easy global logging.
+
+=item * C<Lingua::Translit>
+
+This module is used for transliterating filenames to ASCII.
+
+=item * C<Readonly>
+
+To make Perl::Critic happy :)
+
+=item * C<Pod::Usage>
+
+Some style guide recommended it. I don't even remember which one, but i love style guides.
+
+=back
+
+=head1 HACKING
+
+=head2 Perl 5.10
+
+This program requires Perl 5.10. It has new clean and useful syntax, which
+makes the programs easier to hack, maintain and debug. It is useless to try
+and run it on an older version, unless you want to waste your time
+backporting. Please upgrade your Perl installation if you still have 5.8 or
+(horrors!) something older.
+
+=head2 Perl Best Practices, Perl::Critic and perltidy
+
+Great effort has been put into making this source code pass as cleanly as
+possible the Perl::Critic tests in the 'brutal' mode. It also uses perltidy
+for automatic code formatting. If you modify it, do yourself a favor, install
+Perl::Critic and regularly test it using this command:
+
+./tidy.sh prepare_noiw_list.pl
+
+It checks the syntax, runs perltidy on the code and runs Perl::Critic.
+
+All the places where P::C has been disabled using "no critic" are explained.
+
+The time invested in making the code P::C-friendly will be returned as time
+saved on debugging. Also consider reading the book "Perl Best Practices" by
+Damian Conway if you have not already.
+
+=head1 INCOMPATIBILITIES
+
+=head2 Unicode issues
+
+This program works best on GNU/Linux, where Perl and the filesystem are
+Unicode-friendly.
+
+This program was also tested on Windows XP and Vista with ActivePerl and
+Cygwin. In these environments Unicode-related issues caused
+filenames and clipboard text to become jumbled. You have been warned.
+
+=head1 BUGS AND LIMITATIONS
+
+Please report all bugs, features requests and other comments to
+Amir E. Aharoni (ami...@ma...).
+
+=head2 There is no equality between languages
+
+Currently this program actually only lists pages which do not have
+an interwiki link to the English Wikipedia. This is obviously not useful on
+the English Wikipedia and is conceptually problematic on other Wikipedias,
+too. This is being fixed, but it is not simple to do it Right.
+
+Goal: 0.4 Reut
+
+=head2 Internationalization is far from perfect
+
+Date handling and strings localization is very primitive. There are plans
+to upgrade it to smarter modules such as Locale::Maketext.
+
+Goal: 0.6 Itay
+
+=head2 MediaWiki parsing is ad hoc
+
+This program only does very rudimentary and ad hoc MediaWiki syntax parsing.
+
+Goal: None at the moment, it works well enough.
+
+=head2 Templates are removed semi-automatically
+
+Templates on pages which already have needed links are not removed
+automatically. A list of them is created and a bot can run on it and remove
+the outdated templates. This can be done automatically.
+
+Goal: None at the moment, it works well enough.
+
+=head2 Cooling date
+
+The implementation of the cooling date is very rudimentary.
+
+Goal version: v0.4 Reut
+
+=head2 Major refactoring is needed
+
+=over
+
+=item * The main code is on the brink of passing the threshold for complexity that
+P::C accepts.
+
+=item * There is no separation of searching and formatting. There are two main
+function here: C<process_dump()> and C<create_no_iw_pages()>. They are doing
+separate things and should run from different programs.
+
+=item * Statistics and multi links are just slapped to the log.
+
+=item * At least some of the code can be rewritten as classes that inherit
+from L<Parse::MediaWikiDump>.
+
+=back
+
+Goal: v0.8 Moshe
+
+=head2 There is no test suite
+
+That can be done after proper modularization. Also, a local test MediaWiki
+server would be needed.
+
+Goal: v1.0 Drora
+
+=head1 HISTORY
+
+=over
+
+=item * B<0.2 - Noa>: Perl 5.10. Russian l10n. POD documentation. Pretty
+categories sorting. Memory usage optimization - accumulating information in
+files. More generic, but far-from-perfect handling of links to languages
+other than English. Transliteration with Lingua::Translit. Logging with
+Log::Log4perl. Brutal Perl::Critic 1.90. Started using Readonly. Not finished:
+complete statistics, removal of templates from pages which already have links.
+
+=item * B<0.1 - First and unnamed Amir E. Aharoni's version>: Types
+introduced. Conceptual l10n, but only tested on he.wiki. Still en links
+only. Informative, though lo-tec logging.
+
+=item * B<0.0 - Felagund's version>: no types, he.wiki only, en links only,
+but it uploaded the lists automatically...
+
+=back
+
+=head1 AUTHOR
+
+=over
+
+=item * Creator is Nadav Perez (Felagund)
+
+=over
+
+=item * Hebrew: L<http://he.wikipedia.org/wiki/User:Felagund>
+
+=back
+
+=item * It is (roughly) based on another bot by Guy Shaked (Costello).
+
+=over
+
+=item * Hebrew: L<http://he.wikipedia.org/wiki/User:Costello>
+
+=back
+
+=item * Then Amir E. Aharoni (Amire80) came and mutilated it beyond
+recognition.
+
+=over
+
+=item * Hebrew: L<http://he.wikipedia.org/wiki/User:Amire80>
+
+=item * English: L<http://en.wikipedia.org/wiki/User:Amire80>
+
+=item * Russian: L<http://ru.wikipedia.org/wiki/User:Amire80>
+
+=back
+
+=back
+
+=head1 LICENSE AND COPYRIGHT
+
+Copyright 2009 Guy Shaked, Nadav Perez, Amir E. Aharoni.
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License version 3
+as published by the Free Software Foundation or under the terms of
+Artistic License version 2.0.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the licenses
+along with this program. If not, see
+
+=over
+
+=item * L<http://www.gnu.org/licenses/gpl-3.0.html>.
+
+=item * L<http://www.perlfoundation.org/artistic_license_2_0>
+
+=back
+
+I<Visca la llibertat!>
+
+=cut
+
+1;
+
Property changes on: trunk/no-interwiki/Maintenance.pm
___________________________________________________________________
Added: svn:executable
+ *
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|