[PerlWikiBot] SF.net SVN: perlwikibot:[84] trunk/no-interwiki/Maintenance.pm

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 84
          http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=84&view=rev
Author:   amire80
Date:     2009-10-01 11:39:00 +0000 (Thu, 01 Oct 2009)

Log Message:
-----------
Starting the big refactoring.

Added Paths:
-----------
    trunk/no-interwiki/Maintenance.pm

Added: trunk/no-interwiki/Maintenance.pm
===================================================================

--- trunk/no-interwiki/Maintenance.pm	                        (rev 0)
+++ trunk/no-interwiki/Maintenance.pm	2009-10-01 11:39:00 UTC (rev 84)
@@ -0,0 +1,1755 @@
+package Maintenance;
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 3
+# as published by the Free Software Foundation or under the terms of
+# Artistic License version 2.0.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the licenses
+# along with this program.  If not, see
+# * <http://www.gnu.org/licenses/gpl-3.0.html>.
+# * <http://www.perlfoundation.org/artistic_license_2_0>
+
+# Upgrade! This program actually uses new Perl 5.10 constructs, so you need it
+use 5.010;
+
+# pragmata
+use strict;
+use warnings;
+use integer;    # Improves performance
+use open ':utf8';
+use utf8;
+use charnames ':full';
+
+# Standard library
+# These modules should come installed with Perl
+use English qw(-no_match_vars);
+use Carp qw(croak cluck);
+# use Time::Local;
+# use Data::Dumper;
+# use File::Basename;
+
+# CPAN
+# You must install these modules from CPAN
+
+# I only use Readonly where P::C complains about it. Using it with regular
+# expressions which have non-ASCII chars produces heisenbugs.
+use Readonly;
+# use Pod::Usage;
+# use Regexp::Common;
+# use Lingua::Translit;
+# use Log::Log4perl qw(:easy);
+use Parse::MediaWikiDump 0.91;    # Earlier versions have a different API
+
+our $VERSION = '0.0.1';
+#<<< no perltidy
+my %SVN_PROPS = ( ## no critic (RequireInterpolationOfMetachars)
+    Revision => '$Revision: 83 $',
+    HeadURL  => '$HeadURL: https://perlwikibot.svn.sourceforge.net/svnroot/perlwikibot/trunk/no-interwiki/prepare_noiw_list.pl $',
+    Date     => '$Date: 2009-09-29 15:25:11 +0200 (Tue, 29 Sep 2009) $',
+);
+#>>>
+
+my $LANG_CODE;
+my %STRINGS;
+my %LANG_CODE;
+my $WIKI_LANG;
+
+sub init {
+    my (%args) = @_;
+
+    if ((basename $args{dump_fn}) =~ $DUMP_FN_RE) {
+        $WIKI_LANG = $+{wiki_lang};
+        load_strings($WIKI_LANG);
+    }
+    else {
+        croak("$dump_fn is a weird dump file name.");
+    }
+
+    my $LANG_CODE_FN = "$wiki_lang.language_codes.txt";
+    open my $lang_code_file, '<', $LANG_CODE_FN
+        or croak(file_error('opening', $LANG_CODE_FN, 'reading'));
+    while (my $line = <$lang_code_file>) {
+        chomp $line;
+        my ($code, $name) = split /\t/xms, $line;
+        if (defined $code) {
+            $LANG_CODE{$code} = $name // $code;    # /
+        }
+    }
+    close $lang_code_file
+        or croak(file_error('closing', $LANG_CODE_FN, 'reading'));
+
+    init_logger('outtest.log');
+
+    # TODO: Make smarter, configurable, whatever
+    # $OUT_DIR must be first, because it's the parent
+    foreach my $out_dir ($OUT_DIR, $UNSORTED_DIR, $MULTI_DIR) {
+        if (-d $out_dir) {
+            unlink glob "$out_dir/*$WIKITEXT_EXT";
+        }
+        else {
+            mkdir $out_dir;
+        }
+    }
+
+    return;
+}
+
+my $WIKITEXT_EXT = 'wiki.txt';
+my $OUT_DIR      = 'out';
+my $UNSORTED_DIR = "$OUT_DIR/unsorted";
+my $ALT_SEP      = q{|};
+my $FIELD_SEP    = qq{\t};
+my $LINK_SEP     = q{|};
+my $TYPE_SEP     = qr/\s*;\s*/xms;
+my $HTML_RLM     = '&rlm;';
+
+Readonly our $DEFAULT_PAGE_FREQ             => 1000;
+Readonly our $DEFAULT_MAX_LINKS_PER_SECTION => 100;
+Readonly our $DEFAULT_MAX_SECTIONS_PER_PAGE => 20;
+
+sub init_logger {
+    my ($log_fn) = @_;
+	Log::Log4perl->easy_init(
+	    {
+		level  => $INFO,              # print everything
+		file   => ":utf8>$log_fn",    # utf is important
+		layout => '%m%n',             # No need to print the date
+	    }
+	);
+    return;
+}
+
+# XXX Too coupled to Wikipedia, won't work for other projects.
+our $DUMP_FN_RE = qr{
+    \A                  # Begin string
+    (?<wiki_lang>\w+)   # Lang code
+    wiki                # Is supposed to be after the lang code
+}xms;
+
+our $FIELD_SEP_RE = qr{\t}xms;
+
+our $STRING_SKIP_RE = qr{
+    \A          # Begin string
+    \s*         # Zero or more spaces
+    (?:\#.*)?   # Comment lines
+    \z          # End string
+}xms;
+
+my %STRING;
+
+# XXX - bad i18n
+# ISO 9 is mostly good for Russian and it is still not perfect ASCII
+# my $TRANSLITERATOR = Lingua::Translit->new('ISO 9');
+
+our %MW_SYNTAX = (
+    'start_template'  => '{{',
+    'end_template'    => '}}',
+    'start_link'      => '[[',
+    'end_link'        => ']]',
+    'param_sep'       => q{|},
+    'paragraph'       => qq{\n\n},
+    'start_wikitable' => '{|',
+    'namespace_sep'   => q{:},
+);
+
+# This monstrosity basically says: | and optional spaces
+our $PARAM_SEP_RE = qr{\s*\Q$MW_SYNTAX{param_sep}\E\s*}xms;
+
+#
+# Constants for date processing
+#
+
+our @MONTHS = @STRING{
+    qw(
+        January February March
+        April   May      June
+        July    August   September
+        October November December
+        )
+    };
+my $ALT_MONTHS = join $ALT_SEP, @MONTHS;
+my %REV_MONTH;
+Readonly my $LAST_MONTH => 11;
+@REV_MONTH{@MONTHS} = (0 .. $LAST_MONTH);
+
+# XXX Internationalize
+my $string_in   = get_string('in');
+our $HEB_DATE_RE = qr{
+    \A                      # begin
+    (?<hour>\d{1,2})        # hour
+    :                       # :
+    (?<min>\d{2}),\s        # minute
+    (?<mday>\d{1,2})\s      # day of month
+    $string_in?             # This preposition appears sometimes
+    (?<mon>$ALT_MONTHS)\s   # A name of a month
+    (?<year>\d+?)\s         # Year
+    \([A-Z]{3}\)            # Three letters in brackets - timezone
+    \z                      # end
+}xms;
+
+our $PARAM_RE = qr{
+    \A                  # Beginning of a string
+    \s*                 # Zero or more space
+    (?:                 # No capture
+    (?<param_name>\w+)  # Word chars - param name
+    \s*                 # Zero or more space
+    =                   # =
+    )?                  # Param name is optional
+    (?<value>.*)        # value
+}xms;
+
+# A simplistic template just for testing.
+# Quite possibly it is not needed anymore.
+# Until i get a better regex for matching balancing {{}} ...
+our $TEMPLATE_RE = qr{
+    \A                              # beginning of string
+    \Q$MW_SYNTAX{start_template}\E  # {{
+    .+                              # some chars
+    \Q$MW_SYNTAX{end_template}\E    # }}
+    \z                              # end of string
+}xms;
+
+our $WIKITABLE_RE = qr{
+    \A
+    \Q$MW_SYNTAX{start_wikitable}\E
+}xms;
+
+# Redirect in local language.
+# Regular expression mostly copied from
+# Parse::MediaWikiDump::page::redirect
+# TODO: Try to look for the local language redirect keyword in the dump.
+my $local_redirect    = get_string('REDIRECT');
+my $LOCAL_REDIRECT_RE = qr{
+    \A                          # Beginning of string (page)
+    \#                          # a # character
+    $local_redirect             # Redirect keyword in local language
+    \s*:?\s*\[\[([^\]]*)\]\]    # the link after the redirect
+}xmsi;
+
+my $LTR_CHAR_RE       = qr/\P{IsRightToLeft}/xms;    # \P is negation
+my $SECTION_LINK_RE   = qr{(?<!&)\#}xms;
+my $LOWERCASE_LINK_RE = qr{\A[[:lower:]]}xms;
+
+## no critic (RegularExpressions::ProhibitEscapedMetacharacters)
+my $TRUE_TEMPLATE_RE = qr/\{ $RE{balanced}{-parens=>'{}'} \}/xms;
+## use critic (RegularExpressions::ProhibitEscapedMetacharacters)
+
+my $string_exclude_lowercase = get_string('exclude_lowercase');
+my $EXCLUDE_LOWERCASE_RE     = qr{
+    \A                          # Beginning of foreign article name
+    [$string_exclude_lowercase] # Character class of possibly lowercase chars
+}xms;
+my $NUMBERED_FILE_RE = qr{
+    (?: _ \d*)?
+    \.$WIKITEXT_EXT
+}xms;
+my $INVALID_FILENAME_CHAR_RE = qr{[\\\n/:*?"<>|]}xms;    # "
+
+my $TWO_DIGIT_CHARNUMBER_RE = qr{
+    (?:
+        [%.]                # There are both %C4%B0 and .AA.E0
+        [[:xdigit:]]{2}     # 2 hex digits
+    )
+}xms;
+
+my $HTML_CHARNUMBER_RE = qr{
+    (?:
+        &\#\d+;             # stuff like &#352;
+    )
+}xms;
+
+# TODO: Check whether it is Neapolitan with its ''
+my $CHARACTER_CODE_IN_LINK_RE = qr{
+    $TWO_DIGIT_CHARNUMBER_RE | $HTML_CHARNUMBER_RE
+}xms;
+
+sub load_lang_codes {
+    my ($wiki_lang) = @_;
+
+}
+
+my $string_no_iw          = get_string('no_iw');
+my $SIMPLE_NO_IW_CHECK_RE = qr{
+    \Q$string_no_iw\E # The string may have spaces
+}xmsi;
+
+my $ALT_LANGS = join $ALT_SEP, keys %LANG_CODE;
+
+my $INTERWIKI_LINK_RE = qr{
+    \Q$MW_SYNTAX{start_link}\E
+    (?<lang_code>
+        $ALT_LANGS
+    )
+    :
+    (?<foreign_article>
+        .+?
+    )
+    \Q$MW_SYNTAX{end_link}\E
+}xms;
+
+# Lojban allows lowercase articles
+my @LOWERCASE_LANGS = qw(jbo);
+
+our $the_dump = load_dump();
+
+my $namespaces_alt = join $ALT_SEP,
+    grep { length > 0 } @{ $the_dump->namespaces_names() };
+my $PURE_TITLE_RE = qr{
+    \A
+    (?:
+        (?:$namespaces_alt)
+        $MW_SYNTAX{namespace_sep}
+    )?                              # Namespace name is optional
+    (?<pure_title>.+)
+    \z
+}xms;
+
+our %statistics = (count_iw => []);
+
+my $begin_time = time;
+process_dump();
+my $total_time = time - $begin_time;
+
+print_stats();
+
+create_no_iw_pages();
+
+INFO(q{});
+
+# my @found_lang_codes = sort keys %found_links;
+# INFO("found lang_codes: @found_lang_codes");
+
+my $begin_multi_links_time = time;
+
+say "\nlisting multi links by language";
+print_multi_links_by_foreign();
+
+say "\nlisting multi links by local articles";
+print_multi_links_by_local();
+
+my $total_multi_links_time = time - $begin_multi_links_time;
+INFO("total multi links time: $total_multi_links_time");
+
+exit;
+
+sub load_dump {
+    Readonly my $WORKING_PMWD_VER => 0.91;
+    Readonly my $FUTURE_PMWD_VER  => 0.94;
+
+    my $dump;
+
+    if ($Parse::MediaWikiDump::VERSION == $WORKING_PMWD_VER) {
+        $dump = Parse::MediaWikiDump::Pages->new($dump_fn);
+    }
+    else {
+        if ($Parse::MediaWikiDump::VERSION < $FUTURE_PMWD_VER) {
+            carp(     'You are running Parse::MediaWikiDump version '
+                    . $Parse::MediaWikiDump::VERSION
+                    . ".\n Redirect handling may be broken\n");
+        }
+        my $pmwd = Parse::MediaWikiDump->new();
+        $dump = $pmwd->revisions($dump_fn);
+    }
+
+    # This is the only currently known value
+    # but there could be more in the future
+    if ($dump->case() ne 'first-letter') {
+        croak(q{unable to handle any case setting besides 'first-letter'});
+    }
+
+    return $dump;
+}
+
+sub namespace {
+    my ($page) = @_;
+    return $page->namespace() || get_string('article space');
+}
+
+sub next_page {
+    my ($dump) = @_;
+    my $page_ref = eval { $dump->next(); };
+    if ($EVAL_ERROR) {
+        confess("Failed reading a page: $EVAL_ERROR");
+    }
+    return $page_ref;
+}
+
+sub process_dump {
+    PAGE:
+    while (my $page_ref = next_page($the_dump)) {
+        $page_counter++;
+        if ($page_counter % $option{page_freq} == 0) {
+            say $page_counter;
+        }
+
+        if (    $option{stop_after}
+            and $page_counter > $option{stop_after})
+        {
+            last PAGE;
+        }
+
+        my $page_text_ref = $page_ref->text();
+
+        if (
+            $page_counter < $option{start_from}
+            or not defined ${$page_text_ref}  # must be tested before redirect
+            or not is_in_namespace($page_ref, @INCLUDE_NAMESPACES)
+            or is_redirect($page_ref)
+            )
+        {
+            next PAGE;
+        }
+
+        my $page_namespace = namespace($page_ref);
+        $namespace_count{$page_namespace}++;
+
+        my $page_title = $page_ref->title();
+
+        INFO("\n* processing $page_counter - ", $page_title);
+
+        # TODO: Be more precise here.
+        # Portal pages which have a '/' in their name are probably
+        # internal and do not need interwiki links.
+        if (is_in_namespace($page_ref, 'Portal') and $page_title =~ m{/}xms) {
+            INFO('internal portal, skipping');
+            $statistics{'internal portal'}++;
+            next PAGE;
+        }
+
+        # A simple sanity check: is the no_iw template anywhere around here?
+        my $has_template_no_iw =
+            (${$page_text_ref} =~ $SIMPLE_NO_IW_CHECK_RE);
+
+        # Does the page have interwiki links?
+        # BIG XXX Actually checks only for English
+        my $has_iw = has_interwiki($page_ref);
+
+        if ($has_iw) {
+            INFO("has link to $has_iw");
+            if ($has_template_no_iw) {
+                INFO('has template no_iw. trying to remove ...');
+                remove_template_no_iw($page_text_ref);
+                $statistics{'has both valid interwiki and template'}++;
+                special_cases_file('outdated_template', $page_ref);
+            }
+        }
+        else {    # does not have iw
+            process_iwless_page($page_ref, $has_template_no_iw, $has_iw);
+        }
+    }
+
+    return;
+}
+
+sub process_iwless_page {
+    my (
+        $page_ref,              # object ref
+        $has_template_no_iw,    # scalar bool
+        $has_iw                 # scalar bool
+    ) = @_;
+
+    INFO(q(does not have iw link.));
+    $statistics{'has no interwiki link'}->{ namespace($page_ref) }++;
+
+    # Now we need to search for no_iw templates
+    # and parse their parameters - date and type
+
+    my @found_templates = ();
+
+    my $page_text_ref = $page_ref->text();
+
+    # Optimized - does not start searching,
+    # if we already know that it is not there
+    if ($has_template_no_iw) {
+        find_templates($page_text_ref, \@found_templates,
+            [ get_string('no_iw') ]);
+    }
+
+    my $found_templates_count = scalar @found_templates;
+    INFO("found templates: $found_templates_count");
+
+    my $template;
+    if ($found_templates_count) {
+
+        # Assume the first one is the right one
+        $template = $found_templates[0];
+        if ($found_templates_count > 1) {
+            WARN('many templates were found');
+            $statistics{'many templates'}++;
+            special_cases_file('many_templates', $page_ref);
+        }
+        else {
+            INFO('good, found one template');
+            $statistics{'one template'}++;
+        }
+    }
+    else {
+        INFO('no templates found');
+        $statistics{'no templates found'}++;
+    }
+
+    if (defined $template) {
+        INFO('has template no_iw');
+        my $date_str = $template->{params}->{date};
+        if (defined $date_str) {
+            INFO('checking cooling date');
+            my $date_ref = parse_date($date_str);
+            if (not defined $date_ref) {
+                INFO("invalid date: '$date_str'");
+                $statistics{'invalid date'}++;
+                special_cases_file('invalid_date', $page_ref);
+            }
+            elsif (cooling_date_passed($date_ref)) {
+                INFO('cooling date passed, updating to today ...');
+                update_cooling_date($page_text_ref);
+                $statistics{'cooling date passed'}++;
+            }
+            else {
+                INFO(q(cooling date did not pass.));
+                $statistics{q(cooling date did not pass)}++;
+            }
+        }
+        else {
+            INFO('date not defined');
+        }
+
+    }
+
+    my @all_types = get_all_types($template->{params}->{type}, $page_ref);
+
+    foreach my $type (@all_types) {
+        INFO('adding ' . $page_ref->title() . " to the list as type $type");
+        add_to_no_iw_list($page_ref, $type);
+        $type_count{$type}++;
+    }
+
+    return 1;
+}
+
+sub get_all_types {
+    my ($type_param, $page) = @_;
+
+    $type_param //= q{};    # /
+    strip_whitespace($type_param);
+
+    my @all_types = split $TYPE_SEP, $type_param;
+
+    my $page_title = $page->title();
+    if (is_category($page)) {
+        INFO("$page_title is a category");
+        push @all_types, get_string('Category');
+        $statistics{'categories'}++;
+    }
+
+    if (is_in_namespace($page, 'Portal')) {
+        INFO("$page_title is a portal");
+        push @all_types, get_string('Portal');
+        $statistics{'portal'}++;
+    }
+
+    if (is_disambig($page)) {
+        INFO("$page_title is a disambiguation");
+        push @all_types, get_string('disambig');
+        $statistics{'disambig'}++;
+    }
+
+    # Still nothing?
+    if (not scalar @all_types) {
+        my $other_type = get_string('other');
+        INFO("$page_title does not have any type, adding to $other_type");
+        @all_types = ($other_type);
+        $statistics{'automatically added to type other'}++;
+    }
+
+    return @all_types;
+}
+
+sub find_templates {
+    my (
+        $text_ref,               # string ref
+        $found_templates_ref,    # array ref
+        $filter                  # string array ref
+    ) = @_;
+
+    # A reference to an array with one empty string.
+    # Matching against an empty string will always succeed.
+    $filter //= [q{}];           # /
+
+    # Get all highest-level matches
+    my @matches = (${$text_ref} =~ /$TRUE_TEMPLATE_RE/xmsgo);
+
+    MATCH:
+    foreach my $next_match (@matches) {
+        if ($next_match !~ $TEMPLATE_RE) {
+            INFO(q(i thought that it is a template, but it was:));
+            if ($next_match =~ $WIKITABLE_RE) {
+                INFO('a wikitable');
+            }
+            else {
+                INFO("something else: $next_match");
+            }
+            INFO(q{});
+            next MATCH;
+        }
+
+        foreach my $next_filter (@{$filter}) {
+
+            # N.B. - case-insensitive. Wrong, but kinda useful.
+            if ($next_match =~
+                /\A\Q$MW_SYNTAX{'start_template'}$next_filter/xmsi)
+            {
+
+                # N.B.: parse_template calls find_templates() recursively
+                my $parsed_template =
+                    parse_template(\$next_match, [qw(date type)], $filter);
+                push @{$found_templates_ref}, $parsed_template;
+            }
+        }
+    }
+
+    return $found_templates_ref;
+}
+
+sub parse_template {
+    my (
+        $template,               # string ref
+        $default_param_names,    # string array ref
+        $subtemplate_filter,     # string array ref
+    ) = @_;
+
+    # %parsed_template:
+    # {text}            - string ref; might be big
+    # {name}            - string
+    # {params}          - hash ref
+    # {subtemplates}    - array ref
+    my (%parsed_template, %parsed_params, @clauses);
+
+    $parsed_template{text} = strip_template_curlies(${$template});
+
+    # First string of the split is the template name,
+    # the rest is the params
+    ($parsed_template{name}, @clauses) =
+        (split $PARAM_SEP_RE, ${ $parsed_template{text} });
+
+    my $param_counter = 0;
+    foreach my $clause (@clauses) {
+        if ($clause =~ $PARAM_RE) {
+            #<<< no perltidy
+            my ($name, $value) = @+{ qw(param_name value) };
+            #>>>
+            if (defined $name) {
+                if (defined $param_name{$name}) {
+                    $name = $param_name{$name};
+                }
+            }
+            else {
+
+                # Get next default param name.
+                # If a default name is not defined, just use the number
+                $name = $default_param_names->[$param_counter]
+                    || $param_counter;
+            }
+            $parsed_params{$name} = $value;
+        }
+        else {
+            my $error_msg = "Weird - $clause does not look like a param";
+            INFO($error_msg);
+            cluck($error_msg);
+            $statistics{'weird param'}++;
+        }
+        $param_counter++;
+    }
+    $parsed_template{params} = \%parsed_params;
+
+    # Possible recursion
+    find_templates($parsed_template{text}, $parsed_template{subtemplates},
+        $subtemplate_filter);
+
+    return \%parsed_template;
+}
+
+sub parse_date {
+    my ($date_str) = @_;
+
+    return if (not defined $date_str);
+
+    if ($date_str =~ $HEB_DATE_RE) {
+        INFO("found a valid date: $date_str");
+        my %parsed_date = (
+            'sec' => 0,    # useful for timelocal
+        );
+        foreach my $date_part (qw(hour min mday mon year)) {
+            $parsed_date{$date_part} = $+{$date_part};
+        }
+        $parsed_date{mon} = $REV_MONTH{ $parsed_date{mon} };
+
+        # strip leading zeros
+        foreach (values %parsed_date) {
+            s{
+                \A
+                0+
+                (?<number>\d+)
+            }
+            {$+{number}}xms;
+        }
+        return \%parsed_date;
+    }
+
+    # Returns undef for an invalid date
+    return;
+}
+
+sub strip_template_curlies {
+    my ($template) = @_;
+    for ($template) {
+        s{
+            \A
+            \Q$MW_SYNTAX{start_template}\E
+        }{}xms;
+        s{
+            \Q$MW_SYNTAX{end_template}\E
+            \z
+        }{}xms;
+    }
+    return \$template;
+}
+
+# no arg unpacking for simplicity and performance
+sub strip_whitespace {    ## no critic (RequireArgUnpacking)
+    for (@_) {
+        s/\A\s*//xms;
+        s/\s*\z//xms;
+    }
+    return;
+}
+
+sub has_interwiki {
+    my ($page) = @_;
+
+    my $page_title = $page->title();
+    my $page_text  = ${ $page->text() };    # XXX
+
+    study $page_text;                       # XXX
+
+    my %iw_links;
+    my %special_cases;
+
+    while ($page_text =~ /$INTERWIKI_LINK_RE/xmsgo) {    # XXX
+        my ($lang_code, $foreign_article) = @+{qw(lang_code foreign_article)};
+        if (defined $iw_links{$lang_code}) {
+            $special_cases{double_links}->{$lang_code} = q{};
+        }
+        else {
+            $iw_links{$lang_code} = $foreign_article;
+        }
+
+        # A # sign not after an &.
+        # After an & it is probably a character number.
+        if ($foreign_article =~ $SECTION_LINK_RE) {
+            $special_cases{section_links}->{$lang_code} = q{};
+        }
+
+        # Char codes are common in section links, so there is no
+        # need to show them again
+        elsif ($foreign_article =~ $CHARACTER_CODE_IN_LINK_RE) {
+            $special_cases{charnumber_links}->{$lang_code} = q{};
+        }
+
+        # Lowercase links
+        if (    (not $lang_code ~~ @LOWERCASE_LANGS)
+            and ($foreign_article =~ $LOWERCASE_LINK_RE))
+        {
+            my $include_lowercase_link = 1;
+
+            if (defined get_string('exclude_lowercase', 'if defined')
+                and $foreign_article =~ $EXCLUDE_LOWERCASE_RE)
+            {
+                $include_lowercase_link = 0;
+            }
+
+            if ($include_lowercase_link) {
+                $special_cases{lowercase_links}->{$lang_code} = q{};
+            }
+        }
+
+        $found_links{$lang_code}->{$foreign_article}->{$page_title} = q{};
+    }
+
+    my @all_langs = keys %iw_links;
+    my $count_iw  = scalar @all_langs;
+    if ($count_iw) {
+        if (not defined $statistics{count_iw}->[$count_iw]) {
+            $statistics{count_iw}->[$count_iw] = [];
+        }
+        push @{ $statistics{count_iw}->[$count_iw] }, $page_title;
+    }
+    INFO("iw link count for $page_title is: $count_iw");
+
+    for my $special_case_name (keys %special_cases) {
+        if (scalar %{ $special_cases{$special_case_name} }) {
+            special_cases_file($special_case_name, $page,
+                $special_cases{$special_case_name});
+        }
+    }
+
+    # BIG XXX Still very stupid, but getting better
+    if (defined $iw_links{en}) {
+        return 'en';
+    }
+
+    return q{};
+}
+
+sub special_cases_file {
+    my ($special_case_name, $page, $special_cases_ref) = @_;
+    $special_cases_ref //= {};    # /
+
+    my $special_case_langs = join q{ }, sort keys %{$special_cases_ref};
+
+    if ($special_case_langs) {
+        $special_case_langs = " ($special_case_langs)";
+    }
+
+    my $special_case_fn = make_type_fn($special_case_name, 1);
+
+    if (not -e $special_case_fn) {
+        append_to_file($special_case_fn, $special_case_name);
+    }
+
+    my $page_title = $page->title();
+    my $link       = make_link($page_title);
+    my $line =
+          $link
+        . $special_case_langs
+        . $FIELD_SEP
+        . get_sort_title($page_title);
+
+    append_to_file($special_case_fn, $line);
+
+    return;
+}
+
+sub remove_template_no_iw {
+    my ($params) = @_;
+    INFO(     "Supposed to remove the no_iw template now, but ...\n"
+            . 'This sub is a stub. You can help Wikipedia by expanding it!');
+
+    return 0;
+}
+
+sub cooling_date_passed {
+    my ($date_ref) = @_;
+
+    my @page_times = @{$date_ref}{qw(sec min hour mday mon year)};
+    INFO("page times: @page_times");
+
+    my $pageseconds = timelocal(@page_times);
+    return $pageseconds < $LATEST_COOLING;
+}
+
+sub update_cooling_date {
+    my ($params) = @_;
+
+    INFO(     "Supposed to update cooling date now, but ...\n"
+            . "This sub is a stub. You can help Wikipedia by expanding it!\n"
+    );
+
+    return 0;
+}
+
+# Just a debugging thingie
+sub print_parsed_template {
+    my ($template_ref) = @_;    # hash ref
+
+    INFO('text:');
+    INFO(${ $template_ref->{text} });
+    INFO('name:');
+    INFO($template_ref->{name});
+    INFO('params:');
+    foreach my $next_param (sort keys %{ $template_ref->{params} }) {
+        INFO("$next_param: $template_ref->{params}->{$next_param}");
+    }
+
+    if (defined $template_ref->{subtemplates}) {
+        INFO("subtemplates: $template_ref->{subtemplates}");
+    }
+
+    return;
+}
+
+sub get_sort_title {
+    my ($page_title) = @_;
+
+    my $sort_title;
+    if ($page_title =~ $PURE_TITLE_RE) {
+        $sort_title = $+{pure_title};
+    }
+    else {
+        INFO(     'Something badly weird happened - '
+                . "$page_title has no pure title");
+        croak('A page has no pure title');
+    }
+
+    return $sort_title;
+}
+
+sub add_to_no_iw_list {
+    my (
+        $page,         # ref
+        $page_type,    # string
+    ) = @_;
+
+    my $page_title = $page->title();
+    INFO("Adding $page_title to no_iw_list");
+
+    my $sort_title = get_sort_title($page_title);
+    INFO("full title: $page_title");
+    INFO("sort title: $sort_title");
+
+    my $unsorted_type_fn = make_type_fn($page_type, 1);
+    if (not -e $unsorted_type_fn) {
+        append_to_file($unsorted_type_fn, $page_type);
+    }
+    my $link = make_link($page_title);
+    INFO("link to page: $link");
+    my $line = $link . $FIELD_SEP . $sort_title;
+
+    append_to_file($unsorted_type_fn, $line);
+
+    return;
+}
+
+sub make_link {
+    my ($page_title) = @_;
+
+    my $link_to_page =
+          $MW_SYNTAX{start_link}
+        . $MW_SYNTAX{namespace_sep}
+        . $page_title
+        . $MW_SYNTAX{end_link};
+
+    if ($option{rtl} and $page_title =~ $LTR_CHAR_RE) {
+        $link_to_page = $HTML_RLM . $link_to_page . $HTML_RLM;
+    }
+
+    return $link_to_page;
+}
+
+sub create_no_iw_pages {
+    my ($params) = @_;
+
+    INFO("\ncreating no_iw pages");
+
+    # Run over page types
+    UNSORTED_TYPE_FN:
+    foreach my $unsorted_type_fn (glob "$UNSORTED_DIR/*") {
+        my %all_pages_in_type = ();
+        open my $unsorted_type_file, '<', $unsorted_type_fn
+            or croak(file_error('opening', $unsorted_type_fn, 'reading'));
+        my @lines = <$unsorted_type_file>;
+        close $unsorted_type_file
+            or croak(file_error('closing', $unsorted_type_fn, 'reading'));
+
+        my $type_name = shift @lines;
+        chomp $type_name;
+        foreach my $line (@lines) {
+            chomp $line;
+            my ($page_title, $sort_title) = split $FIELD_SEP_RE, $line;
+            my $sort_letter = get_sort_letter($sort_title);
+            $all_pages_in_type{$sort_letter} //= [];    # /
+            push @{ $all_pages_in_type{$sort_letter} }, $page_title;
+        }
+        write_sorted_pages($type_name, \%all_pages_in_type);
+    }
+    return;
+}
+
+sub write_sorted_pages {
+    my ($type_name, $type_tree_ref) = @_;
+
+    my $type_fn = make_type_fn($type_name);
+
+    my $section_counter = 0;
+    my $page            = q{};
+    my $file_number     = 1;
+
+    LETTER:
+    foreach my $next_letter (sort keys %{$type_tree_ref}) {
+        my @all_links_in_letter =
+            sort @{ $type_tree_ref->{$next_letter} };
+
+        my $links_count = scalar @all_links_in_letter;
+        my $sections =
+            ($links_count - 1) / $option{max_links_per_section} + 1;
+        my $links_per_section = $links_count / $sections;
+
+        SECTION:
+        foreach my $section_num (1 .. $sections) {
+            if ($section_counter == $option{max_sections_per_page}) {
+                write_page(\$page, \$type_fn, $file_number++);
+                $section_counter = 0;
+
+                $page = q{};
+            }
+            elsif ($section_counter) {
+                $page .= $MW_SYNTAX{paragraph};
+            }
+            $section_counter++;
+            my $title = $next_letter;
+            if ($sections > 1) {
+                $title .= " $section_num";
+            }
+            $page .= mw_heading($title);
+
+            my $first_link = ($section_num - 1) * $links_per_section;
+            my $last_link =
+                ($section_num == $sections)
+                ? $links_count - 1
+                : $first_link + $links_per_section - 1;
+            my $links = join_links(
+                [ @all_links_in_letter[ $first_link .. $last_link ] ]);
+            $page .= $links;
+        }
+    }
+
+    # The page may be empty at this point
+    if ($page) {
+        write_page(\$page, \$type_fn, $file_number++);
+    }
+
+    return;
+}
+
+sub write_page {
+    my ($page_ref, $type_fn_ref, $file_number) = @_;
+
+    my $pretty_file_number = sprintf '%03d', $file_number;
+    ${$type_fn_ref} =~ s{
+        $NUMBERED_FILE_RE
+    }
+    {_$pretty_file_number.$WIKITEXT_EXT}xmso;
+    INFO("creating file ${$type_fn_ref}");
+    append_to_file(${$type_fn_ref}, ${$page_ref});
+
+    return;
+}
+
+sub mw_heading {
+    my (
+        $text,    # string
+        $level    # number
+    ) = @_;
+
+    $level //= 2;    # /
+    my $level_marker = q{=} x $level;
+
+    # Line ending is mandatory
+    return "$level_marker $text $level_marker\n";
+}
+
+sub mw_bold {
+    my ($text) = @_;
+    return "'''$text'''";
+}
+
+# Custom Unicode character property, which is like \w, but for Hebrew.
+# The custom is to give custom Unicode character classes CamelCase names.
+# P::C policy ProhibitMixedCaseSubs is deprecated.
+sub IsRightToLeft {    ## no critic (Capitalization)
+    return <<'END';
++utf8::InHebrew
++utf8::IsSpace
++utf8::IsPunct
+END
+}
+
+sub is_redirect {
+    my ($page) = @_;
+
+    if ($page->redirect()) {
+        return 'English';
+    }
+
+    my $page_text_ref = $page->text();
+
+    if (${$page_text_ref} =~ $LOCAL_REDIRECT_RE) {
+        return 'local';
+    }
+
+    return q();
+}
+
+sub is_in_namespace {
+    my ($page, @namespaces) = @_;
+
+    return namespace($page) ~~ [ map { get_string($_) } @namespaces ];
+}
+
+sub is_category {
+    my ($page) = @_;
+    return is_in_namespace($page, 'Category');
+}
+
+sub is_disambig {
+    my ($page) = @_;
+    my $found_templates =
+        find_templates($page->text(), [], [ get_string('disambig') ]);
+    return scalar @{$found_templates};
+}
+
+#
+# This is poor man's gettext.
+# TODO: Replace it with Locale::Maketext
+#
+sub get_strings {
+    my ($lang) = @_;
+
+    my $STRINGS_FN = "$lang.strings.txt";
+
+    open my $STRINGS_FILE, '<:utf8', $STRINGS_FN
+        or croak(file_error('opening', $STRINGS_FN, 'reading'));
+    my @strings_file_lines = <$STRINGS_FILE>;
+    close $STRINGS_FILE
+        or croak(file_error('closing', $STRINGS_FN, 'reading'));
+
+    STRING_LINE:
+    foreach my $next_string_line (@strings_file_lines) {
+
+        # Skip blanks and comments
+        next STRING_LINE if ($next_string_line =~ $STRING_SKIP_RE);
+
+        chomp $next_string_line;
+        my ($english, $target) = split $FIELD_SEP_RE, $next_string_line;
+
+        # Fallback to English if no target language string was supplied
+        $STRING{$english} = $target // $english;    # /
+    }
+
+    return;
+}
+
+sub get_string {
+    my ($english, $if_defined) = @_;
+    return $if_defined
+        ? ($STRING{$english})
+        : ($STRING{$english} //= $english);         # /
+}
+
+sub make_type_fn {
+    my ($type, $unsorted) = @_;
+    $unsorted //= 0;                                # /
+
+    #my $transliterated_type = $TRANSLITERATOR->translit($type);
+    my $transliterated_type = $type;
+
+    my $type_fn = "$transliterated_type.$WIKITEXT_EXT";
+
+    $type_fn =~ s{$INVALID_FILENAME_CHAR_RE}{-}xmsgo;
+    my $dir = $unsorted ? $UNSORTED_DIR : $OUT_DIR;
+    $type_fn = "$dir/$type_fn";
+
+    return $type_fn;
+}
+
+sub append_to_file {
+    my ($fn, $line) = @_;
+
+    open my $file, '>>:utf8', $fn
+        or croak(file_error('opening', $fn, 'appending'));
+
+    say {$file} ($line // q{});    # /
+
+    close $file
+        or croak(file_error('closing', $fn, 'appeding'));
+
+    return;
+}
+
+# It appears simple, but in non-alphabetic languages such as Chinese
+# it may be different, so it will sit here ready for better i18n.
+sub get_sort_letter {
+    my ($string) = @_;
+    return substr $string, 0, 1;
+}
+
+sub file_error {
+    my ($operation, $fn, $access_type) = @_;
+    my $string = "error $operation $fn for $access_type: $OS_ERROR";
+    return $string;
+}
+
+sub format_link_table {    ## no critic (RequireArgUnpacking)
+    return sprintf '%-15s %8d', @_;
+}
+
+sub print_multi_links_by_foreign {
+    LANG_CODE:
+    foreach my $lang_code (sort keys %found_links) {
+        my $filename         = "$MULTI_DIR/$lang_code.$WIKITEXT_EXT";
+        my @foreign_articles = sort keys %{ $found_links{$lang_code} };
+        say format_link_table($lang_code, scalar @foreign_articles);
+
+        FOREIGN_ARTICLE:
+        foreach my $foreign_article (@foreign_articles) {
+            my @local_articles =
+                keys %{ $found_links{$lang_code}->{$foreign_article} };
+            if (scalar @local_articles > 1) {
+                my $links = join q{ | }, sort map { make_link($_) }
+                    keys %{ $found_links{$lang_code}->{$foreign_article} };
+
+                my $foreign_title =
+                    make_link($lang_code
+                        . $MW_SYNTAX{namespace_sep}
+                        . $foreign_article);
+                append_to_file($filename, "* '''$foreign_title''' - $links");
+            }
+        }
+    }
+
+    return;
+}
+
+sub print_multi_links_by_local {
+    my %local_multi_links;
+    LANG_CODE:
+    foreach my $lang_code (sort keys %found_links) {
+        my @foreign_articles = sort keys %{ $found_links{$lang_code} };
+        say format_link_table($lang_code, scalar @foreign_articles);
+
+        FOREIGN_ARTICLE:
+        foreach my $foreign_article (@foreign_articles) {
+            my @local_articles =
+                keys %{ $found_links{$lang_code}->{$foreign_article} };
+
+            if (scalar @local_articles > 1) {
+                add_local_multi(
+                    \%local_multi_links,
+                    make_link(
+                              $lang_code
+                            . $MW_SYNTAX{namespace_sep}
+                            . $foreign_article
+                    ),
+                    @local_articles
+                );
+            }
+        }
+    }
+
+    say 'writing local multilinks file';
+    my $filename = "$MULTI_DIR/LOCAL.$WIKITEXT_EXT";
+    foreach my $local_multi_article (sort keys %local_multi_links) {
+        append_to_file($filename,
+            '* ' . mw_bold(make_link($local_multi_article)));
+
+        foreach my $other_local_article (
+            sort keys %{ $local_multi_links{$local_multi_article} })
+        {
+            append_to_file($filename,
+                '** ' . make_link($other_local_article));
+            my $foreign_articles = join_links(
+                $local_multi_links{$local_multi_article}
+                    ->{$other_local_article},
+                0
+            );
+            append_to_file($filename, "*** $foreign_articles");
+        }
+        append_to_file($filename);
+    }
+
+    return;
+}
+
+sub add_local_multi {
+    my (
+        $local_multi_links_ref, $foreign_link,
+        $first_local_article,   @other_local_articles
+    ) = @_;
+
+    $local_multi_links_ref->{$first_local_article} //= {};    # /
+
+    foreach my $other_local_article (@other_local_articles) {
+        $local_multi_links_ref->{$first_local_article}
+            ->{$other_local_article} //= [];                  # /
+        push @{ $local_multi_links_ref->{$first_local_article}
+                ->{$other_local_article} }, $foreign_link;
+    }
+
+    return;
+}
+
+sub join_links {
+    my ($links_ref, $line_end) = @_;
+    $line_end //= 1;                                          # /
+
+    my $link_sep = q{ } . $LINK_SEP . ($line_end ? "\n" : q{ });
+    return join $link_sep, @{$links_ref};
+}
+
+sub print_stats {
+    INFO("\nSUMMARY");
+    INFO("total time: $total_time");
+    foreach my $stat_type (sort keys %statistics) {
+        if (not ref $statistics{$stat_type}) {
+            INFO("$stat_type: $statistics{$stat_type}");
+        }
+    }
+
+    my $max_iw_index = $#{ $statistics{count_iw} };
+    INFO("max_iw_index: $max_iw_index");
+    MAX_IW:
+    for my $max_iw_place (0 .. $option{max_iw_places}) {
+        my @links =
+            map { make_link($_) } @{ $statistics{count_iw}->[$max_iw_index] }
+            or last MAX_IW;
+        INFO("# $max_iw_index: " . join_links(\@links, 0));
+
+        # Do nothing, just count down to the next index with a defined list.
+        # $max_iw_index needs to be checked for nonzero-ness for the rare case
+        # of a very low page count.
+        while ($max_iw_index
+            and not defined $statistics{count_iw}->[ --$max_iw_index ])
+        {
+        }
+    }
+
+    INFO('pages without interwiki links per namespace');
+    foreach my $namespace (keys %{ $statistics{'has no interwiki link'} }) {
+        my $iwless_in_namespace =
+            $statistics{'has no interwiki link'}->{$namespace};
+
+        no integer;
+        ## no critic (ProhibitMagicNumbers)
+        my $percentage = sprintf '%.2f',
+            100 * $iwless_in_namespace / $namespace_count{$namespace};
+        ## use critic (ValuesAndExpressions::ProhibitMagicNumbers)
+        use integer;
+
+        INFO("$namespace: $iwless_in_namespace, $percentage%");
+    }
+
+    INFO("\nNAMESPACES");
+    foreach my $namespace (sort keys %namespace_count) {
+        INFO("$namespace: $namespace_count{$namespace}");
+    }
+
+    INFO("\nTYPES");
+    foreach my $type (sort keys %type_count) {
+        INFO("$type: $type_count{$type}");
+    }
+
+    my $dump_size = -s $dump_fn;
+    my $log_size  = -s $LOG_FN;
+    INFO("dump size:      $dump_fn");
+    INFO("log size:       $LOG_FN");
+    if ($log_size > 0) {
+        my $dump_log_ratio = $dump_size / $log_size;
+        INFO("dump/log ratio: $dump_log_ratio");
+    }
+    else {
+        WARN("weird: log file size $LOG_FN is 0");
+    }
+
+    return;
+}
+
+__END__
+
+=head1 NAME
+
+C<prepare_noiw_list.pl> - Searches a MediaWiki dump for pages without
+interlanguage links and prepares categorized lists of those pages.
+
+=head1 VERSION
+
+This documentation refers to 
+
+prepare_noiw_list.pl
+
+version 0.2.1 - Noa.
+
+=head1 USAGE
+
+=over
+
+=item * C<prepare_noiw_list.pl ./big-files/ruwiki-20080420-pages-meta-current.xml>
+
+=item * C<prepare_noiw_list.pl --rtl ./big-files/hewiki-20080420-pages-meta-current.xml>
+
+=item * C<prepare_noiw_list.pl --stop_after=20000 ./big-files/hewiki-20080420-pages-meta-current.xml>
+
+=back
+
+=head1 REQUIRED ARGUMENTS
+
+=over
+
+=item * MediaWiki dump file name is required
+
+=back
+
+=head1 OPTIONS
+
+=over
+
+=item * --rtl makes special fixes for right-to-left languages.
+
+=item * --stop_after=NUMBER Stops processing after page with the given
+NUMBER.
+
+=item * --start_from=NUMBER Begins processing after page with the given
+NUMBER.
+
+=item * --page_freq=NUMBER Print the page counter every NUMBER of
+pages. 1000 by default.
+
+=item * --max_links_per_section Maximum number of links per section in
+the output page. Default is 100.
+
+=item * --max_sections_per_page Maximum number of sections per output
+page. Default is 20.
+
+=item * --max_iw_places Number of places to print in the statistics of
+pages with the most interlanguage links.
+
+=item * --version Print the version number and exit.
+
+=item * --usage Print basic usage and exit.
+
+=item * --help Print full help and exit.
+
+=back
+
+=head1 DESCRIPTION
+
+The main goal of this program is to find pages which do not have
+interwiki (interlanguage) links to certain languages.
+
+This program scans a MediaWiki XML dump file. It searches every page for
+interwiki links and for a special hidden template, which defines to which
+types of interwiki-less pages it belongs.
+
+=over
+
+=item * If the page contains links to the defined languages and does not
+contain the "no interwiki" template, its processing stops.
+
+=item * If the page contains links to the defined languages and contains this
+template, it is logged, so the template can be removed. (It is planned that
+it will be removed automatically in the future.)
+
+=item * If the page contains no links to the defined languages and does not
+contain the template, it is automatically added to type "other".
+
+=item * If the page contains no links to the defined languages and
+a template with types, it is added to the defined types.
+
+=back
+
+Pages without links are added to nicely formatted lists according
+to their type.
+
+This program also collects some information on the way about problematic
+cases - more than one "no interwiki" template, invalid templates, invalid
+or redundant interwiki links etc.
+
+You can get a dump for using this program at L<http://download.wikimedia.org/>.
+
+=head1 DIAGNOSTICS
+
+=head2 Invalid command line options.
+
+See L</"REQUIRED ARGUMENTS"> and L</"OPTIONS">.
+
+=head2 FILENAME is a weird dump file name.
+
+The dump file does not appear to have a standard name that appears
+at L<http://download.wikimedia.org/>.
+
+=head2 error opening FILENAME ...
+
+=head2 error closing FILENAME ...
+
+Check the corresponding error.
+
+=head2 unable to handle any case setting besides 'first-letter'
+
+Something is weird with the dump. See the documentation of
+L<Parse::MediaWikiDump> and MediaWiki.
+
+=head2 A page has no pure title
+
+Something is particularly weird with the name of a page. The program cannot
+separate its name from its namespace. It can also be a bug in this program.
+
+=head2 Some weirdness happened - STRING does not look a param
+
+STRING is supposed to be a parameter in a template, but it does not look like
+one. It could be an error in the template, and also a bug in this program
+(the parser that this program employs is rather stupid).
+
+=head2 Unicode character 0xNUMBER is illegal
+
+This is a standard Perl warning. It may appear if a page or its title have
+funky Unicode characters which should not be there according to the Unicode
+standard (to be more precise, according to the implementation of this
+standard in your version of perl). Most probably these characters are not
+supposed to be in the page and should be fixed, but otherwise this issue
+is not supposed to affect the functionality of this program significantly.
+
+This was reported as a MediaWiki bug:
+L<https://bugzilla.wikimedia.org/show_bug.cgi?id=14600>
+
+=head1 EXIT STATUS
+
+Nothing interesting, read the log :)
+
+=head1 CONFIGURATION
+
+=head2 Language codes file
+
+The file name has the form C<LANGUAGE_CODE.language_codes.txt>, where
+LANGUAGE_CODE is the code of the wiki which is being processed. It includes
+a list of language codes which are searched, the full name of the language
+and the group to which this language belongs.
+
+=head2 Localized strings file
+
+The file name has the form C<LANGUAGE_CODE.strings.txt>, where LANGUAGE_CODE
+is the code of the wiki which is being processed. This file includes pairs
+of strings - the English name of a feature and its localized counterpart.
+
+=head2 Log
+
+The log is called outtest.log.
+
+=head1 DEPENDENCIES
+
+This module requires these CPAN modules:
+
+=over
+
+=item * C<Parse::MediaWikiDump>
+
+This module is used for reading pages from the XML dump.
+
+=item * C<Regexp::Common>
+
+This module is used for searching templates which use balanced parenthesis,
+but pretty badly. :)
+
+=item * C<Log::Log4perl>
+
+This module provides easy global logging.
+
+=item * C<Lingua::Translit>
+
+This module is used for transliterating filenames to ASCII.
+
+=item * C<Readonly>
+
+To make Perl::Critic happy :)
+
+=item * C<Pod::Usage>
+
+Some style guide recommended it. I don't even remember which one, but i love style guides.
+
+=back
+
+=head1 HACKING
+
+=head2 Perl 5.10
+
+This program requires Perl 5.10. It has new clean and useful syntax, which
+makes the programs easier to hack, maintain and debug. It is useless to try
+and run it on an older version, unless you want to waste your time
+backporting. Please upgrade your Perl installation if you still have 5.8 or
+(horrors!) something older.
+
+=head2 Perl Best Practices, Perl::Critic and perltidy
+
+Great effort has been put into making this source code pass as cleanly as
+possible the Perl::Critic tests in the 'brutal' mode. It also uses perltidy
+for automatic code formatting. If you modify it, do yourself a favor, install
+Perl::Critic and regularly test it using this command:
+
+./tidy.sh prepare_noiw_list.pl
+
+It checks the syntax, runs perltidy on the code and runs Perl::Critic.
+
+All the places where P::C has been disabled using "no critic" are explained.
+
+The time invested in making the code P::C-friendly will be returned as time
+saved on debugging. Also consider reading the book "Perl Best Practices" by
+Damian Conway if you have not already.
+
+=head1 INCOMPATIBILITIES
+
+=head2 Unicode issues
+
+This program works best on GNU/Linux, where Perl and the filesystem are
+Unicode-friendly.
+
+This program was also tested on Windows XP and Vista with ActivePerl and
+Cygwin. In these environments Unicode-related issues caused
+filenames and clipboard text to become jumbled. You have been warned.
+
+=head1 BUGS AND LIMITATIONS
+
+Please report all bugs, features requests and other comments to
+Amir E. Aharoni (ami...@ma...).
+
+=head2 There is no equality between languages
+
+Currently this program actually only lists pages which do not have
+an interwiki link to the English Wikipedia. This is obviously not useful on
+the English Wikipedia and is conceptually problematic on other Wikipedias,
+too. This is being fixed, but it is not simple to do it Right.
+
+Goal: 0.4 Reut
+
+=head2 Internationalization is far from perfect
+
+Date handling and strings localization is very primitive. There are plans
+to upgrade it to smarter modules such as Locale::Maketext.
+
+Goal: 0.6 Itay
+
+=head2 MediaWiki parsing is ad hoc
+
+This program only does very rudimentary and ad hoc MediaWiki syntax parsing.
+
+Goal: None at the moment, it works well enough.
+
+=head2 Templates are removed semi-automatically
+
+Templates on pages which already have needed links are not removed
+automatically. A list of them is created and a bot can run on it and remove
+the outdated templates. This can be done automatically.
+
+Goal: None at the moment, it works well enough.
+
+=head2 Cooling date
+
+The implementation of the cooling date is very rudimentary.
+
+Goal version: v0.4 Reut
+
+=head2 Major refactoring is needed
+
+=over
+
+=item * The main code is on the brink of passing the threshold for complexity that
+P::C accepts.
+
+=item * There is no separation of searching and formatting. There are two main
+function here: C<process_dump()> and C<create_no_iw_pages()>. They are doing
+separate things and should run from different programs.
+
+=item * Statistics and multi links are just slapped to the log.
+
+=item * At least some of the code can be rewritten as classes that inherit
+from L<Parse::MediaWikiDump>.
+
+=back
+
+Goal: v0.8 Moshe
+
+=head2 There is no test suite
+
+That can be done after proper modularization. Also, a local test MediaWiki
+server would be needed.
+
+Goal: v1.0 Drora
+
+=head1 HISTORY
+
+=over
+
+=item * B<0.2 - Noa>: Perl 5.10. Russian l10n. POD documentation. Pretty
+categories sorting. Memory usage optimization - accumulating information in
+files. More generic, but far-from-perfect handling of links to languages
+other than English. Transliteration with Lingua::Translit. Logging with
+Log::Log4perl. Brutal Perl::Critic 1.90. Started using Readonly. Not finished:
+complete statistics, removal of templates from pages which already have links.
+
+=item * B<0.1 - First and unnamed Amir E. Aharoni's version>: Types
+introduced. Conceptual l10n, but only tested on he.wiki. Still en links
+only. Informative, though lo-tec logging.
+
+=item * B<0.0 - Felagund's version>: no types, he.wiki only, en links only,
+but it uploaded the lists automatically...
+
+=back
+
+=head1 AUTHOR
+
+=over
+
+=item * Creator is Nadav Perez (Felagund)
+
+=over
+
+=item * Hebrew: L<http://he.wikipedia.org/wiki/User:Felagund>
+
+=back
+
+=item * It is (roughly) based on another bot by Guy Shaked (Costello).
+
+=over
+
+=item * Hebrew: L<http://he.wikipedia.org/wiki/User:Costello>
+
+=back
+
+=item * Then Amir E. Aharoni (Amire80) came and mutilated it beyond
+recognition.
+
+=over
+
+=item * Hebrew:  L<http://he.wikipedia.org/wiki/User:Amire80>
+
+=item * English: L<http://en.wikipedia.org/wiki/User:Amire80>
+
+=item * Russian: L<http://ru.wikipedia.org/wiki/User:Amire80>
+
+=back
+
+=back
+
+=head1 LICENSE AND COPYRIGHT
+
+Copyright 2009 Guy Shaked, Nadav Perez, Amir E. Aharoni.
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License version 3
+as published by the Free Software Foundation or under the terms of
+Artistic License version 2.0.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the licenses
+along with this program.  If not, see
+
+=over
+
+=item * L<http://www.gnu.org/licenses/gpl-3.0.html>.
+
+=item * L<http://www.perlfoundation.org/artistic_license_2_0>
+
+=back
+
+I<Visca la llibertat!>
+
+=cut
+
+1;
+


Property changes on: trunk/no-interwiki/Maintenance.pm
___________________________________________________________________
Added: svn:executable
   + *


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.