Thread: [PerlWikiBot] SF.net SVN: perlwikibot:[38] trunk
Status: Pre-Alpha
Brought to you by:
rotemliss
|
From: <rot...@us...> - 2008-07-28 13:35:50
|
Revision: 38
http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=38&view=rev
Author: rotemliss
Date: 2008-07-28 13:35:57 +0000 (Mon, 28 Jul 2008)
Log Message:
-----------
Make place to more modules.
Added Paths:
-----------
trunk/perlwikibot/
trunk/perlwikibot/README
trunk/perlwikibot/bot.pl
trunk/perlwikibot/config/
trunk/perlwikibot/dumps/
trunk/perlwikibot/includes/
Removed Paths:
-------------
trunk/README
trunk/bot.pl
trunk/config/
trunk/dumps/
trunk/includes/
Deleted: trunk/README
===================================================================
--- trunk/README 2007-03-24 15:51:07 UTC (rev 37)
+++ trunk/README 2008-07-28 13:35:57 UTC (rev 38)
@@ -1,13 +0,0 @@
-This is a bot which was written in Perl, for using on MediaWiki sites - especially Wikimedia Foundation projects.
-
-This bot is copyrighted under the terms of GNU GPL. It's based on original work of both Costello and Felagund (Nadav) from Hebrew Wikipedia. Authors:
-* Rotem Liss
-
-The file includes/dump.pm was downloaded from:
-http://search.cpan.org/~triddle/Parse-MediaWikiDump-0.31/
-
-You should get the XML file includes all the pages, but only their current revision. It is called pages-meta-current.xml.bz2 in http://download.wikimedia.org/, and you have to use "maintenance/dumpBackup.php --current" to create it in your own installation. Now extract it in "dumps".
-
-Now copy the files "configure.sample" and "runtime.sample" in the folder "config" to the names "configure.pm" and "runtime.pm", and configure them. You should first go to the configure file, then define the servers, then the actions, then go to the runtime file and define there what actions to execute, and most importantly - set "$sendPages" to 1, after the tests.
-
-After that, you can run the file "bot.pl", with the parameter of the site name, and the parameter "sysop" if you want to use the sysop user (if you don't use it, actions which require sysop rights won't be executed). In later times, you can edit the configure file when you want to change the actions, and always edit the run-time file to enable and disable the page sends, and to define the actions to do.
Deleted: trunk/bot.pl
===================================================================
--- trunk/bot.pl 2007-03-24 15:51:07 UTC (rev 37)
+++ trunk/bot.pl 2008-07-28 13:35:57 UTC (rev 38)
@@ -1,59 +0,0 @@
-#!/usr/bin/perl
-
-# Code style
-use warnings;
-use strict;
-
-# Libraries
-use includes::actions;
-use includes::dump;
-use includes::functions;
-use includes::http;
-use config::configure;
-use config::runtime;
-
-# Do nothing if no server specified
-if ( !$ARGV[0] ) {
- die "No server name set - please set server name!\n";
-}
-
-my $sysop;
-if ( $ARGV[1] eq "sysop" ) {
- $sysop = 1;
-} else {
- $sysop = 0;
-}
-
-# Show notes about the sending pages configuration
-if ( $configure::sendPages == 0 ) {
- print "Note: There will be no changes in the real server. Please set sendPages to 1 to do them.\n";
-} elsif ( $configure::sendPages == 2 ) {
- print "Note: A prefix will be used when editing pages. Please set sendPages to 1 to disable that.\n";
-}
-
-# Log in to the server only if you have to do some stuff in the server
-if ( $configure::sendPages == 1 || $configure::sendPages == 2 ) {
- functions::login( $sysop );
-}
-
-# All the matching pages in the XML file
-my @pages = functions::getMatchingPages( $sysop );
-
-# Execute actions of all the pages
-for ( my $i = 0; $i <= $#pages; $i += 2 ) {
- # Get title and action
- my $title = $pages[$i];
- my $action = $pages[$i + 1];
- my $actionName = $configure::actions{ $action };
-
- # Execute the action
- if ( $actionName eq "replace" ) {
- functions::replaceInPage( $title, $action );
- } elsif ( $actionName eq "refresh" ) {
- functions::refreshPage( $title, $action );
- } elsif ( $actionName eq "move" ) {
- functions::movePage( $title, $action );
- } elsif ( $actionName eq "delete" ) {
- functions::deletePage( $title, $action );
- }
-}
Copied: trunk/perlwikibot/README (from rev 37, trunk/README)
===================================================================
--- trunk/perlwikibot/README (rev 0)
+++ trunk/perlwikibot/README 2008-07-28 13:35:57 UTC (rev 38)
@@ -0,0 +1,13 @@
+This is a bot which was written in Perl, for using on MediaWiki sites - especially Wikimedia Foundation projects.
+
+This bot is copyrighted under the terms of GNU GPL. It's based on original work of both Costello and Felagund (Nadav) from Hebrew Wikipedia. Authors:
+* Rotem Liss
+
+The file includes/dump.pm was downloaded from:
+http://search.cpan.org/~triddle/Parse-MediaWikiDump-0.31/
+
+You should get the XML file includes all the pages, but only their current revision. It is called pages-meta-current.xml.bz2 in http://download.wikimedia.org/, and you have to use "maintenance/dumpBackup.php --current" to create it in your own installation. Now extract it in "dumps".
+
+Now copy the files "configure.sample" and "runtime.sample" in the folder "config" to the names "configure.pm" and "runtime.pm", and configure them. You should first go to the configure file, then define the servers, then the actions, then go to the runtime file and define there what actions to execute, and most importantly - set "$sendPages" to 1, after the tests.
+
+After that, you can run the file "bot.pl", with the parameter of the site name, and the parameter "sysop" if you want to use the sysop user (if you don't use it, actions which require sysop rights won't be executed). In later times, you can edit the configure file when you want to change the actions, and always edit the run-time file to enable and disable the page sends, and to define the actions to do.
Copied: trunk/perlwikibot/bot.pl (from rev 37, trunk/bot.pl)
===================================================================
--- trunk/perlwikibot/bot.pl (rev 0)
+++ trunk/perlwikibot/bot.pl 2008-07-28 13:35:57 UTC (rev 38)
@@ -0,0 +1,59 @@
+#!/usr/bin/perl
+
+# Code style
+use warnings;
+use strict;
+
+# Libraries
+use includes::actions;
+use includes::dump;
+use includes::functions;
+use includes::http;
+use config::configure;
+use config::runtime;
+
+# Do nothing if no server specified
+if ( !$ARGV[0] ) {
+ die "No server name set - please set server name!\n";
+}
+
+my $sysop;
+if ( $ARGV[1] eq "sysop" ) {
+ $sysop = 1;
+} else {
+ $sysop = 0;
+}
+
+# Show notes about the sending pages configuration
+if ( $configure::sendPages == 0 ) {
+ print "Note: There will be no changes in the real server. Please set sendPages to 1 to do them.\n";
+} elsif ( $configure::sendPages == 2 ) {
+ print "Note: A prefix will be used when editing pages. Please set sendPages to 1 to disable that.\n";
+}
+
+# Log in to the server only if you have to do some stuff in the server
+if ( $configure::sendPages == 1 || $configure::sendPages == 2 ) {
+ functions::login( $sysop );
+}
+
+# All the matching pages in the XML file
+my @pages = functions::getMatchingPages( $sysop );
+
+# Execute actions of all the pages
+for ( my $i = 0; $i <= $#pages; $i += 2 ) {
+ # Get title and action
+ my $title = $pages[$i];
+ my $action = $pages[$i + 1];
+ my $actionName = $configure::actions{ $action };
+
+ # Execute the action
+ if ( $actionName eq "replace" ) {
+ functions::replaceInPage( $title, $action );
+ } elsif ( $actionName eq "refresh" ) {
+ functions::refreshPage( $title, $action );
+ } elsif ( $actionName eq "move" ) {
+ functions::movePage( $title, $action );
+ } elsif ( $actionName eq "delete" ) {
+ functions::deletePage( $title, $action );
+ }
+}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <am...@us...> - 2008-07-28 15:28:06
|
Revision: 39
http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=39&view=rev
Author: amire80
Date: 2008-07-28 15:28:06 +0000 (Mon, 28 Jul 2008)
Log Message:
-----------
Adding prepare_noiw_list.pl for the first time.
Added Paths:
-----------
trunk/no-interwiki/
trunk/no-interwiki/prepare_noiw_list.pl
Added: trunk/no-interwiki/prepare_noiw_list.pl
===================================================================
--- trunk/no-interwiki/prepare_noiw_list.pl (rev 0)
+++ trunk/no-interwiki/prepare_noiw_list.pl 2008-07-28 15:28:06 UTC (rev 39)
@@ -0,0 +1,1495 @@
+#!/usr/bin/perl
+
+# prepare_noiw_list.pl
+# version Noa - development
+
+# See the POD documentation at the end of the file or run
+# perldoc prepare_noiw_list.pl
+
+# "Had he been a French child,
+# he would have heard an infinite number of sentences" - Otto Jespersen
+
+# This program is Free Software; you can redistribute it and/or modify it
+# under the same terms as Perl itself.
+
+# Upgrade! This script actually uses new Perl 5.10 constructs, so you need it
+use 5.010;
+
+# pragmata
+use strict;
+use warnings;
+use integer;
+use open ':utf8';
+use utf8;
+
+# Standard library
+# These modules should come installed with Perl
+use English qw(-no_match_vars);
+use Carp qw(croak cluck);
+use Time::Local;
+use List::Util qw(first);
+use Getopt::Long;
+use Data::Dumper;
+use File::Basename;
+use Readonly;
+
+# CPAN
+# You must install these modules from CPAN
+
+# Versions before 0.51 have a bug with parsing the namespace of a page
+use Parse::MediaWikiDump 0.51;
+use Regexp::Common;
+use Lingua::Translit;
+use Log::Log4perl qw(:easy);
+
+our $VERSION = '0.1.9.02';
+
+my %PATTERN;
+Readonly my $WIKITEXT_EXT => 'wiki.txt';
+Readonly my $UNSORTED_EXT => "unsorted.$WIKITEXT_EXT";
+Readonly my $ALT_SEP => q{|};
+Readonly my $FIELD_SEP => qq{\t};
+Readonly my $LINK_SEP => q{|};
+Readonly my $TYPE_SEP => qr/\s*;\s*/xms;
+
+Readonly my $DEFAULT_MAX_IW_PLACES => 20;
+Readonly my $DEFAULT_PAGE_FREQ => 1000;
+Readonly my $DEFAULT_MAX_LINKS_PER_SECTION => 100;
+Readonly my $DEFAULT_MAX_SECTIONS_PER_PAGE => 20;
+
+# Initialize logger
+Log::Log4perl->easy_init(
+ {
+ level => $DEBUG, # print everything
+ file => ':utf8>outtest.log', # utf is important
+ layout => '%m%n', # No need to print the date
+ }
+);
+
+my %option = (
+ rtl => 0, # TODO Should identify default by lang code
+ stop_after => 0,
+ start_from => 0,
+ page_freq => $DEFAULT_PAGE_FREQ,
+ max_links_per_section => $DEFAULT_MAX_LINKS_PER_SECTION,
+ max_sections_per_page => $DEFAULT_MAX_SECTIONS_PER_PAGE,
+ max_iw_places => $DEFAULT_MAX_IW_PLACES,
+);
+
+my $valid_options = GetOptions(
+ 'rtl!' => \$option{rtl},
+ 'stop_after=i' => \$option{stop_after},
+ 'start_from=i' => \$option{start_from},
+ 'page_freq=i' => \$option{page_freq},
+ 'max_links_per_section' => \$option{max_links_per_section},
+ 'max_sections_per_page' => \$option{max_sections_per_page},
+ 'max_iw_places' => \$option{max_iw_places},
+);
+
+if (not $valid_options) {
+ croak('Invalid command line options.');
+}
+
+$PATTERN{dump_fn} = qr{
+ \A # Begin string
+ (?<wiki_lang>\w+) # Lang code
+ wiki # Is supposed to be after the lang code
+}xms;
+
+$PATTERN{field_sep} = qr{\t}xms;
+
+$PATTERN{string_skip} = qr{
+ \A # Begin string
+ \s* # Zero or more spaces
+ (?:\#.*)? # Comment lines
+ \z # End string
+}xms;
+
+# pages-meta-current
+my $dump_fn = $ARGV[0]
+ or croak('Dump filename must be supplied as an argument.');
+my %STRING;
+my $WIKI_LANG;
+if ((basename $dump_fn) =~ $PATTERN{dump_fn}) {
+ $WIKI_LANG = $+{wiki_lang};
+ %STRING = get_strings($WIKI_LANG);
+}
+else {
+ croak("$dump_fn is a weird dump file name.");
+}
+
+# XXX - bad i18n
+# ISO 9 is mostly good for Russian and it is still not perfect ASCII
+my $TRANSLITERATOR = Lingua::Translit->new('ISO 9');
+
+Readonly my %MW_SYNTAX => (
+ 'start_tmpl' => '{{',
+ 'end_tmpl' => '}}',
+ 'start_link' => '[[',
+ 'end_link' => ']]',
+ 'param_sep' => q{|},
+ 'paragraph' => qq{\n\n},
+ 'start_wikitable' => '{|',
+ 'namespace_sep' => q{:},
+);
+
+# This monstrosity basically says: | and optional spaces
+$PATTERN{param_sep} = qr{\s*\Q$MW_SYNTAX{param_sep}\E\s*}xms;
+
+Readonly my @INCLUDE_NAMESPACES => (
+ q{}, # Empty is a specific case
+ 'category',
+);
+
+#
+# Constants for date processing
+#
+
+Readonly my @MONTHS => @STRING{
+ qw(
+ January February March
+ April May June
+ July August September
+ October November December
+ )
+ };
+my $ALT_MONTHS = join $ALT_SEP, @MONTHS;
+my %REV_MONTH;
+Readonly my $LAST_MONTH => 11;
+@REV_MONTH{@MONTHS} = (0 .. $LAST_MONTH);
+
+# XXX Internationalize
+$PATTERN{heb_date} = qr{
+ \A # begin
+ (?<hour>\d{1,2}) # hour
+ : # :
+ (?<min>\d{2}),\s # minute
+ (?<mday>\d{1,2})\s # day of month
+ $STRING{in}? # This preposition appears sometimes
+ (?<mon>$ALT_MONTHS)\s # A name of a month
+ (?<year>\d+?)\s # Year
+ \([A-Z]{3}\) # Three letters in brackets - timezone
+ \z # end
+}xms;
+
+$PATTERN{param} = qr{
+ \A # Beginning of a string
+ \s* # Zero or more space
+ (?: # No capture
+ (?<param_name>\w+) # Word chars - param name
+ \s* # Zero or more space
+ = # =
+ )? # Param name is optional
+ (?<value>.*) # value
+}xms;
+
+$PATTERN{simple_no_iw_check} = qr{
+ \Q$STRING{no_iw}\E # the string may have spaces
+}xmsi;
+
+# XXX HACK Until i get a better regex for matching balancing {{}} ...
+$PATTERN{template} = qr{
+ \A # beginning of string
+ \Q$MW_SYNTAX{start_tmpl}\E # {{
+ .+ # some chars
+ \Q$MW_SYNTAX{end_tmpl}\E # }}
+ \z # end of string
+}xms;
+
+$PATTERN{wikitable} = qr{
+ \A
+ \Q$MW_SYNTAX{start_wikitable}\E
+}xms;
+
+# Redirect in local language.
+# Regular expression mostly copied from
+# Parse::MediaWikiDump::page::redirect
+# TODO: Try to look for the local language redirect keyword in the dump.
+$PATTERN{local_redirect} = qr{
+ \A # Beginning of string (page)
+ \# # a # character
+ $STRING{REDIRECT} # Redirect keyword in local language
+ \s*:?\s*\[\[([^\]]*)\]\] # the link after the redirect
+}xmsi;
+
+$PATTERN{ltr_char} = qr/\P{IsLeftToRight}/xms;
+$PATTERN{true_template} = qr{$RE{balanced}{-parens=>'{}'}}xms; # XXX very bad
+$PATTERN{section_link} = qr{(?<!&)\#}xms;
+$PATTERN{lowercase_link} = qr{\A[[:lower:]]}xms;
+if ($STRING{exclude_lowercase}) {
+ $PATTERN{exclude_lowercase} = qr{\A[$STRING{exclude_lowercase}]}xms;
+}
+$PATTERN{numbered_file} = qr{
+ (?: _ \d*)?
+ \.$WIKITEXT_EXT
+}xms;
+$PATTERN{invalid_filename_char} = qr{[\\\n/:*?"<>|]}xms; # "
+
+# TODO: Check whether it's Neapolitan with its ''
+$PATTERN{character_code_in_link} = qr{
+ (?:
+ [%.] # There are both %C4%B0 and .AA.E0
+ [[:xdigit:]]{2} # 2 hex digits
+ )
+ |
+ (?:
+ &\#\d+; # stuff like Š
+ )
+}xms;
+
+my %LANG_CODE;
+Readonly my $LANG_CODE_FN => "$WIKI_LANG.language_codes.txt";
+open my $lang_code_file, '<', $LANG_CODE_FN
+ or croak(file_error('opening', $LANG_CODE_FN, 'reading'));
+while (my $line = <$lang_code_file>) {
+ chomp $line;
+ my ($code, $name) = split /\t/xms, $line;
+ $LANG_CODE{$code} = $name;
+}
+close $lang_code_file
+ or croak(file_error('closing', $LANG_CODE_FN, 'reading'));
+
+Readonly my $ALT_LANGS => join $ALT_SEP, keys %LANG_CODE;
+
+# XXX Should use ALT_LANGS, but an efficient way is needed to update
+# lang codes list, so in the meantime it's loose.
+$PATTERN{interwiki_link} = qr{
+ \Q$MW_SYNTAX{start_link}\E
+ (?<lang_code>
+# $ALT_LANGS
+ [a-zA-Z-]+
+ )
+ :
+ (?<foreign_article>
+ .+?
+ )
+ \Q$MW_SYNTAX{end_link}\E
+}xms;
+
+# Lojban allows lowercase articles
+Readonly my @LOWERCASE_LANGS => qw(jbo);
+Readonly my $COOLING_DAYS => 120;
+Readonly my $COOLING_SECONDS => $COOLING_DAYS * 24 * 60 * 60;
+Readonly my $LATEST_COOLING => time - $COOLING_SECONDS;
+
+my %param_name;
+foreach my $english_param_name (qw(date type)) {
+ $param_name{ get_string($english_param_name) } = $english_param_name;
+}
+
+# TODO: Make smarter, configurable, whatever
+Readonly my $OUT_DIR => 'out';
+if (-d $OUT_DIR) {
+ unlink glob "$OUT_DIR/*";
+}
+else {
+ mkdir $OUT_DIR;
+}
+
+my $dump = Parse::MediaWikiDump::Pages->new($dump_fn);
+
+my $namespaces_alt = join $ALT_SEP,
+ grep { length > 0 } @{ $dump->namespaces_names() };
+$PATTERN{pure_title} = qr{
+ \A
+ (?:
+ (?:$namespaces_alt)
+ $MW_SYNTAX{namespace_sep}
+ )? # Namespace name is optional
+ (?<pure_title>.+)
+ \z
+}xms;
+
+# This is the only currently known value
+# but there could be more in the future
+if ($dump->case() ne 'first-letter') {
+ croak(q{unable to handle any case setting besides 'first-letter'});
+}
+
+my $page_counter;
+
+my %statistics;
+$statistics{count_iw} = [];
+my %namespace_count;
+my %type_count;
+my %found_links;
+
+my $begin_time = time;
+find_iwless();
+my $total_time = time - $begin_time;
+say "total time: $total_time";
+
+INFO("\nSUMMARY");
+foreach my $stat_type (sort keys %statistics) {
+ if (not ref $statistics{$stat_type}) {
+ INFO("$stat_type: $statistics{$stat_type}");
+ }
+}
+
+my $max_iw_index = $#{ $statistics{count_iw} };
+INFO("max_iw_index: $max_iw_index");
+for my $max_iw_place (0 .. $option{max_iw_places}) {
+ my @links =
+ map { make_link($_) } @{ $statistics{count_iw}->[$max_iw_index] };
+ INFO("# $max_iw_index: " . join_links(\@links, 0));
+ while (not defined $statistics{count_iw}->[ --$max_iw_index ]) {
+
+ # Do nothing, just count down to the next index with a defined list
+ }
+}
+
+INFO("\nNAMESPACES");
+foreach my $namespace (sort keys %namespace_count) {
+ INFO("$namespace: $namespace_count{$namespace}");
+}
+INFO("\nTYPES");
+foreach my $type (sort keys %type_count) {
+ INFO("$type: $type_count{$type}");
+}
+
+create_no_iw_pages();
+
+INFO(q{});
+say 'looking for multi links';
+my $begin_multi_links_time = time;
+create_multi_links_pages();
+my $total_multi_links_time = time - $begin_multi_links_time;
+say "total multi links time: $total_multi_links_time";
+
+exit;
+
+sub find_iwless {
+ PAGE:
+ while (my $page = $dump->page()) {
+ $page_counter++;
+ if ($page_counter % $option{page_freq} == 0) {
+ say $page_counter;
+ }
+
+ last PAGE
+ if ($option{stop_after}
+ and $page_counter > $option{stop_after});
+
+ next PAGE
+ if ($page_counter < $option{start_from});
+
+ my $namespace = $page->namespace() || 'main';
+ $namespace_count{$namespace}++;
+
+ # Skipping cases:
+ next PAGE
+ if (is_redirect($page)
+ or not is_in_namespace($page, @INCLUDE_NAMESPACES));
+
+ INFO("\n* processing $page_counter - ", $page->title());
+
+ my $page_text_ref = $page->text();
+
+ # A simple sanity check: is the no_iw template anywhere around here?
+ my $has_tmpl_no_iw =
+ (${$page_text_ref} =~ $PATTERN{simple_no_iw_check});
+
+ # Does the page have interwiki links?
+ # XXX Actually checks only for English
+ my $has_iw = has_interwiki($page);
+
+ if ($has_iw) {
+ INFO("has link to $has_iw");
+ if ($has_tmpl_no_iw) {
+ INFO('has template no_iw. trying to remove ...');
+ remove_tmpl_no_iw($page_text_ref);
+ $statistics{'has both valid interwiki and template'}++;
+ }
+ }
+ else { # doesn't have iw
+ process_iwless_page($page, $has_tmpl_no_iw, $has_iw);
+ }
+ }
+
+ return;
+}
+
+sub process_iwless_page {
+ my (
+ $page, # object ref
+ $has_tmpl_no_iw, # scalar bool
+ $has_iw # scalar bool
+ ) = @_;
+
+ INFO(q(doesn't have iw link.)); # '
+ $statistics{'has no interwiki link'}++;
+
+ # Now we need to search for no_iw templates
+ # and parse their parameters - date and type
+
+ my @found_templates = ();
+
+ my $page_text_ref = $page->text();
+ my $page_title = $page->title();
+
+ # Optimized - doesn't start searching,
+ # if we already know that it's not there
+ if ($has_tmpl_no_iw) {
+ find_templates($page_text_ref, \@found_templates,
+ [ get_string('no_iw') ]);
+ }
+
+ my $found_templates_count = scalar @found_templates;
+ INFO("found templates: $found_templates_count");
+
+ my $template;
+ if ($found_templates_count) {
+
+ # Assume the first one is the right one
+ $template = $found_templates[0];
+ if ($found_templates_count > 1) {
+ WARN('many templates were found');
+ $statistics{'many templates'}++;
+ }
+ else {
+ INFO('good, found one template');
+ $statistics{'one template'}++;
+ }
+ }
+ else {
+ INFO('no templates found');
+ $statistics{'no templates found'}++;
+ }
+
+ if (defined $template) {
+ my $date_str = $template->{params}->{date};
+
+ INFO('has template no_iw. checking cooling date ... ');
+ if (not defined $date_str
+ or cooling_date_passed($date_str))
+ {
+ INFO('cooling date passed, updating to today ...');
+ update_cooling_date($page_text_ref);
+ $statistics{'cooling date passed'}++;
+ }
+ else {
+ INFO(q(cooling date didn't pass.)); # '
+ $statistics{q(cooling date didn't pass)}++; # '
+ }
+ }
+
+ my @all_types = get_all_types($template->{params}->{type}, $page);
+
+ foreach my $type (@all_types) {
+ INFO("adding $page_title to the list as type $type");
+ add_to_no_iw_list($page, $type);
+ $type_count{$type}++;
+ }
+
+ return 1;
+}
+
+sub get_all_types {
+ my ($type_param, $page) = @_;
+
+ $type_param //= q{};
+ strip_whitespace($type_param);
+
+ my @all_types = split $TYPE_SEP, $type_param;
+
+ my $page_title = $page->title();
+ if (is_category($page)) {
+ INFO("$page_title is a category");
+ push @all_types, get_string('category');
+ $statistics{'categories'}++;
+ }
+
+ if (is_disambig($page)) {
+ INFO("$page_title is a disambiguation");
+ push @all_types, get_string('disambig');
+ $statistics{'disambig'}++;
+ }
+
+ # Still nothing?
+ if (not scalar @all_types) {
+ my $other_type = get_string('other');
+ INFO("$page_title doesn't have any type, adding to $other_type");
+ @all_types = ($other_type);
+ $statistics{'automatically added to type other'}++;
+ }
+
+ return @all_types;
+}
+
+sub find_templates {
+ my (
+ $page_text_ref, # string ref
+ $found_templates_ref, # array ref
+ $filter # string array ref
+ ) = @_;
+
+ # A reference to an array with one empty string.
+ # Matching against an empty string will always succeed.
+ $filter //= [q{}];
+
+ # Get all highest-level matches
+ my @matches = (${$page_text_ref} =~ /$PATTERN{true_template}/xmsgo);
+
+ MATCH:
+ foreach my $next_match (@matches) {
+ if ($next_match !~ $PATTERN{template}) {
+ INFO(q(i thought that it's a template, but it was:)); # '
+ if ($next_match =~ $PATTERN{wikitable}) {
+ INFO('a wikitable');
+ }
+ else {
+ INFO("something else: $next_match");
+ }
+ INFO(q{});
+ next MATCH;
+ }
+
+ foreach my $next_filter (@{$filter}) {
+
+ # XXX Matches anywhere in the template.
+ # It probably should match the template name.
+ # Also - it's case-insensitive which is very wrong
+ # but kinda useful.
+ if ($next_match =~ /\Q$next_filter/xmsi) {
+
+ # N.B.: parse_template calls find_templates() recursively
+ my $parsed_template =
+ parse_template(\$next_match, [qw(date type)], $filter);
+ push @{$found_templates_ref}, $parsed_template;
+ }
+ }
+ }
+
+ return $found_templates_ref;
+}
+
+sub parse_template {
+ my (
+ $template, # string ref
+ $default_param_names, # string array ref
+ $subtmpl_filter, # string array ref
+ ) = @_;
+
+ # %parsed_template:
+ # {text} - string ref; might be big
+ # {name} - string
+ # {params} - hash ref
+ # {subtemplates} - array ref
+ my (%parsed_template, %parsed_params, @clauses);
+
+ $parsed_template{text} = strip_tmpl_curlies(${$template});
+
+ # First string of the split is the template name,
+ # the rest is the params
+ ($parsed_template{name}, @clauses) =
+ (split $PATTERN{param_sep}, ${ $parsed_template{text} });
+
+ my $param_counter = 0;
+ foreach my $clause (@clauses) {
+ if ($clause =~ $PATTERN{param}) {
+ #<<< no perltidy
+ my ($name, $value) = @+{ qw(param_name value) };
+ #>>>
+ if (defined $name) {
+ if (defined $param_name{$name}) {
+ $name = $param_name{$name};
+ }
+ }
+ else {
+
+ # Get next default param name.
+ # If a default name is not defined, just use the number
+ $name = $default_param_names->[$param_counter]
+ || $param_counter;
+ }
+ $parsed_params{$name} = $value;
+ }
+ else {
+ my $error_msg = "Weird - $clause doesn't look a param";
+ INFO($error_msg);
+ cluck($error_msg);
+ $statistics{'weird param'}++;
+ }
+ $param_counter++;
+ }
+ $parsed_template{params} = \%parsed_params;
+
+ # Possible recursion
+ find_templates($parsed_template{text}, $parsed_template{subtemplates},
+ $subtmpl_filter);
+
+ return \%parsed_template;
+}
+
+sub parse_date {
+ my ($date_str) = @_;
+
+ return if (not defined $date_str);
+
+ if ($date_str =~ $PATTERN{heb_date}) {
+ INFO("found a valid date: $date_str");
+ my %parsed_date = (
+ 'sec' => 0, # useful for timelocal
+ );
+ foreach my $date_part (qw(hour min mday mon year)) {
+ $parsed_date{$date_part} = $+{$date_part};
+ }
+ $parsed_date{mon} = $REV_MONTH{ $parsed_date{mon} };
+
+ # strip leading zeros
+ foreach (values %parsed_date) {
+ s{
+ \A
+ 0+
+ (?<number>\d+)
+ }
+ {$+{number}}xms;
+ }
+ return \%parsed_date;
+ }
+ else {
+ INFO("invalid date: $date_str");
+ }
+
+ # Returns undef for an invalid date
+ return;
+}
+
+sub strip_tmpl_curlies {
+ my ($template) = @_;
+ for ($template) {
+ s{
+ \A
+ \Q$MW_SYNTAX{start_tmpl}\E
+ }{}xms;
+ s{
+ \Q$MW_SYNTAX{end_tmpl}\E
+ \z
+ }{}xms;
+ }
+ return \$template;
+}
+
+# no arg unpacking for simplicity and performance
+sub strip_whitespace { ## no critic Subroutines::RequireArgUnpacking
+ for (@_) {
+ s/\A\s*//xms;
+ s/\s*\z//xms;
+ }
+ return;
+}
+
+sub has_interwiki {
+ my ($page) = @_;
+
+ my $page_title = $page->title();
+ my $page_text = ${ $page->text() };
+ study $page_text;
+
+ my %iw_links;
+ my %special_cases;
+
+ while ($page_text =~ /$PATTERN{interwiki_link}/xmsgo) {
+ my ($lang_code, $foreign_article) = @+{qw(lang_code foreign_article)};
+ if (defined $iw_links{$lang_code}) {
+ $special_cases{double_links}->{$lang_code} = q{};
+ }
+ else {
+ $iw_links{$lang_code} = $foreign_article;
+ }
+
+ # A # sign not after an &.
+ # After an & it's probably a character number.
+ if ($foreign_article =~ $PATTERN{section_link}) {
+ $special_cases{section_links}->{$lang_code} = q{};
+ }
+
+ # Char codes are common in section links, so there's no
+ # need to show them again
+ elsif ($foreign_article =~ $PATTERN{character_code_in_link}) {
+ $special_cases{charnumber_links}{$lang_code} = q{};
+ }
+
+ # Lowercase links
+ if ( (not $lang_code ~~ @LOWERCASE_LANGS)
+ and ($foreign_article =~ $PATTERN{lowercase_link}))
+ {
+ my $include_lowercase_link = 1;
+ if (defined $STRING{exclude_lowercase}
+ and $foreign_article =~ $PATTERN{exclude_lowercase})
+ {
+ $include_lowercase_link = 0;
+ }
+
+ if ($include_lowercase_link) {
+ $special_cases{lowercase_links}{$lang_code} = q{};
+ }
+ }
+
+ $found_links{$lang_code}->{$foreign_article}->{$page_title} = q{};
+ }
+
+ my @all_langs = keys %iw_links;
+ my $count_iw = scalar @all_langs;
+ if ($count_iw) {
+ if (not defined $statistics{count_iw}->[$count_iw]) {
+ $statistics{count_iw}->[$count_iw] = [];
+ }
+ push @{ $statistics{count_iw}->[$count_iw] }, $page_title;
+ }
+ INFO("iw link count for $page_title: $count_iw");
+
+ for my $special_case_name (keys %special_cases) {
+ if (scalar %{ $special_cases{$special_case_name} }) {
+ special_cases_file($special_case_name,
+ $special_cases{$special_case_name}, $page);
+ }
+ }
+
+ # XXX Still very stupid, but getting better
+ if (defined $iw_links{en}) {
+ return 'en';
+ }
+
+ return q{};
+}
+
+sub special_cases_file {
+ my ($special_case_name, $special_cases_ref, $page) = @_;
+ my $special_case_langs = join q{, }, sort keys %{$special_cases_ref};
+ my $special_case_fn = make_type_fn($special_case_name, 1);
+ if (not -e $special_case_fn) {
+ append_to_file($special_case_fn, $special_case_name);
+ }
+ my $page_title = $page->title();
+ my $link = make_link($page_title);
+ my $line =
+ $link
+ . " ($special_case_langs)"
+ . $FIELD_SEP
+ . get_sort_title($page_title);
+
+ append_to_file($special_case_fn, $line);
+ return;
+}
+
+sub remove_tmpl_no_iw {
+ my ($params) = @_;
+ INFO( "Supposed to remove the no_iw template now, but ...\n"
+ . 'This sub is a stub. You can help Wikipedia by expanding it!');
+
+ return 0;
+}
+
+sub cooling_date_passed {
+ my ($date_string) = @_;
+
+ # $date is a hash ref
+ my $date_ref = parse_date($date_string);
+ if (not defined $date_ref) {
+ INFO('in cooling_date_passed invalid date');
+ return 1;
+ }
+
+ my @page_times = @{$date_ref}{qw(sec min hour mday mon year)};
+ INFO("page times: @page_times");
+
+ my $pageseconds = timelocal(@page_times);
+ return $pageseconds < $LATEST_COOLING;
+}
+
+sub update_cooling_date {
+ my ($params) = @_;
+
+ INFO( "Supposed to update cooling date now, but ...\n"
+ . "This sub is a stub. You can help Wikipedia by expanding it!\n"
+ );
+
+ return 0;
+}
+
+# Just a debugging thingie
+sub print_parsed_template {
+ my ($template_ref) = @_; # hash ref
+
+ INFO('text:');
+ INFO(${ $template_ref->{text} });
+ INFO('name:');
+ INFO($template_ref->{name});
+ INFO('params:');
+ foreach my $next_param (sort keys %{ $template_ref->{params} }) {
+ INFO("$next_param: $template_ref->{params}->{$next_param}");
+ }
+
+ if (defined $template_ref->{subtemplates}) {
+ INFO("subtemplates: $template_ref->{subtemplates}");
+ }
+
+ return;
+}
+
+sub get_sort_title {
+ my ($page_title) = @_;
+
+ my $sort_title;
+ if ($page_title =~ $PATTERN{pure_title}) {
+ $sort_title = $+{pure_title};
+ }
+ else {
+ INFO( 'Something badly weird happened - '
+ . "$page_title has no pure title");
+ croak('A page has no pure title');
+ }
+
+ return $sort_title;
+}
+
+sub add_to_no_iw_list {
+ my (
+ $page, # ref
+ $page_type, # string
+ ) = @_;
+
+ my $page_title = $page->title();
+ INFO("Adding $page_title to no_iw_list");
+
+ my $sort_title = get_sort_title($page_title);
+ INFO("full title: $page_title");
+ INFO("sort title: $sort_title");
+
+ my $unsorted_type_fn = make_type_fn($page_type, 1);
+ if (not -e $unsorted_type_fn) {
+ append_to_file($unsorted_type_fn, $page_type);
+ }
+ my $link = make_link($page_title);
+ INFO("link to page: $link");
+ my $line = $link . $FIELD_SEP . $sort_title;
+
+ append_to_file($unsorted_type_fn, $line);
+
+ return;
+}
+
+sub make_link {
+ my ($page_title) = @_;
+
+ my $link_to_page =
+ $MW_SYNTAX{start_link}
+ . $MW_SYNTAX{namespace_sep}
+ . $page_title
+ . $MW_SYNTAX{end_link};
+
+ if ($option{rtl}) {
+ if ($page_title =~ $PATTERN{ltr_char}) {
+ $link_to_page = $STRING{rlm} . $link_to_page . $STRING{rlm};
+ }
+ }
+
+ return $link_to_page;
+}
+
+sub create_no_iw_pages {
+ my ($params) = @_;
+
+ INFO('creating no_iw pages');
+
+ # Run over page types
+ UNSORTED_TYPE_FN:
+ foreach my $unsorted_type_fn (glob "$OUT_DIR/*$UNSORTED_EXT") {
+ my %all_pages_in_type = ();
+ open my $unsorted_type_file, '<', $unsorted_type_fn
+ or croak(file_error('opening', $unsorted_type_fn, 'reading'));
+ my @lines = <$unsorted_type_file>;
+ close $unsorted_type_file
+ or croak(file_error('closing', $unsorted_type_fn, 'reading'));
+
+ my $type_name = shift @lines;
+ chomp $type_name;
+ foreach my $line (@lines) {
+ chomp $line;
+ my ($page_title, $sort_title) = split $PATTERN{field_sep}, $line;
+ my $sort_letter = get_sort_letter($sort_title);
+ $all_pages_in_type{$sort_letter} //= [];
+ push @{ $all_pages_in_type{$sort_letter} }, $page_title;
+ }
+ write_sorted_pages($type_name, \%all_pages_in_type);
+ }
+ return;
+}
+
+sub write_sorted_pages {
+ my ($type_name, $type_tree_ref) = @_;
+
+ my $type_fn = make_type_fn($type_name); # XXX
+
+ my $section_counter = 0;
+ my $page = q{};
+ my $file_number = 1;
+
+ LETTER:
+ foreach my $next_letter (sort keys %{$type_tree_ref}) {
+ my @all_links_in_letter =
+ sort @{ $type_tree_ref->{$next_letter} };
+
+ my $links_count = scalar @all_links_in_letter;
+ my $sections =
+ ($links_count - 1) / $option{max_links_per_section} + 1;
+ my $links_per_section = $links_count / $sections;
+
+ SECTION:
+ foreach my $section_num (1 .. $sections) {
+ if ($section_counter == $option{max_sections_per_page}) {
+ write_page(\$page, \$type_fn, $file_number++);
+ $section_counter = 0;
+ undef $page; # XXX Trying to free memory
+ $page = q{};
+ }
+ elsif ($section_counter) {
+ $page .= $MW_SYNTAX{paragraph};
+ }
+ $section_counter++;
+ my $title = $next_letter;
+ if ($sections > 1) {
+ $title .= " $section_num";
+ }
+ $page .= mw_heading($title);
+
+ my $first_link = ($section_num - 1) * $links_per_section;
+ my $last_link =
+ ($section_num == $sections)
+ ? $links_count - 1
+ : $first_link + $links_per_section - 1;
+ my $links = join_links(
+ [ @all_links_in_letter[ $first_link .. $last_link ] ]);
+ $page .= $links;
+ }
+ undef @all_links_in_letter; # XXX Trying to free memory
+ }
+
+ # The page may be empty at this point
+ if ($page) {
+ write_page(\$page, \$type_fn, $file_number++);
+ }
+ undef $page;
+
+ return;
+}
+
+sub write_page {
+ my ($page_ref, $type_fn_ref, $file_number) = @_;
+
+ my $pretty_file_number = sprintf '%03d', $file_number;
+ ${$type_fn_ref} =~ s{
+ $PATTERN{numbered_file}
+ }
+ {_$pretty_file_number.$WIKITEXT_EXT}xmso;
+ INFO("creating file ${$type_fn_ref}");
+ append_to_file(${$type_fn_ref}, ${$page_ref});
+
+ return;
+}
+
+sub mw_heading {
+ my (
+ $text, # string
+ $level # number
+ ) = @_;
+
+ $level //= 2;
+ my $level_marker = q{=} x $level;
+
+ # Line ending is mandatory
+ return "$level_marker $text $level_marker\n";
+}
+
+# Custom Unicode character property for finding characters.
+# The custom is to give those subroutines CamelCase names.
+sub IsLeftToRight { ## no critic (NamingConventions::ProhibitMixedCaseSubs)
+ return <<'END';
++utf8::InHebrew
++utf8::IsSpace
++utf8::IsPunct
+END
+}
+
+sub is_redirect {
+ my ($page) = @_;
+ my $page_title = $page->title();
+ my $page_text_ref = $page->text();
+
+ if ($page->redirect()) {
+ INFO("\nEnglish redirect: $page_title");
+ return 1;
+ }
+ if (${$page_text_ref} =~ $PATTERN{local_redirect}) {
+ I...
[truncated message content] |