perlwikibot-svn Mailing List for Perl MediaWiki Robot
Status: Pre-Alpha
Brought to you by:
rotemliss
You can subscribe to this list here.
2006 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
(19) |
Oct
|
Nov
|
Dec
|
---|---|---|---|---|---|---|---|---|---|---|---|---|
2007 |
Jan
|
Feb
|
Mar
(4) |
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
2008 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
(20) |
Aug
(12) |
Sep
|
Oct
|
Nov
(3) |
Dec
|
2009 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
(11) |
Oct
(1) |
Nov
|
Dec
|
2010 |
Jan
|
Feb
|
Mar
(6) |
Apr
|
May
(1) |
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
(2) |
Dec
|
From: <am...@us...> - 2010-11-06 15:20:27
|
Revision: 93 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=93&view=rev Author: amire80 Date: 2010-11-06 15:20:21 +0000 (Sat, 06 Nov 2010) Log Message: ----------- Import carp. Modified Paths: -------------- trunk/no-interwiki/prepare_noiw_list.pl Modified: trunk/no-interwiki/prepare_noiw_list.pl =================================================================== --- trunk/no-interwiki/prepare_noiw_list.pl 2010-11-06 15:08:56 UTC (rev 92) +++ trunk/no-interwiki/prepare_noiw_list.pl 2010-11-06 15:20:21 UTC (rev 93) @@ -38,7 +38,7 @@ # Standard library # These modules should come installed with Perl use English qw(-no_match_vars); -use Carp qw(croak cluck); +use Carp qw(carp croak cluck); use Time::Local; use Getopt::Long; use Data::Dumper; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2010-11-06 15:09:02
|
Revision: 92 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=92&view=rev Author: amire80 Date: 2010-11-06 15:08:56 +0000 (Sat, 06 Nov 2010) Log Message: ----------- Not forcing SVN version number. Modified Paths: -------------- trunk/no-interwiki/prepare_noiw_list.pl Modified: trunk/no-interwiki/prepare_noiw_list.pl =================================================================== --- trunk/no-interwiki/prepare_noiw_list.pl 2010-05-26 15:36:59 UTC (rev 91) +++ trunk/no-interwiki/prepare_noiw_list.pl 2010-11-06 15:08:56 UTC (rev 92) @@ -74,10 +74,13 @@ Date => '$Date$', ); #>>> -our $FULL_VERSION = - ($SVN_PROPS{Revision} =~ /\A\$Revision:\ (?<revision_num>\d+)\ \$\z/xms) - ? "$VERSION.$+{revision_num}" - : croak(q(Something is wrong with SVN revision number)); +our $FULL_VERSION = $VERSION; +if ($SVN_PROPS{Revision} =~ /\A\$Revision:\ (?<revision_num>\d+)\ \$\z/xms) { + $FULL_VERSION .= ".$+{revision_num}"; +} +else { + carp(q(Something is wrong with SVN revision number, but never mind)); +} my $WIKITEXT_EXT = 'wiki.txt'; my $OUT_DIR = 'out'; @@ -1195,10 +1198,10 @@ carp("$lang_code is $lang_reftype, not hashref!"); next LANG_CODE; } - my $filename = "$MULTI_DIR/$lang_code.$WIKITEXT_EXT"; - say 'sort keys found_links lang_code'; # XXX + my $filename = "$MULTI_DIR/$lang_code.$WIKITEXT_EXT"; + say 'sort keys found_links lang_code'; # XXX my @foreign_articles = sort keys %{ $found_links{$lang_code} }; - say 'format_link_table lang_code scalar foreign_articles'; # XXX + say 'format_link_table lang_code scalar foreign_articles'; # XXX say format_link_table($lang_code, scalar @foreign_articles); FOREIGN_ARTICLE: This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2010-05-26 15:37:06
|
Revision: 91 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=91&view=rev Author: amire80 Date: 2010-05-26 15:36:59 +0000 (Wed, 26 May 2010) Log Message: ----------- Strings for Masry (arz). Modified Paths: -------------- trunk/no-interwiki/MediaWiki/Toolkit.pm trunk/no-interwiki/prepare_noiw_list.pl trunk/no-interwiki/upload_iw.pl Added Paths: ----------- trunk/no-interwiki/arz.language_codes.txt trunk/no-interwiki/arz.strings.txt Property Changed: ---------------- trunk/no-interwiki/MediaWiki/Toolkit.pm trunk/no-interwiki/upload_iw.pl Modified: trunk/no-interwiki/MediaWiki/Toolkit.pm =================================================================== --- trunk/no-interwiki/MediaWiki/Toolkit.pm 2010-03-26 18:42:13 UTC (rev 90) +++ trunk/no-interwiki/MediaWiki/Toolkit.pm 2010-05-26 15:36:59 UTC (rev 91) @@ -16,15 +16,16 @@ GET_STRING => [qw( get_strings get_string )], FILE_UTIL => [qw( file_error append_to_file read_file )], NAMESPACE => [qw( init_namespaces namespace )], + DUMP => [qw( load_dump )], ); our @EXPORT_OK = map { @{$_} } values %EXPORT_TAGS; our $VERSION = '0.01'; #<<< no perltidy my %SVN_PROPS = ( ## no critic (RequireInterpolationOfMetachars) - Revision => '$Revision: 0 $', - HeadURL => '$HeadURL: https://perlwikibot.svn.sourceforge.net/svnroot/perlwikibot/trunk/no-interwiki/MediaWiki/Toolkit.pm $', - Date => '$Date: 2010-03-25 17:02:28 +0200 (Thu, 25 Mar 2010) $', + Revision => '$Revision$', + HeadURL => '$HeadURL$', + Date => '$Date$', ); #>>> @@ -134,5 +135,13 @@ return $namespace || get_string('article space'); } +sub load_dump { + my ($dump_fn) = @_; + + my $dump = MediaWiki::DumpFile::Pages->new($dump_fn); + + return $dump; +} + 1; Property changes on: trunk/no-interwiki/MediaWiki/Toolkit.pm ___________________________________________________________________ Added: svn:keywords + Revision HeadURL Date Added: trunk/no-interwiki/arz.language_codes.txt =================================================================== --- trunk/no-interwiki/arz.language_codes.txt (rev 0) +++ trunk/no-interwiki/arz.language_codes.txt 2010-05-26 15:36:59 UTC (rev 91) @@ -0,0 +1 @@ +link language_codes.txt \ No newline at end of file Property changes on: trunk/no-interwiki/arz.language_codes.txt ___________________________________________________________________ Added: svn:special + * Added: trunk/no-interwiki/arz.strings.txt =================================================================== --- trunk/no-interwiki/arz.strings.txt (rev 0) +++ trunk/no-interwiki/arz.strings.txt 2010-05-26 15:36:59 UTC (rev 91) @@ -0,0 +1,37 @@ +# months +January يناير +February فبراير +March مارس +April ابريل +May مايو +June يونيه +July يوليه +August اغسطس +September سبتمبر +October اكتوبر +November نوفمبر +December ديسمبر + +no_iw no_iw +disambig توضيح +template قالب + +date تاريخ +type نوع + +# MW specials +REDIRECT تحويل + +# Namespaces +User مستخدم +User talk نقاش المستخدم +Image ملف +Portal بوابة +Category تصنيف +article space (رئيسى) + +# Other +other متفرقات +rlm ‏ +exclude_lowercase ß + Modified: trunk/no-interwiki/prepare_noiw_list.pl =================================================================== --- trunk/no-interwiki/prepare_noiw_list.pl 2010-03-26 18:42:13 UTC (rev 90) +++ trunk/no-interwiki/prepare_noiw_list.pl 2010-05-26 15:36:59 UTC (rev 91) @@ -62,6 +62,7 @@ :FIELD_SEP :GET_STRING :FILE_UTIL :NAMESPACE + :DUMP ) ); @@ -361,7 +362,7 @@ } } -my $the_dump = load_dump(); +my $the_dump = load_dump($dump_fn); my %NAMESPACES = $the_dump->namespaces(); my @NAMESPACES = values %NAMESPACES; @@ -417,14 +418,6 @@ exit; -sub load_dump { - my $dump; - - $dump = MediaWiki::DumpFile::Pages->new($dump_fn); - - return $dump; -} - sub next_page { my ($dump) = @_; my $page_ref = eval { $dump->next(); }; @@ -1203,11 +1196,14 @@ next LANG_CODE; } my $filename = "$MULTI_DIR/$lang_code.$WIKITEXT_EXT"; + say 'sort keys found_links lang_code'; # XXX my @foreign_articles = sort keys %{ $found_links{$lang_code} }; + say 'format_link_table lang_code scalar foreign_articles'; # XXX say format_link_table($lang_code, scalar @foreign_articles); FOREIGN_ARTICLE: foreach my $foreign_article (@foreign_articles) { + say 'local_articles = keys found_links lang_code'; my @local_articles = keys %{ $found_links{$lang_code}->{$foreign_article} }; @@ -1219,6 +1215,7 @@ } if (scalar @local_articles > 1) { + say 'links = join sort map make_link'; my $links = join q{ | }, sort map { make_link($_) } keys %{ $found_links{$lang_code}->{$foreign_article} }; @@ -1226,6 +1223,7 @@ make_link($lang_code . $MW_SYNTAX{namespace_sep} . $foreign_article); + say 'append_to_file filename'; append_to_file($filename, "* '''$foreign_title''' - $links"); } } Modified: trunk/no-interwiki/upload_iw.pl =================================================================== --- trunk/no-interwiki/upload_iw.pl 2010-03-26 18:42:13 UTC (rev 90) +++ trunk/no-interwiki/upload_iw.pl 2010-05-26 15:36:59 UTC (rev 91) @@ -26,9 +26,9 @@ our $VERSION = '0.01'; #<<< no perltidy my %SVN_PROPS = ( ## no critic (RequireInterpolationOfMetachars) - Revision => '$Revision: 0 $', - HeadURL => '$HeadURL: https://perlwikibot.svn.sourceforge.net/svnroot/perlwikibot/trunk/no-interwiki/upload_iw.pl $', - Date => '$Date: 2010-03-25 17:02:28 +0200 (Thu, 25 Mar 2010) $', + Revision => '$Revision$', + HeadURL => '$HeadURL$', + Date => '$Date$', ); #>>> Property changes on: trunk/no-interwiki/upload_iw.pl ___________________________________________________________________ Added: svn:keywords + Revision HeadURL Date This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2010-03-26 18:42:19
|
Revision: 90 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=90&view=rev Author: amire80 Date: 2010-03-26 18:42:13 +0000 (Fri, 26 Mar 2010) Log Message: ----------- Ported to MediaWiki::DumpFile, the new version of Parse::MediaWikiDump. Modified Paths: -------------- trunk/no-interwiki/MediaWiki/Toolkit.pm trunk/no-interwiki/prepare_noiw_list.pl Modified: trunk/no-interwiki/MediaWiki/Toolkit.pm =================================================================== --- trunk/no-interwiki/MediaWiki/Toolkit.pm 2010-03-26 15:31:20 UTC (rev 89) +++ trunk/no-interwiki/MediaWiki/Toolkit.pm 2010-03-26 18:42:13 UTC (rev 90) @@ -15,6 +15,7 @@ FIELD_SEP => [qw( $FIELD_SEP $FIELD_SEP_RE )], GET_STRING => [qw( get_strings get_string )], FILE_UTIL => [qw( file_error append_to_file read_file )], + NAMESPACE => [qw( init_namespaces namespace )], ); our @EXPORT_OK = map { @{$_} } values %EXPORT_TAGS; @@ -111,5 +112,27 @@ return $text; } +my @NAMESPACES; + +sub init_namespaces { + @NAMESPACES = @_; + return; +} + +my $NAMESPACE_RE = qr/\A (?<namespace>[^:]+) : .* /xms; + +sub namespace { + my ($title) = @_; + my $namespace = q(); + + if ($title =~ $NAMESPACE_RE) { + if ($+{namespace} ~~ @NAMESPACES) { + $namespace = $+{namespace}; + } + } + + return $namespace || get_string('article space'); +} + 1; Modified: trunk/no-interwiki/prepare_noiw_list.pl =================================================================== --- trunk/no-interwiki/prepare_noiw_list.pl 2010-03-26 15:31:20 UTC (rev 89) +++ trunk/no-interwiki/prepare_noiw_list.pl 2010-03-26 18:42:13 UTC (rev 90) @@ -54,12 +54,15 @@ use Regexp::Common; use Lingua::Translit; use Log::Log4perl qw(:easy); -use Parse::MediaWikiDump 0.91; # Earlier versions have a different API +use MediaWiki::DumpFile::Pages; + use MediaWiki::Toolkit ( qw( :FIELD_SEP - :GET_STRING :FILE_UTIL) + :GET_STRING :FILE_UTIL + :NAMESPACE + ) ); our $VERSION = '0.2.1'; @@ -94,7 +97,7 @@ Log::Log4perl->easy_init( { level => $INFO, # print everything - file => ":utf8>$LOG_FN", # utf is important + file => ":utf8>$LOG_FN", # utf8 is important layout => '%m%n', # No need to print the date } ); @@ -360,8 +363,11 @@ my $the_dump = load_dump(); -my $namespaces_alt = join $ALT_SEP, - grep { length > 0 } @{ $the_dump->namespaces_names() }; +my %NAMESPACES = $the_dump->namespaces(); +my @NAMESPACES = values %NAMESPACES; +init_namespaces(@NAMESPACES); +my $namespaces_alt = join $ALT_SEP, grep { length > 0 } @NAMESPACES; + my $PURE_TITLE_RE = qr{ \A (?: @@ -414,16 +420,11 @@ sub load_dump { my $dump; - $dump = Parse::MediaWikiDump::Pages->new($dump_fn); + $dump = MediaWiki::DumpFile::Pages->new($dump_fn); return $dump; } -sub namespace { - my ($page) = @_; - return $page->namespace() || get_string('article space'); -} - sub next_page { my ($dump) = @_; my $page_ref = eval { $dump->next(); }; @@ -447,9 +448,9 @@ last PAGE; } - my $page_text_ref = $page_ref->text(); + my $page_text = $page_ref->revision()->text(); - if (${$page_text_ref} !~ /\S/xmsi + if ($page_text !~ /\S/xmsi and not is_in_namespace($page_ref, 'User', 'User talk')) { special_cases_file('empty_page', $page_ref); @@ -457,7 +458,7 @@ if ( $page_counter < $option{start_from} - or not defined ${$page_text_ref} # must be tested before redirect + or not defined $page_text # must be tested before redirect or not is_in_namespace($page_ref, @INCLUDE_NAMESPACES) or is_redirect($page_ref) ) @@ -465,11 +466,11 @@ next PAGE; } - my $page_namespace = namespace($page_ref); + my $page_title = $page_ref->title(); + + my $page_namespace = namespace($page_title); $namespace_count{$page_namespace}++; - my $page_title = $page_ref->title(); - INFO("\n* processing $page_counter - ", $page_title); # TODO: Be more precise here. @@ -482,8 +483,7 @@ } # A simple sanity check: is the no_iw template anywhere around here? - my $has_template_no_iw = - (${$page_text_ref} =~ $SIMPLE_NO_IW_CHECK_RE); + my $has_template_no_iw = ($page_text =~ $SIMPLE_NO_IW_CHECK_RE); # Does the page have interwiki links? # BIG XXX Actually checks only for English @@ -493,7 +493,7 @@ INFO("has link to $has_iw"); if ($has_template_no_iw) { INFO('has template no_iw. trying to remove ...'); - remove_template_no_iw($page_text_ref); + remove_template_no_iw($page_text); $statistics{'has both valid interwiki and template'}++; special_cases_file('outdated_template', $page_ref); } @@ -513,20 +513,21 @@ $has_iw # scalar bool ) = @_; + my $page_title = $page_ref->title(); INFO(q(does not have iw link.)); - $statistics{'has no interwiki link'}->{ namespace($page_ref) }++; + $statistics{'has no interwiki link'}->{ namespace($page_title) }++; # Now we need to search for no_iw templates # and parse their parameters - date and type my @found_templates = (); - my $page_text_ref = $page_ref->text(); + my $page_text = $page_ref->revision()->text(); # Optimized - does not start searching, # if we already know that it is not there if ($has_template_no_iw) { - find_templates($page_text_ref, \@found_templates, + find_templates(\$page_text, \@found_templates, [ get_string('no_iw') ]); } @@ -566,7 +567,7 @@ } elsif (cooling_date_passed($date_ref)) { INFO('cooling date passed, updating to today ...'); - update_cooling_date($page_text_ref); + update_cooling_date($page_text); $statistics{'cooling date passed'}++; } else { @@ -583,7 +584,7 @@ my @all_types = get_all_types($template->{params}->{type}, $page_ref); foreach my $type (@all_types) { - INFO('adding ' . $page_ref->title() . " to the list as type $type"); + INFO('adding ' . $page_title . " to the list as type $type"); add_to_no_iw_list($page_ref, $type); $type_count{$type}++; } @@ -792,9 +793,9 @@ my ($page) = @_; my $page_title = $page->title(); - my $page_text = ${ $page->text() }; # XXX + my $page_text = $page->revision()->text(); - study $page_text; # XXX + study $page_text; # XXX my %iw_links; my %special_cases; @@ -1135,9 +1136,9 @@ return 'English'; } - my $page_text_ref = $page->text(); + my $page_text = $page->revision()->text(); - if (${$page_text_ref} =~ $LOCAL_REDIRECT_RE) { + if ($page_text =~ $LOCAL_REDIRECT_RE) { return 'local'; } @@ -1147,7 +1148,8 @@ sub is_in_namespace { my ($page, @namespaces) = @_; - return namespace($page) ~~ [ map { get_string($_) } @namespaces ]; + return namespace($page->title()) ~~ + [ map { get_string($_) } @namespaces ]; } sub is_category { @@ -1158,8 +1160,8 @@ sub is_disambig { my ($page) = @_; - my $found_templates = - find_templates($page->text(), [], [ get_string('disambig') ]); + my $found_templates = find_templates(\$page->revision()->text(), + [], [ get_string('disambig') ]); return scalar @{$found_templates}; } @@ -1497,7 +1499,7 @@ =head2 unable to handle any case setting besides 'first-letter' Something is weird with the dump. See the documentation of -L<Parse::MediaWikiDump> and MediaWiki. +L<MediaWiki::DumpFile> and MediaWiki. =head2 A page has no pure title @@ -1551,7 +1553,7 @@ =over -=item * C<Parse::MediaWikiDump> +=item * C<MediaWiki::DumpFile> This module is used for reading pages from the XML dump. @@ -1671,7 +1673,7 @@ =item * Statistics and multi links are just slapped to the log. =item * At least some of the code can be rewritten as classes that inherit -from L<Parse::MediaWikiDump>. +from L<MediaWiki::DumpFile>. =back This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2010-03-26 15:31:29
|
Revision: 89 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=89&view=rev Author: amire80 Date: 2010-03-26 15:31:20 +0000 (Fri, 26 Mar 2010) Log Message: ----------- Thanks to almut and ikegami on PerlMonks. Modified Paths: -------------- trunk/no-interwiki/upload_iw.pl Modified: trunk/no-interwiki/upload_iw.pl =================================================================== --- trunk/no-interwiki/upload_iw.pl 2010-03-26 15:22:02 UTC (rev 88) +++ trunk/no-interwiki/upload_iw.pl 2010-03-26 15:31:20 UTC (rev 89) @@ -80,7 +80,12 @@ say Dumper(\@filenames); foreach my $filename (@filenames) { - my $pagename = Encode::decode('UTF-8', "$filename"); + # This was tested on Ubuntu. Other systems may have other encodings + # for file names. Test in a sandbox before using it on a live Wikipedia, + # otherwise pages with gibberish titles may be created. + # Thanks to almut and ikegami on PerlMonks: + # http://www.perlmonks.org/?node_id=830948 + my $pagename = Encode::decode('UTF-8', $filename); for ($pagename) { s/\.$INPUT_EXTENSION\z//xms; s/\A$dirname//xms; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2010-03-26 15:22:09
|
Revision: 88 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=88&view=rev Author: amire80 Date: 2010-03-26 15:22:02 +0000 (Fri, 26 Mar 2010) Log Message: ----------- Adding upload_iw.pl. Modified Paths: -------------- trunk/no-interwiki/MediaWiki/Toolkit.pm trunk/no-interwiki/ru.strings.txt Added Paths: ----------- trunk/no-interwiki/upload_iw.pl Modified: trunk/no-interwiki/MediaWiki/Toolkit.pm =================================================================== --- trunk/no-interwiki/MediaWiki/Toolkit.pm 2010-03-26 14:03:26 UTC (rev 87) +++ trunk/no-interwiki/MediaWiki/Toolkit.pm 2010-03-26 15:22:02 UTC (rev 88) @@ -14,7 +14,7 @@ our %EXPORT_TAGS = ( FIELD_SEP => [qw( $FIELD_SEP $FIELD_SEP_RE )], GET_STRING => [qw( get_strings get_string )], - FILE_UTIL => [qw( file_error append_to_file )], + FILE_UTIL => [qw( file_error append_to_file read_file )], ); our @EXPORT_OK = map { @{$_} } values %EXPORT_TAGS; @@ -81,24 +81,35 @@ } sub file_error { - my ($operation, $fn, $access_type) = @_; - my $string = "error $operation $fn, $access_type: $OS_ERROR"; + my ($operation, $filename, $access_type) = @_; + my $string = "error $operation $filename, $access_type: $OS_ERROR"; return $string; } sub append_to_file { - my ($fn, $line) = @_; + my ($filename, $line) = @_; - open my $file, '>>', $fn - or croak(file_error('opening', $fn, 'appending')); - + open my $file, '>>', $filename + or croak(file_error('opening', $filename, 'appending')); say {$file} ($line // q{}); # / - close $file - or croak(file_error('closing', $fn, 'appeding')); + or croak(file_error('closing', $filename, 'appeding')); return; } +sub read_file { + my ($filename) = @_; + + local $INPUT_RECORD_SEPARATOR = undef; + open my $file, '<', $filename + or croak(file_error('opening', $filename, 'reader')); + my $text = <$file>; + close $file + or croak(file_error('closing', $filename, 'reading')); + + return $text; +} + 1; Modified: trunk/no-interwiki/ru.strings.txt =================================================================== --- trunk/no-interwiki/ru.strings.txt 2010-03-26 14:03:26 UTC (rev 87) +++ trunk/no-interwiki/ru.strings.txt 2010-03-26 15:22:02 UTC (rev 88) @@ -30,6 +30,12 @@ Category Категория article space (статьи) +# pages +project prefix Википедия:Проект:Интервики/Виды/Категории + +# summaries +updating list of pages without interwiki обновление списка страниц без интервики + # Other other другое rlm {{כ}} Added: trunk/no-interwiki/upload_iw.pl =================================================================== --- trunk/no-interwiki/upload_iw.pl (rev 0) +++ trunk/no-interwiki/upload_iw.pl 2010-03-26 15:22:02 UTC (rev 88) @@ -0,0 +1,114 @@ +#!/usr/bin/perl + +use 5.010; + +use strict; +use warnings; +use open ':encoding(utf8)'; +use utf8; + +use English qw(-no_match_vars); +use Carp qw(croak cluck); +use Getopt::Long; +use Data::Dumper; +use Encode; + +use Readonly; + +use MediaWiki::API; + +use MediaWiki::Toolkit ( + qw( + :GET_STRING + :FILE_UTIL) +); + +our $VERSION = '0.01'; +#<<< no perltidy +my %SVN_PROPS = ( ## no critic (RequireInterpolationOfMetachars) + Revision => '$Revision: 0 $', + HeadURL => '$HeadURL: https://perlwikibot.svn.sourceforge.net/svnroot/perlwikibot/trunk/no-interwiki/upload_iw.pl $', + Date => '$Date: 2010-03-25 17:02:28 +0200 (Thu, 25 Mar 2010) $', +); +#>>> + +Readonly my $INPUT_EXTENSION => 'wiki.txt'; +Readonly my $INTERVAL_BETWEEN_EDITS => 5; + +my %option = ( + help => 0, + usage => 0, + version => 0, +); + +my $valid_options = GetOptions( + 'langcode=s' => \$option{langcode}, + 'username=s' => \$option{username}, + 'password=s' => \$option{password}, + 'help' => \$option{help}, + 'usage' => \$option{usage}, + 'version' => \$option{version}, +); + +if (not $valid_options) { + croak('Invalid command line options.'); +} + +for my $required_option (qw(langcode username password)) { + if (not $option{$required_option}) { + croak "option $required_option is required"; + } +} + +get_strings($option{langcode}); + +my $mw = MediaWiki::API->new(); +$mw->{config}->{api_url} = "http://$option{langcode}.wikipedia.org/w/api.php"; + +$mw->login( + { + lgname => $option{username}, + lgpassword => $option{password}, + } +) or croak $mw->{error}->{code} . ': ' . $mw->{error}->{details}; + +my $page_prefix = get_string('project prefix'); + +my $dirname = "./out.$option{langcode}/"; +my @filenames = glob $dirname . get_string('Category') . "*.$INPUT_EXTENSION"; + +say Dumper(\@filenames); + +foreach my $filename (@filenames) { + my $pagename = Encode::decode('UTF-8', "$filename"); + for ($pagename) { + s/\.$INPUT_EXTENSION\z//xms; + s/\A$dirname//xms; + } + $pagename = "$page_prefix/$pagename"; + + my $page = $mw->get_page({ title => $pagename }); + if ($page->{missing}) { + say "page $pagename is missing, trying to create"; + } + + say "uploading to $pagename"; + + $mw->edit( + { + action => 'edit', + title => $pagename, + summary => get_string('updating list of pages without interwiki'), + basetimestamp => $page->{timestamp}, + text => read_file($filename), + }, + { skip_encoding => 1, } + ) or croak $mw->{error}->{codie} . ': ' . $mw->{error}->{details}; + + sleep $INTERVAL_BETWEEN_EDITS; +} + +exit; + +__END__ + Property changes on: trunk/no-interwiki/upload_iw.pl ___________________________________________________________________ Added: svn:executable + * This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2010-03-26 14:03:33
|
Revision: 87 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=87&view=rev Author: amire80 Date: 2010-03-26 14:03:26 +0000 (Fri, 26 Mar 2010) Log Message: ----------- Starting very naive refactoring. Modified Paths: -------------- trunk/no-interwiki/prepare_noiw_list.pl Added Paths: ----------- trunk/no-interwiki/MediaWiki/ trunk/no-interwiki/MediaWiki/Toolkit.pm Added: trunk/no-interwiki/MediaWiki/Toolkit.pm =================================================================== --- trunk/no-interwiki/MediaWiki/Toolkit.pm (rev 0) +++ trunk/no-interwiki/MediaWiki/Toolkit.pm 2010-03-26 14:03:26 UTC (rev 87) @@ -0,0 +1,104 @@ +package MediaWiki::Toolkit; + +use 5.010; + +use strict; +use warnings; +use integer; +use utf8; +use open ':encoding(utf8)'; + +use English qw(-no_match_vars); + +use base 'Exporter'; +our %EXPORT_TAGS = ( + FIELD_SEP => [qw( $FIELD_SEP $FIELD_SEP_RE )], + GET_STRING => [qw( get_strings get_string )], + FILE_UTIL => [qw( file_error append_to_file )], +); +our @EXPORT_OK = map { @{$_} } values %EXPORT_TAGS; + +our $VERSION = '0.01'; +#<<< no perltidy +my %SVN_PROPS = ( ## no critic (RequireInterpolationOfMetachars) + Revision => '$Revision: 0 $', + HeadURL => '$HeadURL: https://perlwikibot.svn.sourceforge.net/svnroot/perlwikibot/trunk/no-interwiki/MediaWiki/Toolkit.pm $', + Date => '$Date: 2010-03-25 17:02:28 +0200 (Thu, 25 Mar 2010) $', +); +#>>> + +our $FIELD_SEP = qq{\t}; +our $FIELD_SEP_RE = qr{\Q$FIELD_SEP\E}xms; + +# +# This is poor man's gettext. +# TODO: Replace it with Locale::Maketext +# +{ + my $STRING_SKIP_RE = qr{ + \A # Begin string + \s* # Zero or more spaces + (?:\#.*)? # Comment lines + \z # End string +}xms; + + my %string; + + sub get_strings { + my ($lang) = @_; + + my $strings_fn = "$lang.strings.txt"; + + open my $strings_file, '<', $strings_fn + or croak(file_error('opening', $strings_fn, 'reading')); + my @strings_file_lines = <$strings_file>; + close $strings_file + or croak(file_error('closing', $strings_fn, 'reading')); + + STRING_LINE: + foreach my $next_string_line (@strings_file_lines) { + + # Skip blanks and comments + next STRING_LINE if ($next_string_line =~ $STRING_SKIP_RE); + + chomp $next_string_line; + my ($english, $target) = split $FIELD_SEP_RE, $next_string_line; + + # Fallback to English if no target language string was supplied + $string{$english} = $target // $english; # / + } + + return; + } + + sub get_string { + my ($english, $if_defined) = @_; + + return $if_defined + ? ($string{$english}) + : ($string{$english} //= $english); # / + } +} + +sub file_error { + my ($operation, $fn, $access_type) = @_; + my $string = "error $operation $fn, $access_type: $OS_ERROR"; + return $string; +} + +sub append_to_file { + my ($fn, $line) = @_; + + open my $file, '>>', $fn + or croak(file_error('opening', $fn, 'appending')); + + say {$file} ($line // q{}); # / + + close $file + or croak(file_error('closing', $fn, 'appeding')); + + return; +} + +1; + Modified: trunk/no-interwiki/prepare_noiw_list.pl =================================================================== --- trunk/no-interwiki/prepare_noiw_list.pl 2010-03-25 15:02:28 UTC (rev 86) +++ trunk/no-interwiki/prepare_noiw_list.pl 2010-03-26 14:03:26 UTC (rev 87) @@ -31,7 +31,7 @@ use strict; use warnings; use integer; # Improves performance -use open ':utf8'; +use open ':encoding(utf8)'; use utf8; use charnames ':full'; @@ -56,6 +56,12 @@ use Log::Log4perl qw(:easy); use Parse::MediaWikiDump 0.91; # Earlier versions have a different API +use MediaWiki::Toolkit ( + qw( + :FIELD_SEP + :GET_STRING :FILE_UTIL) +); + our $VERSION = '0.2.1'; #<<< no perltidy my %SVN_PROPS = ( ## no critic (RequireInterpolationOfMetachars) @@ -74,7 +80,6 @@ my $UNSORTED_DIR = "$OUT_DIR/unsorted"; my $MULTI_DIR = "$OUT_DIR/multilinks"; my $ALT_SEP = q{|}; -my $FIELD_SEP = qq{\t}; my $LINK_SEP = q{|}; my $TYPE_SEP = qr/\s*;\s*/xms; my $HTML_RLM = '‏'; @@ -158,28 +163,19 @@ wiki # Is supposed to be after the lang code }xms; -my $FIELD_SEP_RE = qr{\t}xms; - -my $STRING_SKIP_RE = qr{ - \A # Begin string - \s* # Zero or more spaces - (?:\#.*)? # Comment lines - \z # End string -}xms; - # pages-meta-current my $dump_fn = $ARGV[0] or croak('Dump filename must be supplied as an argument.'); -my %STRING; my $WIKI_LANG; if ((basename $dump_fn) =~ $DUMP_FN_RE) { $WIKI_LANG = $+{wiki_lang}; - get_strings($WIKI_LANG); } else { croak("$dump_fn is a weird dump file name."); } +get_strings($WIKI_LANG); + # XXX - bad i18n # ISO 9 is mostly good for Russian and it is still not perfect ASCII my $TRANSLITERATOR = Lingua::Translit->new('ISO 9'); @@ -204,14 +200,12 @@ # Constants for date processing # -my @MONTHS = @STRING{ - qw( - January February March - April May June - July August September - October November December - ) - }; +my @MONTHS = map { get_string($_) } qw( + January February March + April May June + July August September + October November December +); my $ALT_MONTHS = join $ALT_SEP, @MONTHS; my %REV_MONTH; Readonly my $LAST_MONTH => 11; @@ -1163,52 +1157,16 @@ sub is_disambig { my ($page) = @_; + my $found_templates = find_templates($page->text(), [], [ get_string('disambig') ]); + return scalar @{$found_templates}; } -# -# This is poor man's gettext. -# TODO: Replace it with Locale::Maketext -# -sub get_strings { - my ($lang) = @_; - - my $STRINGS_FN = "$lang.strings.txt"; - - open my $STRINGS_FILE, '<:utf8', $STRINGS_FN - or croak(file_error('opening', $STRINGS_FN, 'reading')); - my @strings_file_lines = <$STRINGS_FILE>; - close $STRINGS_FILE - or croak(file_error('closing', $STRINGS_FN, 'reading')); - - STRING_LINE: - foreach my $next_string_line (@strings_file_lines) { - - # Skip blanks and comments - next STRING_LINE if ($next_string_line =~ $STRING_SKIP_RE); - - chomp $next_string_line; - my ($english, $target) = split $FIELD_SEP_RE, $next_string_line; - - # Fallback to English if no target language string was supplied - $STRING{$english} = $target // $english; # / - } - - return; -} - -sub get_string { - my ($english, $if_defined) = @_; - return $if_defined - ? ($STRING{$english}) - : ($STRING{$english} //= $english); # / -} - sub make_type_fn { my ($type, $unsorted) = @_; - $unsorted //= 0; # / + $unsorted //= 0; # / #my $transliterated_type = $TRANSLITERATOR->translit($type); my $transliterated_type = $type; @@ -1222,20 +1180,6 @@ return $type_fn; } -sub append_to_file { - my ($fn, $line) = @_; - - open my $file, '>>:utf8', $fn - or croak(file_error('opening', $fn, 'appending')); - - say {$file} ($line // q{}); # / - - close $file - or croak(file_error('closing', $fn, 'appeding')); - - return; -} - # It appears simple, but in non-alphabetic languages such as Chinese # it may be different, so it will sit here ready for better i18n. sub get_sort_letter { @@ -1243,12 +1187,6 @@ return substr $string, 0, 1; } -sub file_error { - my ($operation, $fn, $access_type) = @_; - my $string = "error $operation $fn for $access_type: $OS_ERROR"; - return $string; -} - sub format_link_table { ## no critic (RequireArgUnpacking) return sprintf '%-15s %8d', @_; } @@ -1259,7 +1197,7 @@ say "processing $lang_code"; my $lang_reftype = ref $found_links{$lang_code}; if ($lang_reftype ne 'HASH') { - carp('$lang_code is $lang_reftype, not hashref!'); + carp("$lang_code is $lang_reftype, not hashref!"); next LANG_CODE; } my $filename = "$MULTI_DIR/$lang_code.$WIKITEXT_EXT"; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2010-03-25 15:02:34
|
Revision: 86 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=86&view=rev Author: amire80 Date: 2010-03-25 15:02:28 +0000 (Thu, 25 Mar 2010) Log Message: ----------- Update of empty pages handling. Modified Paths: -------------- trunk/no-interwiki/prepare_noiw_list.pl Modified: trunk/no-interwiki/prepare_noiw_list.pl =================================================================== --- trunk/no-interwiki/prepare_noiw_list.pl 2010-03-09 12:44:41 UTC (rev 85) +++ trunk/no-interwiki/prepare_noiw_list.pl 2010-03-25 15:02:28 UTC (rev 86) @@ -280,9 +280,7 @@ my $SECTION_LINK_RE = qr{(?<!&)\#}xms; my $LOWERCASE_LINK_RE = qr{\A[[:lower:]]}xms; -## no critic (RegularExpressions::ProhibitEscapedMetacharacters) my $TRUE_TEMPLATE_RE = qr/\{ $RE{balanced}{-parens=>'{}'} \}/xms; -## use critic (RegularExpressions::ProhibitEscapedMetacharacters) # get_string() cannot be used in re my $string_exclude_lowercase = get_string('exclude_lowercase'); @@ -420,23 +418,9 @@ exit; sub load_dump { - Readonly my $WORKING_PMWD_VER => 0.91; - Readonly my $FUTURE_PMWD_VER => 0.94; - my $dump; - if ($Parse::MediaWikiDump::VERSION == $WORKING_PMWD_VER) { - $dump = Parse::MediaWikiDump::Pages->new($dump_fn); - } - else { - if ($Parse::MediaWikiDump::VERSION < $FUTURE_PMWD_VER) { - carp( 'You are running Parse::MediaWikiDump version ' - . $Parse::MediaWikiDump::VERSION - . ".\n Redirect handling may be broken\n"); - } - my $pmwd = Parse::MediaWikiDump->new(); - $dump = $pmwd->revisions($dump_fn); - } + $dump = Parse::MediaWikiDump::Pages->new($dump_fn); return $dump; } @@ -471,6 +455,12 @@ my $page_text_ref = $page_ref->text(); + if (${$page_text_ref} !~ /\S/xmsi + and not is_in_namespace($page_ref, 'User', 'User talk')) + { + special_cases_file('empty_page', $page_ref); + } + if ( $page_counter < $option{start_from} or not defined ${$page_text_ref} # must be tested before redirect @@ -1266,6 +1256,12 @@ sub print_multi_links_by_foreign { LANG_CODE: foreach my $lang_code (sort keys %found_links) { + say "processing $lang_code"; + my $lang_reftype = ref $found_links{$lang_code}; + if ($lang_reftype ne 'HASH') { + carp('$lang_code is $lang_reftype, not hashref!'); + next LANG_CODE; + } my $filename = "$MULTI_DIR/$lang_code.$WIKITEXT_EXT"; my @foreign_articles = sort keys %{ $found_links{$lang_code} }; say format_link_table($lang_code, scalar @foreign_articles); @@ -1274,6 +1270,14 @@ foreach my $foreign_article (@foreign_articles) { my @local_articles = keys %{ $found_links{$lang_code}->{$foreign_article} }; + + my $article_reftype = + ref $found_links{$lang_code}->{$foreign_article}; + if ($article_reftype ne 'HASH') { + carp("$foreign_article is $article_reftype, not hashref!"); + next FOREIGN_ARTICLE; + } + if (scalar @local_articles > 1) { my $links = join q{ | }, sort map { make_link($_) } keys %{ $found_links{$lang_code}->{$foreign_article} }; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2010-03-09 12:44:47
|
Revision: 85 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=85&view=rev Author: amire80 Date: 2010-03-09 12:44:41 +0000 (Tue, 09 Mar 2010) Log Message: ----------- Adding config files for ru.wikipedia. Added Paths: ----------- trunk/no-interwiki/ru.language_codes.txt trunk/no-interwiki/ru.strings.txt Added: trunk/no-interwiki/ru.language_codes.txt =================================================================== --- trunk/no-interwiki/ru.language_codes.txt (rev 0) +++ trunk/no-interwiki/ru.language_codes.txt 2010-03-09 12:44:41 UTC (rev 85) @@ -0,0 +1 @@ +link language_codes.txt \ No newline at end of file Property changes on: trunk/no-interwiki/ru.language_codes.txt ___________________________________________________________________ Added: svn:special + * Added: trunk/no-interwiki/ru.strings.txt =================================================================== --- trunk/no-interwiki/ru.strings.txt (rev 0) +++ trunk/no-interwiki/ru.strings.txt 2010-03-09 12:44:41 UTC (rev 85) @@ -0,0 +1,37 @@ +# months +January января +February февраля +March марта +April апреля +May мая +June июня +July июля +August августа +September сентября +October ортября +November ноября +December декабря + +no_iw Нет интервики +disambig неоднозначность +template шаблон + +date дата +type topic + +# MW specials +REDIRECT перенаправление + +# Namespaces +User Участник +User talk Обсуждение участника +Image Файл +Portal Портал +Category Категория +article space (статьи) + +# Other +other другое +rlm {{כ}} +exclude_lowercase ß + This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2009-10-01 11:39:18
|
Revision: 84 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=84&view=rev Author: amire80 Date: 2009-10-01 11:39:00 +0000 (Thu, 01 Oct 2009) Log Message: ----------- Starting the big refactoring. Added Paths: ----------- trunk/no-interwiki/Maintenance.pm Added: trunk/no-interwiki/Maintenance.pm =================================================================== --- trunk/no-interwiki/Maintenance.pm (rev 0) +++ trunk/no-interwiki/Maintenance.pm 2009-10-01 11:39:00 UTC (rev 84) @@ -0,0 +1,1755 @@ +package Maintenance; + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 3 +# as published by the Free Software Foundation or under the terms of +# Artistic License version 2.0. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the licenses +# along with this program. If not, see +# * <http://www.gnu.org/licenses/gpl-3.0.html>. +# * <http://www.perlfoundation.org/artistic_license_2_0> + +# Upgrade! This program actually uses new Perl 5.10 constructs, so you need it +use 5.010; + +# pragmata +use strict; +use warnings; +use integer; # Improves performance +use open ':utf8'; +use utf8; +use charnames ':full'; + +# Standard library +# These modules should come installed with Perl +use English qw(-no_match_vars); +use Carp qw(croak cluck); +# use Time::Local; +# use Data::Dumper; +# use File::Basename; + +# CPAN +# You must install these modules from CPAN + +# I only use Readonly where P::C complains about it. Using it with regular +# expressions which have non-ASCII chars produces heisenbugs. +use Readonly; +# use Pod::Usage; +# use Regexp::Common; +# use Lingua::Translit; +# use Log::Log4perl qw(:easy); +use Parse::MediaWikiDump 0.91; # Earlier versions have a different API + +our $VERSION = '0.0.1'; +#<<< no perltidy +my %SVN_PROPS = ( ## no critic (RequireInterpolationOfMetachars) + Revision => '$Revision: 83 $', + HeadURL => '$HeadURL: https://perlwikibot.svn.sourceforge.net/svnroot/perlwikibot/trunk/no-interwiki/prepare_noiw_list.pl $', + Date => '$Date: 2009-09-29 15:25:11 +0200 (Tue, 29 Sep 2009) $', +); +#>>> + +my $LANG_CODE; +my %STRINGS; +my %LANG_CODE; +my $WIKI_LANG; + +sub init { + my (%args) = @_; + + if ((basename $args{dump_fn}) =~ $DUMP_FN_RE) { + $WIKI_LANG = $+{wiki_lang}; + load_strings($WIKI_LANG); + } + else { + croak("$dump_fn is a weird dump file name."); + } + + my $LANG_CODE_FN = "$wiki_lang.language_codes.txt"; + open my $lang_code_file, '<', $LANG_CODE_FN + or croak(file_error('opening', $LANG_CODE_FN, 'reading')); + while (my $line = <$lang_code_file>) { + chomp $line; + my ($code, $name) = split /\t/xms, $line; + if (defined $code) { + $LANG_CODE{$code} = $name // $code; # / + } + } + close $lang_code_file + or croak(file_error('closing', $LANG_CODE_FN, 'reading')); + + init_logger('outtest.log'); + + # TODO: Make smarter, configurable, whatever + # $OUT_DIR must be first, because it's the parent + foreach my $out_dir ($OUT_DIR, $UNSORTED_DIR, $MULTI_DIR) { + if (-d $out_dir) { + unlink glob "$out_dir/*$WIKITEXT_EXT"; + } + else { + mkdir $out_dir; + } + } + + return; +} + +my $WIKITEXT_EXT = 'wiki.txt'; +my $OUT_DIR = 'out'; +my $UNSORTED_DIR = "$OUT_DIR/unsorted"; +my $ALT_SEP = q{|}; +my $FIELD_SEP = qq{\t}; +my $LINK_SEP = q{|}; +my $TYPE_SEP = qr/\s*;\s*/xms; +my $HTML_RLM = '‏'; + +Readonly our $DEFAULT_PAGE_FREQ => 1000; +Readonly our $DEFAULT_MAX_LINKS_PER_SECTION => 100; +Readonly our $DEFAULT_MAX_SECTIONS_PER_PAGE => 20; + +sub init_logger { + my ($log_fn) = @_; + Log::Log4perl->easy_init( + { + level => $INFO, # print everything + file => ":utf8>$log_fn", # utf is important + layout => '%m%n', # No need to print the date + } + ); + return; +} + +# XXX Too coupled to Wikipedia, won't work for other projects. +our $DUMP_FN_RE = qr{ + \A # Begin string + (?<wiki_lang>\w+) # Lang code + wiki # Is supposed to be after the lang code +}xms; + +our $FIELD_SEP_RE = qr{\t}xms; + +our $STRING_SKIP_RE = qr{ + \A # Begin string + \s* # Zero or more spaces + (?:\#.*)? # Comment lines + \z # End string +}xms; + +my %STRING; + +# XXX - bad i18n +# ISO 9 is mostly good for Russian and it is still not perfect ASCII +# my $TRANSLITERATOR = Lingua::Translit->new('ISO 9'); + +our %MW_SYNTAX = ( + 'start_template' => '{{', + 'end_template' => '}}', + 'start_link' => '[[', + 'end_link' => ']]', + 'param_sep' => q{|}, + 'paragraph' => qq{\n\n}, + 'start_wikitable' => '{|', + 'namespace_sep' => q{:}, +); + +# This monstrosity basically says: | and optional spaces +our $PARAM_SEP_RE = qr{\s*\Q$MW_SYNTAX{param_sep}\E\s*}xms; + +# +# Constants for date processing +# + +our @MONTHS = @STRING{ + qw( + January February March + April May June + July August September + October November December + ) + }; +my $ALT_MONTHS = join $ALT_SEP, @MONTHS; +my %REV_MONTH; +Readonly my $LAST_MONTH => 11; +@REV_MONTH{@MONTHS} = (0 .. $LAST_MONTH); + +# XXX Internationalize +my $string_in = get_string('in'); +our $HEB_DATE_RE = qr{ + \A # begin + (?<hour>\d{1,2}) # hour + : # : + (?<min>\d{2}),\s # minute + (?<mday>\d{1,2})\s # day of month + $string_in? # This preposition appears sometimes + (?<mon>$ALT_MONTHS)\s # A name of a month + (?<year>\d+?)\s # Year + \([A-Z]{3}\) # Three letters in brackets - timezone + \z # end +}xms; + +our $PARAM_RE = qr{ + \A # Beginning of a string + \s* # Zero or more space + (?: # No capture + (?<param_name>\w+) # Word chars - param name + \s* # Zero or more space + = # = + )? # Param name is optional + (?<value>.*) # value +}xms; + +# A simplistic template just for testing. +# Quite possibly it is not needed anymore. +# Until i get a better regex for matching balancing {{}} ... +our $TEMPLATE_RE = qr{ + \A # beginning of string + \Q$MW_SYNTAX{start_template}\E # {{ + .+ # some chars + \Q$MW_SYNTAX{end_template}\E # }} + \z # end of string +}xms; + +our $WIKITABLE_RE = qr{ + \A + \Q$MW_SYNTAX{start_wikitable}\E +}xms; + +# Redirect in local language. +# Regular expression mostly copied from +# Parse::MediaWikiDump::page::redirect +# TODO: Try to look for the local language redirect keyword in the dump. +my $local_redirect = get_string('REDIRECT'); +my $LOCAL_REDIRECT_RE = qr{ + \A # Beginning of string (page) + \# # a # character + $local_redirect # Redirect keyword in local language + \s*:?\s*\[\[([^\]]*)\]\] # the link after the redirect +}xmsi; + +my $LTR_CHAR_RE = qr/\P{IsRightToLeft}/xms; # \P is negation +my $SECTION_LINK_RE = qr{(?<!&)\#}xms; +my $LOWERCASE_LINK_RE = qr{\A[[:lower:]]}xms; + +## no critic (RegularExpressions::ProhibitEscapedMetacharacters) +my $TRUE_TEMPLATE_RE = qr/\{ $RE{balanced}{-parens=>'{}'} \}/xms; +## use critic (RegularExpressions::ProhibitEscapedMetacharacters) + +my $string_exclude_lowercase = get_string('exclude_lowercase'); +my $EXCLUDE_LOWERCASE_RE = qr{ + \A # Beginning of foreign article name + [$string_exclude_lowercase] # Character class of possibly lowercase chars +}xms; +my $NUMBERED_FILE_RE = qr{ + (?: _ \d*)? + \.$WIKITEXT_EXT +}xms; +my $INVALID_FILENAME_CHAR_RE = qr{[\\\n/:*?"<>|]}xms; # " + +my $TWO_DIGIT_CHARNUMBER_RE = qr{ + (?: + [%.] # There are both %C4%B0 and .AA.E0 + [[:xdigit:]]{2} # 2 hex digits + ) +}xms; + +my $HTML_CHARNUMBER_RE = qr{ + (?: + &\#\d+; # stuff like Š + ) +}xms; + +# TODO: Check whether it is Neapolitan with its '' +my $CHARACTER_CODE_IN_LINK_RE = qr{ + $TWO_DIGIT_CHARNUMBER_RE | $HTML_CHARNUMBER_RE +}xms; + +sub load_lang_codes { + my ($wiki_lang) = @_; + +} + +my $string_no_iw = get_string('no_iw'); +my $SIMPLE_NO_IW_CHECK_RE = qr{ + \Q$string_no_iw\E # The string may have spaces +}xmsi; + +my $ALT_LANGS = join $ALT_SEP, keys %LANG_CODE; + +my $INTERWIKI_LINK_RE = qr{ + \Q$MW_SYNTAX{start_link}\E + (?<lang_code> + $ALT_LANGS + ) + : + (?<foreign_article> + .+? + ) + \Q$MW_SYNTAX{end_link}\E +}xms; + +# Lojban allows lowercase articles +my @LOWERCASE_LANGS = qw(jbo); + +our $the_dump = load_dump(); + +my $namespaces_alt = join $ALT_SEP, + grep { length > 0 } @{ $the_dump->namespaces_names() }; +my $PURE_TITLE_RE = qr{ + \A + (?: + (?:$namespaces_alt) + $MW_SYNTAX{namespace_sep} + )? # Namespace name is optional + (?<pure_title>.+) + \z +}xms; + +our %statistics = (count_iw => []); + +my $begin_time = time; +process_dump(); +my $total_time = time - $begin_time; + +print_stats(); + +create_no_iw_pages(); + +INFO(q{}); + +# my @found_lang_codes = sort keys %found_links; +# INFO("found lang_codes: @found_lang_codes"); + +my $begin_multi_links_time = time; + +say "\nlisting multi links by language"; +print_multi_links_by_foreign(); + +say "\nlisting multi links by local articles"; +print_multi_links_by_local(); + +my $total_multi_links_time = time - $begin_multi_links_time; +INFO("total multi links time: $total_multi_links_time"); + +exit; + +sub load_dump { + Readonly my $WORKING_PMWD_VER => 0.91; + Readonly my $FUTURE_PMWD_VER => 0.94; + + my $dump; + + if ($Parse::MediaWikiDump::VERSION == $WORKING_PMWD_VER) { + $dump = Parse::MediaWikiDump::Pages->new($dump_fn); + } + else { + if ($Parse::MediaWikiDump::VERSION < $FUTURE_PMWD_VER) { + carp( 'You are running Parse::MediaWikiDump version ' + . $Parse::MediaWikiDump::VERSION + . ".\n Redirect handling may be broken\n"); + } + my $pmwd = Parse::MediaWikiDump->new(); + $dump = $pmwd->revisions($dump_fn); + } + + # This is the only currently known value + # but there could be more in the future + if ($dump->case() ne 'first-letter') { + croak(q{unable to handle any case setting besides 'first-letter'}); + } + + return $dump; +} + +sub namespace { + my ($page) = @_; + return $page->namespace() || get_string('article space'); +} + +sub next_page { + my ($dump) = @_; + my $page_ref = eval { $dump->next(); }; + if ($EVAL_ERROR) { + confess("Failed reading a page: $EVAL_ERROR"); + } + return $page_ref; +} + +sub process_dump { + PAGE: + while (my $page_ref = next_page($the_dump)) { + $page_counter++; + if ($page_counter % $option{page_freq} == 0) { + say $page_counter; + } + + if ( $option{stop_after} + and $page_counter > $option{stop_after}) + { + last PAGE; + } + + my $page_text_ref = $page_ref->text(); + + if ( + $page_counter < $option{start_from} + or not defined ${$page_text_ref} # must be tested before redirect + or not is_in_namespace($page_ref, @INCLUDE_NAMESPACES) + or is_redirect($page_ref) + ) + { + next PAGE; + } + + my $page_namespace = namespace($page_ref); + $namespace_count{$page_namespace}++; + + my $page_title = $page_ref->title(); + + INFO("\n* processing $page_counter - ", $page_title); + + # TODO: Be more precise here. + # Portal pages which have a '/' in their name are probably + # internal and do not need interwiki links. + if (is_in_namespace($page_ref, 'Portal') and $page_title =~ m{/}xms) { + INFO('internal portal, skipping'); + $statistics{'internal portal'}++; + next PAGE; + } + + # A simple sanity check: is the no_iw template anywhere around here? + my $has_template_no_iw = + (${$page_text_ref} =~ $SIMPLE_NO_IW_CHECK_RE); + + # Does the page have interwiki links? + # BIG XXX Actually checks only for English + my $has_iw = has_interwiki($page_ref); + + if ($has_iw) { + INFO("has link to $has_iw"); + if ($has_template_no_iw) { + INFO('has template no_iw. trying to remove ...'); + remove_template_no_iw($page_text_ref); + $statistics{'has both valid interwiki and template'}++; + special_cases_file('outdated_template', $page_ref); + } + } + else { # does not have iw + process_iwless_page($page_ref, $has_template_no_iw, $has_iw); + } + } + + return; +} + +sub process_iwless_page { + my ( + $page_ref, # object ref + $has_template_no_iw, # scalar bool + $has_iw # scalar bool + ) = @_; + + INFO(q(does not have iw link.)); + $statistics{'has no interwiki link'}->{ namespace($page_ref) }++; + + # Now we need to search for no_iw templates + # and parse their parameters - date and type + + my @found_templates = (); + + my $page_text_ref = $page_ref->text(); + + # Optimized - does not start searching, + # if we already know that it is not there + if ($has_template_no_iw) { + find_templates($page_text_ref, \@found_templates, + [ get_string('no_iw') ]); + } + + my $found_templates_count = scalar @found_templates; + INFO("found templates: $found_templates_count"); + + my $template; + if ($found_templates_count) { + + # Assume the first one is the right one + $template = $found_templates[0]; + if ($found_templates_count > 1) { + WARN('many templates were found'); + $statistics{'many templates'}++; + special_cases_file('many_templates', $page_ref); + } + else { + INFO('good, found one template'); + $statistics{'one template'}++; + } + } + else { + INFO('no templates found'); + $statistics{'no templates found'}++; + } + + if (defined $template) { + INFO('has template no_iw'); + my $date_str = $template->{params}->{date}; + if (defined $date_str) { + INFO('checking cooling date'); + my $date_ref = parse_date($date_str); + if (not defined $date_ref) { + INFO("invalid date: '$date_str'"); + $statistics{'invalid date'}++; + special_cases_file('invalid_date', $page_ref); + } + elsif (cooling_date_passed($date_ref)) { + INFO('cooling date passed, updating to today ...'); + update_cooling_date($page_text_ref); + $statistics{'cooling date passed'}++; + } + else { + INFO(q(cooling date did not pass.)); + $statistics{q(cooling date did not pass)}++; + } + } + else { + INFO('date not defined'); + } + + } + + my @all_types = get_all_types($template->{params}->{type}, $page_ref); + + foreach my $type (@all_types) { + INFO('adding ' . $page_ref->title() . " to the list as type $type"); + add_to_no_iw_list($page_ref, $type); + $type_count{$type}++; + } + + return 1; +} + +sub get_all_types { + my ($type_param, $page) = @_; + + $type_param //= q{}; # / + strip_whitespace($type_param); + + my @all_types = split $TYPE_SEP, $type_param; + + my $page_title = $page->title(); + if (is_category($page)) { + INFO("$page_title is a category"); + push @all_types, get_string('Category'); + $statistics{'categories'}++; + } + + if (is_in_namespace($page, 'Portal')) { + INFO("$page_title is a portal"); + push @all_types, get_string('Portal'); + $statistics{'portal'}++; + } + + if (is_disambig($page)) { + INFO("$page_title is a disambiguation"); + push @all_types, get_string('disambig'); + $statistics{'disambig'}++; + } + + # Still nothing? + if (not scalar @all_types) { + my $other_type = get_string('other'); + INFO("$page_title does not have any type, adding to $other_type"); + @all_types = ($other_type); + $statistics{'automatically added to type other'}++; + } + + return @all_types; +} + +sub find_templates { + my ( + $text_ref, # string ref + $found_templates_ref, # array ref + $filter # string array ref + ) = @_; + + # A reference to an array with one empty string. + # Matching against an empty string will always succeed. + $filter //= [q{}]; # / + + # Get all highest-level matches + my @matches = (${$text_ref} =~ /$TRUE_TEMPLATE_RE/xmsgo); + + MATCH: + foreach my $next_match (@matches) { + if ($next_match !~ $TEMPLATE_RE) { + INFO(q(i thought that it is a template, but it was:)); + if ($next_match =~ $WIKITABLE_RE) { + INFO('a wikitable'); + } + else { + INFO("something else: $next_match"); + } + INFO(q{}); + next MATCH; + } + + foreach my $next_filter (@{$filter}) { + + # N.B. - case-insensitive. Wrong, but kinda useful. + if ($next_match =~ + /\A\Q$MW_SYNTAX{'start_template'}$next_filter/xmsi) + { + + # N.B.: parse_template calls find_templates() recursively + my $parsed_template = + parse_template(\$next_match, [qw(date type)], $filter); + push @{$found_templates_ref}, $parsed_template; + } + } + } + + return $found_templates_ref; +} + +sub parse_template { + my ( + $template, # string ref + $default_param_names, # string array ref + $subtemplate_filter, # string array ref + ) = @_; + + # %parsed_template: + # {text} - string ref; might be big + # {name} - string + # {params} - hash ref + # {subtemplates} - array ref + my (%parsed_template, %parsed_params, @clauses); + + $parsed_template{text} = strip_template_curlies(${$template}); + + # First string of the split is the template name, + # the rest is the params + ($parsed_template{name}, @clauses) = + (split $PARAM_SEP_RE, ${ $parsed_template{text} }); + + my $param_counter = 0; + foreach my $clause (@clauses) { + if ($clause =~ $PARAM_RE) { + #<<< no perltidy + my ($name, $value) = @+{ qw(param_name value) }; + #>>> + if (defined $name) { + if (defined $param_name{$name}) { + $name = $param_name{$name}; + } + } + else { + + # Get next default param name. + # If a default name is not defined, just use the number + $name = $default_param_names->[$param_counter] + || $param_counter; + } + $parsed_params{$name} = $value; + } + else { + my $error_msg = "Weird - $clause does not look like a param"; + INFO($error_msg); + cluck($error_msg); + $statistics{'weird param'}++; + } + $param_counter++; + } + $parsed_template{params} = \%parsed_params; + + # Possible recursion + find_templates($parsed_template{text}, $parsed_template{subtemplates}, + $subtemplate_filter); + + return \%parsed_template; +} + +sub parse_date { + my ($date_str) = @_; + + return if (not defined $date_str); + + if ($date_str =~ $HEB_DATE_RE) { + INFO("found a valid date: $date_str"); + my %parsed_date = ( + 'sec' => 0, # useful for timelocal + ); + foreach my $date_part (qw(hour min mday mon year)) { + $parsed_date{$date_part} = $+{$date_part}; + } + $parsed_date{mon} = $REV_MONTH{ $parsed_date{mon} }; + + # strip leading zeros + foreach (values %parsed_date) { + s{ + \A + 0+ + (?<number>\d+) + } + {$+{number}}xms; + } + return \%parsed_date; + } + + # Returns undef for an invalid date + return; +} + +sub strip_template_curlies { + my ($template) = @_; + for ($template) { + s{ + \A + \Q$MW_SYNTAX{start_template}\E + }{}xms; + s{ + \Q$MW_SYNTAX{end_template}\E + \z + }{}xms; + } + return \$template; +} + +# no arg unpacking for simplicity and performance +sub strip_whitespace { ## no critic (RequireArgUnpacking) + for (@_) { + s/\A\s*//xms; + s/\s*\z//xms; + } + return; +} + +sub has_interwiki { + my ($page) = @_; + + my $page_title = $page->title(); + my $page_text = ${ $page->text() }; # XXX + + study $page_text; # XXX + + my %iw_links; + my %special_cases; + + while ($page_text =~ /$INTERWIKI_LINK_RE/xmsgo) { # XXX + my ($lang_code, $foreign_article) = @+{qw(lang_code foreign_article)}; + if (defined $iw_links{$lang_code}) { + $special_cases{double_links}->{$lang_code} = q{}; + } + else { + $iw_links{$lang_code} = $foreign_article; + } + + # A # sign not after an &. + # After an & it is probably a character number. + if ($foreign_article =~ $SECTION_LINK_RE) { + $special_cases{section_links}->{$lang_code} = q{}; + } + + # Char codes are common in section links, so there is no + # need to show them again + elsif ($foreign_article =~ $CHARACTER_CODE_IN_LINK_RE) { + $special_cases{charnumber_links}->{$lang_code} = q{}; + } + + # Lowercase links + if ( (not $lang_code ~~ @LOWERCASE_LANGS) + and ($foreign_article =~ $LOWERCASE_LINK_RE)) + { + my $include_lowercase_link = 1; + + if (defined get_string('exclude_lowercase', 'if defined') + and $foreign_article =~ $EXCLUDE_LOWERCASE_RE) + { + $include_lowercase_link = 0; + } + + if ($include_lowercase_link) { + $special_cases{lowercase_links}->{$lang_code} = q{}; + } + } + + $found_links{$lang_code}->{$foreign_article}->{$page_title} = q{}; + } + + my @all_langs = keys %iw_links; + my $count_iw = scalar @all_langs; + if ($count_iw) { + if (not defined $statistics{count_iw}->[$count_iw]) { + $statistics{count_iw}->[$count_iw] = []; + } + push @{ $statistics{count_iw}->[$count_iw] }, $page_title; + } + INFO("iw link count for $page_title is: $count_iw"); + + for my $special_case_name (keys %special_cases) { + if (scalar %{ $special_cases{$special_case_name} }) { + special_cases_file($special_case_name, $page, + $special_cases{$special_case_name}); + } + } + + # BIG XXX Still very stupid, but getting better + if (defined $iw_links{en}) { + return 'en'; + } + + return q{}; +} + +sub special_cases_file { + my ($special_case_name, $page, $special_cases_ref) = @_; + $special_cases_ref //= {}; # / + + my $special_case_langs = join q{ }, sort keys %{$special_cases_ref}; + + if ($special_case_langs) { + $special_case_langs = " ($special_case_langs)"; + } + + my $special_case_fn = make_type_fn($special_case_name, 1); + + if (not -e $special_case_fn) { + append_to_file($special_case_fn, $special_case_name); + } + + my $page_title = $page->title(); + my $link = make_link($page_title); + my $line = + $link + . $special_case_langs + . $FIELD_SEP + . get_sort_title($page_title); + + append_to_file($special_case_fn, $line); + + return; +} + +sub remove_template_no_iw { + my ($params) = @_; + INFO( "Supposed to remove the no_iw template now, but ...\n" + . 'This sub is a stub. You can help Wikipedia by expanding it!'); + + return 0; +} + +sub cooling_date_passed { + my ($date_ref) = @_; + + my @page_times = @{$date_ref}{qw(sec min hour mday mon year)}; + INFO("page times: @page_times"); + + my $pageseconds = timelocal(@page_times); + return $pageseconds < $LATEST_COOLING; +} + +sub update_cooling_date { + my ($params) = @_; + + INFO( "Supposed to update cooling date now, but ...\n" + . "This sub is a stub. You can help Wikipedia by expanding it!\n" + ); + + return 0; +} + +# Just a debugging thingie +sub print_parsed_template { + my ($template_ref) = @_; # hash ref + + INFO('text:'); + INFO(${ $template_ref->{text} }); + INFO('name:'); + INFO($template_ref->{name}); + INFO('params:'); + foreach my $next_param (sort keys %{ $template_ref->{params} }) { + INFO("$next_param: $template_ref->{params}->{$next_param}"); + } + + if (defined $template_ref->{subtemplates}) { + INFO("subtemplates: $template_ref->{subtemplates}"); + } + + return; +} + +sub get_sort_title { + my ($page_title) = @_; + + my $sort_title; + if ($page_title =~ $PURE_TITLE_RE) { + $sort_title = $+{pure_title}; + } + else { + INFO( 'Something badly weird happened - ' + . "$page_title has no pure title"); + croak('A page has no pure title'); + } + + return $sort_title; +} + +sub add_to_no_iw_list { + my ( + $page, # ref + $page_type, # string + ) = @_; + + my $page_title = $page->title(); + INFO("Adding $page_title to no_iw_list"); + + my $sort_title = get_sort_title($page_title); + INFO("full title: $page_title"); + INFO("sort title: $sort_title"); + + my $unsorted_type_fn = make_type_fn($page_type, 1); + if (not -e $unsorted_type_fn) { + append_to_file($unsorted_type_fn, $page_type); + } + my $link = make_link($page_title); + INFO("link to page: $link"); + my $line = $link . $FIELD_SEP . $sort_title; + + append_to_file($unsorted_type_fn, $line); + + return; +} + +sub make_link { + my ($page_title) = @_; + + my $link_to_page = + $MW_SYNTAX{start_link} + . $MW_SYNTAX{namespace_sep} + . $page_title + . $MW_SYNTAX{end_link}; + + if ($option{rtl} and $page_title =~ $LTR_CHAR_RE) { + $link_to_page = $HTML_RLM . $link_to_page . $HTML_RLM; + } + + return $link_to_page; +} + +sub create_no_iw_pages { + my ($params) = @_; + + INFO("\ncreating no_iw pages"); + + # Run over page types + UNSORTED_TYPE_FN: + foreach my $unsorted_type_fn (glob "$UNSORTED_DIR/*") { + my %all_pages_in_type = (); + open my $unsorted_type_file, '<', $unsorted_type_fn + or croak(file_error('opening', $unsorted_type_fn, 'reading')); + my @lines = <$unsorted_type_file>; + close $unsorted_type_file + or croak(file_error('closing', $unsorted_type_fn, 'reading')); + + my $type_name = shift @lines; + chomp $type_name; + foreach my $line (@lines) { + chomp $line; + my ($page_title, $sort_title) = split $FIELD_SEP_RE, $line; + my $sort_letter = get_sort_letter($sort_title); + $all_pages_in_type{$sort_letter} //= []; # / + push @{ $all_pages_in_type{$sort_letter} }, $page_title; + } + write_sorted_pages($type_name, \%all_pages_in_type); + } + return; +} + +sub write_sorted_pages { + my ($type_name, $type_tree_ref) = @_; + + my $type_fn = make_type_fn($type_name); + + my $section_counter = 0; + my $page = q{}; + my $file_number = 1; + + LETTER: + foreach my $next_letter (sort keys %{$type_tree_ref}) { + my @all_links_in_letter = + sort @{ $type_tree_ref->{$next_letter} }; + + my $links_count = scalar @all_links_in_letter; + my $sections = + ($links_count - 1) / $option{max_links_per_section} + 1; + my $links_per_section = $links_count / $sections; + + SECTION: + foreach my $section_num (1 .. $sections) { + if ($section_counter == $option{max_sections_per_page}) { + write_page(\$page, \$type_fn, $file_number++); + $section_counter = 0; + + $page = q{}; + } + elsif ($section_counter) { + $page .= $MW_SYNTAX{paragraph}; + } + $section_counter++; + my $title = $next_letter; + if ($sections > 1) { + $title .= " $section_num"; + } + $page .= mw_heading($title); + + my $first_link = ($section_num - 1) * $links_per_section; + my $last_link = + ($section_num == $sections) + ? $links_count - 1 + : $first_link + $links_per_section - 1; + my $links = join_links( + [ @all_links_in_letter[ $first_link .. $last_link ] ]); + $page .= $links; + } + } + + # The page may be empty at this point + if ($page) { + write_page(\$page, \$type_fn, $file_number++); + } + + return; +} + +sub write_page { + my ($page_ref, $type_fn_ref, $file_number) = @_; + + my $pretty_file_number = sprintf '%03d', $file_number; + ${$type_fn_ref} =~ s{ + $NUMBERED_FILE_RE + } + {_$pretty_file_number.$WIKITEXT_EXT}xmso; + INFO("creating file ${$type_fn_ref}"); + append_to_file(${$type_fn_ref}, ${$page_ref}); + + return; +} + +sub mw_heading { + my ( + $text, # string + $level # number + ) = @_; + + $level //= 2; # / + my $level_marker = q{=} x $level; + + # Line ending is mandatory + return "$level_marker $text $level_marker\n"; +} + +sub mw_bold { + my ($text) = @_; + return "'''$text'''"; +} + +# Custom Unicode character property, which is like \w, but for Hebrew. +# The custom is to give custom Unicode character classes CamelCase names. +# P::C policy ProhibitMixedCaseSubs is deprecated. +sub IsRightToLeft { ## no critic (Capitalization) + return <<'END'; ++utf8::InHebrew ++utf8::IsSpace ++utf8::IsPunct +END +} + +sub is_redirect { + my ($page) = @_; + + if ($page->redirect()) { + return 'English'; + } + + my $page_text_ref = $page->text(); + + if (${$page_text_ref} =~ $LOCAL_REDIRECT_RE) { + return 'local'; + } + + return q(); +} + +sub is_in_namespace { + my ($page, @namespaces) = @_; + + return namespace($page) ~~ [ map { get_string($_) } @namespaces ]; +} + +sub is_category { + my ($page) = @_; + return is_in_namespace($page, 'Category'); +} + +sub is_disambig { + my ($page) = @_; + my $found_templates = + find_templates($page->text(), [], [ get_string('disambig') ]); + return scalar @{$found_templates}; +} + +# +# This is poor man's gettext. +# TODO: Replace it with Locale::Maketext +# +sub get_strings { + my ($lang) = @_; + + my $STRINGS_FN = "$lang.strings.txt"; + + open my $STRINGS_FILE, '<:utf8', $STRINGS_FN + or croak(file_error('opening', $STRINGS_FN, 'reading')); + my @strings_file_lines = <$STRINGS_FILE>; + close $STRINGS_FILE + or croak(file_error('closing', $STRINGS_FN, 'reading')); + + STRING_LINE: + foreach my $next_string_line (@strings_file_lines) { + + # Skip blanks and comments + next STRING_LINE if ($next_string_line =~ $STRING_SKIP_RE); + + chomp $next_string_line; + my ($english, $target) = split $FIELD_SEP_RE, $next_string_line; + + # Fallback to English if no target language string was supplied + $STRING{$english} = $target // $english; # / + } + + return; +} + +sub get_string { + my ($english, $if_defined) = @_; + return $if_defined + ? ($STRING{$english}) + : ($STRING{$english} //= $english); # / +} + +sub make_type_fn { + my ($type, $unsorted) = @_; + $unsorted //= 0; # / + + #my $transliterated_type = $TRANSLITERATOR->translit($type); + my $transliterated_type = $type; + + my $type_fn = "$transliterated_type.$WIKITEXT_EXT"; + + $type_fn =~ s{$INVALID_FILENAME_CHAR_RE}{-}xmsgo; + my $dir = $unsorted ? $UNSORTED_DIR : $OUT_DIR; + $type_fn = "$dir/$type_fn"; + + return $type_fn; +} + +sub append_to_file { + my ($fn, $line) = @_; + + open my $file, '>>:utf8', $fn + or croak(file_error('opening', $fn, 'appending')); + + say {$file} ($line // q{}); # / + + close $file + or croak(file_error('closing', $fn, 'appeding')); + + return; +} + +# It appears simple, but in non-alphabetic languages such as Chinese +# it may be different, so it will sit here ready for better i18n. +sub get_sort_letter { + my ($string) = @_; + return substr $string, 0, 1; +} + +sub file_error { + my ($operation, $fn, $access_type) = @_; + my $string = "error $operation $fn for $access_type: $OS_ERROR"; + return $string; +} + +sub format_link_table { ## no critic (RequireArgUnpacking) + return sprintf '%-15s %8d', @_; +} + +sub print_multi_links_by_foreign { + LANG_CODE: + foreach my $lang_code (sort keys %found_links) { + my $filename = "$MULTI_DIR/$lang_code.$WIKITEXT_EXT"; + my @foreign_articles = sort keys %{ $found_links{$lang_code} }; + say format_link_table($lang_code, scalar @foreign_articles); + + FOREIGN_ARTICLE: + foreach my $foreign_article (@foreign_articles) { + my @local_articles = + keys %{ $found_links{$lang_code}->{$foreign_article} }; + if (scalar @local_articles > 1) { + my $links = join q{ | }, sort map { make_link($_) } + keys %{ $found_links{$lang_code}->{$foreign_article} }; + + my $foreign_title = + make_link($lang_code + . $MW_SYNTAX{namespace_sep} + . $foreign_article); + append_to_file($filename, "* '''$foreign_title''' - $links"); + } + } + } + + return; +} + +sub print_multi_links_by_local { + my %local_multi_links; + LANG_CODE: + foreach my $lang_code (sort keys %found_links) { + my @foreign_articles = sort keys %{ $found_links{$lang_code} }; + say format_link_table($lang_code, scalar @foreign_articles); + + FOREIGN_ARTICLE: + foreach my $foreign_article (@foreign_articles) { + my @local_articles = + keys %{ $found_links{$lang_code}->{$foreign_article} }; + + if (scalar @local_articles > 1) { + add_local_multi( + \%local_multi_links, + make_link( + $lang_code + . $MW_SYNTAX{namespace_sep} + . $foreign_article + ), + @local_articles + ); + } + } + } + + say 'writing local multilinks file'; + my $filename = "$MULTI_DIR/LOCAL.$WIKITEXT_EXT"; + foreach my $local_multi_article (sort keys %local_multi_links) { + append_to_file($filename, + '* ' . mw_bold(make_link($local_multi_article))); + + foreach my $other_local_article ( + sort keys %{ $local_multi_links{$local_multi_article} }) + { + append_to_file($filename, + '** ' . make_link($other_local_article)); + my $foreign_articles = join_links( + $local_multi_links{$local_multi_article} + ->{$other_local_article}, + 0 + ); + append_to_file($filename, "*** $foreign_articles"); + } + append_to_file($filename); + } + + return; +} + +sub add_local_multi { + my ( + $local_multi_links_ref, $foreign_link, + $first_local_article, @other_local_articles + ) = @_; + + $local_multi_links_ref->{$first_local_article} //= {}; # / + + foreach my $other_local_article (@other_local_articles) { + $local_multi_links_ref->{$first_local_article} + ->{$other_local_article} //= []; # / + push @{ $local_multi_links_ref->{$first_local_article} + ->{$other_local_article} }, $foreign_link; + } + + return; +} + +sub join_links { + my ($links_ref, $line_end) = @_; + $line_end //= 1; # / + + my $link_sep = q{ } . $LINK_SEP . ($line_end ? "\n" : q{ }); + return join $link_sep, @{$links_ref}; +} + +sub print_stats { + INFO("\nSUMMARY"); + INFO("total time: $total_time"); + foreach my $stat_type (sort keys %statistics) { + if (not ref $statistics{$stat_type}) { + INFO("$stat_type: $statistics{$stat_type}"); + } + } + + my $max_iw_index = $#{ $statistics{count_iw} }; + INFO("max_iw_index: $max_iw_index"); + MAX_IW: + for my $max_iw_place (0 .. $option{max_iw_places}) { + my @links = + map { make_link($_) } @{ $statistics{count_iw}->[$max_iw_index] } + or last MAX_IW; + INFO("# $max_iw_index: " . join_links(\@links, 0)); + + # Do nothing, just count down to the next index with a defined list. + # $max_iw_index needs to be checked for nonzero-ness for the rare case + # of a very low page count. + while ($max_iw_index + and not defined $statistics{count_iw}->[ --$max_iw_index ]) + { + } + } + + INFO('pages without interwiki links per namespace'); + foreach my $namespace (keys %{ $statistics{'has no interwiki link'} }) { + my $iwless_in_namespace = + $statistics{'has no interwiki link'}->{$namespace}; + + no integer; + ## no critic (ProhibitMagicNumbers) + my $percentage = sprintf '%.2f', + 100 * $iwless_in_namespace / $namespace_count{$namespace}; + ## use critic (ValuesAndExpressions::ProhibitMagicNumbers) + use integer; + + INFO("$namespace: $iwless_in_namespace, $percentage%"); + } + + INFO("\nNAMESPACES"); + foreach my $namespace (sort keys %namespace_count) { + INFO("$namespace: $namespace_count{$namespace}"); + } + + INFO("\nTYPES"); + foreach my $type (sort keys %type_count) { + INFO("$type: $type_count{$type}"); + } + + my $dump_size = -s $dump_fn; + my $log_size = -s $LOG_FN; + INFO("dump size: $dump_fn"); + INFO("log size: $LOG_FN"); + if ($log_size > 0) { + my $dump_log_ratio = $dump_size / $log_size; + INFO("dump/log ratio: $dump_log_ratio"); + } + else { + WARN("weird: log file size $LOG_FN is 0"); + } + + return; +} + +__END__ + +=head1 NAME + +C<prepare_noiw_list.pl> - Searches a MediaWiki dump for pages without +interlanguage links and prepares categorized lists of those pages. + +=head1 VERSION + +This documentation refers to + +prepare_noiw_list.pl + +version 0.2.1 - Noa. + +=head1 USAGE + +=over + +=item * C<prepare_noiw_list.pl ./big-files/ruwiki-20080420-pages-meta-current.xml> + +=item * C<prepare_noiw_list.pl --rtl ./big-files/hewiki-20080420-pages-meta-current.xml> + +=item * C<prepare_noiw_list.pl --stop_after=20000 ./big-files/hewiki-20080420-pages-meta-current.xml> + +=back + +=head1 REQUIRED ARGUMENTS + +=over + +=item * MediaWiki dump file name is required + +=back + +=head1 OPTIONS + +=over + +=item * --rtl makes special fixes for right-to-left languages. + +=item * --stop_after=NUMBER Stops processing after page with the given +NUMBER. + +=item * --start_from=NUMBER Begins processing after page with the given +NUMBER. + +=item * --page_freq=NUMBER Print the page counter every NUMBER of +pages. 1000 by default. + +=item * --max_links_per_section Maximum number of links per section in +the output page. Default is 100. + +=item * --max_sections_per_page Maximum number of sections per output +page. Default is 20. + +=item * --max_iw_places Number of places to print in the statistics of +pages with the most interlanguage links. + +=item * --version Print the version number and exit. + +=item * --usage Print basic usage and exit. + +=item * --help Print full help and exit. + +=back + +=head1 DESCRIPTION + +The main goal of this program is to find pages which do not have +interwiki (interlanguage) links to certain languages. + +This program scans a MediaWiki XML dump file. It searches every page for +interwiki links and for a special hidden template, which defines to which +types of interwiki-less pages it belongs. + +=over + +=item * If the page contains links to the defined languages and does not +contain the "no interwiki" template, its processing stops. + +=item * If the page contains links to the defined languages and contains this +template, it is logged, so the template can be removed. (It is planned that +it will be removed automatically in the future.) + +=item * If the page contains no links to the defined languages and does not +contain the template, it is automatically added to type "other". + +=item * If the page contains no links to the defined languages and +a template with types, it is added to the defined types. + +=back + +Pages without links are added to nicely formatted lists according +to their type. + +This program also collects some information on the way about problematic +cases - more than one "no interwiki" template, invalid templates, invalid +or redundant interwiki links etc. + +You can get a dump for using this program at L<http://download.wikimedia.org/>. + +=head1 DIAGNOSTICS + +=head2 Invalid command line options. + +See L</"REQUIRED ARGUMENTS"> and L</"OPTIONS">. + +=head2 FILENAME is a weird dump file name. + +The dump file does not appear to have a standard name that appears +at L<http://download.wikimedia.org/>. + +=head2 error opening FILENAME ... + +=head2 error closing FILENAME ... + +Check the corresponding error. + +=head2 unable to handle any case setting besides 'first-letter' + +Something is weird with the dump. See the documentation of +L<Parse::MediaWikiDump> and MediaWiki. + +=head2 A page has no pure title + +Something is particularly weird with the name of a page. The program cannot +separate its name from its namespace. It can also be a bug in this program. + +=head2 Some weirdness happened - STRING does not look a param + +STRING is supposed to be a parameter in a template, but it does not look like +one. It could be an error in the template, and also a bug in this program +(the parser that this program employs is rather stupid). + +=head2 Unicode character 0xNUMBER is illegal + +This is a standard Perl warning. It may appear if a page or its title have +funky Unicode characters which should not be there according to the Unicode +standard (to be more precise, according to the implementation of this +standard in your version of perl). Most probably these characters are not +supposed to be in the page and should be fixed, but otherwise this issue +is not supposed to affect the functionality of this program significantly. + +This was reported as a MediaWiki bug: +L<https://bugzilla.wikimedia.org/show_bug.cgi?id=14600> + +=head1 EXIT STATUS + +Nothing interesting, read the log :) + +=head1 CONFIGURATION + +=head2 Language codes file + +The file name has the form C<LANGUAGE_CODE.language_codes.txt>, where +LANGUAGE_CODE is the code of the wiki which is being processed. It includes +a list of language codes which are searched, the full name of the language +and the group to which this language belongs. + +=head2 Localized strings file + +The file name has the form C<LANGUAGE_CODE.strings.txt>, where LANGUAGE_CODE +is the code of the wiki which is being processed. This file includes pairs +of strings - the English name of a feature and its localized counterpart. + +=head2 Log + +The log is called outtest.log. + +=head1 DEPENDENCIES + +This module requires these CPAN modules: + +=over + +=item * C<Parse::MediaWikiDump> + +This module is used for reading pages from the XML dump. + +=item * C<Regexp::Common> + +This module is used for searching templates which use balanced parenthesis, +but pretty badly. :) + +=item * C<Log::Log4perl> + +This module provides easy global logging. + +=item * C<Lingua::Translit> + +This module is used for transliterating filenames to ASCII. + +=item * C<Readonly> + +To make Perl::Critic happy :) + +=item * C<Pod::Usage> + +Some style guide recommended it. I don't even remember which one, but i love style guides. + +=back + +=head1 HACKING + +=head2 Perl 5.10 + +This program requires Perl 5.10. It has new clean and useful syntax, which +makes the programs easier to hack, maintain and debug. It is useless to try +and run it on an older version, unless you want to waste your time +backporting. Please upgrade your Perl installation if you still have 5.8 or +(horrors!) something older. + +=head2 Perl Best Practices, Perl::Critic and perltidy + +Great effort has been put into making this source code pass as cleanly as +possible the Perl::Critic tests in the 'brutal' mode. It also uses perltidy +for automatic code formatting. If you modify it, do yourself a favor, install +Perl::Critic and regularly test it using this command: + +./tidy.sh prepare_noiw_list.pl + +It checks the syntax, runs perltidy on the code and runs Perl::Critic. + +All the places where P::C has been disabled using "no critic" are explained. + +The time invested in making the code P::C-friendly will be returned as time +saved on debugging. Also consider reading the book "Perl Best Practices" by +Damian Conway if you have not already. + +=head1 INCOMPATIBILITIES + +=head2 Unicode issues + +This program works best on GNU/Linux, where Perl and the filesystem are +Unicode-friendly. + +This program was also tested on Windows XP and Vista with ActivePerl and +Cygwin. In these environments Unicode-related issues caused +filenames and clipboard text to become jumbled. You have been warned. + +=head1 BUGS AND LIMITATIONS + +Please report all bugs, features requests and other comments to +Amir E. Aharoni (ami...@ma...). + +=head2 There is no equality between languages + +Currently this program actually only lists pages which do not have +an interwiki link to the English Wikipedia. This is obviously not useful on +the English Wikipedia and is conceptually problematic on other Wikipedias, +too. This is being fixed, but it is not simple to do it Right. + +Goal: 0.4 Reut + +=head2 Internationalization is far from perfect + +Date handling and strings localization is very primitive. There are plans +to upgrade it to smarter modules such as Locale::Maketext. + +Goal: 0.6 Itay + +=head2 MediaWiki parsing is ad hoc + +This program only does very rudimentary and ad hoc MediaWiki syntax parsing. + +Goal: None at the moment, it works well enough. + +=head2 Templates are removed semi-automatically + +Templates on pages which already have needed links are not removed +automatically. A list of them is created and a bot can run on it and remove +the outdated templates. This can be done automatically. + +Goal: None at the moment, it works well enough. + +=head2 Cooling date + +The implementation of the cooling date is very rudimentary. + +Goal version: v0.4 Reut + +=head2 Major refactoring is needed + +=over + +=item * The main code is on the brink of passing the threshold for complexity that +P::C accepts. + +=item * There is no separation of searching and formatting. There are two main +function here: C<process_dump()> and C<create_no_iw_pages()>. They are doing +separate things and should run from different programs. + +=item * Statistics and multi links are just slapped to the log. + +=item * At least some of the code can be rewritten as classes that inherit +from L<Parse::MediaWikiDump>. + +=back + +Goal: v0.8 Moshe + +=head2 There is no test suite + +That can be done after proper modularization. Also, a local test MediaWiki +server would be needed. + +Goal: v1.0 Drora + +=head1 HISTORY + +=over + +=item * B<0.2 - Noa>: Perl 5.10. Russian l10n. POD documentation. Pretty +categories sorting. Memory usage optimization - accumulating information in +files. More generic, but far-from-perfect handling of links to languages +other than English. Transliteration with Lingua::Translit. Logging with +Log::Log4perl. Brutal Perl::Critic 1.90. Started using Readonly. Not finished: +complete statistics, removal of templates from pages which already have links. + +=item * B<0.1 - First and unnamed Amir E. Aharoni's version>: Types +introduced. Conceptual l10n, but only tested on he.wiki. Still en links +only. Informative, though lo-tec logging. + +=item * B<0.0 - Felagund's version>: no types, he.wiki only, en links only, +but it uploaded the lists automatically... + +=back + +=head1 AUTHOR + +=over + +=item * Creator is Nadav Perez (Felagund) + +=over + +=item * Hebrew: L<http://he.wikipedia.org/wiki/User:Felagund> + +=back + +=item * It is (roughly) based on another bot by Guy Shaked (Costello). + +=over + +=item * Hebrew: L<http://he.wikipedia.org/wiki/User:Costello> + +=back + +=item * Then Amir E. Aharoni (Amire80) came and mutilated it beyond +recognition. + +=over + +=item * Hebrew: L<http://he.wikipedia.org/wiki/User:Amire80> + +=item * English: L<http://en.wikipedia.org/wiki/User:Amire80> + +=item * Russian: L<http://ru.wikipedia.org/wiki/User:Amire80> + +=back + +=back + +=head1 LICENSE AND COPYRIGHT + +Copyright 2009 Guy Shaked, Nadav Perez, Amir E. Aharoni. + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License version 3 +as published by the Free Software Foundation or under the terms of +Artistic License version 2.0. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the licenses +along with this program. If not, see + +=over + +=item * L<http://www.gnu.org/licenses/gpl-3.0.html>. + +=item * L<http://www.perlfoundation.org/artistic_license_2_0> + +=back + +I<Visca la llibertat!> + +=cut + +1; + Property changes on: trunk/no-interwiki/Maintenance.pm ___________________________________________________________________ Added: svn:executable + * This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2009-09-29 13:25:18
|
Revision: 83 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=83&view=rev Author: amire80 Date: 2009-09-29 13:25:11 +0000 (Tue, 29 Sep 2009) Log Message: ----------- Refactoring empty pages handling. Modified Paths: -------------- trunk/no-interwiki/cv.strings.txt trunk/no-interwiki/prepare_noiw_list.pl Modified: trunk/no-interwiki/cv.strings.txt =================================================================== --- trunk/no-interwiki/cv.strings.txt 2009-09-27 12:41:34 UTC (rev 82) +++ trunk/no-interwiki/cv.strings.txt 2009-09-29 13:25:11 UTC (rev 83) @@ -23,7 +23,8 @@ type тĕс # MW specials -REDIRECT куçару +#REDIRECT куçару +REDIRECT перенаправление # Namespaces User Хутшăнакан Modified: trunk/no-interwiki/prepare_noiw_list.pl =================================================================== --- trunk/no-interwiki/prepare_noiw_list.pl 2009-09-27 12:41:34 UTC (rev 82) +++ trunk/no-interwiki/prepare_noiw_list.pl 2009-09-29 13:25:11 UTC (rev 83) @@ -463,55 +463,40 @@ say $page_counter; } - last PAGE - if ($option{stop_after} - and $page_counter > $option{stop_after}); - - next PAGE - if ($page_counter < $option{start_from}); - - my $page_namespace = namespace($page_ref); - my $page_title = $page_ref->title(); - my $page_text_ref = $page_ref->text(); - - if (not defined $page_text_ref) { - WARN('ref to page text undefined, skipping'); - $statistics{'ref to page text undefined'}++; - next PAGE; + if ( $option{stop_after} + and $page_counter > $option{stop_after}) + { + last PAGE; } - if (not defined ${$page_text_ref}) { - WARN('page text undefined, skipping'); - $statistics{'page text undefined'}++; - next PAGE; - } + my $page_text_ref = $page_ref->text(); - if (${$page_text_ref} eq q()) { - WARN('page text empty, skipping'); - $statistics{'page text empty'}++; + if ( + $page_counter < $option{start_from} + or not defined ${$page_text_ref} # must be tested before redirect + or not is_in_namespace($page_ref, @INCLUDE_NAMESPACES) + or is_redirect($page_ref) + ) + { next PAGE; } - if (not is_in_namespace($page_ref, @INCLUDE_NAMESPACES)) { - next PAGE; - } + my $page_namespace = namespace($page_ref); + $namespace_count{$page_namespace}++; - if (my $redirect_type = is_redirect($page_ref)) { - $statistics{"redirect - $redirect_type"}++; - DEBUG("\n$page_title - $redirect_type redirect"); - next PAGE; - } + my $page_title = $page_ref->title(); + INFO("\n* processing $page_counter - ", $page_title); + # TODO: Be more precise here. # Portal pages which have a '/' in their name are probably # internal and do not need interwiki links. if (is_in_namespace($page_ref, 'Portal') and $page_title =~ m{/}xms) { + INFO('internal portal, skipping'); + $statistics{'internal portal'}++; next PAGE; } - $namespace_count{$page_namespace}++; - INFO("\n* processing $page_counter - ", $page_title); - # A simple sanity check: is the no_iw template anywhere around here? my $has_template_no_iw = (${$page_text_ref} =~ $SIMPLE_NO_IW_CHECK_RE); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2009-09-27 12:41:46
|
Revision: 82 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=82&view=rev Author: amire80 Date: 2009-09-27 12:41:34 +0000 (Sun, 27 Sep 2009) Log Message: ----------- Refactoring, improving performance. Gmar khatima tova. Modified Paths: -------------- trunk/no-interwiki/prepare_noiw_list.pl Modified: trunk/no-interwiki/prepare_noiw_list.pl =================================================================== --- trunk/no-interwiki/prepare_noiw_list.pl 2009-09-27 00:44:08 UTC (rev 81) +++ trunk/no-interwiki/prepare_noiw_list.pl 2009-09-27 12:41:34 UTC (rev 82) @@ -30,7 +30,7 @@ # pragmata use strict; use warnings; -use integer; +use integer; # Improves performance use open ':utf8'; use utf8; use charnames ':full'; @@ -54,7 +54,7 @@ use Regexp::Common; use Lingua::Translit; use Log::Log4perl qw(:easy); -use Parse::MediaWikiDump 0.92; # Earlier versions have a different API +use Parse::MediaWikiDump 0.91; # Earlier versions have a different API our $VERSION = '0.2.1'; #<<< no perltidy @@ -366,11 +366,10 @@ } } -my $pmwd = Parse::MediaWikiDump->new(); -my $dump = $pmwd->revisions($dump_fn); +my $the_dump = load_dump(); my $namespaces_alt = join $ALT_SEP, - grep { length > 0 } @{ $dump->namespaces_names() }; + grep { length > 0 } @{ $the_dump->namespaces_names() }; my $PURE_TITLE_RE = qr{ \A (?: @@ -383,7 +382,7 @@ # This is the only currently known value # but there could be more in the future -if ($dump->case() ne 'first-letter') { +if ($the_dump->case() ne 'first-letter') { croak(q{unable to handle any case setting besides 'first-letter'}); } @@ -420,12 +419,35 @@ exit; +sub load_dump { + Readonly my $WORKING_PMWD_VER => 0.91; + Readonly my $FUTURE_PMWD_VER => 0.94; + + my $dump; + + if ($Parse::MediaWikiDump::VERSION == $WORKING_PMWD_VER) { + $dump = Parse::MediaWikiDump::Pages->new($dump_fn); + } + else { + if ($Parse::MediaWikiDump::VERSION < $FUTURE_PMWD_VER) { + carp( 'You are running Parse::MediaWikiDump version ' + . $Parse::MediaWikiDump::VERSION + . ".\n Redirect handling may be broken\n"); + } + my $pmwd = Parse::MediaWikiDump->new(); + $dump = $pmwd->revisions($dump_fn); + } + + return $dump; +} + sub namespace { my ($page) = @_; return $page->namespace() || get_string('article space'); } sub next_page { + my ($dump) = @_; my $page_ref = eval { $dump->next(); }; if ($EVAL_ERROR) { confess("Failed reading a page: $EVAL_ERROR"); @@ -435,7 +457,7 @@ sub find_iwless { PAGE: - while (my $page_ref = next_page()) { + while (my $page_ref = next_page($the_dump)) { $page_counter++; if ($page_counter % $option{page_freq} == 0) { say $page_counter; @@ -1390,12 +1412,14 @@ foreach my $namespace (keys %{ $statistics{'has no interwiki link'} }) { my $iwless_in_namespace = $statistics{'has no interwiki link'}->{$namespace}; + no integer; ## no critic (ProhibitMagicNumbers) my $percentage = sprintf '%.2f', 100 * $iwless_in_namespace / $namespace_count{$namespace}; ## use critic (ValuesAndExpressions::ProhibitMagicNumbers) use integer; + INFO("$namespace: $iwless_in_namespace, $percentage%"); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2009-09-27 00:44:16
|
Revision: 81 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=81&view=rev Author: amire80 Date: 2009-09-27 00:44:08 +0000 (Sun, 27 Sep 2009) Log Message: ----------- Multilinks cleanup. Modified Paths: -------------- trunk/no-interwiki/prepare_noiw_list.pl Modified: trunk/no-interwiki/prepare_noiw_list.pl =================================================================== --- trunk/no-interwiki/prepare_noiw_list.pl 2009-09-26 23:43:11 UTC (rev 80) +++ trunk/no-interwiki/prepare_noiw_list.pl 2009-09-27 00:44:08 UTC (rev 81) @@ -409,10 +409,10 @@ my $begin_multi_links_time = time; -say 'listing multi links by language'; +say "\nlisting multi links by language"; print_multi_links_by_foreign(); -say 'listing multi links by local articles'; +say "\nlisting multi links by local articles"; print_multi_links_by_local(); my $total_multi_links_time = time - $begin_multi_links_time; @@ -1252,12 +1252,16 @@ return $string; } +sub format_link_table { ## no critic (RequireArgUnpacking) + return sprintf '%-15s %8d', @_; +} + sub print_multi_links_by_foreign { LANG_CODE: foreach my $lang_code (sort keys %found_links) { my $filename = "$MULTI_DIR/$lang_code.$WIKITEXT_EXT"; my @foreign_articles = sort keys %{ $found_links{$lang_code} }; - say "$lang_code ", scalar @foreign_articles; + say format_link_table($lang_code, scalar @foreign_articles); FOREIGN_ARTICLE: foreach my $foreign_article (@foreign_articles) { @@ -1284,7 +1288,7 @@ LANG_CODE: foreach my $lang_code (sort keys %found_links) { my @foreign_articles = sort keys %{ $found_links{$lang_code} }; - say "$lang_code ", scalar @foreign_articles; + say format_link_table($lang_code, scalar @foreign_articles); FOREIGN_ARTICLE: foreach my $foreign_article (@foreign_articles) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2009-09-26 23:43:19
|
Revision: 80 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=80&view=rev Author: amire80 Date: 2009-09-26 23:43:11 +0000 (Sat, 26 Sep 2009) Log Message: ----------- Copyright, MediaWikiDump 0.92 requirement, version number cleanup, RTL cleanup, logging cleanup, POD cleanup, get_string cleanup, simple dump ratio, P::C cleanup, adding pod spelling. Modified Paths: -------------- trunk/no-interwiki/.perlcriticrc trunk/no-interwiki/prepare_noiw_list.pl Added Paths: ----------- trunk/no-interwiki/stopwords.txt Modified: trunk/no-interwiki/.perlcriticrc =================================================================== --- trunk/no-interwiki/.perlcriticrc 2009-09-26 10:51:33 UTC (rev 79) +++ trunk/no-interwiki/.perlcriticrc 2009-09-26 23:43:11 UTC (rev 80) @@ -2,6 +2,10 @@ [Variables::ProhibitPunctuationVars] allow = %+ $+ @+ -# This is probably useful, but appears buggy, so it should remain manual -[-Documentation::PodSpelling] +[Documentation::PodSpelling] +stop_words_file = stopwords.txt +# say is for writing to the terminal, no need to check it +[InputOutput::RequireCheckedSyscalls] +exclude_functions = say + Modified: trunk/no-interwiki/prepare_noiw_list.pl =================================================================== --- trunk/no-interwiki/prepare_noiw_list.pl 2009-09-26 10:51:33 UTC (rev 79) +++ trunk/no-interwiki/prepare_noiw_list.pl 2009-09-26 23:43:11 UTC (rev 80) @@ -1,7 +1,7 @@ #!/usr/bin/perl # prepare_noiw_list.pl -# version 0.2 Noa - development +# version 0.2.1 Noa - development # See the POD documentation at the end of the file or run # perldoc prepare_noiw_list.pl @@ -9,9 +9,21 @@ # "Had he been a French child, # he would have heard an infinite number of sentences" - Otto Jespersen -# This program is Free Software; you can redistribute it and/or modify it -# under the same terms as Perl itself. +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 3 +# as published by the Free Software Foundation or under the terms of +# Artistic License version 2.0. +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the licenses +# along with this program. If not, see +# * <http://www.gnu.org/licenses/gpl-3.0.html>. +# * <http://www.perlfoundation.org/artistic_license_2_0> + # Upgrade! This program actually uses new Perl 5.10 constructs, so you need it use 5.010; @@ -38,11 +50,13 @@ # I only use Readonly where P::C complains about it. Using it with regular # expressions which have non-ASCII chars produces heisenbugs. use Readonly; +use Pod::Usage; use Regexp::Common; use Lingua::Translit; use Log::Log4perl qw(:easy); -use Parse::MediaWikiDump 0.51; # Earlier versions have a bug in namespaces +use Parse::MediaWikiDump 0.92; # Earlier versions have a different API +our $VERSION = '0.2.1'; #<<< no perltidy my %SVN_PROPS = ( ## no critic (RequireInterpolationOfMetachars) Revision => '$Revision$', @@ -50,8 +64,9 @@ Date => '$Date$', ); #>>> -our $VERSION = ($SVN_PROPS{Revision} =~ /\A\$Revision:\ (?<revision_num>\d+)\ \$\z/xms) - ? "0.1.9.$+{revision_num}" +our $FULL_VERSION = + ($SVN_PROPS{Revision} =~ /\A\$Revision:\ (?<revision_num>\d+)\ \$\z/xms) + ? "$VERSION.$+{revision_num}" : croak(q(Something is wrong with SVN revision number)); my $WIKITEXT_EXT = 'wiki.txt'; @@ -62,6 +77,7 @@ my $FIELD_SEP = qq{\t}; my $LINK_SEP = q{|}; my $TYPE_SEP = qr/\s*;\s*/xms; +my $HTML_RLM = '‏'; Readonly my $DEFAULT_MAX_IW_PLACES => 20; Readonly my $DEFAULT_PAGE_FREQ => 1000; @@ -69,11 +85,12 @@ Readonly my $DEFAULT_MAX_SECTIONS_PER_PAGE => 20; # Initialize logger +my $LOG_FN = 'outtest.log'; Log::Log4perl->easy_init( { - level => $INFO, # print everything - file => ':utf8>outtest.log', # utf is important - layout => '%m%n', # No need to print the date + level => $INFO, # print everything + file => ":utf8>$LOG_FN", # utf is important + layout => '%m%n', # No need to print the date } ); @@ -85,6 +102,9 @@ max_links_per_section => $DEFAULT_MAX_LINKS_PER_SECTION, max_sections_per_page => $DEFAULT_MAX_SECTIONS_PER_PAGE, max_iw_places => $DEFAULT_MAX_IW_PLACES, + help => 0, + usage => 0, + version => 0, ); my $valid_options = GetOptions( @@ -95,12 +115,42 @@ 'max_links_per_section' => \$option{max_links_per_section}, 'max_sections_per_page' => \$option{max_sections_per_page}, 'max_iw_places' => \$option{max_iw_places}, + 'help' => \$option{help}, + 'usage' => \$option{usage}, + 'version' => \$option{version}, ); if (not $valid_options) { croak('Invalid command line options.'); } +if ($option{'version'}) { + my $program_name = __FILE__; + say <<"END_VERSION"; +$program_name version $FULL_VERSION +This program searches for pages without interlanguage links (interwiki). + +Copyright 2005-2009 Guy Shaked, Nadav Perez and Amir E. Aharoni. + +This program is Free Software; you can redistribute it and/or modify it +under the terms of Artistic License version 2.0 or the +GNU General Public license version 3.0. + +Run `perldoc $program_name' for full terms. +END_VERSION + exit; +} + +if ($option{'help'} or $option{'usage'}) { + my $verbose_level = $option{'usage'} ? 1 : 2; + pod2usage( + { + -exitval => 0, + -verbose => $verbose_level, + } + ); +} + # XXX Too coupled to Wikipedia, won't work for other projects. my $DUMP_FN_RE = qr{ \A # Begin string @@ -168,14 +218,14 @@ @REV_MONTH{@MONTHS} = (0 .. $LAST_MONTH); # XXX Internationalize +my $string_in = get_string('in'); my $HEB_DATE_RE = qr{ \A # begin (?<hour>\d{1,2}) # hour : # : (?<min>\d{2}),\s # minute (?<mday>\d{1,2})\s # day of month - $STRING{in}? # This preposition appears sometimes - # It should have been get_string() + $string_in? # This preposition appears sometimes (?<mon>$ALT_MONTHS)\s # A name of a month (?<year>\d+?)\s # Year \([A-Z]{3}\) # Three letters in brackets - timezone @@ -193,9 +243,9 @@ (?<value>.*) # value }xms; -# XXX It should use get_string() +my $string_no_iw = get_string('no_iw'); my $SIMPLE_NO_IW_CHECK_RE = qr{ - \Q$STRING{no_iw}\E # The string may have spaces + \Q$string_no_iw\E # The string may have spaces }xmsi; # A simplistic template just for testing. @@ -218,21 +268,21 @@ # Regular expression mostly copied from # Parse::MediaWikiDump::page::redirect # TODO: Try to look for the local language redirect keyword in the dump. +my $local_redirect = get_string('REDIRECT'); my $LOCAL_REDIRECT_RE = qr{ \A # Beginning of string (page) \# # a # character - $STRING{REDIRECT} # Redirect keyword in local language - # XXX It should use get_string() + $local_redirect # Redirect keyword in local language \s*:?\s*\[\[([^\]]*)\]\] # the link after the redirect }xmsi; -my $LTR_CHAR_RE = qr/\P{IsLeftToRight}/xms; +my $LTR_CHAR_RE = qr/\P{IsRightToLeft}/xms; # \P is negation my $SECTION_LINK_RE = qr{(?<!&)\#}xms; my $LOWERCASE_LINK_RE = qr{\A[[:lower:]]}xms; -## no critic (ProhibitEscapedMetacharacters) +## no critic (RegularExpressions::ProhibitEscapedMetacharacters) my $TRUE_TEMPLATE_RE = qr/\{ $RE{balanced}{-parens=>'{}'} \}/xms; -## use critic (ProhibitEscapedMetacharacters) +## use critic (RegularExpressions::ProhibitEscapedMetacharacters) # get_string() cannot be used in re my $string_exclude_lowercase = get_string('exclude_lowercase'); @@ -357,15 +407,16 @@ # my @found_lang_codes = sort keys %found_links; # INFO("found lang_codes: @found_lang_codes"); -say 'looking for multi links'; my $begin_multi_links_time = time; +say 'listing multi links by language'; print_multi_links_by_foreign(); +say 'listing multi links by local articles'; print_multi_links_by_local(); my $total_multi_links_time = time - $begin_multi_links_time; -say "total multi links time: $total_multi_links_time"; +INFO("total multi links time: $total_multi_links_time"); exit; @@ -784,8 +835,7 @@ { my $include_lowercase_link = 1; - # XXX get_string() cannot be used here - if (defined $STRING{exclude_lowercase} + if (defined get_string('exclude_lowercase', 'if defined') and $foreign_article =~ $EXCLUDE_LOWERCASE_RE) { $include_lowercase_link = 0; @@ -807,15 +857,7 @@ } push @{ $statistics{count_iw}->[$count_iw] }, $page_title; } - INFO( - "iw link count for $page_title" - . ( - $option{'rlm'} - ? "\N{LEFT-TO-RIGHT MARK}" - : q() - ) - . " is: $count_iw" - ); + INFO("iw link count for $page_title is: $count_iw"); for my $special_case_name (keys %special_cases) { if (scalar %{ $special_cases{$special_case_name} }) { @@ -835,14 +877,19 @@ sub special_cases_file { my ($special_case_name, $page, $special_cases_ref) = @_; $special_cases_ref //= {}; # / - my $special_case_langs = join q{, }, sort keys %{$special_cases_ref}; + + my $special_case_langs = join q{ }, sort keys %{$special_cases_ref}; + if ($special_case_langs) { $special_case_langs = " ($special_case_langs)"; } + my $special_case_fn = make_type_fn($special_case_name, 1); + if (not -e $special_case_fn) { append_to_file($special_case_fn, $special_case_name); } + my $page_title = $page->title(); my $link = make_link($page_title); my $line = @@ -852,6 +899,7 @@ . get_sort_title($page_title); append_to_file($special_case_fn, $line); + return; } @@ -954,12 +1002,8 @@ . $page_title . $MW_SYNTAX{end_link}; - if ($option{rtl}) { - if ($page_title =~ $LTR_CHAR_RE) { - - # XXX get_string() cannot be used here - $link_to_page = $STRING{rlm} . $link_to_page . $STRING{rlm}; - } + if ($option{rtl} and $page_title =~ $LTR_CHAR_RE) { + $link_to_page = $HTML_RLM . $link_to_page . $HTML_RLM; } return $link_to_page; @@ -968,7 +1012,7 @@ sub create_no_iw_pages { my ($params) = @_; - INFO('creating no_iw pages'); + INFO("\ncreating no_iw pages"); # Run over page types UNSORTED_TYPE_FN: @@ -1082,9 +1126,10 @@ return "'''$text'''"; } -# Custom Unicode character property for finding characters. -# The custom is to give those subroutines CamelCase names. -sub IsLeftToRight { ## no critic (ProhibitMixedCaseSubs) +# Custom Unicode character property, which is like \w, but for Hebrew. +# The custom is to give custom Unicode character classes CamelCase names. +# P::C policy ProhibitMixedCaseSubs is deprecated. +sub IsRightToLeft { ## no critic (Capitalization) return <<'END'; +utf8::InHebrew +utf8::IsSpace @@ -1158,8 +1203,10 @@ } sub get_string { - my ($english) = @_; - return $STRING{$english} //= $english; # / + my ($english, $if_defined) = @_; + return $if_defined + ? ($STRING{$english}) + : ($STRING{$english} //= $english); # / } sub make_type_fn { @@ -1210,6 +1257,8 @@ foreach my $lang_code (sort keys %found_links) { my $filename = "$MULTI_DIR/$lang_code.$WIKITEXT_EXT"; my @foreign_articles = sort keys %{ $found_links{$lang_code} }; + say "$lang_code ", scalar @foreign_articles; + FOREIGN_ARTICLE: foreach my $foreign_article (@foreign_articles) { my @local_articles = @@ -1235,6 +1284,8 @@ LANG_CODE: foreach my $lang_code (sort keys %found_links) { my @foreign_articles = sort keys %{ $found_links{$lang_code} }; + say "$lang_code ", scalar @foreign_articles; + FOREIGN_ARTICLE: foreach my $foreign_article (@foreign_articles) { my @local_articles = @@ -1254,10 +1305,12 @@ } } + say 'writing local multilinks file'; my $filename = "$MULTI_DIR/LOCAL.$WIKITEXT_EXT"; foreach my $local_multi_article (sort keys %local_multi_links) { append_to_file($filename, '* ' . mw_bold(make_link($local_multi_article))); + foreach my $other_local_article ( sort keys %{ $local_multi_links{$local_multi_article} }) { @@ -1304,7 +1357,7 @@ sub print_stats { INFO("\nSUMMARY"); - say "total time: $total_time"; + INFO("total time: $total_time"); foreach my $stat_type (sort keys %statistics) { if (not ref $statistics{$stat_type}) { INFO("$stat_type: $statistics{$stat_type}"); @@ -1346,11 +1399,24 @@ foreach my $namespace (sort keys %namespace_count) { INFO("$namespace: $namespace_count{$namespace}"); } + INFO("\nTYPES"); foreach my $type (sort keys %type_count) { INFO("$type: $type_count{$type}"); } + my $dump_size = -s $dump_fn; + my $log_size = -s $LOG_FN; + INFO("dump size: $dump_fn"); + INFO("log size: $LOG_FN"); + if ($log_size > 0) { + my $dump_log_ratio = $dump_size / $log_size; + INFO("dump/log ratio: $dump_log_ratio"); + } + else { + WARN("weird: log file size $LOG_FN is 0"); + } + return; } @@ -1413,6 +1479,12 @@ =item * --max_iw_places Number of places to print in the statistics of pages with the most interlanguage links. +=item * --version Print the version number and exit. + +=item * --usage Print basic usage and exit. + +=item * --help Print full help and exit. + =back =head1 DESCRIPTION @@ -1545,6 +1617,10 @@ To make Perl::Critic happy :) +=item * C<Pod::Usage> + +Some style guide recommended it. I don't even remember which one, but i love style guides. + =back =head1 HACKING @@ -1564,7 +1640,7 @@ for automatic code formatting. If you modify it, do yourself a favor, install Perl::Critic and regularly test it using this command: -./tidy.sh +./tidy.sh prepare_noiw_list.pl It checks the syntax, runs perltidy on the code and runs Perl::Critic. @@ -1710,9 +1786,29 @@ =head1 LICENSE AND COPYRIGHT -This program is Free Software; you can redistribute it and/or modify it -under the same terms as Perl itself. +Copyright 2009 Guy Shaked, Nadav Perez, Amir E. Aharoni. +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License version 3 +as published by the Free Software Foundation or under the terms of +Artistic License version 2.0. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the licenses +along with this program. If not, see + +=over + +=item * L<http://www.gnu.org/licenses/gpl-3.0.html>. + +=item * L<http://www.perlfoundation.org/artistic_license_2_0> + +=back + I<Visca la llibertat!> =cut Added: trunk/no-interwiki/stopwords.txt =================================================================== --- trunk/no-interwiki/stopwords.txt (rev 0) +++ trunk/no-interwiki/stopwords.txt 2009-09-26 23:43:11 UTC (rev 80) @@ -0,0 +1,40 @@ +ActivePerl +Aharoni +Aharoni's +Amir +Amire +backporting +CPAN +Cygwin +Drora +Felagund +Felagund's +FILENAME +filenames +filesystem +hoc +interlanguage +interwiki +Itay +llibertat +MediaWiki +multi +Nadav +namespace +Noa +outtest +param +perl +perltidy +Readonly +refactoring +Reut +rtl +Shaked +tec +Visca +wiki +Wikipedia +Wikipedias +xNUMBER +XP This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2009-09-26 10:51:46
|
Revision: 79 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=79&view=rev Author: amire80 Date: 2009-09-26 10:51:33 +0000 (Sat, 26 Sep 2009) Log Message: ----------- Better diff. Modified Paths: -------------- trunk/no-interwiki/tidy.sh Modified: trunk/no-interwiki/tidy.sh =================================================================== --- trunk/no-interwiki/tidy.sh 2009-09-26 10:42:51 UTC (rev 78) +++ trunk/no-interwiki/tidy.sh 2009-09-26 10:51:33 UTC (rev 79) @@ -1,26 +1,33 @@ #!/bin/bash -FN=$1 +FILENAME=$1 echo checking syntax -perl -c $FN +perl -c $FILENAME if [ $? -ne 0 ]; then exit 1 fi echo tidying -perltidy $FN +perltidy $FILENAME if [ $? -ne 0 ]; then exit 1 fi -diff $FN.bak ${FN} -if [ $? -eq 2 ]; then +diff $FILENAME.bak $FILENAME +diff_exit_code=$? + +case $diff_exit_code in +0) + rm $FILENAME.bak +;; +2) + echo trouble in diff after perltidy, exiting. exit 1 -fi +;; +esac echo criticizing -perlcritic -brutal $FN +perlcritic -brutal $FILENAME exit $? - This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2009-09-26 10:43:01
|
Revision: 78 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=78&view=rev Author: amire80 Date: 2009-09-26 10:42:51 +0000 (Sat, 26 Sep 2009) Log Message: ----------- Less logging, redirects are supposed to be stable. Modified Paths: -------------- trunk/no-interwiki/prepare_noiw_list.pl Modified: trunk/no-interwiki/prepare_noiw_list.pl =================================================================== --- trunk/no-interwiki/prepare_noiw_list.pl 2009-09-26 10:41:09 UTC (rev 77) +++ trunk/no-interwiki/prepare_noiw_list.pl 2009-09-26 10:42:51 UTC (rev 78) @@ -71,7 +71,7 @@ # Initialize logger Log::Log4perl->easy_init( { - level => $DEBUG, # print everything + level => $INFO, # print everything file => ':utf8>outtest.log', # utf is important layout => '%m%n', # No need to print the date } @@ -425,7 +425,7 @@ if (my $redirect_type = is_redirect($page_ref)) { $statistics{"redirect - $redirect_type"}++; - INFO("\n$page_title - $redirect_type redirect"); + DEBUG("\n$page_title - $redirect_type redirect"); next PAGE; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2009-09-26 10:41:19
|
Revision: 77 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=77&view=rev Author: amire80 Date: 2009-09-26 10:41:09 +0000 (Sat, 26 Sep 2009) Log Message: ----------- Better diff handling. Modified Paths: -------------- trunk/no-interwiki/.perltidyrc Modified: trunk/no-interwiki/.perltidyrc =================================================================== --- trunk/no-interwiki/.perltidyrc 2009-09-25 10:43:03 UTC (rev 76) +++ trunk/no-interwiki/.perltidyrc 2009-09-26 10:41:09 UTC (rev 77) @@ -1,3 +1,4 @@ +--backup-and-modify-in-place --maximum-line-length=78 --continuation-indentation=4 --nooutdent-long-lines This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2009-09-25 10:43:13
|
Revision: 76 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=76&view=rev Author: amire80 Date: 2009-09-25 10:43:03 +0000 (Fri, 25 Sep 2009) Log Message: ----------- Adding el, fixing tidy. Modified Paths: -------------- trunk/no-interwiki/.perltidyrc Added Paths: ----------- trunk/no-interwiki/el.language_codes.txt trunk/no-interwiki/el.strings.txt Modified: trunk/no-interwiki/.perltidyrc =================================================================== --- trunk/no-interwiki/.perltidyrc 2009-09-24 22:40:56 UTC (rev 75) +++ trunk/no-interwiki/.perltidyrc 2009-09-25 10:43:03 UTC (rev 76) @@ -1,4 +1,3 @@ ---backup-and-modify-in-place --maximum-line-length=78 --continuation-indentation=4 --nooutdent-long-lines Added: trunk/no-interwiki/el.language_codes.txt =================================================================== --- trunk/no-interwiki/el.language_codes.txt (rev 0) +++ trunk/no-interwiki/el.language_codes.txt 2009-09-25 10:43:03 UTC (rev 76) @@ -0,0 +1 @@ +link language_codes.txt \ No newline at end of file Property changes on: trunk/no-interwiki/el.language_codes.txt ___________________________________________________________________ Added: svn:special + * Added: trunk/no-interwiki/el.strings.txt =================================================================== --- trunk/no-interwiki/el.strings.txt (rev 0) +++ trunk/no-interwiki/el.strings.txt 2009-09-25 10:43:03 UTC (rev 76) @@ -0,0 +1,36 @@ +# months +January Januaro +February Februaro +March Marto +April Aprilo +May Majo +June Junio +July Julio +August Auxgusto +September Septembro +October Oktobro +November Novembro +December Decembro + +in + +no_iw iw +disambig Αποσαφήνιση +template Πρότυπο + +date dato +type tipo + +# MW specials +REDIRECT REDIRECT + +# Namespaces +User Χρήστης +User talk Συζήτηση χρήστη +Image Αρχείο +Portal Πύλη +Category Κατηγορία +article space (Κύριος χώρος) + +# Other +other other This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2009-09-24 22:41:06
|
Revision: 75 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=75&view=rev Author: amire80 Date: 2009-09-24 22:40:56 +0000 (Thu, 24 Sep 2009) Log Message: ----------- Removed a little too much. Modified Paths: -------------- trunk/no-interwiki/prepare_noiw_list.pl Modified: trunk/no-interwiki/prepare_noiw_list.pl =================================================================== --- trunk/no-interwiki/prepare_noiw_list.pl 2009-09-24 22:33:58 UTC (rev 74) +++ trunk/no-interwiki/prepare_noiw_list.pl 2009-09-24 22:40:56 UTC (rev 75) @@ -357,6 +357,7 @@ # my @found_lang_codes = sort keys %found_links; # INFO("found lang_codes: @found_lang_codes"); +say 'looking for multi links'; my $begin_multi_links_time = time; print_multi_links_by_foreign(); @@ -1540,6 +1541,10 @@ This module is used for transliterating filenames to ASCII. +=item * C<Readonly> + +To make Perl::Critic happy :) + =back =head1 HACKING This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2009-09-24 22:34:08
|
Revision: 74 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=74&view=rev Author: amire80 Date: 2009-09-24 22:33:58 +0000 (Thu, 24 Sep 2009) Log Message: ----------- Fixing a lot of bitrot. Modified Paths: -------------- trunk/no-interwiki/prepare_noiw_list.pl trunk/no-interwiki/tidy.sh Added Paths: ----------- trunk/no-interwiki/cv.language_codes.txt trunk/no-interwiki/cv.strings.txt Added: trunk/no-interwiki/cv.language_codes.txt =================================================================== --- trunk/no-interwiki/cv.language_codes.txt (rev 0) +++ trunk/no-interwiki/cv.language_codes.txt 2009-09-24 22:33:58 UTC (rev 74) @@ -0,0 +1 @@ +link language_codes.txt \ No newline at end of file Property changes on: trunk/no-interwiki/cv.language_codes.txt ___________________________________________________________________ Added: svn:special + * Added: trunk/no-interwiki/cv.strings.txt =================================================================== --- trunk/no-interwiki/cv.strings.txt (rev 0) +++ trunk/no-interwiki/cv.strings.txt 2009-09-24 22:33:58 UTC (rev 74) @@ -0,0 +1,40 @@ +# months +January кăрлач +February нарăс +March пуш +April ака +May çу +June çĕртме +July утă +August çурла +September авăн +October юпа +November чӳк +December раштав + +in + +no_iw Интервики лартман +category Категори +disambig тĕрлĕ пĕлтерĕш +template шаблон + +date дата +type тĕс + +# MW specials +REDIRECT куçару + +# Namespaces +User Хутшăнакан +User talk Хутшăнаканăн канашлу страници +Image Ӳкерчĕк +Portal Portal +Category Категори +article space article space + +# Other +other Ытти +rlm ‏ +exclude_lowercase ß + Modified: trunk/no-interwiki/prepare_noiw_list.pl =================================================================== --- trunk/no-interwiki/prepare_noiw_list.pl 2009-09-08 19:20:10 UTC (rev 73) +++ trunk/no-interwiki/prepare_noiw_list.pl 2009-09-24 22:33:58 UTC (rev 74) @@ -21,6 +21,7 @@ use integer; use open ':utf8'; use utf8; +use charnames ':full'; # Standard library # These modules should come installed with Perl @@ -34,17 +35,14 @@ # CPAN # You must install these modules from CPAN -# Versions before 0.51 have a bug with parsing the namespace of a page +# I only use Readonly where P::C complains about it. Using it with regular +# expressions which have non-ASCII chars produces heisenbugs. use Readonly; use Regexp::Common; use Lingua::Translit; use Log::Log4perl qw(:easy); use Parse::MediaWikiDump 0.51; # Earlier versions have a bug in namespaces -# XXX -use Devel::Leak; -use Devel::Size qw(size total_size); - #<<< no perltidy my %SVN_PROPS = ( ## no critic (RequireInterpolationOfMetachars) Revision => '$Revision$', @@ -56,14 +54,14 @@ ? "0.1.9.$+{revision_num}" : croak(q(Something is wrong with SVN revision number)); -Readonly my $WIKITEXT_EXT => 'wiki.txt'; -Readonly my $OUT_DIR => 'out'; -Readonly my $UNSORTED_DIR => "$OUT_DIR/unsorted"; -Readonly my $MULTI_DIR => "$OUT_DIR/multilinks"; -Readonly my $ALT_SEP => q{|}; -Readonly my $FIELD_SEP => qq{\t}; -Readonly my $LINK_SEP => q{|}; -Readonly my $TYPE_SEP => qr/\s*;\s*/xms; +my $WIKITEXT_EXT = 'wiki.txt'; +my $OUT_DIR = 'out'; +my $UNSORTED_DIR = "$OUT_DIR/unsorted"; +my $MULTI_DIR = "$OUT_DIR/multilinks"; +my $ALT_SEP = q{|}; +my $FIELD_SEP = qq{\t}; +my $LINK_SEP = q{|}; +my $TYPE_SEP = qr/\s*;\s*/xms; Readonly my $DEFAULT_MAX_IW_PLACES => 20; Readonly my $DEFAULT_PAGE_FREQ => 1000; @@ -104,15 +102,15 @@ } # XXX Too coupled to Wikipedia, won't work for other projects. -Readonly my $DUMP_FN_RE => qr{ +my $DUMP_FN_RE = qr{ \A # Begin string (?<wiki_lang>\w+) # Lang code wiki # Is supposed to be after the lang code }xms; -Readonly my $FIELD_SEP_RE => qr{\t}xms; +my $FIELD_SEP_RE = qr{\t}xms; -Readonly my $STRING_SKIP_RE => qr{ +my $STRING_SKIP_RE = qr{ \A # Begin string \s* # Zero or more spaces (?:\#.*)? # Comment lines @@ -136,7 +134,7 @@ # ISO 9 is mostly good for Russian and it is still not perfect ASCII my $TRANSLITERATOR = Lingua::Translit->new('ISO 9'); -Readonly my %MW_SYNTAX => ( +my %MW_SYNTAX = ( 'start_template' => '{{', 'end_template' => '}}', 'start_link' => '[[', @@ -148,15 +146,15 @@ ); # This monstrosity basically says: | and optional spaces -Readonly my $PARAM_SEP_RE => qr{\s*\Q$MW_SYNTAX{param_sep}\E\s*}xms; +my $PARAM_SEP_RE = qr{\s*\Q$MW_SYNTAX{param_sep}\E\s*}xms; -Readonly my @INCLUDE_NAMESPACES => ('article space', 'Category', 'Portal'); +my @INCLUDE_NAMESPACES = ('article space', 'Category', 'Portal'); # # Constants for date processing # -Readonly my @MONTHS => @STRING{ +my @MONTHS = @STRING{ qw( January February March April May June @@ -170,7 +168,7 @@ @REV_MONTH{@MONTHS} = (0 .. $LAST_MONTH); # XXX Internationalize -Readonly my $HEB_DATE_RE => qr{ +my $HEB_DATE_RE = qr{ \A # begin (?<hour>\d{1,2}) # hour : # : @@ -184,7 +182,7 @@ \z # end }xms; -Readonly my $PARAM_RE => qr{ +my $PARAM_RE = qr{ \A # Beginning of a string \s* # Zero or more space (?: # No capture @@ -196,14 +194,14 @@ }xms; # XXX It should use get_string() -Readonly my $SIMPLE_NO_IW_CHECK_RE => qr{ +my $SIMPLE_NO_IW_CHECK_RE = qr{ \Q$STRING{no_iw}\E # The string may have spaces }xmsi; # A simplistic template just for testing. # Quite possibly it is not needed anymore. # Until i get a better regex for matching balancing {{}} ... -Readonly my $TEMPLATE_RE => qr{ +my $TEMPLATE_RE = qr{ \A # beginning of string \Q$MW_SYNTAX{start_template}\E # {{ .+ # some chars @@ -211,7 +209,7 @@ \z # end of string }xms; -Readonly my $WIKITABLE_RE => qr{ +my $WIKITABLE_RE = qr{ \A \Q$MW_SYNTAX{start_wikitable}\E }xms; @@ -220,7 +218,7 @@ # Regular expression mostly copied from # Parse::MediaWikiDump::page::redirect # TODO: Try to look for the local language redirect keyword in the dump. -Readonly my $LOCAL_REDIRECT_RE => qr{ +my $LOCAL_REDIRECT_RE = qr{ \A # Beginning of string (page) \# # a # character $STRING{REDIRECT} # Redirect keyword in local language @@ -228,60 +226,62 @@ \s*:?\s*\[\[([^\]]*)\]\] # the link after the redirect }xmsi; -Readonly my $LTR_CHAR_RE => qr/\P{IsLeftToRight}/xms; -Readonly my $SECTION_LINK_RE => qr{(?<!&)\#}xms; -Readonly my $LOWERCASE_LINK_RE => qr{\A[[:lower:]]}xms; +my $LTR_CHAR_RE = qr/\P{IsLeftToRight}/xms; +my $SECTION_LINK_RE = qr{(?<!&)\#}xms; +my $LOWERCASE_LINK_RE = qr{\A[[:lower:]]}xms; ## no critic (ProhibitEscapedMetacharacters) -Readonly my $TRUE_TEMPLATE_RE => qr/\{ $RE{balanced}{-parens=>'{}'} \}/xms; +my $TRUE_TEMPLATE_RE = qr/\{ $RE{balanced}{-parens=>'{}'} \}/xms; ## use critic (ProhibitEscapedMetacharacters) # get_string() cannot be used in re my $string_exclude_lowercase = get_string('exclude_lowercase'); -Readonly my $EXCLUDE_LOWERCASE_RE => qr{ +my $EXCLUDE_LOWERCASE_RE = qr{ \A # Beginning of foreign article name [$string_exclude_lowercase] # Character class of possibly lowercase chars }xms; -Readonly my $NUMBERED_FILE_RE => qr{ +my $NUMBERED_FILE_RE = qr{ (?: _ \d*)? \.$WIKITEXT_EXT }xms; -Readonly my $INVALID_FILENAME_CHAR_RE => qr{[\\\n/:*?"<>|]}xms; # " +my $INVALID_FILENAME_CHAR_RE = qr{[\\\n/:*?"<>|]}xms; # " -my $two_digit_charnumber_re = qr{ +my $TWO_DIGIT_CHARNUMBER_RE = qr{ (?: [%.] # There are both %C4%B0 and .AA.E0 [[:xdigit:]]{2} # 2 hex digits ) }xms; -my $html_charnumber_re = qr{ +my $HTML_CHARNUMBER_RE = qr{ (?: &\#\d+; # stuff like Š ) }xms; # TODO: Check whether it is Neapolitan with its '' -Readonly my $CHARACTER_CODE_IN_LINK_RE => qr{ - $two_digit_charnumber_re | $html_charnumber_re +my $CHARACTER_CODE_IN_LINK_RE = qr{ + $TWO_DIGIT_CHARNUMBER_RE | $HTML_CHARNUMBER_RE }xms; my %LANG_CODE; -Readonly my $LANG_CODE_FN => "$WIKI_LANG.language_codes.txt"; -open my $lang_code_file, '<', $LANG_CODE_FN +my $LANG_CODE_FN = "$WIKI_LANG.language_codes.txt"; +open my $lang_code_file, '<', $LANG_CODE_FN ## no critic (RequireBriefOpen) or croak(file_error('opening', $LANG_CODE_FN, 'reading')); while (my $line = <$lang_code_file>) { chomp $line; my ($code, $name) = split /\t/xms, $line; - $LANG_CODE{$code} = $name // $code; # / + if (defined $code) { + $LANG_CODE{$code} = $name // $code; # / + } } close $lang_code_file or croak(file_error('closing', $LANG_CODE_FN, 'reading')); -Readonly my $ALT_LANGS => join $ALT_SEP, keys %LANG_CODE; +my $ALT_LANGS = join $ALT_SEP, keys %LANG_CODE; -Readonly my $INTERWIKI_LINK_RE => qr{ +my $INTERWIKI_LINK_RE = qr{ \Q$MW_SYNTAX{start_link}\E (?<lang_code> $ALT_LANGS @@ -294,7 +294,7 @@ }xms; # Lojban allows lowercase articles -Readonly my @LOWERCASE_LANGS => qw(jbo); +my @LOWERCASE_LANGS = qw(jbo); Readonly my $COOLING_DAYS => 120; Readonly my $COOLING_SECONDS => $COOLING_DAYS * 24 * 60 * 60; @@ -321,7 +321,7 @@ my $namespaces_alt = join $ALT_SEP, grep { length > 0 } @{ $dump->namespaces_names() }; -Readonly my $PURE_TITLE_RE => qr{ +my $PURE_TITLE_RE = qr{ \A (?: (?:$namespaces_alt) @@ -357,7 +357,6 @@ # my @found_lang_codes = sort keys %found_links; # INFO("found lang_codes: @found_lang_codes"); -say 'looking for multi links'; my $begin_multi_links_time = time; print_multi_links_by_foreign(); @@ -383,18 +382,11 @@ } sub find_iwless { - my $leak_handle; - my $leak_count; - PAGE: while (my $page_ref = next_page()) { $page_counter++; if ($page_counter % $option{page_freq} == 0) { say $page_counter; - # my $lead_count = Devel::Leak::NoteSV($leak_handle); - # say "leak count: $lead_count"; - # say 'Devel::Size size of buck dump: ', total_size($dump); - # say 'Devel::Size total_size of buck page_ref: ', total_size($page_ref); } last PAGE @@ -406,27 +398,49 @@ my $page_namespace = namespace($page_ref); my $page_title = $page_ref->title(); + my $page_text_ref = $page_ref->text(); - # Skipping cases: - next PAGE - if ( - not is_in_namespace($page_ref, @INCLUDE_NAMESPACES) - or is_redirect($page_ref) + if (not defined $page_text_ref) { + WARN('ref to page text undefined, skipping'); + $statistics{'ref to page text undefined'}++; + next PAGE; + } - # TODO: Be more precise here. - # Portal pages which have a '/' in their name are probably - # internal and do not need interwiki links. - or (is_in_namespace($page_ref, 'Portal') - and $page_title =~ m{/}xms) - ); + if (not defined ${$page_text_ref}) { + WARN('page text undefined, skipping'); + $statistics{'page text undefined'}++; + next PAGE; + } + if (${$page_text_ref} eq q()) { + WARN('page text empty, skipping'); + $statistics{'page text empty'}++; + next PAGE; + } + + if (not is_in_namespace($page_ref, @INCLUDE_NAMESPACES)) { + next PAGE; + } + + if (my $redirect_type = is_redirect($page_ref)) { + $statistics{"redirect - $redirect_type"}++; + INFO("\n$page_title - $redirect_type redirect"); + next PAGE; + } + + # TODO: Be more precise here. + # Portal pages which have a '/' in their name are probably + # internal and do not need interwiki links. + if (is_in_namespace($page_ref, 'Portal') and $page_title =~ m{/}xms) { + next PAGE; + } + $namespace_count{$page_namespace}++; INFO("\n* processing $page_counter - ", $page_title); - my $page_text_ref = $page_ref->text(); - # A simple sanity check: is the no_iw template anywhere around here? - my $has_template_no_iw = ($page_text_ref =~ $SIMPLE_NO_IW_CHECK_RE); + my $has_template_no_iw = + (${$page_text_ref} =~ $SIMPLE_NO_IW_CHECK_RE); # Does the page have interwiki links? # BIG XXX Actually checks only for English @@ -792,7 +806,15 @@ } push @{ $statistics{count_iw}->[$count_iw] }, $page_title; } - INFO("iw link count for $page_title: $count_iw"); + INFO( + "iw link count for $page_title" + . ( + $option{'rlm'} + ? "\N{LEFT-TO-RIGHT MARK}" + : q() + ) + . " is: $count_iw" + ); for my $special_case_name (keys %special_cases) { if (scalar %{ $special_cases{$special_case_name} }) { @@ -1061,7 +1083,7 @@ # Custom Unicode character property for finding characters. # The custom is to give those subroutines CamelCase names. -sub IsLeftToRight { ## no critic (Capitalization) +sub IsLeftToRight { ## no critic (ProhibitMixedCaseSubs) return <<'END'; +utf8::InHebrew +utf8::IsSpace @@ -1070,19 +1092,19 @@ } sub is_redirect { - my ($page) = @_; - my $page_title = $page->title(); - my $page_text_ref = $page->text(); + my ($page) = @_; if ($page->redirect()) { - INFO("\nEnglish redirect: $page_title"); - return 1; + return 'English'; } + + my $page_text_ref = $page->text(); + if (${$page_text_ref} =~ $LOCAL_REDIRECT_RE) { - INFO("\nLocal redirect: $page_title"); - return 1; + return 'local'; } - return 0; + + return q(); } sub is_in_namespace { @@ -1290,13 +1312,20 @@ my $max_iw_index = $#{ $statistics{count_iw} }; INFO("max_iw_index: $max_iw_index"); + MAX_IW: for my $max_iw_place (0 .. $option{max_iw_places}) { my @links = - map { make_link($_) } @{ $statistics{count_iw}->[$max_iw_index] }; + map { make_link($_) } @{ $statistics{count_iw}->[$max_iw_index] } + or last MAX_IW; INFO("# $max_iw_index: " . join_links(\@links, 0)); - # Do nothing, just count down to the next index with a defined list - while (not defined $statistics{count_iw}->[ --$max_iw_index ]) { } + # Do nothing, just count down to the next index with a defined list. + # $max_iw_index needs to be checked for nonzero-ness for the rare case + # of a very low page count. + while ($max_iw_index + and not defined $statistics{count_iw}->[ --$max_iw_index ]) + { + } } INFO('pages without interwiki links per namespace'); @@ -1337,7 +1366,7 @@ prepare_noiw_list.pl -version 0.2 - Noa. +version 0.2.1 - Noa. =head1 USAGE @@ -1511,10 +1540,6 @@ This module is used for transliterating filenames to ASCII. -=item * C<Readonly> - -To make Perl::Critic happy :) - =back =head1 HACKING Modified: trunk/no-interwiki/tidy.sh =================================================================== --- trunk/no-interwiki/tidy.sh 2009-09-08 19:20:10 UTC (rev 73) +++ trunk/no-interwiki/tidy.sh 2009-09-24 22:33:58 UTC (rev 74) @@ -1,6 +1,6 @@ #!/bin/bash -FN=prepare_noiw_list.pl +FN=$1 echo checking syntax perl -c $FN This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2009-09-08 19:20:25
|
Revision: 73 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=73&view=rev Author: amire80 Date: 2009-09-08 19:20:10 +0000 (Tue, 08 Sep 2009) Log Message: ----------- cleanup before upgrading to Parse::MediaWikiDump 0.92 Modified Paths: -------------- trunk/no-interwiki/prepare_noiw_list.pl Modified: trunk/no-interwiki/prepare_noiw_list.pl =================================================================== --- trunk/no-interwiki/prepare_noiw_list.pl 2008-11-21 23:07:37 UTC (rev 72) +++ trunk/no-interwiki/prepare_noiw_list.pl 2009-09-08 19:20:10 UTC (rev 73) @@ -12,7 +12,7 @@ # This program is Free Software; you can redistribute it and/or modify it # under the same terms as Perl itself. -# Upgrade! This script actually uses new Perl 5.10 constructs, so you need it +# Upgrade! This program actually uses new Perl 5.10 constructs, so you need it use 5.010; # pragmata @@ -27,7 +27,6 @@ use English qw(-no_match_vars); use Carp qw(croak cluck); use Time::Local; -use List::Util qw(first); use Getopt::Long; use Data::Dumper; use File::Basename; @@ -36,14 +35,18 @@ # You must install these modules from CPAN # Versions before 0.51 have a bug with parsing the namespace of a page -use Parse::MediaWikiDump 0.51; +use Readonly; use Regexp::Common; use Lingua::Translit; use Log::Log4perl qw(:easy); -use Readonly; +use Parse::MediaWikiDump 0.51; # Earlier versions have a bug in namespaces +# XXX +use Devel::Leak; +use Devel::Size qw(size total_size); + #<<< no perltidy -my %SVN_PROPS = ( ## no critic ValuesAndExpressions::RequireInterpolationOfMetachars +my %SVN_PROPS = ( ## no critic (RequireInterpolationOfMetachars) Revision => '$Revision$', HeadURL => '$HeadURL$', Date => '$Date$', @@ -53,7 +56,6 @@ ? "0.1.9.$+{revision_num}" : croak(q(Something is wrong with SVN revision number)); -my %PATTERN; Readonly my $WIKITEXT_EXT => 'wiki.txt'; Readonly my $OUT_DIR => 'out'; Readonly my $UNSORTED_DIR => "$OUT_DIR/unsorted"; @@ -102,15 +104,15 @@ } # XXX Too coupled to Wikipedia, won't work for other projects. -$PATTERN{dump_fn} = qr{ +Readonly my $DUMP_FN_RE => qr{ \A # Begin string (?<wiki_lang>\w+) # Lang code wiki # Is supposed to be after the lang code }xms; -$PATTERN{field_sep} = qr{\t}xms; +Readonly my $FIELD_SEP_RE => qr{\t}xms; -$PATTERN{string_skip} = qr{ +Readonly my $STRING_SKIP_RE => qr{ \A # Begin string \s* # Zero or more spaces (?:\#.*)? # Comment lines @@ -122,9 +124,9 @@ or croak('Dump filename must be supplied as an argument.'); my %STRING; my $WIKI_LANG; -if ((basename $dump_fn) =~ $PATTERN{dump_fn}) { +if ((basename $dump_fn) =~ $DUMP_FN_RE) { $WIKI_LANG = $+{wiki_lang}; - %STRING = get_strings($WIKI_LANG); + get_strings($WIKI_LANG); } else { croak("$dump_fn is a weird dump file name."); @@ -135,8 +137,8 @@ my $TRANSLITERATOR = Lingua::Translit->new('ISO 9'); Readonly my %MW_SYNTAX => ( - 'start_tmpl' => '{{', - 'end_tmpl' => '}}', + 'start_template' => '{{', + 'end_template' => '}}', 'start_link' => '[[', 'end_link' => ']]', 'param_sep' => q{|}, @@ -146,7 +148,7 @@ ); # This monstrosity basically says: | and optional spaces -$PATTERN{param_sep} = qr{\s*\Q$MW_SYNTAX{param_sep}\E\s*}xms; +Readonly my $PARAM_SEP_RE => qr{\s*\Q$MW_SYNTAX{param_sep}\E\s*}xms; Readonly my @INCLUDE_NAMESPACES => ('article space', 'Category', 'Portal'); @@ -168,7 +170,7 @@ @REV_MONTH{@MONTHS} = (0 .. $LAST_MONTH); # XXX Internationalize -$PATTERN{heb_date} = qr{ +Readonly my $HEB_DATE_RE => qr{ \A # begin (?<hour>\d{1,2}) # hour : # : @@ -182,7 +184,7 @@ \z # end }xms; -$PATTERN{param} = qr{ +Readonly my $PARAM_RE => qr{ \A # Beginning of a string \s* # Zero or more space (?: # No capture @@ -194,22 +196,22 @@ }xms; # XXX It should use get_string() -$PATTERN{simple_no_iw_check} = qr{ +Readonly my $SIMPLE_NO_IW_CHECK_RE => qr{ \Q$STRING{no_iw}\E # The string may have spaces }xmsi; # A simplistic template just for testing. # Quite possibly it is not needed anymore. # Until i get a better regex for matching balancing {{}} ... -$PATTERN{template} = qr{ +Readonly my $TEMPLATE_RE => qr{ \A # beginning of string - \Q$MW_SYNTAX{start_tmpl}\E # {{ + \Q$MW_SYNTAX{start_template}\E # {{ .+ # some chars - \Q$MW_SYNTAX{end_tmpl}\E # }} + \Q$MW_SYNTAX{end_template}\E # }} \z # end of string }xms; -$PATTERN{wikitable} = qr{ +Readonly my $WIKITABLE_RE => qr{ \A \Q$MW_SYNTAX{start_wikitable}\E }xms; @@ -218,7 +220,7 @@ # Regular expression mostly copied from # Parse::MediaWikiDump::page::redirect # TODO: Try to look for the local language redirect keyword in the dump. -$PATTERN{local_redirect} = qr{ +Readonly my $LOCAL_REDIRECT_RE => qr{ \A # Beginning of string (page) \# # a # character $STRING{REDIRECT} # Redirect keyword in local language @@ -226,33 +228,44 @@ \s*:?\s*\[\[([^\]]*)\]\] # the link after the redirect }xmsi; -$PATTERN{ltr_char} = qr/\P{IsLeftToRight}/xms; -$PATTERN{true_template} = qr/\{ $RE{balanced}{-parens=>'{}'} \}/xms; -$PATTERN{section_link} = qr{(?<!&)\#}xms; -$PATTERN{lowercase_link} = qr{\A[[:lower:]]}xms; +Readonly my $LTR_CHAR_RE => qr/\P{IsLeftToRight}/xms; +Readonly my $SECTION_LINK_RE => qr{(?<!&)\#}xms; +Readonly my $LOWERCASE_LINK_RE => qr{\A[[:lower:]]}xms; -# XXX get_string() cannot be used here -if ($STRING{exclude_lowercase}) { - $PATTERN{exclude_lowercase} = qr{\A[$STRING{exclude_lowercase}]}xms; -} -$PATTERN{numbered_file} = qr{ +## no critic (ProhibitEscapedMetacharacters) +Readonly my $TRUE_TEMPLATE_RE => qr/\{ $RE{balanced}{-parens=>'{}'} \}/xms; +## use critic (ProhibitEscapedMetacharacters) + +# get_string() cannot be used in re +my $string_exclude_lowercase = get_string('exclude_lowercase'); +Readonly my $EXCLUDE_LOWERCASE_RE => qr{ + \A # Beginning of foreign article name + [$string_exclude_lowercase] # Character class of possibly lowercase chars +}xms; +Readonly my $NUMBERED_FILE_RE => qr{ (?: _ \d*)? \.$WIKITEXT_EXT }xms; -$PATTERN{invalid_filename_char} = qr{[\\\n/:*?"<>|]}xms; # " +Readonly my $INVALID_FILENAME_CHAR_RE => qr{[\\\n/:*?"<>|]}xms; # " -# TODO: Check whether it is Neapolitan with its '' -$PATTERN{character_code_in_link} = qr{ +my $two_digit_charnumber_re = qr{ (?: [%.] # There are both %C4%B0 and .AA.E0 [[:xdigit:]]{2} # 2 hex digits ) - | +}xms; + +my $html_charnumber_re = qr{ (?: &\#\d+; # stuff like Š ) }xms; +# TODO: Check whether it is Neapolitan with its '' +Readonly my $CHARACTER_CODE_IN_LINK_RE => qr{ + $two_digit_charnumber_re | $html_charnumber_re +}xms; + my %LANG_CODE; Readonly my $LANG_CODE_FN => "$WIKI_LANG.language_codes.txt"; open my $lang_code_file, '<', $LANG_CODE_FN @@ -260,7 +273,7 @@ while (my $line = <$lang_code_file>) { chomp $line; my ($code, $name) = split /\t/xms, $line; - $LANG_CODE{$code} = $name // $code; + $LANG_CODE{$code} = $name // $code; # / } close $lang_code_file @@ -268,7 +281,7 @@ Readonly my $ALT_LANGS => join $ALT_SEP, keys %LANG_CODE; -$PATTERN{interwiki_link} = qr{ +Readonly my $INTERWIKI_LINK_RE => qr{ \Q$MW_SYNTAX{start_link}\E (?<lang_code> $ALT_LANGS @@ -282,6 +295,7 @@ # Lojban allows lowercase articles Readonly my @LOWERCASE_LANGS => qw(jbo); + Readonly my $COOLING_DAYS => 120; Readonly my $COOLING_SECONDS => $COOLING_DAYS * 24 * 60 * 60; Readonly my $LATEST_COOLING => time - $COOLING_SECONDS; @@ -302,11 +316,12 @@ } } -my $dump = Parse::MediaWikiDump::Pages->new($dump_fn); +my $pmwd = Parse::MediaWikiDump->new(); +my $dump = $pmwd->revisions($dump_fn); my $namespaces_alt = join $ALT_SEP, grep { length > 0 } @{ $dump->namespaces_names() }; -$PATTERN{pure_title} = qr{ +Readonly my $PURE_TITLE_RE => qr{ \A (?: (?:$namespaces_alt) @@ -359,12 +374,27 @@ return $page->namespace() || get_string('article space'); } +sub next_page { + my $page_ref = eval { $dump->next(); }; + if ($EVAL_ERROR) { + confess("Failed reading a page: $EVAL_ERROR"); + } + return $page_ref; +} + sub find_iwless { + my $leak_handle; + my $leak_count; + PAGE: - while (my $page = $dump->page()) { + while (my $page_ref = next_page()) { $page_counter++; if ($page_counter % $option{page_freq} == 0) { say $page_counter; + # my $lead_count = Devel::Leak::NoteSV($leak_handle); + # say "leak count: $lead_count"; + # say 'Devel::Size size of buck dump: ', total_size($dump); + # say 'Devel::Size total_size of buck page_ref: ', total_size($page_ref); } last PAGE @@ -374,45 +404,45 @@ next PAGE if ($page_counter < $option{start_from}); - my $namespace = namespace($page); - my $page_title = $page->title(); + my $page_namespace = namespace($page_ref); + my $page_title = $page_ref->title(); # Skipping cases: next PAGE if ( - not is_in_namespace($page, @INCLUDE_NAMESPACES) - or is_redirect($page) + not is_in_namespace($page_ref, @INCLUDE_NAMESPACES) + or is_redirect($page_ref) # TODO: Be more precise here. # Portal pages which have a '/' in their name are probably # internal and do not need interwiki links. - or (is_in_namespace($page, 'Portal') and $page_title =~ m{/}xms) + or (is_in_namespace($page_ref, 'Portal') + and $page_title =~ m{/}xms) ); - $namespace_count{$namespace}++; + $namespace_count{$page_namespace}++; INFO("\n* processing $page_counter - ", $page_title); - my $page_text_ref = $page->text(); + my $page_text_ref = $page_ref->text(); # A simple sanity check: is the no_iw template anywhere around here? - my $has_tmpl_no_iw = - (${$page_text_ref} =~ $PATTERN{simple_no_iw_check}); + my $has_template_no_iw = ($page_text_ref =~ $SIMPLE_NO_IW_CHECK_RE); # Does the page have interwiki links? # BIG XXX Actually checks only for English - my $has_iw = has_interwiki($page); + my $has_iw = has_interwiki($page_ref); if ($has_iw) { INFO("has link to $has_iw"); - if ($has_tmpl_no_iw) { + if ($has_template_no_iw) { INFO('has template no_iw. trying to remove ...'); - remove_tmpl_no_iw($page_text_ref); + remove_template_no_iw($page_text_ref); $statistics{'has both valid interwiki and template'}++; - special_cases_file('outdated_template', $page); + special_cases_file('outdated_template', $page_ref); } } else { # does not have iw - process_iwless_page($page, $has_tmpl_no_iw, $has_iw); + process_iwless_page($page_ref, $has_template_no_iw, $has_iw); } } @@ -421,25 +451,24 @@ sub process_iwless_page { my ( - $page, # object ref - $has_tmpl_no_iw, # scalar bool - $has_iw # scalar bool + $page_ref, # object ref + $has_template_no_iw, # scalar bool + $has_iw # scalar bool ) = @_; INFO(q(does not have iw link.)); - $statistics{'has no interwiki link'}->{ namespace($page) }++; + $statistics{'has no interwiki link'}->{ namespace($page_ref) }++; # Now we need to search for no_iw templates # and parse their parameters - date and type my @found_templates = (); - my $page_text_ref = $page->text(); - my $page_title = $page->title(); + my $page_text_ref = $page_ref->text(); # Optimized - does not start searching, # if we already know that it is not there - if ($has_tmpl_no_iw) { + if ($has_template_no_iw) { find_templates($page_text_ref, \@found_templates, [ get_string('no_iw') ]); } @@ -455,7 +484,7 @@ if ($found_templates_count > 1) { WARN('many templates were found'); $statistics{'many templates'}++; - special_cases_file('many_templates', $page); + special_cases_file('many_templates', $page_ref); } else { INFO('good, found one template'); @@ -476,7 +505,7 @@ if (not defined $date_ref) { INFO("invalid date: '$date_str'"); $statistics{'invalid date'}++; - special_cases_file('invalid_date', $page); + special_cases_file('invalid_date', $page_ref); } elsif (cooling_date_passed($date_ref)) { INFO('cooling date passed, updating to today ...'); @@ -494,11 +523,11 @@ } - my @all_types = get_all_types($template->{params}->{type}, $page); + my @all_types = get_all_types($template->{params}->{type}, $page_ref); foreach my $type (@all_types) { - INFO("adding $page_title to the list as type $type"); - add_to_no_iw_list($page, $type); + INFO('adding ' . $page_ref->title() . " to the list as type $type"); + add_to_no_iw_list($page_ref, $type); $type_count{$type}++; } @@ -508,7 +537,7 @@ sub get_all_types { my ($type_param, $page) = @_; - $type_param //= q{}; + $type_param //= q{}; # / strip_whitespace($type_param); my @all_types = split $TYPE_SEP, $type_param; @@ -545,23 +574,23 @@ sub find_templates { my ( - $page_text_ref, # string ref + $text_ref, # string ref $found_templates_ref, # array ref $filter # string array ref ) = @_; # A reference to an array with one empty string. # Matching against an empty string will always succeed. - $filter //= [q{}]; + $filter //= [q{}]; # / # Get all highest-level matches - my @matches = (${$page_text_ref} =~ /$PATTERN{true_template}/xmsgo); + my @matches = (${$text_ref} =~ /$TRUE_TEMPLATE_RE/xmsgo); MATCH: foreach my $next_match (@matches) { - if ($next_match !~ $PATTERN{template}) { + if ($next_match !~ $TEMPLATE_RE) { INFO(q(i thought that it is a template, but it was:)); - if ($next_match =~ $PATTERN{wikitable}) { + if ($next_match =~ $WIKITABLE_RE) { INFO('a wikitable'); } else { @@ -574,7 +603,8 @@ foreach my $next_filter (@{$filter}) { # N.B. - case-insensitive. Wrong, but kinda useful. - if ($next_match =~ /\A\Q$MW_SYNTAX{'start_tmpl'}$next_filter/xmsi) + if ($next_match =~ + /\A\Q$MW_SYNTAX{'start_template'}$next_filter/xmsi) { # N.B.: parse_template calls find_templates() recursively @@ -592,7 +622,7 @@ my ( $template, # string ref $default_param_names, # string array ref - $subtmpl_filter, # string array ref + $subtemplate_filter, # string array ref ) = @_; # %parsed_template: @@ -602,16 +632,16 @@ # {subtemplates} - array ref my (%parsed_template, %parsed_params, @clauses); - $parsed_template{text} = strip_tmpl_curlies(${$template}); + $parsed_template{text} = strip_template_curlies(${$template}); # First string of the split is the template name, # the rest is the params ($parsed_template{name}, @clauses) = - (split $PATTERN{param_sep}, ${ $parsed_template{text} }); + (split $PARAM_SEP_RE, ${ $parsed_template{text} }); my $param_counter = 0; foreach my $clause (@clauses) { - if ($clause =~ $PATTERN{param}) { + if ($clause =~ $PARAM_RE) { #<<< no perltidy my ($name, $value) = @+{ qw(param_name value) }; #>>> @@ -630,7 +660,7 @@ $parsed_params{$name} = $value; } else { - my $error_msg = "Weird - $clause does not look a param"; + my $error_msg = "Weird - $clause does not look like a param"; INFO($error_msg); cluck($error_msg); $statistics{'weird param'}++; @@ -641,7 +671,7 @@ # Possible recursion find_templates($parsed_template{text}, $parsed_template{subtemplates}, - $subtmpl_filter); + $subtemplate_filter); return \%parsed_template; } @@ -651,7 +681,7 @@ return if (not defined $date_str); - if ($date_str =~ $PATTERN{heb_date}) { + if ($date_str =~ $HEB_DATE_RE) { INFO("found a valid date: $date_str"); my %parsed_date = ( 'sec' => 0, # useful for timelocal @@ -677,15 +707,15 @@ return; } -sub strip_tmpl_curlies { +sub strip_template_curlies { my ($template) = @_; for ($template) { s{ \A - \Q$MW_SYNTAX{start_tmpl}\E + \Q$MW_SYNTAX{start_template}\E }{}xms; s{ - \Q$MW_SYNTAX{end_tmpl}\E + \Q$MW_SYNTAX{end_template}\E \z }{}xms; } @@ -693,7 +723,7 @@ } # no arg unpacking for simplicity and performance -sub strip_whitespace { ## no critic Subroutines::RequireArgUnpacking +sub strip_whitespace { ## no critic (RequireArgUnpacking) for (@_) { s/\A\s*//xms; s/\s*\z//xms; @@ -705,13 +735,14 @@ my ($page) = @_; my $page_title = $page->title(); - my $page_text = ${ $page->text() }; - study $page_text; + my $page_text = ${ $page->text() }; # XXX + study $page_text; # XXX + my %iw_links; my %special_cases; - while ($page_text =~ /$PATTERN{interwiki_link}/xmsgo) { + while ($page_text =~ /$INTERWIKI_LINK_RE/xmsgo) { # XXX my ($lang_code, $foreign_article) = @+{qw(lang_code foreign_article)}; if (defined $iw_links{$lang_code}) { $special_cases{double_links}->{$lang_code} = q{}; @@ -722,31 +753,31 @@ # A # sign not after an &. # After an & it is probably a character number. - if ($foreign_article =~ $PATTERN{section_link}) { + if ($foreign_article =~ $SECTION_LINK_RE) { $special_cases{section_links}->{$lang_code} = q{}; } # Char codes are common in section links, so there is no # need to show them again - elsif ($foreign_article =~ $PATTERN{character_code_in_link}) { - $special_cases{charnumber_links}{$lang_code} = q{}; + elsif ($foreign_article =~ $CHARACTER_CODE_IN_LINK_RE) { + $special_cases{charnumber_links}->{$lang_code} = q{}; } # Lowercase links if ( (not $lang_code ~~ @LOWERCASE_LANGS) - and ($foreign_article =~ $PATTERN{lowercase_link})) + and ($foreign_article =~ $LOWERCASE_LINK_RE)) { my $include_lowercase_link = 1; # XXX get_string() cannot be used here if (defined $STRING{exclude_lowercase} - and $foreign_article =~ $PATTERN{exclude_lowercase}) + and $foreign_article =~ $EXCLUDE_LOWERCASE_RE) { $include_lowercase_link = 0; } if ($include_lowercase_link) { - $special_cases{lowercase_links}{$lang_code} = q{}; + $special_cases{lowercase_links}->{$lang_code} = q{}; } } @@ -801,7 +832,7 @@ return; } -sub remove_tmpl_no_iw { +sub remove_template_no_iw { my ($params) = @_; INFO( "Supposed to remove the no_iw template now, but ...\n" . 'This sub is a stub. You can help Wikipedia by expanding it!'); @@ -853,7 +884,7 @@ my ($page_title) = @_; my $sort_title; - if ($page_title =~ $PATTERN{pure_title}) { + if ($page_title =~ $PURE_TITLE_RE) { $sort_title = $+{pure_title}; } else { @@ -901,7 +932,7 @@ . $MW_SYNTAX{end_link}; if ($option{rtl}) { - if ($page_title =~ $PATTERN{ltr_char}) { + if ($page_title =~ $LTR_CHAR_RE) { # XXX get_string() cannot be used here $link_to_page = $STRING{rlm} . $link_to_page . $STRING{rlm}; @@ -930,9 +961,9 @@ chomp $type_name; foreach my $line (@lines) { chomp $line; - my ($page_title, $sort_title) = split $PATTERN{field_sep}, $line; + my ($page_title, $sort_title) = split $FIELD_SEP_RE, $line; my $sort_letter = get_sort_letter($sort_title); - $all_pages_in_type{$sort_letter} //= []; + $all_pages_in_type{$sort_letter} //= []; # / push @{ $all_pages_in_type{$sort_letter} }, $page_title; } write_sorted_pages($type_name, \%all_pages_in_type); @@ -965,8 +996,6 @@ write_page(\$page, \$type_fn, $file_number++); $section_counter = 0; - # N.B. Trying to free memory, not guaranteed - undef $page; $page = q{}; } elsif ($section_counter) { @@ -988,16 +1017,12 @@ [ @all_links_in_letter[ $first_link .. $last_link ] ]); $page .= $links; } - - # N.B. Trying to free memory, not guaranteed - undef @all_links_in_letter; } # The page may be empty at this point if ($page) { write_page(\$page, \$type_fn, $file_number++); } - undef $page; return; } @@ -1007,7 +1032,7 @@ my $pretty_file_number = sprintf '%03d', $file_number; ${$type_fn_ref} =~ s{ - $PATTERN{numbered_file} + $NUMBERED_FILE_RE } {_$pretty_file_number.$WIKITEXT_EXT}xmso; INFO("creating file ${$type_fn_ref}"); @@ -1036,7 +1061,7 @@ # Custom Unicode character property for finding characters. # The custom is to give those subroutines CamelCase names. -sub IsLeftToRight { ## no critic NamingConventions::ProhibitMixedCaseSubs +sub IsLeftToRight { ## no critic (Capitalization) return <<'END'; +utf8::InHebrew +utf8::IsSpace @@ -1053,7 +1078,7 @@ INFO("\nEnglish redirect: $page_title"); return 1; } - if (${$page_text_ref} =~ $PATTERN{local_redirect}) { + if (${$page_text_ref} =~ $LOCAL_REDIRECT_RE) { INFO("\nLocal redirect: $page_title"); return 1; } @@ -1093,22 +1118,20 @@ close $STRINGS_FILE or croak(file_error('closing', $STRINGS_FN, 'reading')); - my %STRING; - STRING_LINE: foreach my $next_string_line (@strings_file_lines) { # Skip blanks and comments - next STRING_LINE if ($next_string_line =~ $PATTERN{string_skip}); + next STRING_LINE if ($next_string_line =~ $STRING_SKIP_RE); chomp $next_string_line; - my ($english, $target) = split $PATTERN{field_sep}, $next_string_line; + my ($english, $target) = split $FIELD_SEP_RE, $next_string_line; # Fallback to English if no target language string was supplied $STRING{$english} = $target // $english; # / } - return %STRING; + return; } sub get_string { @@ -1125,7 +1148,7 @@ my $type_fn = "$transliterated_type.$WIKITEXT_EXT"; - $type_fn =~ s{$PATTERN{invalid_filename_char}}{-}xmsgo; + $type_fn =~ s{$INVALID_FILENAME_CHAR_RE}{-}xmsgo; my $dir = $unsorted ? $UNSORTED_DIR : $OUT_DIR; $type_fn = "$dir/$type_fn"; @@ -1138,7 +1161,7 @@ open my $file, '>>:utf8', $fn or croak(file_error('opening', $fn, 'appending')); - say {$file} ($line // q{}); + say {$file} ($line // q{}); # / close $file or croak(file_error('closing', $fn, 'appeding')); @@ -1236,11 +1259,11 @@ $first_local_article, @other_local_articles ) = @_; - $local_multi_links_ref->{$first_local_article} //= {}; + $local_multi_links_ref->{$first_local_article} //= {}; # / foreach my $other_local_article (@other_local_articles) { $local_multi_links_ref->{$first_local_article} - ->{$other_local_article} //= []; + ->{$other_local_article} //= []; # / push @{ $local_multi_links_ref->{$first_local_article} ->{$other_local_article} }, $foreign_link; } @@ -1250,7 +1273,7 @@ sub join_links { my ($links_ref, $line_end) = @_; - $line_end //= 1; # / + $line_end //= 1; # / my $link_sep = q{ } . $LINK_SEP . ($line_end ? "\n" : q{ }); return join $link_sep, @{$links_ref}; @@ -1280,10 +1303,11 @@ foreach my $namespace (keys %{ $statistics{'has no interwiki link'} }) { my $iwless_in_namespace = $statistics{'has no interwiki link'}->{$namespace}; - ## no critic ValuesAndExpressions::ProhibitMagicNumbers no integer; + ## no critic (ProhibitMagicNumbers) my $percentage = sprintf '%.2f', 100 * $iwless_in_namespace / $namespace_count{$namespace}; + ## use critic (ValuesAndExpressions::ProhibitMagicNumbers) use integer; INFO("$namespace: $iwless_in_namespace, $percentage%"); } @@ -1344,7 +1368,7 @@ =item * --stop_after=NUMBER Stops processing after page with the given NUMBER. -=item * --stop_after=NUMBER Begins processing after page with the given +=item * --start_from=NUMBER Begins processing after page with the given NUMBER. =item * --page_freq=NUMBER Print the page counter every NUMBER of @@ -1514,7 +1538,7 @@ It checks the syntax, runs perltidy on the code and runs Perl::Critic. -All the places where P::C has been disabled using "# no critic" are explained. +All the places where P::C has been disabled using "no critic" are explained. The time invested in making the code P::C-friendly will be returned as time saved on debugging. Also consider reading the book "Perl Best Practices" by @@ -1527,14 +1551,14 @@ This program works best on GNU/Linux, where Perl and the filesystem are Unicode-friendly. -This program was also tested on Windows XP and Vista with ActivePerl 5.10 -and Cygwin Perl 5.10. In these environments Unicode-related issues caused +This program was also tested on Windows XP and Vista with ActivePerl and +Cygwin. In these environments Unicode-related issues caused filenames and clipboard text to become jumbled. You have been warned. =head1 BUGS AND LIMITATIONS Please report all bugs, features requests and other comments to -Amir E. Aharoni (ami...@gm...). +Amir E. Aharoni (ami...@ma...). =head2 There is no equality between languages @@ -1662,4 +1686,3 @@ I<Visca la llibertat!> =cut - This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2008-11-21 23:07:41
|
Revision: 72 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=72&view=rev Author: amire80 Date: 2008-11-21 23:07:37 +0000 (Fri, 21 Nov 2008) Log Message: ----------- Update language codes. Modified Paths: -------------- trunk/no-interwiki/language_codes.txt Modified: trunk/no-interwiki/language_codes.txt =================================================================== --- trunk/no-interwiki/language_codes.txt 2008-11-21 22:39:40 UTC (rev 71) +++ trunk/no-interwiki/language_codes.txt 2008-11-21 23:07:37 UTC (rev 72) @@ -251,3 +251,12 @@ rn Kirundi chy Cheyenne ng Ndonga +cho Choctaw +chy Cheyenne +mh Marshallese +aa Afar +kj Kuanyama +ho Hiri Motu +mus Muscogee +kr Kanuri + This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2008-11-21 22:39:42
|
Revision: 71 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=71&view=rev Author: amire80 Date: 2008-11-21 22:39:40 +0000 (Fri, 21 Nov 2008) Log Message: ----------- POD spelling, critic update. Modified Paths: -------------- trunk/no-interwiki/.perlcriticrc trunk/no-interwiki/prepare_noiw_list.pl Modified: trunk/no-interwiki/.perlcriticrc =================================================================== --- trunk/no-interwiki/.perlcriticrc 2008-11-21 20:57:24 UTC (rev 70) +++ trunk/no-interwiki/.perlcriticrc 2008-11-21 22:39:40 UTC (rev 71) @@ -1,6 +1,7 @@ -#[-CodeLayout::RequireTidyCode] -#[-Miscellanea::RequireRcsKeywords] - # English.pm doesn't support named capture variables (yet) [Variables::ProhibitPunctuationVars] allow = %+ $+ @+ + +# This is probably useful, but appears buggy, so it should remain manual +[-Documentation::PodSpelling] + Modified: trunk/no-interwiki/prepare_noiw_list.pl =================================================================== --- trunk/no-interwiki/prepare_noiw_list.pl 2008-11-21 20:57:24 UTC (rev 70) +++ trunk/no-interwiki/prepare_noiw_list.pl 2008-11-21 22:39:40 UTC (rev 71) @@ -1380,7 +1380,7 @@ it will be removed automatically in the future.) =item * If the page contains no links to the defined languages and does not -comtain the template, it is automatically added to type "other". +contain the template, it is automatically added to type "other". =item * If the page contains no links to the defined languages and a template with types, it is added to the defined types. @@ -1606,7 +1606,7 @@ =item * B<0.2 - Noa>: Perl 5.10. Russian l10n. POD documentation. Pretty categories sorting. Memory usage optimization - accumulating information in files. More generic, but far-from-perfect handling of links to languages -other than English. Translitetaion with Lingua::Translit. Logging with +other than English. Transliteration with Lingua::Translit. Logging with Log::Log4perl. Brutal Perl::Critic 1.90. Started using Readonly. Not finished: complete statistics, removal of templates from pages which already have links. This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2008-11-21 20:57:27
|
Revision: 70 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=70&view=rev Author: amire80 Date: 2008-11-21 20:57:24 +0000 (Fri, 21 Nov 2008) Log Message: ----------- generic perlcritic location. Modified Paths: -------------- trunk/no-interwiki/tidy.sh Modified: trunk/no-interwiki/tidy.sh =================================================================== --- trunk/no-interwiki/tidy.sh 2008-08-13 10:30:54 UTC (rev 69) +++ trunk/no-interwiki/tidy.sh 2008-11-21 20:57:24 UTC (rev 70) @@ -20,7 +20,7 @@ fi echo criticizing -/usr/local/bin/perlcritic -brutal $FN +perlcritic -brutal $FN exit $? This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2008-08-13 10:30:44
|
Revision: 69 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=69&view=rev Author: amire80 Date: 2008-08-13 10:30:54 +0000 (Wed, 13 Aug 2008) Log Message: ----------- Exclude namespace before redirect check, makes log shorter. Modified Paths: -------------- trunk/no-interwiki/prepare_noiw_list.pl trunk/no-interwiki/tidy.sh Modified: trunk/no-interwiki/prepare_noiw_list.pl =================================================================== --- trunk/no-interwiki/prepare_noiw_list.pl 2008-08-12 07:54:04 UTC (rev 68) +++ trunk/no-interwiki/prepare_noiw_list.pl 2008-08-13 10:30:54 UTC (rev 69) @@ -380,8 +380,8 @@ # Skipping cases: next PAGE if ( - is_redirect($page) - or not is_in_namespace($page, @INCLUDE_NAMESPACES) + not is_in_namespace($page, @INCLUDE_NAMESPACES) + or is_redirect($page) # TODO: Be more precise here. # Portal pages which have a '/' in their name are probably Modified: trunk/no-interwiki/tidy.sh =================================================================== --- trunk/no-interwiki/tidy.sh 2008-08-12 07:54:04 UTC (rev 68) +++ trunk/no-interwiki/tidy.sh 2008-08-13 10:30:54 UTC (rev 69) @@ -14,7 +14,7 @@ exit 1 fi -diff $FN ${FN}.bak +diff $FN.bak ${FN} if [ $? -eq 2 ]; then exit 1 fi This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |