[PerlWikiBot] SF.net SVN: perlwikibot:[80] trunk/no-interwiki
Status: Pre-Alpha
Brought to you by:
rotemliss
|
From: <am...@us...> - 2009-09-26 23:43:19
|
Revision: 80
http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=80&view=rev
Author: amire80
Date: 2009-09-26 23:43:11 +0000 (Sat, 26 Sep 2009)
Log Message:
-----------
Copyright, MediaWikiDump 0.92 requirement, version number cleanup, RTL cleanup, logging cleanup, POD cleanup, get_string cleanup, simple dump ratio, P::C cleanup, adding pod spelling.
Modified Paths:
--------------
trunk/no-interwiki/.perlcriticrc
trunk/no-interwiki/prepare_noiw_list.pl
Added Paths:
-----------
trunk/no-interwiki/stopwords.txt
Modified: trunk/no-interwiki/.perlcriticrc
===================================================================
--- trunk/no-interwiki/.perlcriticrc 2009-09-26 10:51:33 UTC (rev 79)
+++ trunk/no-interwiki/.perlcriticrc 2009-09-26 23:43:11 UTC (rev 80)
@@ -2,6 +2,10 @@
[Variables::ProhibitPunctuationVars]
allow = %+ $+ @+
-# This is probably useful, but appears buggy, so it should remain manual
-[-Documentation::PodSpelling]
+[Documentation::PodSpelling]
+stop_words_file = stopwords.txt
+# say is for writing to the terminal, no need to check it
+[InputOutput::RequireCheckedSyscalls]
+exclude_functions = say
+
Modified: trunk/no-interwiki/prepare_noiw_list.pl
===================================================================
--- trunk/no-interwiki/prepare_noiw_list.pl 2009-09-26 10:51:33 UTC (rev 79)
+++ trunk/no-interwiki/prepare_noiw_list.pl 2009-09-26 23:43:11 UTC (rev 80)
@@ -1,7 +1,7 @@
#!/usr/bin/perl
# prepare_noiw_list.pl
-# version 0.2 Noa - development
+# version 0.2.1 Noa - development
# See the POD documentation at the end of the file or run
# perldoc prepare_noiw_list.pl
@@ -9,9 +9,21 @@
# "Had he been a French child,
# he would have heard an infinite number of sentences" - Otto Jespersen
-# This program is Free Software; you can redistribute it and/or modify it
-# under the same terms as Perl itself.
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 3
+# as published by the Free Software Foundation or under the terms of
+# Artistic License version 2.0.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the licenses
+# along with this program. If not, see
+# * <http://www.gnu.org/licenses/gpl-3.0.html>.
+# * <http://www.perlfoundation.org/artistic_license_2_0>
+
# Upgrade! This program actually uses new Perl 5.10 constructs, so you need it
use 5.010;
@@ -38,11 +50,13 @@
# I only use Readonly where P::C complains about it. Using it with regular
# expressions which have non-ASCII chars produces heisenbugs.
use Readonly;
+use Pod::Usage;
use Regexp::Common;
use Lingua::Translit;
use Log::Log4perl qw(:easy);
-use Parse::MediaWikiDump 0.51; # Earlier versions have a bug in namespaces
+use Parse::MediaWikiDump 0.92; # Earlier versions have a different API
+our $VERSION = '0.2.1';
#<<< no perltidy
my %SVN_PROPS = ( ## no critic (RequireInterpolationOfMetachars)
Revision => '$Revision$',
@@ -50,8 +64,9 @@
Date => '$Date$',
);
#>>>
-our $VERSION = ($SVN_PROPS{Revision} =~ /\A\$Revision:\ (?<revision_num>\d+)\ \$\z/xms)
- ? "0.1.9.$+{revision_num}"
+our $FULL_VERSION =
+ ($SVN_PROPS{Revision} =~ /\A\$Revision:\ (?<revision_num>\d+)\ \$\z/xms)
+ ? "$VERSION.$+{revision_num}"
: croak(q(Something is wrong with SVN revision number));
my $WIKITEXT_EXT = 'wiki.txt';
@@ -62,6 +77,7 @@
my $FIELD_SEP = qq{\t};
my $LINK_SEP = q{|};
my $TYPE_SEP = qr/\s*;\s*/xms;
+my $HTML_RLM = '‏';
Readonly my $DEFAULT_MAX_IW_PLACES => 20;
Readonly my $DEFAULT_PAGE_FREQ => 1000;
@@ -69,11 +85,12 @@
Readonly my $DEFAULT_MAX_SECTIONS_PER_PAGE => 20;
# Initialize logger
+my $LOG_FN = 'outtest.log';
Log::Log4perl->easy_init(
{
- level => $INFO, # print everything
- file => ':utf8>outtest.log', # utf is important
- layout => '%m%n', # No need to print the date
+ level => $INFO, # print everything
+ file => ":utf8>$LOG_FN", # utf is important
+ layout => '%m%n', # No need to print the date
}
);
@@ -85,6 +102,9 @@
max_links_per_section => $DEFAULT_MAX_LINKS_PER_SECTION,
max_sections_per_page => $DEFAULT_MAX_SECTIONS_PER_PAGE,
max_iw_places => $DEFAULT_MAX_IW_PLACES,
+ help => 0,
+ usage => 0,
+ version => 0,
);
my $valid_options = GetOptions(
@@ -95,12 +115,42 @@
'max_links_per_section' => \$option{max_links_per_section},
'max_sections_per_page' => \$option{max_sections_per_page},
'max_iw_places' => \$option{max_iw_places},
+ 'help' => \$option{help},
+ 'usage' => \$option{usage},
+ 'version' => \$option{version},
);
if (not $valid_options) {
croak('Invalid command line options.');
}
+if ($option{'version'}) {
+ my $program_name = __FILE__;
+ say <<"END_VERSION";
+$program_name version $FULL_VERSION
+This program searches for pages without interlanguage links (interwiki).
+
+Copyright 2005-2009 Guy Shaked, Nadav Perez and Amir E. Aharoni.
+
+This program is Free Software; you can redistribute it and/or modify it
+under the terms of Artistic License version 2.0 or the
+GNU General Public license version 3.0.
+
+Run `perldoc $program_name' for full terms.
+END_VERSION
+ exit;
+}
+
+if ($option{'help'} or $option{'usage'}) {
+ my $verbose_level = $option{'usage'} ? 1 : 2;
+ pod2usage(
+ {
+ -exitval => 0,
+ -verbose => $verbose_level,
+ }
+ );
+}
+
# XXX Too coupled to Wikipedia, won't work for other projects.
my $DUMP_FN_RE = qr{
\A # Begin string
@@ -168,14 +218,14 @@
@REV_MONTH{@MONTHS} = (0 .. $LAST_MONTH);
# XXX Internationalize
+my $string_in = get_string('in');
my $HEB_DATE_RE = qr{
\A # begin
(?<hour>\d{1,2}) # hour
: # :
(?<min>\d{2}),\s # minute
(?<mday>\d{1,2})\s # day of month
- $STRING{in}? # This preposition appears sometimes
- # It should have been get_string()
+ $string_in? # This preposition appears sometimes
(?<mon>$ALT_MONTHS)\s # A name of a month
(?<year>\d+?)\s # Year
\([A-Z]{3}\) # Three letters in brackets - timezone
@@ -193,9 +243,9 @@
(?<value>.*) # value
}xms;
-# XXX It should use get_string()
+my $string_no_iw = get_string('no_iw');
my $SIMPLE_NO_IW_CHECK_RE = qr{
- \Q$STRING{no_iw}\E # The string may have spaces
+ \Q$string_no_iw\E # The string may have spaces
}xmsi;
# A simplistic template just for testing.
@@ -218,21 +268,21 @@
# Regular expression mostly copied from
# Parse::MediaWikiDump::page::redirect
# TODO: Try to look for the local language redirect keyword in the dump.
+my $local_redirect = get_string('REDIRECT');
my $LOCAL_REDIRECT_RE = qr{
\A # Beginning of string (page)
\# # a # character
- $STRING{REDIRECT} # Redirect keyword in local language
- # XXX It should use get_string()
+ $local_redirect # Redirect keyword in local language
\s*:?\s*\[\[([^\]]*)\]\] # the link after the redirect
}xmsi;
-my $LTR_CHAR_RE = qr/\P{IsLeftToRight}/xms;
+my $LTR_CHAR_RE = qr/\P{IsRightToLeft}/xms; # \P is negation
my $SECTION_LINK_RE = qr{(?<!&)\#}xms;
my $LOWERCASE_LINK_RE = qr{\A[[:lower:]]}xms;
-## no critic (ProhibitEscapedMetacharacters)
+## no critic (RegularExpressions::ProhibitEscapedMetacharacters)
my $TRUE_TEMPLATE_RE = qr/\{ $RE{balanced}{-parens=>'{}'} \}/xms;
-## use critic (ProhibitEscapedMetacharacters)
+## use critic (RegularExpressions::ProhibitEscapedMetacharacters)
# get_string() cannot be used in re
my $string_exclude_lowercase = get_string('exclude_lowercase');
@@ -357,15 +407,16 @@
# my @found_lang_codes = sort keys %found_links;
# INFO("found lang_codes: @found_lang_codes");
-say 'looking for multi links';
my $begin_multi_links_time = time;
+say 'listing multi links by language';
print_multi_links_by_foreign();
+say 'listing multi links by local articles';
print_multi_links_by_local();
my $total_multi_links_time = time - $begin_multi_links_time;
-say "total multi links time: $total_multi_links_time";
+INFO("total multi links time: $total_multi_links_time");
exit;
@@ -784,8 +835,7 @@
{
my $include_lowercase_link = 1;
- # XXX get_string() cannot be used here
- if (defined $STRING{exclude_lowercase}
+ if (defined get_string('exclude_lowercase', 'if defined')
and $foreign_article =~ $EXCLUDE_LOWERCASE_RE)
{
$include_lowercase_link = 0;
@@ -807,15 +857,7 @@
}
push @{ $statistics{count_iw}->[$count_iw] }, $page_title;
}
- INFO(
- "iw link count for $page_title"
- . (
- $option{'rlm'}
- ? "\N{LEFT-TO-RIGHT MARK}"
- : q()
- )
- . " is: $count_iw"
- );
+ INFO("iw link count for $page_title is: $count_iw");
for my $special_case_name (keys %special_cases) {
if (scalar %{ $special_cases{$special_case_name} }) {
@@ -835,14 +877,19 @@
sub special_cases_file {
my ($special_case_name, $page, $special_cases_ref) = @_;
$special_cases_ref //= {}; # /
- my $special_case_langs = join q{, }, sort keys %{$special_cases_ref};
+
+ my $special_case_langs = join q{ }, sort keys %{$special_cases_ref};
+
if ($special_case_langs) {
$special_case_langs = " ($special_case_langs)";
}
+
my $special_case_fn = make_type_fn($special_case_name, 1);
+
if (not -e $special_case_fn) {
append_to_file($special_case_fn, $special_case_name);
}
+
my $page_title = $page->title();
my $link = make_link($page_title);
my $line =
@@ -852,6 +899,7 @@
. get_sort_title($page_title);
append_to_file($special_case_fn, $line);
+
return;
}
@@ -954,12 +1002,8 @@
. $page_title
. $MW_SYNTAX{end_link};
- if ($option{rtl}) {
- if ($page_title =~ $LTR_CHAR_RE) {
-
- # XXX get_string() cannot be used here
- $link_to_page = $STRING{rlm} . $link_to_page . $STRING{rlm};
- }
+ if ($option{rtl} and $page_title =~ $LTR_CHAR_RE) {
+ $link_to_page = $HTML_RLM . $link_to_page . $HTML_RLM;
}
return $link_to_page;
@@ -968,7 +1012,7 @@
sub create_no_iw_pages {
my ($params) = @_;
- INFO('creating no_iw pages');
+ INFO("\ncreating no_iw pages");
# Run over page types
UNSORTED_TYPE_FN:
@@ -1082,9 +1126,10 @@
return "'''$text'''";
}
-# Custom Unicode character property for finding characters.
-# The custom is to give those subroutines CamelCase names.
-sub IsLeftToRight { ## no critic (ProhibitMixedCaseSubs)
+# Custom Unicode character property, which is like \w, but for Hebrew.
+# The custom is to give custom Unicode character classes CamelCase names.
+# P::C policy ProhibitMixedCaseSubs is deprecated.
+sub IsRightToLeft { ## no critic (Capitalization)
return <<'END';
+utf8::InHebrew
+utf8::IsSpace
@@ -1158,8 +1203,10 @@
}
sub get_string {
- my ($english) = @_;
- return $STRING{$english} //= $english; # /
+ my ($english, $if_defined) = @_;
+ return $if_defined
+ ? ($STRING{$english})
+ : ($STRING{$english} //= $english); # /
}
sub make_type_fn {
@@ -1210,6 +1257,8 @@
foreach my $lang_code (sort keys %found_links) {
my $filename = "$MULTI_DIR/$lang_code.$WIKITEXT_EXT";
my @foreign_articles = sort keys %{ $found_links{$lang_code} };
+ say "$lang_code ", scalar @foreign_articles;
+
FOREIGN_ARTICLE:
foreach my $foreign_article (@foreign_articles) {
my @local_articles =
@@ -1235,6 +1284,8 @@
LANG_CODE:
foreach my $lang_code (sort keys %found_links) {
my @foreign_articles = sort keys %{ $found_links{$lang_code} };
+ say "$lang_code ", scalar @foreign_articles;
+
FOREIGN_ARTICLE:
foreach my $foreign_article (@foreign_articles) {
my @local_articles =
@@ -1254,10 +1305,12 @@
}
}
+ say 'writing local multilinks file';
my $filename = "$MULTI_DIR/LOCAL.$WIKITEXT_EXT";
foreach my $local_multi_article (sort keys %local_multi_links) {
append_to_file($filename,
'* ' . mw_bold(make_link($local_multi_article)));
+
foreach my $other_local_article (
sort keys %{ $local_multi_links{$local_multi_article} })
{
@@ -1304,7 +1357,7 @@
sub print_stats {
INFO("\nSUMMARY");
- say "total time: $total_time";
+ INFO("total time: $total_time");
foreach my $stat_type (sort keys %statistics) {
if (not ref $statistics{$stat_type}) {
INFO("$stat_type: $statistics{$stat_type}");
@@ -1346,11 +1399,24 @@
foreach my $namespace (sort keys %namespace_count) {
INFO("$namespace: $namespace_count{$namespace}");
}
+
INFO("\nTYPES");
foreach my $type (sort keys %type_count) {
INFO("$type: $type_count{$type}");
}
+ my $dump_size = -s $dump_fn;
+ my $log_size = -s $LOG_FN;
+ INFO("dump size: $dump_fn");
+ INFO("log size: $LOG_FN");
+ if ($log_size > 0) {
+ my $dump_log_ratio = $dump_size / $log_size;
+ INFO("dump/log ratio: $dump_log_ratio");
+ }
+ else {
+ WARN("weird: log file size $LOG_FN is 0");
+ }
+
return;
}
@@ -1413,6 +1479,12 @@
=item * --max_iw_places Number of places to print in the statistics of
pages with the most interlanguage links.
+=item * --version Print the version number and exit.
+
+=item * --usage Print basic usage and exit.
+
+=item * --help Print full help and exit.
+
=back
=head1 DESCRIPTION
@@ -1545,6 +1617,10 @@
To make Perl::Critic happy :)
+=item * C<Pod::Usage>
+
+Some style guide recommended it. I don't even remember which one, but i love style guides.
+
=back
=head1 HACKING
@@ -1564,7 +1640,7 @@
for automatic code formatting. If you modify it, do yourself a favor, install
Perl::Critic and regularly test it using this command:
-./tidy.sh
+./tidy.sh prepare_noiw_list.pl
It checks the syntax, runs perltidy on the code and runs Perl::Critic.
@@ -1710,9 +1786,29 @@
=head1 LICENSE AND COPYRIGHT
-This program is Free Software; you can redistribute it and/or modify it
-under the same terms as Perl itself.
+Copyright 2009 Guy Shaked, Nadav Perez, Amir E. Aharoni.
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License version 3
+as published by the Free Software Foundation or under the terms of
+Artistic License version 2.0.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the licenses
+along with this program. If not, see
+
+=over
+
+=item * L<http://www.gnu.org/licenses/gpl-3.0.html>.
+
+=item * L<http://www.perlfoundation.org/artistic_license_2_0>
+
+=back
+
I<Visca la llibertat!>
=cut
Added: trunk/no-interwiki/stopwords.txt
===================================================================
--- trunk/no-interwiki/stopwords.txt (rev 0)
+++ trunk/no-interwiki/stopwords.txt 2009-09-26 23:43:11 UTC (rev 80)
@@ -0,0 +1,40 @@
+ActivePerl
+Aharoni
+Aharoni's
+Amir
+Amire
+backporting
+CPAN
+Cygwin
+Drora
+Felagund
+Felagund's
+FILENAME
+filenames
+filesystem
+hoc
+interlanguage
+interwiki
+Itay
+llibertat
+MediaWiki
+multi
+Nadav
+namespace
+Noa
+outtest
+param
+perl
+perltidy
+Readonly
+refactoring
+Reut
+rtl
+Shaked
+tec
+Visca
+wiki
+Wikipedia
+Wikipedias
+xNUMBER
+XP
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|