[PerlWikiBot] SF.net SVN: perlwikibot:[80] trunk/no-interwiki
Status: Pre-Alpha
Brought to you by:
rotemliss
From: <am...@us...> - 2009-09-26 23:43:19
|
Revision: 80 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=80&view=rev Author: amire80 Date: 2009-09-26 23:43:11 +0000 (Sat, 26 Sep 2009) Log Message: ----------- Copyright, MediaWikiDump 0.92 requirement, version number cleanup, RTL cleanup, logging cleanup, POD cleanup, get_string cleanup, simple dump ratio, P::C cleanup, adding pod spelling. Modified Paths: -------------- trunk/no-interwiki/.perlcriticrc trunk/no-interwiki/prepare_noiw_list.pl Added Paths: ----------- trunk/no-interwiki/stopwords.txt Modified: trunk/no-interwiki/.perlcriticrc =================================================================== --- trunk/no-interwiki/.perlcriticrc 2009-09-26 10:51:33 UTC (rev 79) +++ trunk/no-interwiki/.perlcriticrc 2009-09-26 23:43:11 UTC (rev 80) @@ -2,6 +2,10 @@ [Variables::ProhibitPunctuationVars] allow = %+ $+ @+ -# This is probably useful, but appears buggy, so it should remain manual -[-Documentation::PodSpelling] +[Documentation::PodSpelling] +stop_words_file = stopwords.txt +# say is for writing to the terminal, no need to check it +[InputOutput::RequireCheckedSyscalls] +exclude_functions = say + Modified: trunk/no-interwiki/prepare_noiw_list.pl =================================================================== --- trunk/no-interwiki/prepare_noiw_list.pl 2009-09-26 10:51:33 UTC (rev 79) +++ trunk/no-interwiki/prepare_noiw_list.pl 2009-09-26 23:43:11 UTC (rev 80) @@ -1,7 +1,7 @@ #!/usr/bin/perl # prepare_noiw_list.pl -# version 0.2 Noa - development +# version 0.2.1 Noa - development # See the POD documentation at the end of the file or run # perldoc prepare_noiw_list.pl @@ -9,9 +9,21 @@ # "Had he been a French child, # he would have heard an infinite number of sentences" - Otto Jespersen -# This program is Free Software; you can redistribute it and/or modify it -# under the same terms as Perl itself. +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 3 +# as published by the Free Software Foundation or under the terms of +# Artistic License version 2.0. +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the licenses +# along with this program. If not, see +# * <http://www.gnu.org/licenses/gpl-3.0.html>. +# * <http://www.perlfoundation.org/artistic_license_2_0> + # Upgrade! This program actually uses new Perl 5.10 constructs, so you need it use 5.010; @@ -38,11 +50,13 @@ # I only use Readonly where P::C complains about it. Using it with regular # expressions which have non-ASCII chars produces heisenbugs. use Readonly; +use Pod::Usage; use Regexp::Common; use Lingua::Translit; use Log::Log4perl qw(:easy); -use Parse::MediaWikiDump 0.51; # Earlier versions have a bug in namespaces +use Parse::MediaWikiDump 0.92; # Earlier versions have a different API +our $VERSION = '0.2.1'; #<<< no perltidy my %SVN_PROPS = ( ## no critic (RequireInterpolationOfMetachars) Revision => '$Revision$', @@ -50,8 +64,9 @@ Date => '$Date$', ); #>>> -our $VERSION = ($SVN_PROPS{Revision} =~ /\A\$Revision:\ (?<revision_num>\d+)\ \$\z/xms) - ? "0.1.9.$+{revision_num}" +our $FULL_VERSION = + ($SVN_PROPS{Revision} =~ /\A\$Revision:\ (?<revision_num>\d+)\ \$\z/xms) + ? "$VERSION.$+{revision_num}" : croak(q(Something is wrong with SVN revision number)); my $WIKITEXT_EXT = 'wiki.txt'; @@ -62,6 +77,7 @@ my $FIELD_SEP = qq{\t}; my $LINK_SEP = q{|}; my $TYPE_SEP = qr/\s*;\s*/xms; +my $HTML_RLM = '‏'; Readonly my $DEFAULT_MAX_IW_PLACES => 20; Readonly my $DEFAULT_PAGE_FREQ => 1000; @@ -69,11 +85,12 @@ Readonly my $DEFAULT_MAX_SECTIONS_PER_PAGE => 20; # Initialize logger +my $LOG_FN = 'outtest.log'; Log::Log4perl->easy_init( { - level => $INFO, # print everything - file => ':utf8>outtest.log', # utf is important - layout => '%m%n', # No need to print the date + level => $INFO, # print everything + file => ":utf8>$LOG_FN", # utf is important + layout => '%m%n', # No need to print the date } ); @@ -85,6 +102,9 @@ max_links_per_section => $DEFAULT_MAX_LINKS_PER_SECTION, max_sections_per_page => $DEFAULT_MAX_SECTIONS_PER_PAGE, max_iw_places => $DEFAULT_MAX_IW_PLACES, + help => 0, + usage => 0, + version => 0, ); my $valid_options = GetOptions( @@ -95,12 +115,42 @@ 'max_links_per_section' => \$option{max_links_per_section}, 'max_sections_per_page' => \$option{max_sections_per_page}, 'max_iw_places' => \$option{max_iw_places}, + 'help' => \$option{help}, + 'usage' => \$option{usage}, + 'version' => \$option{version}, ); if (not $valid_options) { croak('Invalid command line options.'); } +if ($option{'version'}) { + my $program_name = __FILE__; + say <<"END_VERSION"; +$program_name version $FULL_VERSION +This program searches for pages without interlanguage links (interwiki). + +Copyright 2005-2009 Guy Shaked, Nadav Perez and Amir E. Aharoni. + +This program is Free Software; you can redistribute it and/or modify it +under the terms of Artistic License version 2.0 or the +GNU General Public license version 3.0. + +Run `perldoc $program_name' for full terms. +END_VERSION + exit; +} + +if ($option{'help'} or $option{'usage'}) { + my $verbose_level = $option{'usage'} ? 1 : 2; + pod2usage( + { + -exitval => 0, + -verbose => $verbose_level, + } + ); +} + # XXX Too coupled to Wikipedia, won't work for other projects. my $DUMP_FN_RE = qr{ \A # Begin string @@ -168,14 +218,14 @@ @REV_MONTH{@MONTHS} = (0 .. $LAST_MONTH); # XXX Internationalize +my $string_in = get_string('in'); my $HEB_DATE_RE = qr{ \A # begin (?<hour>\d{1,2}) # hour : # : (?<min>\d{2}),\s # minute (?<mday>\d{1,2})\s # day of month - $STRING{in}? # This preposition appears sometimes - # It should have been get_string() + $string_in? # This preposition appears sometimes (?<mon>$ALT_MONTHS)\s # A name of a month (?<year>\d+?)\s # Year \([A-Z]{3}\) # Three letters in brackets - timezone @@ -193,9 +243,9 @@ (?<value>.*) # value }xms; -# XXX It should use get_string() +my $string_no_iw = get_string('no_iw'); my $SIMPLE_NO_IW_CHECK_RE = qr{ - \Q$STRING{no_iw}\E # The string may have spaces + \Q$string_no_iw\E # The string may have spaces }xmsi; # A simplistic template just for testing. @@ -218,21 +268,21 @@ # Regular expression mostly copied from # Parse::MediaWikiDump::page::redirect # TODO: Try to look for the local language redirect keyword in the dump. +my $local_redirect = get_string('REDIRECT'); my $LOCAL_REDIRECT_RE = qr{ \A # Beginning of string (page) \# # a # character - $STRING{REDIRECT} # Redirect keyword in local language - # XXX It should use get_string() + $local_redirect # Redirect keyword in local language \s*:?\s*\[\[([^\]]*)\]\] # the link after the redirect }xmsi; -my $LTR_CHAR_RE = qr/\P{IsLeftToRight}/xms; +my $LTR_CHAR_RE = qr/\P{IsRightToLeft}/xms; # \P is negation my $SECTION_LINK_RE = qr{(?<!&)\#}xms; my $LOWERCASE_LINK_RE = qr{\A[[:lower:]]}xms; -## no critic (ProhibitEscapedMetacharacters) +## no critic (RegularExpressions::ProhibitEscapedMetacharacters) my $TRUE_TEMPLATE_RE = qr/\{ $RE{balanced}{-parens=>'{}'} \}/xms; -## use critic (ProhibitEscapedMetacharacters) +## use critic (RegularExpressions::ProhibitEscapedMetacharacters) # get_string() cannot be used in re my $string_exclude_lowercase = get_string('exclude_lowercase'); @@ -357,15 +407,16 @@ # my @found_lang_codes = sort keys %found_links; # INFO("found lang_codes: @found_lang_codes"); -say 'looking for multi links'; my $begin_multi_links_time = time; +say 'listing multi links by language'; print_multi_links_by_foreign(); +say 'listing multi links by local articles'; print_multi_links_by_local(); my $total_multi_links_time = time - $begin_multi_links_time; -say "total multi links time: $total_multi_links_time"; +INFO("total multi links time: $total_multi_links_time"); exit; @@ -784,8 +835,7 @@ { my $include_lowercase_link = 1; - # XXX get_string() cannot be used here - if (defined $STRING{exclude_lowercase} + if (defined get_string('exclude_lowercase', 'if defined') and $foreign_article =~ $EXCLUDE_LOWERCASE_RE) { $include_lowercase_link = 0; @@ -807,15 +857,7 @@ } push @{ $statistics{count_iw}->[$count_iw] }, $page_title; } - INFO( - "iw link count for $page_title" - . ( - $option{'rlm'} - ? "\N{LEFT-TO-RIGHT MARK}" - : q() - ) - . " is: $count_iw" - ); + INFO("iw link count for $page_title is: $count_iw"); for my $special_case_name (keys %special_cases) { if (scalar %{ $special_cases{$special_case_name} }) { @@ -835,14 +877,19 @@ sub special_cases_file { my ($special_case_name, $page, $special_cases_ref) = @_; $special_cases_ref //= {}; # / - my $special_case_langs = join q{, }, sort keys %{$special_cases_ref}; + + my $special_case_langs = join q{ }, sort keys %{$special_cases_ref}; + if ($special_case_langs) { $special_case_langs = " ($special_case_langs)"; } + my $special_case_fn = make_type_fn($special_case_name, 1); + if (not -e $special_case_fn) { append_to_file($special_case_fn, $special_case_name); } + my $page_title = $page->title(); my $link = make_link($page_title); my $line = @@ -852,6 +899,7 @@ . get_sort_title($page_title); append_to_file($special_case_fn, $line); + return; } @@ -954,12 +1002,8 @@ . $page_title . $MW_SYNTAX{end_link}; - if ($option{rtl}) { - if ($page_title =~ $LTR_CHAR_RE) { - - # XXX get_string() cannot be used here - $link_to_page = $STRING{rlm} . $link_to_page . $STRING{rlm}; - } + if ($option{rtl} and $page_title =~ $LTR_CHAR_RE) { + $link_to_page = $HTML_RLM . $link_to_page . $HTML_RLM; } return $link_to_page; @@ -968,7 +1012,7 @@ sub create_no_iw_pages { my ($params) = @_; - INFO('creating no_iw pages'); + INFO("\ncreating no_iw pages"); # Run over page types UNSORTED_TYPE_FN: @@ -1082,9 +1126,10 @@ return "'''$text'''"; } -# Custom Unicode character property for finding characters. -# The custom is to give those subroutines CamelCase names. -sub IsLeftToRight { ## no critic (ProhibitMixedCaseSubs) +# Custom Unicode character property, which is like \w, but for Hebrew. +# The custom is to give custom Unicode character classes CamelCase names. +# P::C policy ProhibitMixedCaseSubs is deprecated. +sub IsRightToLeft { ## no critic (Capitalization) return <<'END'; +utf8::InHebrew +utf8::IsSpace @@ -1158,8 +1203,10 @@ } sub get_string { - my ($english) = @_; - return $STRING{$english} //= $english; # / + my ($english, $if_defined) = @_; + return $if_defined + ? ($STRING{$english}) + : ($STRING{$english} //= $english); # / } sub make_type_fn { @@ -1210,6 +1257,8 @@ foreach my $lang_code (sort keys %found_links) { my $filename = "$MULTI_DIR/$lang_code.$WIKITEXT_EXT"; my @foreign_articles = sort keys %{ $found_links{$lang_code} }; + say "$lang_code ", scalar @foreign_articles; + FOREIGN_ARTICLE: foreach my $foreign_article (@foreign_articles) { my @local_articles = @@ -1235,6 +1284,8 @@ LANG_CODE: foreach my $lang_code (sort keys %found_links) { my @foreign_articles = sort keys %{ $found_links{$lang_code} }; + say "$lang_code ", scalar @foreign_articles; + FOREIGN_ARTICLE: foreach my $foreign_article (@foreign_articles) { my @local_articles = @@ -1254,10 +1305,12 @@ } } + say 'writing local multilinks file'; my $filename = "$MULTI_DIR/LOCAL.$WIKITEXT_EXT"; foreach my $local_multi_article (sort keys %local_multi_links) { append_to_file($filename, '* ' . mw_bold(make_link($local_multi_article))); + foreach my $other_local_article ( sort keys %{ $local_multi_links{$local_multi_article} }) { @@ -1304,7 +1357,7 @@ sub print_stats { INFO("\nSUMMARY"); - say "total time: $total_time"; + INFO("total time: $total_time"); foreach my $stat_type (sort keys %statistics) { if (not ref $statistics{$stat_type}) { INFO("$stat_type: $statistics{$stat_type}"); @@ -1346,11 +1399,24 @@ foreach my $namespace (sort keys %namespace_count) { INFO("$namespace: $namespace_count{$namespace}"); } + INFO("\nTYPES"); foreach my $type (sort keys %type_count) { INFO("$type: $type_count{$type}"); } + my $dump_size = -s $dump_fn; + my $log_size = -s $LOG_FN; + INFO("dump size: $dump_fn"); + INFO("log size: $LOG_FN"); + if ($log_size > 0) { + my $dump_log_ratio = $dump_size / $log_size; + INFO("dump/log ratio: $dump_log_ratio"); + } + else { + WARN("weird: log file size $LOG_FN is 0"); + } + return; } @@ -1413,6 +1479,12 @@ =item * --max_iw_places Number of places to print in the statistics of pages with the most interlanguage links. +=item * --version Print the version number and exit. + +=item * --usage Print basic usage and exit. + +=item * --help Print full help and exit. + =back =head1 DESCRIPTION @@ -1545,6 +1617,10 @@ To make Perl::Critic happy :) +=item * C<Pod::Usage> + +Some style guide recommended it. I don't even remember which one, but i love style guides. + =back =head1 HACKING @@ -1564,7 +1640,7 @@ for automatic code formatting. If you modify it, do yourself a favor, install Perl::Critic and regularly test it using this command: -./tidy.sh +./tidy.sh prepare_noiw_list.pl It checks the syntax, runs perltidy on the code and runs Perl::Critic. @@ -1710,9 +1786,29 @@ =head1 LICENSE AND COPYRIGHT -This program is Free Software; you can redistribute it and/or modify it -under the same terms as Perl itself. +Copyright 2009 Guy Shaked, Nadav Perez, Amir E. Aharoni. +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License version 3 +as published by the Free Software Foundation or under the terms of +Artistic License version 2.0. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the licenses +along with this program. If not, see + +=over + +=item * L<http://www.gnu.org/licenses/gpl-3.0.html>. + +=item * L<http://www.perlfoundation.org/artistic_license_2_0> + +=back + I<Visca la llibertat!> =cut Added: trunk/no-interwiki/stopwords.txt =================================================================== --- trunk/no-interwiki/stopwords.txt (rev 0) +++ trunk/no-interwiki/stopwords.txt 2009-09-26 23:43:11 UTC (rev 80) @@ -0,0 +1,40 @@ +ActivePerl +Aharoni +Aharoni's +Amir +Amire +backporting +CPAN +Cygwin +Drora +Felagund +Felagund's +FILENAME +filenames +filesystem +hoc +interlanguage +interwiki +Itay +llibertat +MediaWiki +multi +Nadav +namespace +Noa +outtest +param +perl +perltidy +Readonly +refactoring +Reut +rtl +Shaked +tec +Visca +wiki +Wikipedia +Wikipedias +xNUMBER +XP This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |