Thread: [PerlWikiBot] SF.net SVN: perlwikibot:[42] trunk/no-interwiki
Status: Pre-Alpha
Brought to you by:
rotemliss
From: <am...@us...> - 2008-07-28 16:15:55
|
Revision: 42 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=42&view=rev Author: amire80 Date: 2008-07-28 16:16:05 +0000 (Mon, 28 Jul 2008) Log Message: ----------- Adding config files and tidy script. Rewrote version number code. Modified Paths: -------------- trunk/no-interwiki/prepare_noiw_list.pl Added Paths: ----------- trunk/no-interwiki/.perlcriticrc trunk/no-interwiki/.perltidyrc trunk/no-interwiki/tidy.sh Added: trunk/no-interwiki/.perlcriticrc =================================================================== --- trunk/no-interwiki/.perlcriticrc (rev 0) +++ trunk/no-interwiki/.perlcriticrc 2008-07-28 16:16:05 UTC (rev 42) @@ -0,0 +1,6 @@ +#[-CodeLayout::RequireTidyCode] +#[-Miscellanea::RequireRcsKeywords] + +# English.pm doesn't support named capture variables (yet) +[Variables::ProhibitPunctuationVars] +allow = %+ $+ @+ Property changes on: trunk/no-interwiki/.perlcriticrc ___________________________________________________________________ Added: svn:executable + * Added: trunk/no-interwiki/.perltidyrc =================================================================== --- trunk/no-interwiki/.perltidyrc (rev 0) +++ trunk/no-interwiki/.perltidyrc 2008-07-28 16:16:05 UTC (rev 42) @@ -0,0 +1,9 @@ +--backup-and-modify-in-place +--maximum-line-length=78 +--continuation-indentation=4 +--nooutdent-long-lines +--nooutdent-labels +--outdent-keyword-list="next last redo goto" +--paren-tightness=2 +--nospace-for-semicolon +--nooutdent-long-comments Property changes on: trunk/no-interwiki/.perltidyrc ___________________________________________________________________ Added: svn:executable + * Modified: trunk/no-interwiki/prepare_noiw_list.pl =================================================================== --- trunk/no-interwiki/prepare_noiw_list.pl 2008-07-28 15:46:45 UTC (rev 41) +++ trunk/no-interwiki/prepare_noiw_list.pl 2008-07-28 16:16:05 UTC (rev 42) @@ -12,10 +12,6 @@ # This program is Free Software; you can redistribute it and/or modify it # under the same terms as Perl itself. -# $Revision$ -# $HeadURL$ -# $Date$ - # Upgrade! This script actually uses new Perl 5.10 constructs, so you need it use 5.010; @@ -46,7 +42,16 @@ use Log::Log4perl qw(:easy); use Readonly; -our $VERSION = '0.1.9.02'; +#<<< no perltidy +my %SVN_PROPS = ( ## no critic ValuesAndExpressions::RequireInterpolationOfMetachars + Revision => '$Revision$', + HeadURL => '$HeadURL$', + Date => '$Date$', +); +#>>> +our $VERSION = ($SVN_PROPS{Revision} =~ /\A\$Revision:\ (?<revision_num>\d+)\ \$\z/xms) + ? "0.1.$+{revision_num}" + : croak(q(Something's wrong with SVN revision number)); my %PATTERN; Readonly my $WIKITEXT_EXT => 'wiki.txt'; Added: trunk/no-interwiki/tidy.sh =================================================================== --- trunk/no-interwiki/tidy.sh (rev 0) +++ trunk/no-interwiki/tidy.sh 2008-07-28 16:16:05 UTC (rev 42) @@ -0,0 +1,26 @@ +#!/bin/bash + +FN=prepare_noiw_list.pl + +echo checking syntax +perl -c $FN +if [ $? -ne 0 ]; then + exit 1 +fi + +echo tidying +perltidy $FN +if [ $? -ne 0 ]; then + exit 1 +fi + +diff $FN.bak ${FN} +if [ $? -eq 2 ]; then + exit 1 +fi + +echo criticizing +/usr/local/bin/perlcritic -brutal $FN + +exit $? + Property changes on: trunk/no-interwiki/tidy.sh ___________________________________________________________________ Added: svn:executable + * This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2008-07-28 16:26:10
|
Revision: 43 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=43&view=rev Author: amire80 Date: 2008-07-28 16:26:20 +0000 (Mon, 28 Jul 2008) Log Message: ----------- Adding some strings and language codes files. Added Paths: ----------- trunk/no-interwiki/eo.language_codes.txt trunk/no-interwiki/eo.strings.txt trunk/no-interwiki/he.language_codes.txt trunk/no-interwiki/he.strings.txt trunk/no-interwiki/oc.language_codes.txt Added: trunk/no-interwiki/eo.language_codes.txt =================================================================== --- trunk/no-interwiki/eo.language_codes.txt (rev 0) +++ trunk/no-interwiki/eo.language_codes.txt 2008-07-28 16:26:20 UTC (rev 43) @@ -0,0 +1 @@ +link he.language_codes.txt \ No newline at end of file Property changes on: trunk/no-interwiki/eo.language_codes.txt ___________________________________________________________________ Added: svn:special + * Added: trunk/no-interwiki/eo.strings.txt =================================================================== --- trunk/no-interwiki/eo.strings.txt (rev 0) +++ trunk/no-interwiki/eo.strings.txt 2008-07-28 16:26:20 UTC (rev 43) @@ -0,0 +1,34 @@ +# months +January Januaro +February Februaro +March Marto +April Aprilo +May Majo +June Junio +July Julio +August Auxgusto +September Septembro +October Oktobro +November Novembro +December Decembro + +in + +no_iw no estas interviki +category kategorio +disambig apartigilo +template Sxablono + +date dato +type tipo + +# MW specials +REDIRECT REDIRECT + +# Namespaces +User Vikipediisto +User talk Vikipediista diskuto +Image Dosiero + +# Other +other alia Added: trunk/no-interwiki/he.language_codes.txt =================================================================== --- trunk/no-interwiki/he.language_codes.txt (rev 0) +++ trunk/no-interwiki/he.language_codes.txt 2008-07-28 16:26:20 UTC (rev 43) @@ -0,0 +1,150 @@ +en English +de German +fr French +pl Polish +ja Japanese +it Italian +ru Russian +nl Dutch +pt Portuguese +es Spanish +sv Swedish +ru Russian +zh Chinese +no Norwegian Bokmal +fi Finnish +vo Volapuk +ca Catalan +ro Romanian +tr Turkish +uk Ukrainian +eo Esperanto +cs Czech +hu Hungarian +sk Slovak +da Danish +id Indonesian +he Hebrew +lt Lithuanian +sr Serbian +sl Slovenian +ko Korean +ar Arabic +bg Bulgarian +et Estonian +hr Croatian +new Newari +te Telugu +vi Vietnamese +nn Norwegian Nynorsk +th Thai +fa Persian +ga Galician +ceb Cebuano +el Greek +ms Malay +simple Simple English +eu Basque +bpy Bishnupriya Manipuri +bs Bosnian +lb Luxembourgish +is Icelandic +ka Georgian +sq Albanian +la Latin +br Breton +hi Hindi +az Azeri +bn Bengali +mk Macedonian +mr Marathi +sh Serbocroatian +tl Tagalog +cy Welsh +io Ido +pms Piedmontese +lv Latvian +su Sundanese +ta Tamil +jv Javanese +nap Neapolitan +oc Occitan +nds Low German +scn Sicilian +ast Asturian +ku Kurdish +be Belarusian (modern) +be-x-old Belarusian (tarashkevitsa) +tg Tajik +an Aragonese +ksh Ripuarian +fy Frisian +vec Venetian +roa-tara Tarantino +cv Chuvash +zh-yue Cantonese +ur Urdu +qu Quechua +sw Swahili +uz Uzbek +bat-smg Samogitian +ga Irish Gaelic +mi Maori +ml Malayalam +gd Scottish Gaelic +yo Yoruba +co Corsican +kn Kannada +pam Kapampangan +yi Yiddish +hsb Upper Sorbian +nah Nahuatl +ia Interlingua +li Limburg +sa Sanskrit +hy Armenian +als Alemannic +tt Tatar +roa-rup Aromanian +map-bms Banyumasan +pag Pangasinan +am Amharic +zh-min-nan Min Nan +nrm Norman +wuu Wuu +fo Faroese +vls West Flemish +lmo Lombard +nds-nl Dutch Low Saxon +se Northern Sami +rm Romansh +ne Nepali +war Waray-Waray +fur Friulian +lij Ligurian +nov Novial +sco Scots +bh Bihari +dv Divehi +pi Pali +diq Zazaki +ilo Ilokano +kk Kazakh +os Ossetian +zh-classical Classical Chinese +frp Franco Provencal +mt Maltese +lad Ladino +fiu-vro Voro +pdc Pennsylvania German +csb Kashubian +kw Cornish +bar Bavarian +to Tongan +haw Hawaii +mn Mongolian +ps Pashto +km Khmer +gv Manx +tk Turkmen +ln Lingala Added: trunk/no-interwiki/he.strings.txt =================================================================== --- trunk/no-interwiki/he.strings.txt (rev 0) +++ trunk/no-interwiki/he.strings.txt 2008-07-28 16:26:20 UTC (rev 43) @@ -0,0 +1,37 @@ +# months +January ינואר +February פברואר +March מרץ +April אפריל +May מאי +June יוני +July יולי +August אוגוסט +September ספטמבר +October אוקטובר +November נובמבר +December דצמבר + +in ב + +no_iw אין בינוויקי +category קטגוריה +disambig פירושונים +template תבנית + +date תאריך +type סוג + +# MW specials +REDIRECT הפניה + +# Namespaces +User משתמש +User talk שיחת משתמש +Image תמונה + +# Other +other אחר +rlm {{כ}} +exclude_lowercase ß + Added: trunk/no-interwiki/oc.language_codes.txt =================================================================== --- trunk/no-interwiki/oc.language_codes.txt (rev 0) +++ trunk/no-interwiki/oc.language_codes.txt 2008-07-28 16:26:20 UTC (rev 43) @@ -0,0 +1 @@ +link he.language_codes.txt \ No newline at end of file Property changes on: trunk/no-interwiki/oc.language_codes.txt ___________________________________________________________________ Added: svn:special + * This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2008-07-30 18:08:37
|
Revision: 51 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=51&view=rev Author: amire80 Date: 2008-07-30 18:08:45 +0000 (Wed, 30 Jul 2008) Log Message: ----------- Count pages without interwiki per namespace. Modified Paths: -------------- trunk/no-interwiki/eo.strings.txt trunk/no-interwiki/he.strings.txt trunk/no-interwiki/prepare_noiw_list.pl Modified: trunk/no-interwiki/eo.strings.txt =================================================================== --- trunk/no-interwiki/eo.strings.txt 2008-07-30 17:48:29 UTC (rev 50) +++ trunk/no-interwiki/eo.strings.txt 2008-07-30 18:08:45 UTC (rev 51) @@ -29,6 +29,7 @@ User Vikipediisto User talk Vikipediista diskuto Image Dosiero +article space (nomspace de artikoloj) # Other other alia Modified: trunk/no-interwiki/he.strings.txt =================================================================== --- trunk/no-interwiki/he.strings.txt 2008-07-30 17:48:29 UTC (rev 50) +++ trunk/no-interwiki/he.strings.txt 2008-07-30 18:08:45 UTC (rev 51) @@ -29,6 +29,7 @@ User משתמש User talk שיחת משתמש Image תמונה +article space (מרחב ערכים) # Other other אחר Modified: trunk/no-interwiki/prepare_noiw_list.pl =================================================================== --- trunk/no-interwiki/prepare_noiw_list.pl 2008-07-30 17:48:29 UTC (rev 50) +++ trunk/no-interwiki/prepare_noiw_list.pl 2008-07-30 18:08:45 UTC (rev 51) @@ -146,10 +146,7 @@ # This monstrosity basically says: | and optional spaces $PATTERN{param_sep} = qr{\s*\Q$MW_SYNTAX{param_sep}\E\s*}xms; -Readonly my @INCLUDE_NAMESPACES => ( - q{}, # Empty is a specific case - 'category', -); +Readonly my @INCLUDE_NAMESPACES => ('article space', 'category',); # # Constants for date processing @@ -342,6 +339,11 @@ exit; +sub namespace { + my ($page) = @_; + return $page->namespace() || $STRING{'article space'}; +} + sub find_iwless { PAGE: while (my $page = $dump->page()) { @@ -357,7 +359,7 @@ next PAGE if ($page_counter < $option{start_from}); - my $namespace = $page->namespace() || 'main'; + my $namespace = namespace($page); $namespace_count{$namespace}++; # Skipping cases: @@ -402,7 +404,7 @@ ) = @_; INFO(q(does not have iw link.)); - $statistics{'has no interwiki link'}++; + $statistics{'has no interwiki link'}->{ namespace($page) }++; # Now we need to search for no_iw templates # and parse their parameters - date and type @@ -1020,7 +1022,7 @@ sub is_in_namespace { my ($page, @namespaces) = @_; - return $page->namespace() ~~ [ map { get_string($_) } @namespaces ]; + return namespace($page) ~~ [ map { get_string($_) } @namespaces ]; } sub is_category { @@ -1173,6 +1175,12 @@ while (not defined $statistics{count_iw}->[ --$max_iw_index ]) { } } + INFO('pages without interwiki links per namespace'); + foreach my $namespace (keys %{ $statistics{'has no interwiki link'} }) { + INFO( + "$namespace: $statistics{'has no interwiki link'}->{$namespace}"); + } + INFO("\nNAMESPACES"); foreach my $namespace (sort keys %namespace_count) { INFO("$namespace: $namespace_count{$namespace}"); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2008-07-31 10:57:39
|
Revision: 55 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=55&view=rev Author: amire80 Date: 2008-07-31 10:57:47 +0000 (Thu, 31 Jul 2008) Log Message: ----------- Simple support for portals. Modified Paths: -------------- trunk/no-interwiki/eo.strings.txt trunk/no-interwiki/he.strings.txt trunk/no-interwiki/prepare_noiw_list.pl Modified: trunk/no-interwiki/eo.strings.txt =================================================================== --- trunk/no-interwiki/eo.strings.txt 2008-07-31 10:35:08 UTC (rev 54) +++ trunk/no-interwiki/eo.strings.txt 2008-07-31 10:57:47 UTC (rev 55) @@ -15,7 +15,6 @@ in no_iw no estas interviki -category kategorio disambig apartigilo template Sxablono @@ -29,6 +28,8 @@ User Vikipediisto User talk Vikipediista diskuto Image Dosiero +Portal Portalo +Category kategorio article space (nomspace de artikoloj) # Other Modified: trunk/no-interwiki/he.strings.txt =================================================================== --- trunk/no-interwiki/he.strings.txt 2008-07-31 10:35:08 UTC (rev 54) +++ trunk/no-interwiki/he.strings.txt 2008-07-31 10:57:47 UTC (rev 55) @@ -15,7 +15,6 @@ in ב no_iw אין בינוויקי -category קטגוריה disambig פירושונים template תבנית @@ -29,6 +28,8 @@ User משתמש User talk שיחת משתמש Image תמונה +Portal פורטל +Category קטגוריה article space (מרחב ערכים) # Other Modified: trunk/no-interwiki/prepare_noiw_list.pl =================================================================== --- trunk/no-interwiki/prepare_noiw_list.pl 2008-07-31 10:35:08 UTC (rev 54) +++ trunk/no-interwiki/prepare_noiw_list.pl 2008-07-31 10:57:47 UTC (rev 55) @@ -100,6 +100,7 @@ croak('Invalid command line options.'); } +# XXX Too coupled to Wikipedia, won't work for other projects. $PATTERN{dump_fn} = qr{ \A # Begin string (?<wiki_lang>\w+) # Lang code @@ -146,7 +147,7 @@ # This monstrosity basically says: | and optional spaces $PATTERN{param_sep} = qr{\s*\Q$MW_SYNTAX{param_sep}\E\s*}xms; -Readonly my @INCLUDE_NAMESPACES => ('article space', 'category',); +Readonly my @INCLUDE_NAMESPACES => ('article space', 'Category', 'Portal'); # # Constants for date processing @@ -498,10 +499,16 @@ my $page_title = $page->title(); if (is_category($page)) { INFO("$page_title is a category"); - push @all_types, get_string('category'); + push @all_types, get_string('Category'); $statistics{'categories'}++; } + if (is_in_namespace($page, 'Portal')) { + INFO("$page_title is a portal"); + push @all_types, get_string('Portal'); + $statistics{'portal'}++; + } + if (is_disambig($page)) { INFO("$page_title is a disambiguation"); push @all_types, get_string('disambig'); @@ -1037,7 +1044,7 @@ sub is_category { my ($page) = @_; - return is_in_namespace($page, 'category'); + return is_in_namespace($page, 'Category'); } sub is_disambig { @@ -1115,8 +1122,8 @@ return; } -# It appears simple, but non-alphabetic languages such as Chinese it must be -# different, so it will sit here ready for better i18n. +# It appears simple, but in non-alphabetic languages such as Chinese +# it may be different, so it will sit here ready for better i18n. sub get_sort_letter { my ($string) = @_; return substr $string, 0, 1; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2008-08-04 10:20:33
|
Revision: 60 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=60&view=rev Author: amire80 Date: 2008-08-04 10:20:39 +0000 (Mon, 04 Aug 2008) Log Message: ----------- Updated language list, thanks to rotemliss. Now properly reading only the needed languages. Modified Paths: -------------- trunk/no-interwiki/eo.language_codes.txt trunk/no-interwiki/he.language_codes.txt trunk/no-interwiki/oc.language_codes.txt trunk/no-interwiki/prepare_noiw_list.pl Added Paths: ----------- trunk/no-interwiki/language_codes.txt Property Changed: ---------------- trunk/no-interwiki/he.language_codes.txt Modified: trunk/no-interwiki/eo.language_codes.txt =================================================================== --- trunk/no-interwiki/eo.language_codes.txt 2008-08-03 14:38:23 UTC (rev 59) +++ trunk/no-interwiki/eo.language_codes.txt 2008-08-04 10:20:39 UTC (rev 60) @@ -1 +1 @@ -link he.language_codes.txt \ No newline at end of file +link language_codes.txt \ No newline at end of file Modified: trunk/no-interwiki/he.language_codes.txt =================================================================== --- trunk/no-interwiki/he.language_codes.txt 2008-08-03 14:38:23 UTC (rev 59) +++ trunk/no-interwiki/he.language_codes.txt 2008-08-04 10:20:39 UTC (rev 60) @@ -1,150 +1 @@ -en English -de German -fr French -pl Polish -ja Japanese -it Italian -ru Russian -nl Dutch -pt Portuguese -es Spanish -sv Swedish -ru Russian -zh Chinese -no Norwegian Bokmal -fi Finnish -vo Volapuk -ca Catalan -ro Romanian -tr Turkish -uk Ukrainian -eo Esperanto -cs Czech -hu Hungarian -sk Slovak -da Danish -id Indonesian -he Hebrew -lt Lithuanian -sr Serbian -sl Slovenian -ko Korean -ar Arabic -bg Bulgarian -et Estonian -hr Croatian -new Newari -te Telugu -vi Vietnamese -nn Norwegian Nynorsk -th Thai -fa Persian -ga Galician -ceb Cebuano -el Greek -ms Malay -simple Simple English -eu Basque -bpy Bishnupriya Manipuri -bs Bosnian -lb Luxembourgish -is Icelandic -ka Georgian -sq Albanian -la Latin -br Breton -hi Hindi -az Azeri -bn Bengali -mk Macedonian -mr Marathi -sh Serbocroatian -tl Tagalog -cy Welsh -io Ido -pms Piedmontese -lv Latvian -su Sundanese -ta Tamil -jv Javanese -nap Neapolitan -oc Occitan -nds Low German -scn Sicilian -ast Asturian -ku Kurdish -be Belarusian (modern) -be-x-old Belarusian (tarashkevitsa) -tg Tajik -an Aragonese -ksh Ripuarian -fy Frisian -vec Venetian -roa-tara Tarantino -cv Chuvash -zh-yue Cantonese -ur Urdu -qu Quechua -sw Swahili -uz Uzbek -bat-smg Samogitian -ga Irish Gaelic -mi Maori -ml Malayalam -gd Scottish Gaelic -yo Yoruba -co Corsican -kn Kannada -pam Kapampangan -yi Yiddish -hsb Upper Sorbian -nah Nahuatl -ia Interlingua -li Limburg -sa Sanskrit -hy Armenian -als Alemannic -tt Tatar -roa-rup Aromanian -map-bms Banyumasan -pag Pangasinan -am Amharic -zh-min-nan Min Nan -nrm Norman -wuu Wuu -fo Faroese -vls West Flemish -lmo Lombard -nds-nl Dutch Low Saxon -se Northern Sami -rm Romansh -ne Nepali -war Waray-Waray -fur Friulian -lij Ligurian -nov Novial -sco Scots -bh Bihari -dv Divehi -pi Pali -diq Zazaki -ilo Ilokano -kk Kazakh -os Ossetian -zh-classical Classical Chinese -frp Franco Provencal -mt Maltese -lad Ladino -fiu-vro Voro -pdc Pennsylvania German -csb Kashubian -kw Cornish -bar Bavarian -to Tongan -haw Hawaii -mn Mongolian -ps Pashto -km Khmer -gv Manx -tk Turkmen -ln Lingala +link language_codes.txt \ No newline at end of file Property changes on: trunk/no-interwiki/he.language_codes.txt ___________________________________________________________________ Added: svn:special + 1 Added: trunk/no-interwiki/language_codes.txt =================================================================== --- trunk/no-interwiki/language_codes.txt (rev 0) +++ trunk/no-interwiki/language_codes.txt 2008-08-04 10:20:39 UTC (rev 60) @@ -0,0 +1,253 @@ +en English +de German +fr French +pl Polish +ja Japanese +it Italian +nl Dutch +pt Portuguese +es Spanish +ru Russian +sv Swedish +zh Chinese +no Norwegian Bokmal +fi Finnish +ca Catalan +uk Ukrainian +vo Volapuk +ro Romanian +tr Turkish +cs Czech +eo Esperanto +hu Hungarian +sk Slovak +da Danish +id Indonesian +he Hebrew +ko Korean +lt Lithuanian +ar Arabic +sr Serbian +sl Slovenian +bg Bulgarian +et Estonian +hr Croatian +vi Vietnamese +new Newari +fa Persian +te Telugu +nn Norwegian Nynorsk +gl Galician +th Thai +el Greek +ceb Cebuano +simple Simple English +ms Malay +eu Basque +ht Haitian +bs Bosnian +lb Luxembourgish +bpy Bishnupriya Manipuri +ka Georgian +is Icelandic +la Latin +sq Albanian +hi Hindi +br Breton +az Azeri +mr Marathi +mk Macedonian +sh Serbocroatian +tl Tagalog +bn Bengali +cy Welsh +lv Latvian +pms Piedmontese +io Ido +ta Tamil +oc Occitan +su Sundanese +jv Javanese +be Belarusian (modern) +nap Neapolitan +nds Low German +scn Sicilian +be-x-old Belarusian (tarashkevitsa) +ku Kurdish +ast Asturian +wa Walloon +af Afrikaans +an Aragonese +ksh Ripuarian +fy Frisian +tg Tajik +zh-yue Cantonese +cv Chuvash +ur Urdu +roa-tara Tarantino +vec Venetian +qu Quechua +sw Swahili +bat-smg Samogitian +ml Malayalam +ga Irish Gaelic +uz Uzbek +gd Scottish Gaelic +mi Maori +yo Yoruba +kn Kannada +pam Kapampangan +co Corsican +yi Yiddish +hsb Upper Sorbian +nah Nahuatl +ia Interlingua +li Limburg +als Alemannic +hy Armenian +sa Sanskrit +tt Tatar +roa-rup Aromanian +am Amharic +fo Faroese +zh-min-nan Min Nan +pag Pangasinan +map-bms Banyumasan +nds-nl Dutch Low Saxon +nrm Norman +lmo Lombard +vls West Flemish +rm Romansh +diq Zazaki +se Northern Sami +ne Nepali +fur Friulian +dv Divehi +war Waray-Waray +kk Kazakh +lij Ligurian +sco Scots +nov Novial +bh Bihari +pi Pali +ilo Ilokano +mt Maltese +zh-classical Classical Chinese +os Ossetian +frp Franco Provencal +wuu Wuu +bar Bavarian +lad Ladino +gu +fiu-vro Voro +gv +pdc Pennsylvania German +csb Kashubian +mn Mongolian +kw Cornish +to Tongan +haw Hawaii +gan +km Khmer +ps Pashto +ang +ie +tk Turkmen +ln Lingala +gn +bcl +tpi +si +wo +crh +ty +srn +zea +sc +cbk-zam +jbo +ay +ky +eml +myv +szl +ig +my +mg +or +stq +kg +glk +arc +rmy +pap +kab +so +ba +ks +sah +mzn +ce +lo +pa +udm +tet +hak +cu +hif +sd +ext +iu +kaa +na +got +dsb +bo +sm +bm +cdo +chr +om +ee +ug +as +ti +av +zu +mdf +kv +nv +ss +pih +cr +ts +ve +ch +bi +xh +rw +dz +tn +kl +ik +bug +bxr +xal +ny +st +tw +ak +ab +fj +ha +ff +lbe +ki +za +lg +sn +tum +sg +rn +chy +ng Modified: trunk/no-interwiki/oc.language_codes.txt =================================================================== --- trunk/no-interwiki/oc.language_codes.txt 2008-08-03 14:38:23 UTC (rev 59) +++ trunk/no-interwiki/oc.language_codes.txt 2008-08-04 10:20:39 UTC (rev 60) @@ -1 +1 @@ -link he.language_codes.txt \ No newline at end of file +link language_codes.txt \ No newline at end of file Modified: trunk/no-interwiki/prepare_noiw_list.pl =================================================================== --- trunk/no-interwiki/prepare_noiw_list.pl 2008-08-03 14:38:23 UTC (rev 59) +++ trunk/no-interwiki/prepare_noiw_list.pl 2008-08-04 10:20:39 UTC (rev 60) @@ -257,20 +257,18 @@ while (my $line = <$lang_code_file>) { chomp $line; my ($code, $name) = split /\t/xms, $line; - $LANG_CODE{$code} = $name; + $LANG_CODE{$code} = $name // $code; } + close $lang_code_file or croak(file_error('closing', $LANG_CODE_FN, 'reading')); Readonly my $ALT_LANGS => join $ALT_SEP, keys %LANG_CODE; -# XXX Should use ALT_LANGS, but an efficient way is needed to update -# lang codes list, so in the meantime it is loose. $PATTERN{interwiki_link} = qr{ \Q$MW_SYNTAX{start_link}\E (?<lang_code> -# $ALT_LANGS - [a-zA-Z-]+ + $ALT_LANGS ) : (?<foreign_article> @@ -344,7 +342,7 @@ say 'looking for multi links'; my $begin_multi_links_time = time; -# print_multi_links_by_foreign(); +print_multi_links_by_foreign(); print_multi_links_by_local(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2008-08-12 07:22:21
|
Revision: 67 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=67&view=rev Author: amire80 Date: 2008-08-12 07:22:30 +0000 (Tue, 12 Aug 2008) Log Message: ----------- Adding string files for fr + some cosmetics in the main script. Modified Paths: -------------- trunk/no-interwiki/prepare_noiw_list.pl Added Paths: ----------- trunk/no-interwiki/fr.language_codes.txt trunk/no-interwiki/fr.strings.txt Added: trunk/no-interwiki/fr.language_codes.txt =================================================================== --- trunk/no-interwiki/fr.language_codes.txt (rev 0) +++ trunk/no-interwiki/fr.language_codes.txt 2008-08-12 07:22:30 UTC (rev 67) @@ -0,0 +1 @@ +link language_codes.txt \ No newline at end of file Property changes on: trunk/no-interwiki/fr.language_codes.txt ___________________________________________________________________ Added: svn:special + * Added: trunk/no-interwiki/fr.strings.txt =================================================================== --- trunk/no-interwiki/fr.strings.txt (rev 0) +++ trunk/no-interwiki/fr.strings.txt 2008-08-12 07:22:30 UTC (rev 67) @@ -0,0 +1,39 @@ +# months +January Jan +February Feb +March Mar +April Apr +May Mag +June Jun +July Jul +August Aug +September Sep +October Oct +November Nov +December Dec + +in d' + +no_iw sense iw +disambig Homonymie +template Modèle + +date date +type type + +# MW specials +REDIRECT REDIRECT + +# Namespaces +User Utilisateur +User talk Discussion Utilisateur +Image Image +Portal Portail +Category Catégorie +article space Principal + +# Other +other autre +rlm ‏ +exclude_lowercase ß + Modified: trunk/no-interwiki/prepare_noiw_list.pl =================================================================== --- trunk/no-interwiki/prepare_noiw_list.pl 2008-08-11 13:29:30 UTC (rev 66) +++ trunk/no-interwiki/prepare_noiw_list.pl 2008-08-12 07:22:30 UTC (rev 67) @@ -50,7 +50,7 @@ ); #>>> our $VERSION = ($SVN_PROPS{Revision} =~ /\A\$Revision:\ (?<revision_num>\d+)\ \$\z/xms) - ? "0.1.$+{revision_num}" + ? "0.1.9.$+{revision_num}" : croak(q(Something is wrong with SVN revision number)); my %PATTERN; @@ -324,8 +324,9 @@ my $page_counter; -my %statistics; -$statistics{count_iw} = []; +my %statistics = ( + count_iw => [], +); my %namespace_count; my %type_count; my %found_links; @@ -1163,7 +1164,7 @@ sub print_multi_links_by_foreign { LANG_CODE: foreach my $lang_code (sort keys %found_links) { - my $filename = "$MULTI_DIR/$lang_code.txt"; + my $filename = "$MULTI_DIR/$lang_code.$WIKITEXT_EXT"; my @foreign_articles = sort keys %{ $found_links{$lang_code} }; FOREIGN_ARTICLE: foreach my $foreign_article (@foreign_articles) { @@ -1209,7 +1210,7 @@ } } - my $filename = "$MULTI_DIR/LOCAL.txt"; + my $filename = "$MULTI_DIR/LOCAL.$WIKITEXT_EXT"; foreach my $local_multi_article (sort keys %local_multi_links) { append_to_file($filename, '* ' . mw_bold(make_link($local_multi_article))); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2008-08-12 07:53:55
|
Revision: 68 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=68&view=rev Author: amire80 Date: 2008-08-12 07:54:04 +0000 (Tue, 12 Aug 2008) Log Message: ----------- Nynorsk string files and a very cosmetic change to the script. Modified Paths: -------------- trunk/no-interwiki/prepare_noiw_list.pl Added Paths: ----------- trunk/no-interwiki/nn.language_codes.txt trunk/no-interwiki/nn.strings.txt Added: trunk/no-interwiki/nn.language_codes.txt =================================================================== --- trunk/no-interwiki/nn.language_codes.txt (rev 0) +++ trunk/no-interwiki/nn.language_codes.txt 2008-08-12 07:54:04 UTC (rev 68) @@ -0,0 +1 @@ +link language_codes.txt \ No newline at end of file Property changes on: trunk/no-interwiki/nn.language_codes.txt ___________________________________________________________________ Added: svn:special + * Added: trunk/no-interwiki/nn.strings.txt =================================================================== --- trunk/no-interwiki/nn.strings.txt (rev 0) +++ trunk/no-interwiki/nn.strings.txt 2008-08-12 07:54:04 UTC (rev 68) @@ -0,0 +1,39 @@ +# months +January Jan +February Feb +March Mar +April Apr +May Mag +June Jun +July Jul +August Aug +September Sep +October Oct +November Nov +December Dec + +in d' + +no_iw sense iw +disambig Fleirtyding +template Mal + +date Dato +type type + +# MW specials +REDIRECT OMDIRIGER + +# Namespaces +User Brukar +User talk Brukardiskusjon +Image Fil +Portal Tema +Category Kategori +article space Hovud + +# Other +other other +rlm ‏ +exclude_lowercase ß + Modified: trunk/no-interwiki/prepare_noiw_list.pl =================================================================== --- trunk/no-interwiki/prepare_noiw_list.pl 2008-08-12 07:22:30 UTC (rev 67) +++ trunk/no-interwiki/prepare_noiw_list.pl 2008-08-12 07:54:04 UTC (rev 68) @@ -324,9 +324,7 @@ my $page_counter; -my %statistics = ( - count_iw => [], -); +my %statistics = (count_iw => []); my %namespace_count; my %type_count; my %found_links; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2008-08-13 10:30:44
|
Revision: 69 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=69&view=rev Author: amire80 Date: 2008-08-13 10:30:54 +0000 (Wed, 13 Aug 2008) Log Message: ----------- Exclude namespace before redirect check, makes log shorter. Modified Paths: -------------- trunk/no-interwiki/prepare_noiw_list.pl trunk/no-interwiki/tidy.sh Modified: trunk/no-interwiki/prepare_noiw_list.pl =================================================================== --- trunk/no-interwiki/prepare_noiw_list.pl 2008-08-12 07:54:04 UTC (rev 68) +++ trunk/no-interwiki/prepare_noiw_list.pl 2008-08-13 10:30:54 UTC (rev 69) @@ -380,8 +380,8 @@ # Skipping cases: next PAGE if ( - is_redirect($page) - or not is_in_namespace($page, @INCLUDE_NAMESPACES) + not is_in_namespace($page, @INCLUDE_NAMESPACES) + or is_redirect($page) # TODO: Be more precise here. # Portal pages which have a '/' in their name are probably Modified: trunk/no-interwiki/tidy.sh =================================================================== --- trunk/no-interwiki/tidy.sh 2008-08-12 07:54:04 UTC (rev 68) +++ trunk/no-interwiki/tidy.sh 2008-08-13 10:30:54 UTC (rev 69) @@ -14,7 +14,7 @@ exit 1 fi -diff $FN ${FN}.bak +diff $FN.bak ${FN} if [ $? -eq 2 ]; then exit 1 fi This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2008-11-21 22:39:42
|
Revision: 71 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=71&view=rev Author: amire80 Date: 2008-11-21 22:39:40 +0000 (Fri, 21 Nov 2008) Log Message: ----------- POD spelling, critic update. Modified Paths: -------------- trunk/no-interwiki/.perlcriticrc trunk/no-interwiki/prepare_noiw_list.pl Modified: trunk/no-interwiki/.perlcriticrc =================================================================== --- trunk/no-interwiki/.perlcriticrc 2008-11-21 20:57:24 UTC (rev 70) +++ trunk/no-interwiki/.perlcriticrc 2008-11-21 22:39:40 UTC (rev 71) @@ -1,6 +1,7 @@ -#[-CodeLayout::RequireTidyCode] -#[-Miscellanea::RequireRcsKeywords] - # English.pm doesn't support named capture variables (yet) [Variables::ProhibitPunctuationVars] allow = %+ $+ @+ + +# This is probably useful, but appears buggy, so it should remain manual +[-Documentation::PodSpelling] + Modified: trunk/no-interwiki/prepare_noiw_list.pl =================================================================== --- trunk/no-interwiki/prepare_noiw_list.pl 2008-11-21 20:57:24 UTC (rev 70) +++ trunk/no-interwiki/prepare_noiw_list.pl 2008-11-21 22:39:40 UTC (rev 71) @@ -1380,7 +1380,7 @@ it will be removed automatically in the future.) =item * If the page contains no links to the defined languages and does not -comtain the template, it is automatically added to type "other". +contain the template, it is automatically added to type "other". =item * If the page contains no links to the defined languages and a template with types, it is added to the defined types. @@ -1606,7 +1606,7 @@ =item * B<0.2 - Noa>: Perl 5.10. Russian l10n. POD documentation. Pretty categories sorting. Memory usage optimization - accumulating information in files. More generic, but far-from-perfect handling of links to languages -other than English. Translitetaion with Lingua::Translit. Logging with +other than English. Transliteration with Lingua::Translit. Logging with Log::Log4perl. Brutal Perl::Critic 1.90. Started using Readonly. Not finished: complete statistics, removal of templates from pages which already have links. This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2009-09-24 22:34:08
|
Revision: 74 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=74&view=rev Author: amire80 Date: 2009-09-24 22:33:58 +0000 (Thu, 24 Sep 2009) Log Message: ----------- Fixing a lot of bitrot. Modified Paths: -------------- trunk/no-interwiki/prepare_noiw_list.pl trunk/no-interwiki/tidy.sh Added Paths: ----------- trunk/no-interwiki/cv.language_codes.txt trunk/no-interwiki/cv.strings.txt Added: trunk/no-interwiki/cv.language_codes.txt =================================================================== --- trunk/no-interwiki/cv.language_codes.txt (rev 0) +++ trunk/no-interwiki/cv.language_codes.txt 2009-09-24 22:33:58 UTC (rev 74) @@ -0,0 +1 @@ +link language_codes.txt \ No newline at end of file Property changes on: trunk/no-interwiki/cv.language_codes.txt ___________________________________________________________________ Added: svn:special + * Added: trunk/no-interwiki/cv.strings.txt =================================================================== --- trunk/no-interwiki/cv.strings.txt (rev 0) +++ trunk/no-interwiki/cv.strings.txt 2009-09-24 22:33:58 UTC (rev 74) @@ -0,0 +1,40 @@ +# months +January кăрлач +February нарăс +March пуш +April ака +May çу +June çĕртме +July утă +August çурла +September авăн +October юпа +November чӳк +December раштав + +in + +no_iw Интервики лартман +category Категори +disambig тĕрлĕ пĕлтерĕш +template шаблон + +date дата +type тĕс + +# MW specials +REDIRECT куçару + +# Namespaces +User Хутшăнакан +User talk Хутшăнаканăн канашлу страници +Image Ӳкерчĕк +Portal Portal +Category Категори +article space article space + +# Other +other Ытти +rlm ‏ +exclude_lowercase ß + Modified: trunk/no-interwiki/prepare_noiw_list.pl =================================================================== --- trunk/no-interwiki/prepare_noiw_list.pl 2009-09-08 19:20:10 UTC (rev 73) +++ trunk/no-interwiki/prepare_noiw_list.pl 2009-09-24 22:33:58 UTC (rev 74) @@ -21,6 +21,7 @@ use integer; use open ':utf8'; use utf8; +use charnames ':full'; # Standard library # These modules should come installed with Perl @@ -34,17 +35,14 @@ # CPAN # You must install these modules from CPAN -# Versions before 0.51 have a bug with parsing the namespace of a page +# I only use Readonly where P::C complains about it. Using it with regular +# expressions which have non-ASCII chars produces heisenbugs. use Readonly; use Regexp::Common; use Lingua::Translit; use Log::Log4perl qw(:easy); use Parse::MediaWikiDump 0.51; # Earlier versions have a bug in namespaces -# XXX -use Devel::Leak; -use Devel::Size qw(size total_size); - #<<< no perltidy my %SVN_PROPS = ( ## no critic (RequireInterpolationOfMetachars) Revision => '$Revision$', @@ -56,14 +54,14 @@ ? "0.1.9.$+{revision_num}" : croak(q(Something is wrong with SVN revision number)); -Readonly my $WIKITEXT_EXT => 'wiki.txt'; -Readonly my $OUT_DIR => 'out'; -Readonly my $UNSORTED_DIR => "$OUT_DIR/unsorted"; -Readonly my $MULTI_DIR => "$OUT_DIR/multilinks"; -Readonly my $ALT_SEP => q{|}; -Readonly my $FIELD_SEP => qq{\t}; -Readonly my $LINK_SEP => q{|}; -Readonly my $TYPE_SEP => qr/\s*;\s*/xms; +my $WIKITEXT_EXT = 'wiki.txt'; +my $OUT_DIR = 'out'; +my $UNSORTED_DIR = "$OUT_DIR/unsorted"; +my $MULTI_DIR = "$OUT_DIR/multilinks"; +my $ALT_SEP = q{|}; +my $FIELD_SEP = qq{\t}; +my $LINK_SEP = q{|}; +my $TYPE_SEP = qr/\s*;\s*/xms; Readonly my $DEFAULT_MAX_IW_PLACES => 20; Readonly my $DEFAULT_PAGE_FREQ => 1000; @@ -104,15 +102,15 @@ } # XXX Too coupled to Wikipedia, won't work for other projects. -Readonly my $DUMP_FN_RE => qr{ +my $DUMP_FN_RE = qr{ \A # Begin string (?<wiki_lang>\w+) # Lang code wiki # Is supposed to be after the lang code }xms; -Readonly my $FIELD_SEP_RE => qr{\t}xms; +my $FIELD_SEP_RE = qr{\t}xms; -Readonly my $STRING_SKIP_RE => qr{ +my $STRING_SKIP_RE = qr{ \A # Begin string \s* # Zero or more spaces (?:\#.*)? # Comment lines @@ -136,7 +134,7 @@ # ISO 9 is mostly good for Russian and it is still not perfect ASCII my $TRANSLITERATOR = Lingua::Translit->new('ISO 9'); -Readonly my %MW_SYNTAX => ( +my %MW_SYNTAX = ( 'start_template' => '{{', 'end_template' => '}}', 'start_link' => '[[', @@ -148,15 +146,15 @@ ); # This monstrosity basically says: | and optional spaces -Readonly my $PARAM_SEP_RE => qr{\s*\Q$MW_SYNTAX{param_sep}\E\s*}xms; +my $PARAM_SEP_RE = qr{\s*\Q$MW_SYNTAX{param_sep}\E\s*}xms; -Readonly my @INCLUDE_NAMESPACES => ('article space', 'Category', 'Portal'); +my @INCLUDE_NAMESPACES = ('article space', 'Category', 'Portal'); # # Constants for date processing # -Readonly my @MONTHS => @STRING{ +my @MONTHS = @STRING{ qw( January February March April May June @@ -170,7 +168,7 @@ @REV_MONTH{@MONTHS} = (0 .. $LAST_MONTH); # XXX Internationalize -Readonly my $HEB_DATE_RE => qr{ +my $HEB_DATE_RE = qr{ \A # begin (?<hour>\d{1,2}) # hour : # : @@ -184,7 +182,7 @@ \z # end }xms; -Readonly my $PARAM_RE => qr{ +my $PARAM_RE = qr{ \A # Beginning of a string \s* # Zero or more space (?: # No capture @@ -196,14 +194,14 @@ }xms; # XXX It should use get_string() -Readonly my $SIMPLE_NO_IW_CHECK_RE => qr{ +my $SIMPLE_NO_IW_CHECK_RE = qr{ \Q$STRING{no_iw}\E # The string may have spaces }xmsi; # A simplistic template just for testing. # Quite possibly it is not needed anymore. # Until i get a better regex for matching balancing {{}} ... -Readonly my $TEMPLATE_RE => qr{ +my $TEMPLATE_RE = qr{ \A # beginning of string \Q$MW_SYNTAX{start_template}\E # {{ .+ # some chars @@ -211,7 +209,7 @@ \z # end of string }xms; -Readonly my $WIKITABLE_RE => qr{ +my $WIKITABLE_RE = qr{ \A \Q$MW_SYNTAX{start_wikitable}\E }xms; @@ -220,7 +218,7 @@ # Regular expression mostly copied from # Parse::MediaWikiDump::page::redirect # TODO: Try to look for the local language redirect keyword in the dump. -Readonly my $LOCAL_REDIRECT_RE => qr{ +my $LOCAL_REDIRECT_RE = qr{ \A # Beginning of string (page) \# # a # character $STRING{REDIRECT} # Redirect keyword in local language @@ -228,60 +226,62 @@ \s*:?\s*\[\[([^\]]*)\]\] # the link after the redirect }xmsi; -Readonly my $LTR_CHAR_RE => qr/\P{IsLeftToRight}/xms; -Readonly my $SECTION_LINK_RE => qr{(?<!&)\#}xms; -Readonly my $LOWERCASE_LINK_RE => qr{\A[[:lower:]]}xms; +my $LTR_CHAR_RE = qr/\P{IsLeftToRight}/xms; +my $SECTION_LINK_RE = qr{(?<!&)\#}xms; +my $LOWERCASE_LINK_RE = qr{\A[[:lower:]]}xms; ## no critic (ProhibitEscapedMetacharacters) -Readonly my $TRUE_TEMPLATE_RE => qr/\{ $RE{balanced}{-parens=>'{}'} \}/xms; +my $TRUE_TEMPLATE_RE = qr/\{ $RE{balanced}{-parens=>'{}'} \}/xms; ## use critic (ProhibitEscapedMetacharacters) # get_string() cannot be used in re my $string_exclude_lowercase = get_string('exclude_lowercase'); -Readonly my $EXCLUDE_LOWERCASE_RE => qr{ +my $EXCLUDE_LOWERCASE_RE = qr{ \A # Beginning of foreign article name [$string_exclude_lowercase] # Character class of possibly lowercase chars }xms; -Readonly my $NUMBERED_FILE_RE => qr{ +my $NUMBERED_FILE_RE = qr{ (?: _ \d*)? \.$WIKITEXT_EXT }xms; -Readonly my $INVALID_FILENAME_CHAR_RE => qr{[\\\n/:*?"<>|]}xms; # " +my $INVALID_FILENAME_CHAR_RE = qr{[\\\n/:*?"<>|]}xms; # " -my $two_digit_charnumber_re = qr{ +my $TWO_DIGIT_CHARNUMBER_RE = qr{ (?: [%.] # There are both %C4%B0 and .AA.E0 [[:xdigit:]]{2} # 2 hex digits ) }xms; -my $html_charnumber_re = qr{ +my $HTML_CHARNUMBER_RE = qr{ (?: &\#\d+; # stuff like Š ) }xms; # TODO: Check whether it is Neapolitan with its '' -Readonly my $CHARACTER_CODE_IN_LINK_RE => qr{ - $two_digit_charnumber_re | $html_charnumber_re +my $CHARACTER_CODE_IN_LINK_RE = qr{ + $TWO_DIGIT_CHARNUMBER_RE | $HTML_CHARNUMBER_RE }xms; my %LANG_CODE; -Readonly my $LANG_CODE_FN => "$WIKI_LANG.language_codes.txt"; -open my $lang_code_file, '<', $LANG_CODE_FN +my $LANG_CODE_FN = "$WIKI_LANG.language_codes.txt"; +open my $lang_code_file, '<', $LANG_CODE_FN ## no critic (RequireBriefOpen) or croak(file_error('opening', $LANG_CODE_FN, 'reading')); while (my $line = <$lang_code_file>) { chomp $line; my ($code, $name) = split /\t/xms, $line; - $LANG_CODE{$code} = $name // $code; # / + if (defined $code) { + $LANG_CODE{$code} = $name // $code; # / + } } close $lang_code_file or croak(file_error('closing', $LANG_CODE_FN, 'reading')); -Readonly my $ALT_LANGS => join $ALT_SEP, keys %LANG_CODE; +my $ALT_LANGS = join $ALT_SEP, keys %LANG_CODE; -Readonly my $INTERWIKI_LINK_RE => qr{ +my $INTERWIKI_LINK_RE = qr{ \Q$MW_SYNTAX{start_link}\E (?<lang_code> $ALT_LANGS @@ -294,7 +294,7 @@ }xms; # Lojban allows lowercase articles -Readonly my @LOWERCASE_LANGS => qw(jbo); +my @LOWERCASE_LANGS = qw(jbo); Readonly my $COOLING_DAYS => 120; Readonly my $COOLING_SECONDS => $COOLING_DAYS * 24 * 60 * 60; @@ -321,7 +321,7 @@ my $namespaces_alt = join $ALT_SEP, grep { length > 0 } @{ $dump->namespaces_names() }; -Readonly my $PURE_TITLE_RE => qr{ +my $PURE_TITLE_RE = qr{ \A (?: (?:$namespaces_alt) @@ -357,7 +357,6 @@ # my @found_lang_codes = sort keys %found_links; # INFO("found lang_codes: @found_lang_codes"); -say 'looking for multi links'; my $begin_multi_links_time = time; print_multi_links_by_foreign(); @@ -383,18 +382,11 @@ } sub find_iwless { - my $leak_handle; - my $leak_count; - PAGE: while (my $page_ref = next_page()) { $page_counter++; if ($page_counter % $option{page_freq} == 0) { say $page_counter; - # my $lead_count = Devel::Leak::NoteSV($leak_handle); - # say "leak count: $lead_count"; - # say 'Devel::Size size of buck dump: ', total_size($dump); - # say 'Devel::Size total_size of buck page_ref: ', total_size($page_ref); } last PAGE @@ -406,27 +398,49 @@ my $page_namespace = namespace($page_ref); my $page_title = $page_ref->title(); + my $page_text_ref = $page_ref->text(); - # Skipping cases: - next PAGE - if ( - not is_in_namespace($page_ref, @INCLUDE_NAMESPACES) - or is_redirect($page_ref) + if (not defined $page_text_ref) { + WARN('ref to page text undefined, skipping'); + $statistics{'ref to page text undefined'}++; + next PAGE; + } - # TODO: Be more precise here. - # Portal pages which have a '/' in their name are probably - # internal and do not need interwiki links. - or (is_in_namespace($page_ref, 'Portal') - and $page_title =~ m{/}xms) - ); + if (not defined ${$page_text_ref}) { + WARN('page text undefined, skipping'); + $statistics{'page text undefined'}++; + next PAGE; + } + if (${$page_text_ref} eq q()) { + WARN('page text empty, skipping'); + $statistics{'page text empty'}++; + next PAGE; + } + + if (not is_in_namespace($page_ref, @INCLUDE_NAMESPACES)) { + next PAGE; + } + + if (my $redirect_type = is_redirect($page_ref)) { + $statistics{"redirect - $redirect_type"}++; + INFO("\n$page_title - $redirect_type redirect"); + next PAGE; + } + + # TODO: Be more precise here. + # Portal pages which have a '/' in their name are probably + # internal and do not need interwiki links. + if (is_in_namespace($page_ref, 'Portal') and $page_title =~ m{/}xms) { + next PAGE; + } + $namespace_count{$page_namespace}++; INFO("\n* processing $page_counter - ", $page_title); - my $page_text_ref = $page_ref->text(); - # A simple sanity check: is the no_iw template anywhere around here? - my $has_template_no_iw = ($page_text_ref =~ $SIMPLE_NO_IW_CHECK_RE); + my $has_template_no_iw = + (${$page_text_ref} =~ $SIMPLE_NO_IW_CHECK_RE); # Does the page have interwiki links? # BIG XXX Actually checks only for English @@ -792,7 +806,15 @@ } push @{ $statistics{count_iw}->[$count_iw] }, $page_title; } - INFO("iw link count for $page_title: $count_iw"); + INFO( + "iw link count for $page_title" + . ( + $option{'rlm'} + ? "\N{LEFT-TO-RIGHT MARK}" + : q() + ) + . " is: $count_iw" + ); for my $special_case_name (keys %special_cases) { if (scalar %{ $special_cases{$special_case_name} }) { @@ -1061,7 +1083,7 @@ # Custom Unicode character property for finding characters. # The custom is to give those subroutines CamelCase names. -sub IsLeftToRight { ## no critic (Capitalization) +sub IsLeftToRight { ## no critic (ProhibitMixedCaseSubs) return <<'END'; +utf8::InHebrew +utf8::IsSpace @@ -1070,19 +1092,19 @@ } sub is_redirect { - my ($page) = @_; - my $page_title = $page->title(); - my $page_text_ref = $page->text(); + my ($page) = @_; if ($page->redirect()) { - INFO("\nEnglish redirect: $page_title"); - return 1; + return 'English'; } + + my $page_text_ref = $page->text(); + if (${$page_text_ref} =~ $LOCAL_REDIRECT_RE) { - INFO("\nLocal redirect: $page_title"); - return 1; + return 'local'; } - return 0; + + return q(); } sub is_in_namespace { @@ -1290,13 +1312,20 @@ my $max_iw_index = $#{ $statistics{count_iw} }; INFO("max_iw_index: $max_iw_index"); + MAX_IW: for my $max_iw_place (0 .. $option{max_iw_places}) { my @links = - map { make_link($_) } @{ $statistics{count_iw}->[$max_iw_index] }; + map { make_link($_) } @{ $statistics{count_iw}->[$max_iw_index] } + or last MAX_IW; INFO("# $max_iw_index: " . join_links(\@links, 0)); - # Do nothing, just count down to the next index with a defined list - while (not defined $statistics{count_iw}->[ --$max_iw_index ]) { } + # Do nothing, just count down to the next index with a defined list. + # $max_iw_index needs to be checked for nonzero-ness for the rare case + # of a very low page count. + while ($max_iw_index + and not defined $statistics{count_iw}->[ --$max_iw_index ]) + { + } } INFO('pages without interwiki links per namespace'); @@ -1337,7 +1366,7 @@ prepare_noiw_list.pl -version 0.2 - Noa. +version 0.2.1 - Noa. =head1 USAGE @@ -1511,10 +1540,6 @@ This module is used for transliterating filenames to ASCII. -=item * C<Readonly> - -To make Perl::Critic happy :) - =back =head1 HACKING Modified: trunk/no-interwiki/tidy.sh =================================================================== --- trunk/no-interwiki/tidy.sh 2009-09-08 19:20:10 UTC (rev 73) +++ trunk/no-interwiki/tidy.sh 2009-09-24 22:33:58 UTC (rev 74) @@ -1,6 +1,6 @@ #!/bin/bash -FN=prepare_noiw_list.pl +FN=$1 echo checking syntax perl -c $FN This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2009-09-25 10:43:13
|
Revision: 76 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=76&view=rev Author: amire80 Date: 2009-09-25 10:43:03 +0000 (Fri, 25 Sep 2009) Log Message: ----------- Adding el, fixing tidy. Modified Paths: -------------- trunk/no-interwiki/.perltidyrc Added Paths: ----------- trunk/no-interwiki/el.language_codes.txt trunk/no-interwiki/el.strings.txt Modified: trunk/no-interwiki/.perltidyrc =================================================================== --- trunk/no-interwiki/.perltidyrc 2009-09-24 22:40:56 UTC (rev 75) +++ trunk/no-interwiki/.perltidyrc 2009-09-25 10:43:03 UTC (rev 76) @@ -1,4 +1,3 @@ ---backup-and-modify-in-place --maximum-line-length=78 --continuation-indentation=4 --nooutdent-long-lines Added: trunk/no-interwiki/el.language_codes.txt =================================================================== --- trunk/no-interwiki/el.language_codes.txt (rev 0) +++ trunk/no-interwiki/el.language_codes.txt 2009-09-25 10:43:03 UTC (rev 76) @@ -0,0 +1 @@ +link language_codes.txt \ No newline at end of file Property changes on: trunk/no-interwiki/el.language_codes.txt ___________________________________________________________________ Added: svn:special + * Added: trunk/no-interwiki/el.strings.txt =================================================================== --- trunk/no-interwiki/el.strings.txt (rev 0) +++ trunk/no-interwiki/el.strings.txt 2009-09-25 10:43:03 UTC (rev 76) @@ -0,0 +1,36 @@ +# months +January Januaro +February Februaro +March Marto +April Aprilo +May Majo +June Junio +July Julio +August Auxgusto +September Septembro +October Oktobro +November Novembro +December Decembro + +in + +no_iw iw +disambig Αποσαφήνιση +template Πρότυπο + +date dato +type tipo + +# MW specials +REDIRECT REDIRECT + +# Namespaces +User Χρήστης +User talk Συζήτηση χρήστη +Image Αρχείο +Portal Πύλη +Category Κατηγορία +article space (Κύριος χώρος) + +# Other +other other This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2009-09-26 23:43:19
|
Revision: 80 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=80&view=rev Author: amire80 Date: 2009-09-26 23:43:11 +0000 (Sat, 26 Sep 2009) Log Message: ----------- Copyright, MediaWikiDump 0.92 requirement, version number cleanup, RTL cleanup, logging cleanup, POD cleanup, get_string cleanup, simple dump ratio, P::C cleanup, adding pod spelling. Modified Paths: -------------- trunk/no-interwiki/.perlcriticrc trunk/no-interwiki/prepare_noiw_list.pl Added Paths: ----------- trunk/no-interwiki/stopwords.txt Modified: trunk/no-interwiki/.perlcriticrc =================================================================== --- trunk/no-interwiki/.perlcriticrc 2009-09-26 10:51:33 UTC (rev 79) +++ trunk/no-interwiki/.perlcriticrc 2009-09-26 23:43:11 UTC (rev 80) @@ -2,6 +2,10 @@ [Variables::ProhibitPunctuationVars] allow = %+ $+ @+ -# This is probably useful, but appears buggy, so it should remain manual -[-Documentation::PodSpelling] +[Documentation::PodSpelling] +stop_words_file = stopwords.txt +# say is for writing to the terminal, no need to check it +[InputOutput::RequireCheckedSyscalls] +exclude_functions = say + Modified: trunk/no-interwiki/prepare_noiw_list.pl =================================================================== --- trunk/no-interwiki/prepare_noiw_list.pl 2009-09-26 10:51:33 UTC (rev 79) +++ trunk/no-interwiki/prepare_noiw_list.pl 2009-09-26 23:43:11 UTC (rev 80) @@ -1,7 +1,7 @@ #!/usr/bin/perl # prepare_noiw_list.pl -# version 0.2 Noa - development +# version 0.2.1 Noa - development # See the POD documentation at the end of the file or run # perldoc prepare_noiw_list.pl @@ -9,9 +9,21 @@ # "Had he been a French child, # he would have heard an infinite number of sentences" - Otto Jespersen -# This program is Free Software; you can redistribute it and/or modify it -# under the same terms as Perl itself. +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 3 +# as published by the Free Software Foundation or under the terms of +# Artistic License version 2.0. +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the licenses +# along with this program. If not, see +# * <http://www.gnu.org/licenses/gpl-3.0.html>. +# * <http://www.perlfoundation.org/artistic_license_2_0> + # Upgrade! This program actually uses new Perl 5.10 constructs, so you need it use 5.010; @@ -38,11 +50,13 @@ # I only use Readonly where P::C complains about it. Using it with regular # expressions which have non-ASCII chars produces heisenbugs. use Readonly; +use Pod::Usage; use Regexp::Common; use Lingua::Translit; use Log::Log4perl qw(:easy); -use Parse::MediaWikiDump 0.51; # Earlier versions have a bug in namespaces +use Parse::MediaWikiDump 0.92; # Earlier versions have a different API +our $VERSION = '0.2.1'; #<<< no perltidy my %SVN_PROPS = ( ## no critic (RequireInterpolationOfMetachars) Revision => '$Revision$', @@ -50,8 +64,9 @@ Date => '$Date$', ); #>>> -our $VERSION = ($SVN_PROPS{Revision} =~ /\A\$Revision:\ (?<revision_num>\d+)\ \$\z/xms) - ? "0.1.9.$+{revision_num}" +our $FULL_VERSION = + ($SVN_PROPS{Revision} =~ /\A\$Revision:\ (?<revision_num>\d+)\ \$\z/xms) + ? "$VERSION.$+{revision_num}" : croak(q(Something is wrong with SVN revision number)); my $WIKITEXT_EXT = 'wiki.txt'; @@ -62,6 +77,7 @@ my $FIELD_SEP = qq{\t}; my $LINK_SEP = q{|}; my $TYPE_SEP = qr/\s*;\s*/xms; +my $HTML_RLM = '‏'; Readonly my $DEFAULT_MAX_IW_PLACES => 20; Readonly my $DEFAULT_PAGE_FREQ => 1000; @@ -69,11 +85,12 @@ Readonly my $DEFAULT_MAX_SECTIONS_PER_PAGE => 20; # Initialize logger +my $LOG_FN = 'outtest.log'; Log::Log4perl->easy_init( { - level => $INFO, # print everything - file => ':utf8>outtest.log', # utf is important - layout => '%m%n', # No need to print the date + level => $INFO, # print everything + file => ":utf8>$LOG_FN", # utf is important + layout => '%m%n', # No need to print the date } ); @@ -85,6 +102,9 @@ max_links_per_section => $DEFAULT_MAX_LINKS_PER_SECTION, max_sections_per_page => $DEFAULT_MAX_SECTIONS_PER_PAGE, max_iw_places => $DEFAULT_MAX_IW_PLACES, + help => 0, + usage => 0, + version => 0, ); my $valid_options = GetOptions( @@ -95,12 +115,42 @@ 'max_links_per_section' => \$option{max_links_per_section}, 'max_sections_per_page' => \$option{max_sections_per_page}, 'max_iw_places' => \$option{max_iw_places}, + 'help' => \$option{help}, + 'usage' => \$option{usage}, + 'version' => \$option{version}, ); if (not $valid_options) { croak('Invalid command line options.'); } +if ($option{'version'}) { + my $program_name = __FILE__; + say <<"END_VERSION"; +$program_name version $FULL_VERSION +This program searches for pages without interlanguage links (interwiki). + +Copyright 2005-2009 Guy Shaked, Nadav Perez and Amir E. Aharoni. + +This program is Free Software; you can redistribute it and/or modify it +under the terms of Artistic License version 2.0 or the +GNU General Public license version 3.0. + +Run `perldoc $program_name' for full terms. +END_VERSION + exit; +} + +if ($option{'help'} or $option{'usage'}) { + my $verbose_level = $option{'usage'} ? 1 : 2; + pod2usage( + { + -exitval => 0, + -verbose => $verbose_level, + } + ); +} + # XXX Too coupled to Wikipedia, won't work for other projects. my $DUMP_FN_RE = qr{ \A # Begin string @@ -168,14 +218,14 @@ @REV_MONTH{@MONTHS} = (0 .. $LAST_MONTH); # XXX Internationalize +my $string_in = get_string('in'); my $HEB_DATE_RE = qr{ \A # begin (?<hour>\d{1,2}) # hour : # : (?<min>\d{2}),\s # minute (?<mday>\d{1,2})\s # day of month - $STRING{in}? # This preposition appears sometimes - # It should have been get_string() + $string_in? # This preposition appears sometimes (?<mon>$ALT_MONTHS)\s # A name of a month (?<year>\d+?)\s # Year \([A-Z]{3}\) # Three letters in brackets - timezone @@ -193,9 +243,9 @@ (?<value>.*) # value }xms; -# XXX It should use get_string() +my $string_no_iw = get_string('no_iw'); my $SIMPLE_NO_IW_CHECK_RE = qr{ - \Q$STRING{no_iw}\E # The string may have spaces + \Q$string_no_iw\E # The string may have spaces }xmsi; # A simplistic template just for testing. @@ -218,21 +268,21 @@ # Regular expression mostly copied from # Parse::MediaWikiDump::page::redirect # TODO: Try to look for the local language redirect keyword in the dump. +my $local_redirect = get_string('REDIRECT'); my $LOCAL_REDIRECT_RE = qr{ \A # Beginning of string (page) \# # a # character - $STRING{REDIRECT} # Redirect keyword in local language - # XXX It should use get_string() + $local_redirect # Redirect keyword in local language \s*:?\s*\[\[([^\]]*)\]\] # the link after the redirect }xmsi; -my $LTR_CHAR_RE = qr/\P{IsLeftToRight}/xms; +my $LTR_CHAR_RE = qr/\P{IsRightToLeft}/xms; # \P is negation my $SECTION_LINK_RE = qr{(?<!&)\#}xms; my $LOWERCASE_LINK_RE = qr{\A[[:lower:]]}xms; -## no critic (ProhibitEscapedMetacharacters) +## no critic (RegularExpressions::ProhibitEscapedMetacharacters) my $TRUE_TEMPLATE_RE = qr/\{ $RE{balanced}{-parens=>'{}'} \}/xms; -## use critic (ProhibitEscapedMetacharacters) +## use critic (RegularExpressions::ProhibitEscapedMetacharacters) # get_string() cannot be used in re my $string_exclude_lowercase = get_string('exclude_lowercase'); @@ -357,15 +407,16 @@ # my @found_lang_codes = sort keys %found_links; # INFO("found lang_codes: @found_lang_codes"); -say 'looking for multi links'; my $begin_multi_links_time = time; +say 'listing multi links by language'; print_multi_links_by_foreign(); +say 'listing multi links by local articles'; print_multi_links_by_local(); my $total_multi_links_time = time - $begin_multi_links_time; -say "total multi links time: $total_multi_links_time"; +INFO("total multi links time: $total_multi_links_time"); exit; @@ -784,8 +835,7 @@ { my $include_lowercase_link = 1; - # XXX get_string() cannot be used here - if (defined $STRING{exclude_lowercase} + if (defined get_string('exclude_lowercase', 'if defined') and $foreign_article =~ $EXCLUDE_LOWERCASE_RE) { $include_lowercase_link = 0; @@ -807,15 +857,7 @@ } push @{ $statistics{count_iw}->[$count_iw] }, $page_title; } - INFO( - "iw link count for $page_title" - . ( - $option{'rlm'} - ? "\N{LEFT-TO-RIGHT MARK}" - : q() - ) - . " is: $count_iw" - ); + INFO("iw link count for $page_title is: $count_iw"); for my $special_case_name (keys %special_cases) { if (scalar %{ $special_cases{$special_case_name} }) { @@ -835,14 +877,19 @@ sub special_cases_file { my ($special_case_name, $page, $special_cases_ref) = @_; $special_cases_ref //= {}; # / - my $special_case_langs = join q{, }, sort keys %{$special_cases_ref}; + + my $special_case_langs = join q{ }, sort keys %{$special_cases_ref}; + if ($special_case_langs) { $special_case_langs = " ($special_case_langs)"; } + my $special_case_fn = make_type_fn($special_case_name, 1); + if (not -e $special_case_fn) { append_to_file($special_case_fn, $special_case_name); } + my $page_title = $page->title(); my $link = make_link($page_title); my $line = @@ -852,6 +899,7 @@ . get_sort_title($page_title); append_to_file($special_case_fn, $line); + return; } @@ -954,12 +1002,8 @@ . $page_title . $MW_SYNTAX{end_link}; - if ($option{rtl}) { - if ($page_title =~ $LTR_CHAR_RE) { - - # XXX get_string() cannot be used here - $link_to_page = $STRING{rlm} . $link_to_page . $STRING{rlm}; - } + if ($option{rtl} and $page_title =~ $LTR_CHAR_RE) { + $link_to_page = $HTML_RLM . $link_to_page . $HTML_RLM; } return $link_to_page; @@ -968,7 +1012,7 @@ sub create_no_iw_pages { my ($params) = @_; - INFO('creating no_iw pages'); + INFO("\ncreating no_iw pages"); # Run over page types UNSORTED_TYPE_FN: @@ -1082,9 +1126,10 @@ return "'''$text'''"; } -# Custom Unicode character property for finding characters. -# The custom is to give those subroutines CamelCase names. -sub IsLeftToRight { ## no critic (ProhibitMixedCaseSubs) +# Custom Unicode character property, which is like \w, but for Hebrew. +# The custom is to give custom Unicode character classes CamelCase names. +# P::C policy ProhibitMixedCaseSubs is deprecated. +sub IsRightToLeft { ## no critic (Capitalization) return <<'END'; +utf8::InHebrew +utf8::IsSpace @@ -1158,8 +1203,10 @@ } sub get_string { - my ($english) = @_; - return $STRING{$english} //= $english; # / + my ($english, $if_defined) = @_; + return $if_defined + ? ($STRING{$english}) + : ($STRING{$english} //= $english); # / } sub make_type_fn { @@ -1210,6 +1257,8 @@ foreach my $lang_code (sort keys %found_links) { my $filename = "$MULTI_DIR/$lang_code.$WIKITEXT_EXT"; my @foreign_articles = sort keys %{ $found_links{$lang_code} }; + say "$lang_code ", scalar @foreign_articles; + FOREIGN_ARTICLE: foreach my $foreign_article (@foreign_articles) { my @local_articles = @@ -1235,6 +1284,8 @@ LANG_CODE: foreach my $lang_code (sort keys %found_links) { my @foreign_articles = sort keys %{ $found_links{$lang_code} }; + say "$lang_code ", scalar @foreign_articles; + FOREIGN_ARTICLE: foreach my $foreign_article (@foreign_articles) { my @local_articles = @@ -1254,10 +1305,12 @@ } } + say 'writing local multilinks file'; my $filename = "$MULTI_DIR/LOCAL.$WIKITEXT_EXT"; foreach my $local_multi_article (sort keys %local_multi_links) { append_to_file($filename, '* ' . mw_bold(make_link($local_multi_article))); + foreach my $other_local_article ( sort keys %{ $local_multi_links{$local_multi_article} }) { @@ -1304,7 +1357,7 @@ sub print_stats { INFO("\nSUMMARY"); - say "total time: $total_time"; + INFO("total time: $total_time"); foreach my $stat_type (sort keys %statistics) { if (not ref $statistics{$stat_type}) { INFO("$stat_type: $statistics{$stat_type}"); @@ -1346,11 +1399,24 @@ foreach my $namespace (sort keys %namespace_count) { INFO("$namespace: $namespace_count{$namespace}"); } + INFO("\nTYPES"); foreach my $type (sort keys %type_count) { INFO("$type: $type_count{$type}"); } + my $dump_size = -s $dump_fn; + my $log_size = -s $LOG_FN; + INFO("dump size: $dump_fn"); + INFO("log size: $LOG_FN"); + if ($log_size > 0) { + my $dump_log_ratio = $dump_size / $log_size; + INFO("dump/log ratio: $dump_log_ratio"); + } + else { + WARN("weird: log file size $LOG_FN is 0"); + } + return; } @@ -1413,6 +1479,12 @@ =item * --max_iw_places Number of places to print in the statistics of pages with the most interlanguage links. +=item * --version Print the version number and exit. + +=item * --usage Print basic usage and exit. + +=item * --help Print full help and exit. + =back =head1 DESCRIPTION @@ -1545,6 +1617,10 @@ To make Perl::Critic happy :) +=item * C<Pod::Usage> + +Some style guide recommended it. I don't even remember which one, but i love style guides. + =back =head1 HACKING @@ -1564,7 +1640,7 @@ for automatic code formatting. If you modify it, do yourself a favor, install Perl::Critic and regularly test it using this command: -./tidy.sh +./tidy.sh prepare_noiw_list.pl It checks the syntax, runs perltidy on the code and runs Perl::Critic. @@ -1710,9 +1786,29 @@ =head1 LICENSE AND COPYRIGHT -This program is Free Software; you can redistribute it and/or modify it -under the same terms as Perl itself. +Copyright 2009 Guy Shaked, Nadav Perez, Amir E. Aharoni. +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License version 3 +as published by the Free Software Foundation or under the terms of +Artistic License version 2.0. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the licenses +along with this program. If not, see + +=over + +=item * L<http://www.gnu.org/licenses/gpl-3.0.html>. + +=item * L<http://www.perlfoundation.org/artistic_license_2_0> + +=back + I<Visca la llibertat!> =cut Added: trunk/no-interwiki/stopwords.txt =================================================================== --- trunk/no-interwiki/stopwords.txt (rev 0) +++ trunk/no-interwiki/stopwords.txt 2009-09-26 23:43:11 UTC (rev 80) @@ -0,0 +1,40 @@ +ActivePerl +Aharoni +Aharoni's +Amir +Amire +backporting +CPAN +Cygwin +Drora +Felagund +Felagund's +FILENAME +filenames +filesystem +hoc +interlanguage +interwiki +Itay +llibertat +MediaWiki +multi +Nadav +namespace +Noa +outtest +param +perl +perltidy +Readonly +refactoring +Reut +rtl +Shaked +tec +Visca +wiki +Wikipedia +Wikipedias +xNUMBER +XP This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2009-09-29 13:25:18
|
Revision: 83 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=83&view=rev Author: amire80 Date: 2009-09-29 13:25:11 +0000 (Tue, 29 Sep 2009) Log Message: ----------- Refactoring empty pages handling. Modified Paths: -------------- trunk/no-interwiki/cv.strings.txt trunk/no-interwiki/prepare_noiw_list.pl Modified: trunk/no-interwiki/cv.strings.txt =================================================================== --- trunk/no-interwiki/cv.strings.txt 2009-09-27 12:41:34 UTC (rev 82) +++ trunk/no-interwiki/cv.strings.txt 2009-09-29 13:25:11 UTC (rev 83) @@ -23,7 +23,8 @@ type тĕс # MW specials -REDIRECT куçару +#REDIRECT куçару +REDIRECT перенаправление # Namespaces User Хутшăнакан Modified: trunk/no-interwiki/prepare_noiw_list.pl =================================================================== --- trunk/no-interwiki/prepare_noiw_list.pl 2009-09-27 12:41:34 UTC (rev 82) +++ trunk/no-interwiki/prepare_noiw_list.pl 2009-09-29 13:25:11 UTC (rev 83) @@ -463,55 +463,40 @@ say $page_counter; } - last PAGE - if ($option{stop_after} - and $page_counter > $option{stop_after}); - - next PAGE - if ($page_counter < $option{start_from}); - - my $page_namespace = namespace($page_ref); - my $page_title = $page_ref->title(); - my $page_text_ref = $page_ref->text(); - - if (not defined $page_text_ref) { - WARN('ref to page text undefined, skipping'); - $statistics{'ref to page text undefined'}++; - next PAGE; + if ( $option{stop_after} + and $page_counter > $option{stop_after}) + { + last PAGE; } - if (not defined ${$page_text_ref}) { - WARN('page text undefined, skipping'); - $statistics{'page text undefined'}++; - next PAGE; - } + my $page_text_ref = $page_ref->text(); - if (${$page_text_ref} eq q()) { - WARN('page text empty, skipping'); - $statistics{'page text empty'}++; + if ( + $page_counter < $option{start_from} + or not defined ${$page_text_ref} # must be tested before redirect + or not is_in_namespace($page_ref, @INCLUDE_NAMESPACES) + or is_redirect($page_ref) + ) + { next PAGE; } - if (not is_in_namespace($page_ref, @INCLUDE_NAMESPACES)) { - next PAGE; - } + my $page_namespace = namespace($page_ref); + $namespace_count{$page_namespace}++; - if (my $redirect_type = is_redirect($page_ref)) { - $statistics{"redirect - $redirect_type"}++; - DEBUG("\n$page_title - $redirect_type redirect"); - next PAGE; - } + my $page_title = $page_ref->title(); + INFO("\n* processing $page_counter - ", $page_title); + # TODO: Be more precise here. # Portal pages which have a '/' in their name are probably # internal and do not need interwiki links. if (is_in_namespace($page_ref, 'Portal') and $page_title =~ m{/}xms) { + INFO('internal portal, skipping'); + $statistics{'internal portal'}++; next PAGE; } - $namespace_count{$page_namespace}++; - INFO("\n* processing $page_counter - ", $page_title); - # A simple sanity check: is the no_iw template anywhere around here? my $has_template_no_iw = (${$page_text_ref} =~ $SIMPLE_NO_IW_CHECK_RE); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2010-03-09 12:44:47
|
Revision: 85 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=85&view=rev Author: amire80 Date: 2010-03-09 12:44:41 +0000 (Tue, 09 Mar 2010) Log Message: ----------- Adding config files for ru.wikipedia. Added Paths: ----------- trunk/no-interwiki/ru.language_codes.txt trunk/no-interwiki/ru.strings.txt Added: trunk/no-interwiki/ru.language_codes.txt =================================================================== --- trunk/no-interwiki/ru.language_codes.txt (rev 0) +++ trunk/no-interwiki/ru.language_codes.txt 2010-03-09 12:44:41 UTC (rev 85) @@ -0,0 +1 @@ +link language_codes.txt \ No newline at end of file Property changes on: trunk/no-interwiki/ru.language_codes.txt ___________________________________________________________________ Added: svn:special + * Added: trunk/no-interwiki/ru.strings.txt =================================================================== --- trunk/no-interwiki/ru.strings.txt (rev 0) +++ trunk/no-interwiki/ru.strings.txt 2010-03-09 12:44:41 UTC (rev 85) @@ -0,0 +1,37 @@ +# months +January января +February февраля +March марта +April апреля +May мая +June июня +July июля +August августа +September сентября +October ортября +November ноября +December декабря + +no_iw Нет интервики +disambig неоднозначность +template шаблон + +date дата +type topic + +# MW specials +REDIRECT перенаправление + +# Namespaces +User Участник +User talk Обсуждение участника +Image Файл +Portal Портал +Category Категория +article space (статьи) + +# Other +other другое +rlm {{כ}} +exclude_lowercase ß + This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2010-03-26 14:03:33
|
Revision: 87 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=87&view=rev Author: amire80 Date: 2010-03-26 14:03:26 +0000 (Fri, 26 Mar 2010) Log Message: ----------- Starting very naive refactoring. Modified Paths: -------------- trunk/no-interwiki/prepare_noiw_list.pl Added Paths: ----------- trunk/no-interwiki/MediaWiki/ trunk/no-interwiki/MediaWiki/Toolkit.pm Added: trunk/no-interwiki/MediaWiki/Toolkit.pm =================================================================== --- trunk/no-interwiki/MediaWiki/Toolkit.pm (rev 0) +++ trunk/no-interwiki/MediaWiki/Toolkit.pm 2010-03-26 14:03:26 UTC (rev 87) @@ -0,0 +1,104 @@ +package MediaWiki::Toolkit; + +use 5.010; + +use strict; +use warnings; +use integer; +use utf8; +use open ':encoding(utf8)'; + +use English qw(-no_match_vars); + +use base 'Exporter'; +our %EXPORT_TAGS = ( + FIELD_SEP => [qw( $FIELD_SEP $FIELD_SEP_RE )], + GET_STRING => [qw( get_strings get_string )], + FILE_UTIL => [qw( file_error append_to_file )], +); +our @EXPORT_OK = map { @{$_} } values %EXPORT_TAGS; + +our $VERSION = '0.01'; +#<<< no perltidy +my %SVN_PROPS = ( ## no critic (RequireInterpolationOfMetachars) + Revision => '$Revision: 0 $', + HeadURL => '$HeadURL: https://perlwikibot.svn.sourceforge.net/svnroot/perlwikibot/trunk/no-interwiki/MediaWiki/Toolkit.pm $', + Date => '$Date: 2010-03-25 17:02:28 +0200 (Thu, 25 Mar 2010) $', +); +#>>> + +our $FIELD_SEP = qq{\t}; +our $FIELD_SEP_RE = qr{\Q$FIELD_SEP\E}xms; + +# +# This is poor man's gettext. +# TODO: Replace it with Locale::Maketext +# +{ + my $STRING_SKIP_RE = qr{ + \A # Begin string + \s* # Zero or more spaces + (?:\#.*)? # Comment lines + \z # End string +}xms; + + my %string; + + sub get_strings { + my ($lang) = @_; + + my $strings_fn = "$lang.strings.txt"; + + open my $strings_file, '<', $strings_fn + or croak(file_error('opening', $strings_fn, 'reading')); + my @strings_file_lines = <$strings_file>; + close $strings_file + or croak(file_error('closing', $strings_fn, 'reading')); + + STRING_LINE: + foreach my $next_string_line (@strings_file_lines) { + + # Skip blanks and comments + next STRING_LINE if ($next_string_line =~ $STRING_SKIP_RE); + + chomp $next_string_line; + my ($english, $target) = split $FIELD_SEP_RE, $next_string_line; + + # Fallback to English if no target language string was supplied + $string{$english} = $target // $english; # / + } + + return; + } + + sub get_string { + my ($english, $if_defined) = @_; + + return $if_defined + ? ($string{$english}) + : ($string{$english} //= $english); # / + } +} + +sub file_error { + my ($operation, $fn, $access_type) = @_; + my $string = "error $operation $fn, $access_type: $OS_ERROR"; + return $string; +} + +sub append_to_file { + my ($fn, $line) = @_; + + open my $file, '>>', $fn + or croak(file_error('opening', $fn, 'appending')); + + say {$file} ($line // q{}); # / + + close $file + or croak(file_error('closing', $fn, 'appeding')); + + return; +} + +1; + Modified: trunk/no-interwiki/prepare_noiw_list.pl =================================================================== --- trunk/no-interwiki/prepare_noiw_list.pl 2010-03-25 15:02:28 UTC (rev 86) +++ trunk/no-interwiki/prepare_noiw_list.pl 2010-03-26 14:03:26 UTC (rev 87) @@ -31,7 +31,7 @@ use strict; use warnings; use integer; # Improves performance -use open ':utf8'; +use open ':encoding(utf8)'; use utf8; use charnames ':full'; @@ -56,6 +56,12 @@ use Log::Log4perl qw(:easy); use Parse::MediaWikiDump 0.91; # Earlier versions have a different API +use MediaWiki::Toolkit ( + qw( + :FIELD_SEP + :GET_STRING :FILE_UTIL) +); + our $VERSION = '0.2.1'; #<<< no perltidy my %SVN_PROPS = ( ## no critic (RequireInterpolationOfMetachars) @@ -74,7 +80,6 @@ my $UNSORTED_DIR = "$OUT_DIR/unsorted"; my $MULTI_DIR = "$OUT_DIR/multilinks"; my $ALT_SEP = q{|}; -my $FIELD_SEP = qq{\t}; my $LINK_SEP = q{|}; my $TYPE_SEP = qr/\s*;\s*/xms; my $HTML_RLM = '‏'; @@ -158,28 +163,19 @@ wiki # Is supposed to be after the lang code }xms; -my $FIELD_SEP_RE = qr{\t}xms; - -my $STRING_SKIP_RE = qr{ - \A # Begin string - \s* # Zero or more spaces - (?:\#.*)? # Comment lines - \z # End string -}xms; - # pages-meta-current my $dump_fn = $ARGV[0] or croak('Dump filename must be supplied as an argument.'); -my %STRING; my $WIKI_LANG; if ((basename $dump_fn) =~ $DUMP_FN_RE) { $WIKI_LANG = $+{wiki_lang}; - get_strings($WIKI_LANG); } else { croak("$dump_fn is a weird dump file name."); } +get_strings($WIKI_LANG); + # XXX - bad i18n # ISO 9 is mostly good for Russian and it is still not perfect ASCII my $TRANSLITERATOR = Lingua::Translit->new('ISO 9'); @@ -204,14 +200,12 @@ # Constants for date processing # -my @MONTHS = @STRING{ - qw( - January February March - April May June - July August September - October November December - ) - }; +my @MONTHS = map { get_string($_) } qw( + January February March + April May June + July August September + October November December +); my $ALT_MONTHS = join $ALT_SEP, @MONTHS; my %REV_MONTH; Readonly my $LAST_MONTH => 11; @@ -1163,52 +1157,16 @@ sub is_disambig { my ($page) = @_; + my $found_templates = find_templates($page->text(), [], [ get_string('disambig') ]); + return scalar @{$found_templates}; } -# -# This is poor man's gettext. -# TODO: Replace it with Locale::Maketext -# -sub get_strings { - my ($lang) = @_; - - my $STRINGS_FN = "$lang.strings.txt"; - - open my $STRINGS_FILE, '<:utf8', $STRINGS_FN - or croak(file_error('opening', $STRINGS_FN, 'reading')); - my @strings_file_lines = <$STRINGS_FILE>; - close $STRINGS_FILE - or croak(file_error('closing', $STRINGS_FN, 'reading')); - - STRING_LINE: - foreach my $next_string_line (@strings_file_lines) { - - # Skip blanks and comments - next STRING_LINE if ($next_string_line =~ $STRING_SKIP_RE); - - chomp $next_string_line; - my ($english, $target) = split $FIELD_SEP_RE, $next_string_line; - - # Fallback to English if no target language string was supplied - $STRING{$english} = $target // $english; # / - } - - return; -} - -sub get_string { - my ($english, $if_defined) = @_; - return $if_defined - ? ($STRING{$english}) - : ($STRING{$english} //= $english); # / -} - sub make_type_fn { my ($type, $unsorted) = @_; - $unsorted //= 0; # / + $unsorted //= 0; # / #my $transliterated_type = $TRANSLITERATOR->translit($type); my $transliterated_type = $type; @@ -1222,20 +1180,6 @@ return $type_fn; } -sub append_to_file { - my ($fn, $line) = @_; - - open my $file, '>>:utf8', $fn - or croak(file_error('opening', $fn, 'appending')); - - say {$file} ($line // q{}); # / - - close $file - or croak(file_error('closing', $fn, 'appeding')); - - return; -} - # It appears simple, but in non-alphabetic languages such as Chinese # it may be different, so it will sit here ready for better i18n. sub get_sort_letter { @@ -1243,12 +1187,6 @@ return substr $string, 0, 1; } -sub file_error { - my ($operation, $fn, $access_type) = @_; - my $string = "error $operation $fn for $access_type: $OS_ERROR"; - return $string; -} - sub format_link_table { ## no critic (RequireArgUnpacking) return sprintf '%-15s %8d', @_; } @@ -1259,7 +1197,7 @@ say "processing $lang_code"; my $lang_reftype = ref $found_links{$lang_code}; if ($lang_reftype ne 'HASH') { - carp('$lang_code is $lang_reftype, not hashref!'); + carp("$lang_code is $lang_reftype, not hashref!"); next LANG_CODE; } my $filename = "$MULTI_DIR/$lang_code.$WIKITEXT_EXT"; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2010-03-26 15:22:09
|
Revision: 88 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=88&view=rev Author: amire80 Date: 2010-03-26 15:22:02 +0000 (Fri, 26 Mar 2010) Log Message: ----------- Adding upload_iw.pl. Modified Paths: -------------- trunk/no-interwiki/MediaWiki/Toolkit.pm trunk/no-interwiki/ru.strings.txt Added Paths: ----------- trunk/no-interwiki/upload_iw.pl Modified: trunk/no-interwiki/MediaWiki/Toolkit.pm =================================================================== --- trunk/no-interwiki/MediaWiki/Toolkit.pm 2010-03-26 14:03:26 UTC (rev 87) +++ trunk/no-interwiki/MediaWiki/Toolkit.pm 2010-03-26 15:22:02 UTC (rev 88) @@ -14,7 +14,7 @@ our %EXPORT_TAGS = ( FIELD_SEP => [qw( $FIELD_SEP $FIELD_SEP_RE )], GET_STRING => [qw( get_strings get_string )], - FILE_UTIL => [qw( file_error append_to_file )], + FILE_UTIL => [qw( file_error append_to_file read_file )], ); our @EXPORT_OK = map { @{$_} } values %EXPORT_TAGS; @@ -81,24 +81,35 @@ } sub file_error { - my ($operation, $fn, $access_type) = @_; - my $string = "error $operation $fn, $access_type: $OS_ERROR"; + my ($operation, $filename, $access_type) = @_; + my $string = "error $operation $filename, $access_type: $OS_ERROR"; return $string; } sub append_to_file { - my ($fn, $line) = @_; + my ($filename, $line) = @_; - open my $file, '>>', $fn - or croak(file_error('opening', $fn, 'appending')); - + open my $file, '>>', $filename + or croak(file_error('opening', $filename, 'appending')); say {$file} ($line // q{}); # / - close $file - or croak(file_error('closing', $fn, 'appeding')); + or croak(file_error('closing', $filename, 'appeding')); return; } +sub read_file { + my ($filename) = @_; + + local $INPUT_RECORD_SEPARATOR = undef; + open my $file, '<', $filename + or croak(file_error('opening', $filename, 'reader')); + my $text = <$file>; + close $file + or croak(file_error('closing', $filename, 'reading')); + + return $text; +} + 1; Modified: trunk/no-interwiki/ru.strings.txt =================================================================== --- trunk/no-interwiki/ru.strings.txt 2010-03-26 14:03:26 UTC (rev 87) +++ trunk/no-interwiki/ru.strings.txt 2010-03-26 15:22:02 UTC (rev 88) @@ -30,6 +30,12 @@ Category Категория article space (статьи) +# pages +project prefix Википедия:Проект:Интервики/Виды/Категории + +# summaries +updating list of pages without interwiki обновление списка страниц без интервики + # Other other другое rlm {{כ}} Added: trunk/no-interwiki/upload_iw.pl =================================================================== --- trunk/no-interwiki/upload_iw.pl (rev 0) +++ trunk/no-interwiki/upload_iw.pl 2010-03-26 15:22:02 UTC (rev 88) @@ -0,0 +1,114 @@ +#!/usr/bin/perl + +use 5.010; + +use strict; +use warnings; +use open ':encoding(utf8)'; +use utf8; + +use English qw(-no_match_vars); +use Carp qw(croak cluck); +use Getopt::Long; +use Data::Dumper; +use Encode; + +use Readonly; + +use MediaWiki::API; + +use MediaWiki::Toolkit ( + qw( + :GET_STRING + :FILE_UTIL) +); + +our $VERSION = '0.01'; +#<<< no perltidy +my %SVN_PROPS = ( ## no critic (RequireInterpolationOfMetachars) + Revision => '$Revision: 0 $', + HeadURL => '$HeadURL: https://perlwikibot.svn.sourceforge.net/svnroot/perlwikibot/trunk/no-interwiki/upload_iw.pl $', + Date => '$Date: 2010-03-25 17:02:28 +0200 (Thu, 25 Mar 2010) $', +); +#>>> + +Readonly my $INPUT_EXTENSION => 'wiki.txt'; +Readonly my $INTERVAL_BETWEEN_EDITS => 5; + +my %option = ( + help => 0, + usage => 0, + version => 0, +); + +my $valid_options = GetOptions( + 'langcode=s' => \$option{langcode}, + 'username=s' => \$option{username}, + 'password=s' => \$option{password}, + 'help' => \$option{help}, + 'usage' => \$option{usage}, + 'version' => \$option{version}, +); + +if (not $valid_options) { + croak('Invalid command line options.'); +} + +for my $required_option (qw(langcode username password)) { + if (not $option{$required_option}) { + croak "option $required_option is required"; + } +} + +get_strings($option{langcode}); + +my $mw = MediaWiki::API->new(); +$mw->{config}->{api_url} = "http://$option{langcode}.wikipedia.org/w/api.php"; + +$mw->login( + { + lgname => $option{username}, + lgpassword => $option{password}, + } +) or croak $mw->{error}->{code} . ': ' . $mw->{error}->{details}; + +my $page_prefix = get_string('project prefix'); + +my $dirname = "./out.$option{langcode}/"; +my @filenames = glob $dirname . get_string('Category') . "*.$INPUT_EXTENSION"; + +say Dumper(\@filenames); + +foreach my $filename (@filenames) { + my $pagename = Encode::decode('UTF-8', "$filename"); + for ($pagename) { + s/\.$INPUT_EXTENSION\z//xms; + s/\A$dirname//xms; + } + $pagename = "$page_prefix/$pagename"; + + my $page = $mw->get_page({ title => $pagename }); + if ($page->{missing}) { + say "page $pagename is missing, trying to create"; + } + + say "uploading to $pagename"; + + $mw->edit( + { + action => 'edit', + title => $pagename, + summary => get_string('updating list of pages without interwiki'), + basetimestamp => $page->{timestamp}, + text => read_file($filename), + }, + { skip_encoding => 1, } + ) or croak $mw->{error}->{codie} . ': ' . $mw->{error}->{details}; + + sleep $INTERVAL_BETWEEN_EDITS; +} + +exit; + +__END__ + Property changes on: trunk/no-interwiki/upload_iw.pl ___________________________________________________________________ Added: svn:executable + * This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2010-03-26 18:42:19
|
Revision: 90 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=90&view=rev Author: amire80 Date: 2010-03-26 18:42:13 +0000 (Fri, 26 Mar 2010) Log Message: ----------- Ported to MediaWiki::DumpFile, the new version of Parse::MediaWikiDump. Modified Paths: -------------- trunk/no-interwiki/MediaWiki/Toolkit.pm trunk/no-interwiki/prepare_noiw_list.pl Modified: trunk/no-interwiki/MediaWiki/Toolkit.pm =================================================================== --- trunk/no-interwiki/MediaWiki/Toolkit.pm 2010-03-26 15:31:20 UTC (rev 89) +++ trunk/no-interwiki/MediaWiki/Toolkit.pm 2010-03-26 18:42:13 UTC (rev 90) @@ -15,6 +15,7 @@ FIELD_SEP => [qw( $FIELD_SEP $FIELD_SEP_RE )], GET_STRING => [qw( get_strings get_string )], FILE_UTIL => [qw( file_error append_to_file read_file )], + NAMESPACE => [qw( init_namespaces namespace )], ); our @EXPORT_OK = map { @{$_} } values %EXPORT_TAGS; @@ -111,5 +112,27 @@ return $text; } +my @NAMESPACES; + +sub init_namespaces { + @NAMESPACES = @_; + return; +} + +my $NAMESPACE_RE = qr/\A (?<namespace>[^:]+) : .* /xms; + +sub namespace { + my ($title) = @_; + my $namespace = q(); + + if ($title =~ $NAMESPACE_RE) { + if ($+{namespace} ~~ @NAMESPACES) { + $namespace = $+{namespace}; + } + } + + return $namespace || get_string('article space'); +} + 1; Modified: trunk/no-interwiki/prepare_noiw_list.pl =================================================================== --- trunk/no-interwiki/prepare_noiw_list.pl 2010-03-26 15:31:20 UTC (rev 89) +++ trunk/no-interwiki/prepare_noiw_list.pl 2010-03-26 18:42:13 UTC (rev 90) @@ -54,12 +54,15 @@ use Regexp::Common; use Lingua::Translit; use Log::Log4perl qw(:easy); -use Parse::MediaWikiDump 0.91; # Earlier versions have a different API +use MediaWiki::DumpFile::Pages; + use MediaWiki::Toolkit ( qw( :FIELD_SEP - :GET_STRING :FILE_UTIL) + :GET_STRING :FILE_UTIL + :NAMESPACE + ) ); our $VERSION = '0.2.1'; @@ -94,7 +97,7 @@ Log::Log4perl->easy_init( { level => $INFO, # print everything - file => ":utf8>$LOG_FN", # utf is important + file => ":utf8>$LOG_FN", # utf8 is important layout => '%m%n', # No need to print the date } ); @@ -360,8 +363,11 @@ my $the_dump = load_dump(); -my $namespaces_alt = join $ALT_SEP, - grep { length > 0 } @{ $the_dump->namespaces_names() }; +my %NAMESPACES = $the_dump->namespaces(); +my @NAMESPACES = values %NAMESPACES; +init_namespaces(@NAMESPACES); +my $namespaces_alt = join $ALT_SEP, grep { length > 0 } @NAMESPACES; + my $PURE_TITLE_RE = qr{ \A (?: @@ -414,16 +420,11 @@ sub load_dump { my $dump; - $dump = Parse::MediaWikiDump::Pages->new($dump_fn); + $dump = MediaWiki::DumpFile::Pages->new($dump_fn); return $dump; } -sub namespace { - my ($page) = @_; - return $page->namespace() || get_string('article space'); -} - sub next_page { my ($dump) = @_; my $page_ref = eval { $dump->next(); }; @@ -447,9 +448,9 @@ last PAGE; } - my $page_text_ref = $page_ref->text(); + my $page_text = $page_ref->revision()->text(); - if (${$page_text_ref} !~ /\S/xmsi + if ($page_text !~ /\S/xmsi and not is_in_namespace($page_ref, 'User', 'User talk')) { special_cases_file('empty_page', $page_ref); @@ -457,7 +458,7 @@ if ( $page_counter < $option{start_from} - or not defined ${$page_text_ref} # must be tested before redirect + or not defined $page_text # must be tested before redirect or not is_in_namespace($page_ref, @INCLUDE_NAMESPACES) or is_redirect($page_ref) ) @@ -465,11 +466,11 @@ next PAGE; } - my $page_namespace = namespace($page_ref); + my $page_title = $page_ref->title(); + + my $page_namespace = namespace($page_title); $namespace_count{$page_namespace}++; - my $page_title = $page_ref->title(); - INFO("\n* processing $page_counter - ", $page_title); # TODO: Be more precise here. @@ -482,8 +483,7 @@ } # A simple sanity check: is the no_iw template anywhere around here? - my $has_template_no_iw = - (${$page_text_ref} =~ $SIMPLE_NO_IW_CHECK_RE); + my $has_template_no_iw = ($page_text =~ $SIMPLE_NO_IW_CHECK_RE); # Does the page have interwiki links? # BIG XXX Actually checks only for English @@ -493,7 +493,7 @@ INFO("has link to $has_iw"); if ($has_template_no_iw) { INFO('has template no_iw. trying to remove ...'); - remove_template_no_iw($page_text_ref); + remove_template_no_iw($page_text); $statistics{'has both valid interwiki and template'}++; special_cases_file('outdated_template', $page_ref); } @@ -513,20 +513,21 @@ $has_iw # scalar bool ) = @_; + my $page_title = $page_ref->title(); INFO(q(does not have iw link.)); - $statistics{'has no interwiki link'}->{ namespace($page_ref) }++; + $statistics{'has no interwiki link'}->{ namespace($page_title) }++; # Now we need to search for no_iw templates # and parse their parameters - date and type my @found_templates = (); - my $page_text_ref = $page_ref->text(); + my $page_text = $page_ref->revision()->text(); # Optimized - does not start searching, # if we already know that it is not there if ($has_template_no_iw) { - find_templates($page_text_ref, \@found_templates, + find_templates(\$page_text, \@found_templates, [ get_string('no_iw') ]); } @@ -566,7 +567,7 @@ } elsif (cooling_date_passed($date_ref)) { INFO('cooling date passed, updating to today ...'); - update_cooling_date($page_text_ref); + update_cooling_date($page_text); $statistics{'cooling date passed'}++; } else { @@ -583,7 +584,7 @@ my @all_types = get_all_types($template->{params}->{type}, $page_ref); foreach my $type (@all_types) { - INFO('adding ' . $page_ref->title() . " to the list as type $type"); + INFO('adding ' . $page_title . " to the list as type $type"); add_to_no_iw_list($page_ref, $type); $type_count{$type}++; } @@ -792,9 +793,9 @@ my ($page) = @_; my $page_title = $page->title(); - my $page_text = ${ $page->text() }; # XXX + my $page_text = $page->revision()->text(); - study $page_text; # XXX + study $page_text; # XXX my %iw_links; my %special_cases; @@ -1135,9 +1136,9 @@ return 'English'; } - my $page_text_ref = $page->text(); + my $page_text = $page->revision()->text(); - if (${$page_text_ref} =~ $LOCAL_REDIRECT_RE) { + if ($page_text =~ $LOCAL_REDIRECT_RE) { return 'local'; } @@ -1147,7 +1148,8 @@ sub is_in_namespace { my ($page, @namespaces) = @_; - return namespace($page) ~~ [ map { get_string($_) } @namespaces ]; + return namespace($page->title()) ~~ + [ map { get_string($_) } @namespaces ]; } sub is_category { @@ -1158,8 +1160,8 @@ sub is_disambig { my ($page) = @_; - my $found_templates = - find_templates($page->text(), [], [ get_string('disambig') ]); + my $found_templates = find_templates(\$page->revision()->text(), + [], [ get_string('disambig') ]); return scalar @{$found_templates}; } @@ -1497,7 +1499,7 @@ =head2 unable to handle any case setting besides 'first-letter' Something is weird with the dump. See the documentation of -L<Parse::MediaWikiDump> and MediaWiki. +L<MediaWiki::DumpFile> and MediaWiki. =head2 A page has no pure title @@ -1551,7 +1553,7 @@ =over -=item * C<Parse::MediaWikiDump> +=item * C<MediaWiki::DumpFile> This module is used for reading pages from the XML dump. @@ -1671,7 +1673,7 @@ =item * Statistics and multi links are just slapped to the log. =item * At least some of the code can be rewritten as classes that inherit -from L<Parse::MediaWikiDump>. +from L<MediaWiki::DumpFile>. =back This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <am...@us...> - 2010-05-26 15:37:06
|
Revision: 91 http://perlwikibot.svn.sourceforge.net/perlwikibot/?rev=91&view=rev Author: amire80 Date: 2010-05-26 15:36:59 +0000 (Wed, 26 May 2010) Log Message: ----------- Strings for Masry (arz). Modified Paths: -------------- trunk/no-interwiki/MediaWiki/Toolkit.pm trunk/no-interwiki/prepare_noiw_list.pl trunk/no-interwiki/upload_iw.pl Added Paths: ----------- trunk/no-interwiki/arz.language_codes.txt trunk/no-interwiki/arz.strings.txt Property Changed: ---------------- trunk/no-interwiki/MediaWiki/Toolkit.pm trunk/no-interwiki/upload_iw.pl Modified: trunk/no-interwiki/MediaWiki/Toolkit.pm =================================================================== --- trunk/no-interwiki/MediaWiki/Toolkit.pm 2010-03-26 18:42:13 UTC (rev 90) +++ trunk/no-interwiki/MediaWiki/Toolkit.pm 2010-05-26 15:36:59 UTC (rev 91) @@ -16,15 +16,16 @@ GET_STRING => [qw( get_strings get_string )], FILE_UTIL => [qw( file_error append_to_file read_file )], NAMESPACE => [qw( init_namespaces namespace )], + DUMP => [qw( load_dump )], ); our @EXPORT_OK = map { @{$_} } values %EXPORT_TAGS; our $VERSION = '0.01'; #<<< no perltidy my %SVN_PROPS = ( ## no critic (RequireInterpolationOfMetachars) - Revision => '$Revision: 0 $', - HeadURL => '$HeadURL: https://perlwikibot.svn.sourceforge.net/svnroot/perlwikibot/trunk/no-interwiki/MediaWiki/Toolkit.pm $', - Date => '$Date: 2010-03-25 17:02:28 +0200 (Thu, 25 Mar 2010) $', + Revision => '$Revision$', + HeadURL => '$HeadURL$', + Date => '$Date$', ); #>>> @@ -134,5 +135,13 @@ return $namespace || get_string('article space'); } +sub load_dump { + my ($dump_fn) = @_; + + my $dump = MediaWiki::DumpFile::Pages->new($dump_fn); + + return $dump; +} + 1; Property changes on: trunk/no-interwiki/MediaWiki/Toolkit.pm ___________________________________________________________________ Added: svn:keywords + Revision HeadURL Date Added: trunk/no-interwiki/arz.language_codes.txt =================================================================== --- trunk/no-interwiki/arz.language_codes.txt (rev 0) +++ trunk/no-interwiki/arz.language_codes.txt 2010-05-26 15:36:59 UTC (rev 91) @@ -0,0 +1 @@ +link language_codes.txt \ No newline at end of file Property changes on: trunk/no-interwiki/arz.language_codes.txt ___________________________________________________________________ Added: svn:special + * Added: trunk/no-interwiki/arz.strings.txt =================================================================== --- trunk/no-interwiki/arz.strings.txt (rev 0) +++ trunk/no-interwiki/arz.strings.txt 2010-05-26 15:36:59 UTC (rev 91) @@ -0,0 +1,37 @@ +# months +January يناير +February فبراير +March مارس +April ابريل +May مايو +June يونيه +July يوليه +August اغسطس +September سبتمبر +October اكتوبر +November نوفمبر +December ديسمبر + +no_iw no_iw +disambig توضيح +template قالب + +date تاريخ +type نوع + +# MW specials +REDIRECT تحويل + +# Namespaces +User مستخدم +User talk نقاش المستخدم +Image ملف +Portal بوابة +Category تصنيف +article space (رئيسى) + +# Other +other متفرقات +rlm ‏ +exclude_lowercase ß + Modified: trunk/no-interwiki/prepare_noiw_list.pl =================================================================== --- trunk/no-interwiki/prepare_noiw_list.pl 2010-03-26 18:42:13 UTC (rev 90) +++ trunk/no-interwiki/prepare_noiw_list.pl 2010-05-26 15:36:59 UTC (rev 91) @@ -62,6 +62,7 @@ :FIELD_SEP :GET_STRING :FILE_UTIL :NAMESPACE + :DUMP ) ); @@ -361,7 +362,7 @@ } } -my $the_dump = load_dump(); +my $the_dump = load_dump($dump_fn); my %NAMESPACES = $the_dump->namespaces(); my @NAMESPACES = values %NAMESPACES; @@ -417,14 +418,6 @@ exit; -sub load_dump { - my $dump; - - $dump = MediaWiki::DumpFile::Pages->new($dump_fn); - - return $dump; -} - sub next_page { my ($dump) = @_; my $page_ref = eval { $dump->next(); }; @@ -1203,11 +1196,14 @@ next LANG_CODE; } my $filename = "$MULTI_DIR/$lang_code.$WIKITEXT_EXT"; + say 'sort keys found_links lang_code'; # XXX my @foreign_articles = sort keys %{ $found_links{$lang_code} }; + say 'format_link_table lang_code scalar foreign_articles'; # XXX say format_link_table($lang_code, scalar @foreign_articles); FOREIGN_ARTICLE: foreach my $foreign_article (@foreign_articles) { + say 'local_articles = keys found_links lang_code'; my @local_articles = keys %{ $found_links{$lang_code}->{$foreign_article} }; @@ -1219,6 +1215,7 @@ } if (scalar @local_articles > 1) { + say 'links = join sort map make_link'; my $links = join q{ | }, sort map { make_link($_) } keys %{ $found_links{$lang_code}->{$foreign_article} }; @@ -1226,6 +1223,7 @@ make_link($lang_code . $MW_SYNTAX{namespace_sep} . $foreign_article); + say 'append_to_file filename'; append_to_file($filename, "* '''$foreign_title''' - $links"); } } Modified: trunk/no-interwiki/upload_iw.pl =================================================================== --- trunk/no-interwiki/upload_iw.pl 2010-03-26 18:42:13 UTC (rev 90) +++ trunk/no-interwiki/upload_iw.pl 2010-05-26 15:36:59 UTC (rev 91) @@ -26,9 +26,9 @@ our $VERSION = '0.01'; #<<< no perltidy my %SVN_PROPS = ( ## no critic (RequireInterpolationOfMetachars) - Revision => '$Revision: 0 $', - HeadURL => '$HeadURL: https://perlwikibot.svn.sourceforge.net/svnroot/perlwikibot/trunk/no-interwiki/upload_iw.pl $', - Date => '$Date: 2010-03-25 17:02:28 +0200 (Thu, 25 Mar 2010) $', + Revision => '$Revision$', + HeadURL => '$HeadURL$', + Date => '$Date$', ); #>>> Property changes on: trunk/no-interwiki/upload_iw.pl ___________________________________________________________________ Added: svn:keywords + Revision HeadURL Date This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |