perlwikibot-svn Mailing List for Perl MediaWiki Robot (Page 4)
Status: Pre-Alpha
Brought to you by:
rotemliss
You can subscribe to this list here.
2006 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
(19) |
Oct
|
Nov
|
Dec
|
---|---|---|---|---|---|---|---|---|---|---|---|---|
2007 |
Jan
|
Feb
|
Mar
(4) |
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
2008 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
(20) |
Aug
(12) |
Sep
|
Oct
|
Nov
(3) |
Dec
|
2009 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
(11) |
Oct
(1) |
Nov
|
Dec
|
2010 |
Jan
|
Feb
|
Mar
(6) |
Apr
|
May
(1) |
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
(2) |
Dec
|
From: <rot...@us...> - 2006-09-16 11:06:21
|
Revision: 18 http://svn.sourceforge.net/perlwikibot/?rev=18&view=rev Author: rotemliss Date: 2006-09-16 04:06:13 -0700 (Sat, 16 Sep 2006) Log Message: ----------- Code style - properly getting parameters. Modified Paths: -------------- trunk/includes/functions.pm trunk/includes/http.pm Modified: trunk/includes/functions.pm =================================================================== --- trunk/includes/functions.pm 2006-09-16 10:54:41 UTC (rev 17) +++ trunk/includes/functions.pm 2006-09-16 11:06:13 UTC (rev 18) @@ -101,8 +101,7 @@ sub getPageLastEdit { # Get parameters - my $xmlFile = $_[0]; - my $title = $_[1]; + my ($xmlFile, $title) = @_; utf8::decode($title); # Get the pages @@ -188,7 +187,7 @@ sub getPageContents { # Get parameters - my $title = $_[0]; + my ($title) = @_; http::getPage($title, "raw"); # Return } @@ -197,9 +196,7 @@ sub editPage { # Get parameters - my $title = $_[0]; - my $text = $_[1]; - my $editSummary = $_[2]; + my ($title, $text, $editSummary) = @_; print "Editing page $title...\n"; @@ -253,7 +250,7 @@ sub replaceInPage { # Get parameters - my $title = $_[0]; + my ($title) = @_; if ($configure::sendPages == 1 || $configure::sendPages == 2) { @@ -303,8 +300,7 @@ sub refreshPage { # Get parameters - my $server = $_[0]; - my $title = $_[1]; + my ($server, $title) = @_; editPage($title, getPageContents($title), "Refreshing page"); } @@ -313,10 +309,7 @@ sub movePage { # Get parameters - my $server = $_[0]; - my $title = $_[1]; - my $newTitle = $_[2]; - my $moveReason = $_[3]; + my ($server, $title, $newTitle, $moveReason) = @_; print "Moving page $title to $newTitle...\n"; @@ -352,9 +345,7 @@ sub deletePage { # Get parameters - my $server = $_[0]; - my $title = $_[1]; - my $deleteReason = $_[2]; + my ($server, $title, $reason) = @_; print "Deleting page $title...\n"; @@ -371,7 +362,7 @@ { http::postPage($title, "delete", [ - wpReason => $deleteReason, + wpReason => $reason, wpEditToken => $editToken ], ); Modified: trunk/includes/http.pm =================================================================== --- trunk/includes/http.pm 2006-09-16 10:54:41 UTC (rev 17) +++ trunk/includes/http.pm 2006-09-16 11:06:13 UTC (rev 18) @@ -21,9 +21,7 @@ sub buildPageURL { # Get parameters - my $title = $_[0]; - my $action = $_[1]; - my $get = $_[2]; + my ($title, $action, $get) = @_; # Initial URL: server, script path and title my $server = $configure::servers{$ARGV[0]}; @@ -49,9 +47,7 @@ sub getPage { # Get parameters - my $title = $_[0]; - my $action = $_[1]; - my $get = $_[2]; + my ($title, $action, $get) = @_; my $url = buildPageURL($title, $action, $get); @@ -60,7 +56,7 @@ { $result = $browser->get($url); } - while ((!$result->is_success) && ($result->status_line ne "302 Moved Temporarily")); + while (!$result->is_success && $result->status_line =~ /302[\s\w]+/); $result->content; # Return } @@ -69,10 +65,7 @@ sub postPage { # Get parameters - my $title = $_[0]; - my $action = $_[1]; - my $post = $_[2]; - my $get = $_[3]; + my ($title, $action, $post, $get) = @_; my $url = buildPageURL($title, $action, $get); @@ -81,7 +74,7 @@ { $result = $browser->post($url, $post); } - while ((!$result->is_success) && (!$result->status_line =~ /302[\s\w]+/)); + while (!$result->is_success && !$result->status_line =~ /302[\s\w]+/); $result->content; # Return } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <rot...@us...> - 2006-09-16 10:54:55
|
Revision: 17 http://svn.sourceforge.net/perlwikibot/?rev=17&view=rev Author: rotemliss Date: 2006-09-16 03:54:41 -0700 (Sat, 16 Sep 2006) Log Message: ----------- Use namespaces, and directly access the variables instead of using many functions. Modified Paths: -------------- trunk/bot.pl trunk/config/configure.sample trunk/config/runtime.sample trunk/includes/dump.pm trunk/includes/functions.pm trunk/includes/http.pm Modified: trunk/bot.pl =================================================================== --- trunk/bot.pl 2006-09-01 16:04:29 UTC (rev 16) +++ trunk/bot.pl 2006-09-16 10:54:41 UTC (rev 17) @@ -6,6 +6,8 @@ # Libraries use includes::functions; +use config::configure; +use config::runtime; # Counters my ($i, $j, $k, $l, $m); @@ -17,34 +19,34 @@ } # Show notes about the sending pages configuration -if (sendPages() == 0) +if ($configure::sendPages == 0) { print "Note: There will be no changes in the real server. Please set sendPages to 1 to do them.\n"; } -elsif (sendPages() == 2) +elsif ($configure::sendPages == 2) { print "Note: A prefix will be used when editing pages. Please set sendPages to 1 to disable that.\n"; } # Log in to the server only if you have to do some stuff in the server -if (connectToServer()) +if ($configure::sendPages == 1 || $configure::sendPages == 2) { - login(); + functions::login(); } # All the matching pages in the XML file -my @pages = getMatchingPages(); +my @pages = functions::getMatchingPages(); # Execute actions of all the pages for ($i = 0; ($i <= $#pages); $i += 2) { # Get title and action my $title = $pages[$i]; - my $action = action($pages[$i + 1]); + my $action = $configure::actions{$pages[$i + 1]}; # Execute the action (TODO: add the actions which were supported in the previous bot) if ($action eq "replace") { - replaceInPage($title); + functions::replaceInPage($title); } } Modified: trunk/config/configure.sample =================================================================== --- trunk/config/configure.sample 2006-09-01 16:04:29 UTC (rev 16) +++ trunk/config/configure.sample 2006-09-16 10:54:41 UTC (rev 17) @@ -1,3 +1,6 @@ +# Package name +package configure; + # Code style use warnings; use strict; @@ -9,165 +12,56 @@ my ($i, $j, $k, $l, $m); # Available servers and details about them, the bot should work on -my %servers = ( +our %servers = ( "server" => "http://www.server.com", ); # The MediaWiki Server -my %scriptPaths = ( +our %scriptPaths = ( "server" => "/w/index.php", ); # The path to the script index.php (or wiki.phtml in old versions) -my %userNames = ( +our %userNames = ( "server" => "My-bot", ); # The bot user name in every server -my %passwords = ( +our %passwords = ( "server" => "password" ); # The bot password in every server -my %sysopUserNames = ( +our %sysopUserNames = ( "server" => "My-sysop", ); # The sysop bot user name in every server, blank if no -my %sysopPasswords = ( +our %sysopPasswords = ( "server" => "strong_password", ); # The sysop bot password in every server, blank if no -my %xmlFiles = ( +our %xmlFiles = ( "server" => "site.xml", ); # The XML file of the database -my %initialEditSummaries = ( +our %initialEditSummaries = ( "server" => "[[Project:Replace bot|Replace bot]] – " ); # Initial edit summary, when several replaces are done and several edit summaries are integrated, using the replace script -my %serverPrefixes = ( +our %serverPrefixes = ( "server" => "User:My-bot/Tests/", ); # Prefixes for the saved pages for tests; won't be used when really running the script # Avilable actions and details about them -my %actions = ( +our %actions = ( "action" => "move" ); # The global actions done -my %actionServers = ( +our %actionServers = ( "action" => "server" ); # Which server should it be executed on? TODO - allow using multiple servers like that, currently ignored -my %titles = ( +our %titles = ( "action" => "^שיחת משתמש:[\\d]+\\.[\\d]+\\.[\\d]+\\.[\\d]+\$" ); # Restrict the actions to these titles -my %texts = ( +our %texts = ( "action" => "^אנונימי\$" ); # Restrict the actions to the pages -my %news = ( +our %news = ( "action" => "שיחת משתמש:אנונימי" ); # New title (move) or text (replace), leave blank otherwise -my %reasons = ( +our %reasons = ( "action" => "מחליף שיחות אנונימיות" ); # Summary/reason sent about the action -my %minimumTimes = ( +our %minimumTimes = ( "action" => timegm(0, 0, 0, 1, 1 - 1, 1970 - 1900) ); # The minimum diff between the current time and the last edit time -sub server -{ - $servers{$ARGV[0]}; # Return -} - -sub scriptPath -{ - $scriptPaths{$ARGV[0]}; # Return -} - -sub userName -{ - # Return the appropriate user name, if the sysop is needed or not, - # using the parameter - if ($_[0] == 1) - { - $sysopUserNames{$ARGV[0]}; # Return - } - else - { - $userNames{$ARGV[0]}; # Return - } -} - -sub password -{ - # Return the appropriate user name, if the sysop is needed or not, - # using the parameter - if ($_[0] == 1) - { - $sysopPasswords{$ARGV[0]}; # Return - } - else - { - $passwords{$ARGV[0]}; # Return - } -} - -sub xmlFile -{ - $xmlFiles{$ARGV[0]}; # Return -} - -sub initialEditSummary -{ - $initialEditSummaries{$ARGV[0]}; # Return -} - -sub serverPrefix -{ - $serverPrefixes{$ARGV[0]}; # Return -} - -sub action -{ - # Get parameter - my $actionName = $_[0]; - - $actions{$actionName}; # Return -} - -sub actionServer -{ - # Get parameter - my $actionName = $_[0]; - - $actionServers{$actionName}; # Return -} - -sub title -{ - # Get parameter - my $actionName = $_[0]; - - $titles{$actionName}; # Return -} - -sub text -{ - # Get parameter - my $actionName = $_[0]; - - $texts{$actionName}; # Return -} - -sub new -{ - # Get parameter - my $actionName = $_[0]; - - $news{$actionName}; # Return -} - -sub reason -{ - # Get parameter - my $actionName = $_[0]; - - $reasons{$actionName}; # Return -} - -sub minimumTime -{ - # Get parameter - my $actionName = $_[0]; - - $minimumTimes{$actionName}; # Return -} - # Return a true value 1; Modified: trunk/config/runtime.sample =================================================================== --- trunk/config/runtime.sample 2006-09-01 16:04:29 UTC (rev 16) +++ trunk/config/runtime.sample 2006-09-16 10:54:41 UTC (rev 17) @@ -1,3 +1,6 @@ +# Package name +package configure; + # Code style use warnings; use strict; @@ -6,12 +9,12 @@ my ($i, $j, $k, $l, $m); # Actions to execute -my @executedActions = ( +our @executedActions = ( "action" ); # Actions to execute by the way, if you are already editing the page (replace only) -my @bywayActions = ( +our @bywayActions = ( "actionB" ); @@ -22,35 +25,7 @@ # * 2 - send to the server with prefix when editing (if not, don't send at all). # It's strongly recommended that you use 0 before making tests, and 2 before more # advanced tests, before you use 1 for the real changes. -my $sendPages = 0; +our $sendPages = 0; -sub executedActions -{ - @executedActions; # Return -} - -sub bywayActions -{ - @bywayActions; # Return -} - -sub sendPages -{ - $sendPages; # Return -} - -# Check if we have to connect to server -sub connectToServer -{ - if ($sendPages == 1 || $sendPages == 2) - { - 1; # Return - } - else - { - 0; # Return - } -} - # Return a true value 1; Modified: trunk/includes/dump.pm =================================================================== --- trunk/includes/dump.pm 2006-09-01 16:04:29 UTC (rev 16) +++ trunk/includes/dump.pm 2006-09-16 10:54:41 UTC (rev 17) @@ -4,7 +4,7 @@ #and shifting values for subroutine input should be avoided in #any subroutines that get called often, like the handlers -package Parse::MediaWikiDump::Pages; +package mwdump; #This parser works by placing all of the start, text, and end events into #a buffer as they come out of XML::Parser. On each call to page() the function Modified: trunk/includes/functions.pm =================================================================== --- trunk/includes/functions.pm 2006-09-01 16:04:29 UTC (rev 16) +++ trunk/includes/functions.pm 2006-09-16 10:54:41 UTC (rev 17) @@ -1,3 +1,6 @@ +# Package name +package functions; + # Code style use warnings; use strict; @@ -15,7 +18,7 @@ sub getMatchingPages { # Get the pages - my $pages = Parse::MediaWikiDump::Pages->new(xmlFile()); + my $pages = mwdump->new("dumps/".$configure::xmlFiles{$ARGV[0]}); # Can use only "first-letter", TODO: enable also "case-sensitive" for Wiktionary if ($pages->case ne "first-letter") @@ -24,7 +27,7 @@ } # Get the executed actions - my @possibleActions = executedActions(); + my @possibleActions = @configure::executedActions; # An array contains all the pages my @matchingPages; @@ -49,7 +52,7 @@ $tempCounter = 0; for ($i = 0; ($i <= $#pagePossibleActions); $i++) { - my $wantedTitle = title($pagePossibleActions[$i]); + my $wantedTitle = $configure::titles{$pagePossibleActions[$i]}; if ($title =~ /$wantedTitle/) { $tempTitleArray[$tempCounter++] = $pagePossibleActions[$i]; @@ -70,7 +73,7 @@ $tempCounter = 0; for ($i = 0; ($i <= $#pagePossibleActions); $i++) { - my $wantedText = text($pagePossibleActions[$i]); + my $wantedText = $configure::texts{$pagePossibleActions[$i]}; if ($$text =~ /$wantedText/) { $tempTextArray[$tempCounter++] = $pagePossibleActions[$i]; @@ -131,16 +134,13 @@ # Log in sub login { - # Get all the executed actions - my @executedActions = executedActions(); - - # Sysop finally required? + # Is sysop finally required? my $sysop = 0; # Go through the array, and check if sysop permission is needed. - for ($i = 0; ($i <= $#executedActions); $i++) + for ($i = 0; ($i <= $#configure::executedActions); $i++) { - my $action = action($executedActions[$i]); + my $action = $configure::actions{$configure::executedActions[$i]}; if ($action eq "replace") { # Continue @@ -159,12 +159,25 @@ } } + # Get user name and password + my ($username, $password); + if ($sysop == 1) + { + $username = $configure::sysopUserNames{$ARGV[0]}; + $password = $configure::sysopPasswords{$ARGV[0]}; + } + else + { + $username = $configure::userNames{$ARGV[0]}; + $password = $configure::passwords{$ARGV[0]}; + } + # Log in print "Logging in...\n"; - postPage("Special:Userlogin", "submitlogin", + http::postPage("Special:Userlogin", "submitlogin", [ - wpName => userName($sysop), - wpPassword => password($sysop), + wpName => $username, + wpPassword => $password, wpRemember => 0 ], , "type=login"); @@ -177,7 +190,7 @@ # Get parameters my $title = $_[0]; - getPage($title, "raw"); # Return + http::getPage($title, "raw"); # Return } # Edit page @@ -190,13 +203,14 @@ print "Editing page $title...\n"; - if (sendPages() == 2) + if ($configure::sendPages == 2) { - $title = serverPrefix().$title; + my $serverPrefix = $configure::serverPrefixes{$ARGV[0]}; + $title = $serverPrefix.$title; } # Get the edit page contents - my $editPage = getPage($title, "edit"); + my $editPage = http::getPage($title, "edit"); # Get the start time $editPage =~ /<input type='hidden' value="([0-9]{14})" name="wpStarttime" \/>/; @@ -211,11 +225,11 @@ my $editToken = $1; # Send page - if ((defined($editToken)) && ($editToken =~ /[0-9a-f]{32}/)) + if (defined($editToken) && $editToken =~ /[0-9a-f]{32}/) { - if ((sendPages() == 1) || (sendPages() == 2)) + if ($configure::sendPages == 1 || $configure::sendPages == 2) { - postPage($title, "submit", + http::postPage($title, "submit", [ wpSection => "", wpStarttime => $startTime, @@ -241,16 +255,16 @@ # Get parameters my $title = $_[0]; - if (connectToServer()) + if ($configure::sendPages == 1 || $configure::sendPages == 2) { # Get the edited page contents my $editedPage = getPageContents($title); # Set initial edit summary - my $editSummary = initialEditSummary(); + my $editSummary = $configure::initialEditSummaries{$ARGV[0]}; # Groups array - my @actions = (executedActions(), bywayActions()); + my @actions = (@configure::executedActions, @configure::bywayActions); # Replaced something at all? Flag to check my $replaced = 0; @@ -259,16 +273,16 @@ for ($i = 0; ($i <= $#actions); $i++) { my $action = $actions[$i]; - my $search = text($action); + my $search = $configure::texts{$action}; if ($editedPage =~ /$search/) { - my $replace = new($action); + my $replace = $configure::news{$action}; $editedPage =~ s/$search/$replace/g; if ($replaced == 1) { $editSummary = $editSummary.", "; } - $editSummary = $editSummary.reason($action); + $editSummary = $editSummary.$configure::reasons{$action}; $replaced = 1; } } @@ -307,7 +321,7 @@ print "Moving page $title to $newTitle...\n"; # Get the delete page contents - my $movePage = getPage("Special:Movepage/$title"); + my $movePage = http::getPage("Special:Movepage/$title"); # Get the edit token if ($movePage =~ /<input type='hidden' name='wpEditToken' value="([0-9,a-f]{32})" \/>/) @@ -315,9 +329,9 @@ my $editToken = $1; # Send page - if (sendPages() == 1) + if ($configure::sendPages == 1) { - postPage("Special:Movepage", "submit", + http::postPage("Special:Movepage", "submit", [ wpOldTitle => $title, wpNewTitle => $newTitle, @@ -345,7 +359,7 @@ print "Deleting page $title...\n"; # Get the delete page contents - my $deletePage = getPage($title, "delete"); + my $deletePage = http::getPage($title, "delete"); # Get the edit token if ($deletePage =~ /<input type='hidden' name='wpEditToken' value="([0-9,a-f]{32})" \/>/) @@ -353,9 +367,9 @@ my $editToken = $1; # Send page - if (sendPages() == 1) + if ($configure::sendPages == 1) { - postPage($title, "delete", + http::postPage($title, "delete", [ wpReason => $deleteReason, wpEditToken => $editToken Modified: trunk/includes/http.pm =================================================================== --- trunk/includes/http.pm 2006-09-01 16:04:29 UTC (rev 16) +++ trunk/includes/http.pm 2006-09-16 10:54:41 UTC (rev 17) @@ -1,3 +1,6 @@ +# Package name +package http; + # Code style use warnings; use strict; @@ -4,6 +7,7 @@ # Libraries use LWP; +use config::configure; # Counters my ($i, $j, $k, $l, $m); @@ -22,7 +26,9 @@ my $get = $_[2]; # Initial URL: server, script path and title - my $url = server().scriptPath()."?title=$title"; + my $server = $configure::servers{$ARGV[0]}; + my $scriptPath = $configure::scriptPaths{$ARGV[0]}; + my $url = "$server$scriptPath?title=$title"; # Action if (defined($action)) This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <rot...@us...> - 2006-09-01 16:04:39
|
Revision: 16 http://svn.sourceforge.net/perlwikibot/?rev=16&view=rev Author: rotemliss Date: 2006-09-01 09:04:29 -0700 (Fri, 01 Sep 2006) Log Message: ----------- Moving the dump file to a better name, and moving some functions about HTTP to a new HTTP library. Modified Paths: -------------- trunk/includes/functions.pm Added Paths: ----------- trunk/includes/dump.pm trunk/includes/http.pm Removed Paths: ------------- trunk/includes/mwdump.pm Copied: trunk/includes/dump.pm (from rev 6, trunk/includes/mwdump.pm) =================================================================== --- trunk/includes/dump.pm (rev 0) +++ trunk/includes/dump.pm 2006-09-01 16:04:29 UTC (rev 16) @@ -0,0 +1,1394 @@ +our $VERSION = '0.31'; +#the POD is at the end of this file +#avoid shift() - it is computationally more expensive than pop +#and shifting values for subroutine input should be avoided in +#any subroutines that get called often, like the handlers + +package Parse::MediaWikiDump::Pages; + +#This parser works by placing all of the start, text, and end events into +#a buffer as they come out of XML::Parser. On each call to page() the function +#checks for a complete article in the buffer and calls for XML::Parser to add +#more tokens if a complete article is not found. Once a complete article is +#found it is removed from the buffer, parsed, and an instance of the page +#object is returned. + +use strict; +use warnings; +use XML::Parser; + +#tokens in the buffer are an array ref with the 0th element specifying +#its type; these are the constants for those types. +use constant T_START => 1; +use constant T_END => 2; +use constant T_TEXT => 3; + +sub new { + my $class = shift; + my $source = shift; + my $self = {}; + + bless($self, $class); + + $$self{PARSER} = XML::Parser->new(ProtocolEncoding => 'UTF-8'); + $$self{PARSER}->setHandlers('Start', \&start_handler, + 'End', \&end_handler, + 'Char', \&char_handler); + $$self{EXPAT} = $$self{PARSER}->parse_start(state => $self); + $$self{BUFFER} = []; + $$self{CHUNK_SIZE} = 32768; + $$self{BUF_LIMIT} = 10000; + $$self{BYTE} = 0; + + $self->open($source); + $self->init; + + return $self; +} + +sub next { + my $self = shift; + my $buffer = $$self{BUFFER}; + my $offset; + my @page; + + #look through the contents of our buffer for a complete article; fill + #the buffer with more data if an entire article is not there + while(1) { + $offset = $self->search_buffer([T_END, 'page']); + last if $offset != -1; + + #indicates EOF + return undef unless $self->parse_more; + } + + #remove the entire page from the buffer + @page = splice(@$buffer, 0, $offset + 1); + + if (! token_compare($page[0], [T_START, 'page'])) { + $self->dump($buffer); + die "expected <page>; got " . token2text($page[0]); + } + + my $data = $self->parse_page(\@page); + + return Parse::MediaWikiDump::page->new($data, $$self{CATEGORY_ANCHOR}); +} + +#outputs a nicely formated representation of the tokens on the buffer specified +sub dump { + my $self = shift; + my $buffer = shift || $$self{BUFFER}; + my $offset = 0; + + foreach my $i (0 .. $#$buffer) { + my $token = $$buffer[$i]; + + print STDERR "$i "; + + if ($$token[0] == T_START) { + my $attr = $$token[2]; + print STDERR " " x $offset; + print STDERR "START $$token[1] "; + + foreach my $key (sort(keys(%$attr))) { + print STDERR "$key=\"$$attr{$key}\" "; + } + + print STDERR "\n"; + $offset++; + } elsif ($$token[0] == T_END) { + $offset--; + print STDERR " " x $offset; + print STDERR "END $$token[1]\n"; + } elsif ($$token[0] == T_TEXT) { + my $ref = $$token[1]; + print STDERR " " x $offset; + print STDERR "TEXT "; + + my $len = length($$ref); + + if ($len < 50) { + print STDERR "'$$ref'\n"; + } else { + print STDERR "$len characters\n"; + } + } + } + + return 1; +} + +sub sitename { + my $self = shift; + return $$self{HEAD}{sitename}; +} + +sub base { + my $self = shift; + return $$self{HEAD}{base}; +} + +sub generator { + my $self = shift; + return $$self{HEAD}{generator}; +} + +sub case { + my $self = shift; + return $$self{HEAD}{case}; +} + +sub namespaces { + my $self = shift; + return $$self{HEAD}{namespaces}; +} + +sub current_byte { + my $self = shift; + return $$self{BYTE}; +} + +#depreciated backwards compatibility methods + +#replaced by next() +sub page { + my $self = shift; + return $self->next(@_); +} + +#private functions with OO interface +sub open { + my $self = shift; + my $source = shift; + + if (ref($source) eq 'GLOB') { + $$self{SOURCE} = $source; + } elsif (! open($$self{SOURCE}, $source)) { + die "could not open $source: $!"; + } + + binmode($$self{SOURCE}, ':utf8'); + + return 1; +} + +sub init { + my $self = shift; + my $offset; + my @head; + + #parse more XML until the entire siteinfo section is in the buffer + while(1) { + die "could not init" unless $self->parse_more; + + $offset = $self->search_buffer([T_END, 'siteinfo']); + + last if $offset != -1; + } + + #pull the siteinfo section out of the buffer + @head = splice(@{$$self{BUFFER}}, 0, $offset + 1); + + $self->parse_head(\@head); + + return 1; +} + +#feed data into expat and have it put more tokens onto the buffer +sub parse_more { + my ($self) = @_; + my $buf; + + my $read = read($$self{SOURCE}, $buf, $$self{CHUNK_SIZE}); + + if (! defined($read)) { + die "error during read: $!"; + } elsif ($read == 0) { + $$self{FINISHED} = 1; + $$self{EXPAT}->parse_done(); + return 0; + } + + $$self{BYTE} += $read; + $$self{EXPAT}->parse_more($buf); + + my $buflen = scalar(@{$$self{BUFFER}}); + + die "buffer length of $buflen exceeds $$self{BUF_LIMIT}" unless + $buflen < $$self{BUF_LIMIT}; + + return 1; +} + +#searches through a buffer for a specified token +sub search_buffer { + my ($self, $search, $list) = @_; + + $list = $$self{BUFFER} unless defined $list; + + return -1 if scalar(@$list) == 0; + + foreach my $i (0 .. $#$list) { + return $i if token_compare($$list[$i], $search); + } + + return -1; +} + +#this function is very frightning =) +sub parse_head { + my $self = shift; + my $buffer = shift; + my $state = 'start'; + my %data = (namespaces => []); + + for (my $i = 0; $i <= $#$buffer; $i++) { + my $token = $$buffer[$i]; + + if ($state eq 'start') { + my $version; + die "$i: expected <mediawiki> got " . token2text($token) unless + token_compare($token, [T_START, 'mediawiki']); + + die "$i: version is a required attribute" unless + defined($version = $$token[2]->{version}); + + die "$i: version $version unsupported" unless $version eq '0.3'; + + $token = $$buffer[++$i]; + + die "$i: expected <siteinfo> got " . token2text($token) unless + token_compare($token, [T_START, 'siteinfo']); + + $state = 'in_siteinfo'; + } elsif ($state eq 'in_siteinfo') { + if (token_compare($token, [T_START, 'namespaces'])) { + $state = 'in_namespaces'; + next; + } elsif (token_compare($token, [T_END, 'siteinfo'])) { + last; + } elsif (token_compare($token, [T_START, 'sitename'])) { + $token = $$buffer[++$i]; + + if ($$token[0] != T_TEXT) { + die "$i: expected TEXT but got " . token2text($token); + } + + $data{sitename} = ${$$token[1]}; + + $token = $$buffer[++$i]; + + if (! token_compare($token, [T_END, 'sitename'])) { + die "$i: expected </sitename> but got " . token2text($token); + } + } elsif (token_compare($token, [T_START, 'base'])) { + $token = $$buffer[++$i]; + + if ($$token[0] != T_TEXT) { + $self->dump($buffer); + die "$i: expected TEXT but got " . token2text($token); + } + + $data{base} = ${$$token[1]}; + + $token = $$buffer[++$i]; + + if (! token_compare($token, [T_END, 'base'])) { + $self->dump($buffer); + die "$i: expected </base> but got " . token2text($token); + } + + } elsif (token_compare($token, [T_START, 'generator'])) { + $token = $$buffer[++$i]; + + if ($$token[0] != T_TEXT) { + $self->dump($buffer); + die "$i: expected TEXT but got " . token2text($token); + } + + $data{generator} = ${$$token[1]}; + + $token = $$buffer[++$i]; + + if (! token_compare($token, [T_END, 'generator'])) { + $self->dump($buffer); + die "$i: expected </generator> but got " . token2text($token); + } + + } elsif (token_compare($token, [T_START, 'case'])) { + $token = $$buffer[++$i]; + + if ($$token[0] != T_TEXT) { + $self->dump($buffer); + die "$i: expected </case> but got " . token2text($token); + } + + $data{case} = ${$$token[1]}; + + $token = $$buffer[++$i]; + + if (! token_compare($token, [T_END, 'case'])) { + $self->dump($buffer); + die "$i: expected </case> but got " . token2text($token); + } + } + + } elsif ($state eq 'in_namespaces') { + my $key; + my $name; + + if (token_compare($token, [T_END, 'namespaces'])) { + $state = 'in_siteinfo'; + next; + } + + if (! token_compare($token, [T_START, 'namespace'])) { + die "$i: expected <namespace> or </namespaces>; got " . token2text($token); + } + + die "$i: key is a required attribute" unless + defined($key = $$token[2]->{key}); + + $token = $$buffer[++$i]; + + #the default namespace has no text associated with it + if ($$token[0] == T_TEXT) { + $name = ${$$token[1]}; + } elsif (token_compare($token, [T_END, 'namespace'])) { + $name = ''; + $i--; #move back one for below + } else { + die "$i: should never happen"; + } + + push(@{$data{namespaces}}, [$key, $name]); + + $token = $$buffer[++$i]; + + if (! token_compare($token, [T_END, 'namespace'])) { + $self->dump($buffer); + die "$i: expected </namespace> but got " . token2text($token); + } + + } else { + die "$i: unknown state '$state'"; + } + } + + $$self{HEAD} = \%data; + + #locate the anchor that indicates what looks like a link is really a + #category assignment ([[foo]] vs [[Category:foo]]) + #fix for bug #16616 + foreach my $ns (@{$data{namespaces}}) { + #namespace 14 is the category namespace + if ($$ns[0] == 14) { + $$self{CATEGORY_ANCHOR} = $$ns[1]; + last; + } + } + + if (! defined($$self{CATEGORY_ANCHOR})) { + die "Could not locate category indicator in namespace definitions"; + } + + return 1; +} + +#this function is very frightning =) +sub parse_page { + my $self = shift; + my $buffer = shift; + my %data; + my $state = 'start'; + + for (my $i = 0; $i <= $#$buffer; $i++) { + my $token = $$buffer[$i]; + + if ($state eq 'start') { + if (! token_compare($token, [T_START, 'page'])) { + $self->dump($buffer); + die "$i: expected <page>; got " . token2text($token); + } + + $state = 'in_page'; + } elsif ($state eq 'in_page') { + if (token_compare($token, [T_START, 'revision'])) { + $state = 'in_revision'; + next; + } elsif (token_compare($token, [T_END, 'page'])) { + last; + } elsif (token_compare($token, [T_START, 'title'])) { + $token = $$buffer[++$i]; + + if (token_compare($token, [T_END, 'title'])) { + $data{title} = ''; + next; + } + + if ($$token[0] != T_TEXT) { + $self->dump($buffer); + die "$i: expected TEXT; got " . token2text($token); + } + + $data{title} = ${$$token[1]}; + + $token = $$buffer[++$i]; + + if (! token_compare($token, [T_END, 'title'])) { + $self->dump($buffer); + die "$i: expected </title>; got " . token2text($token); + } + } elsif (token_compare($token, [T_START, 'id'])) { + $token = $$buffer[++$i]; + + if ($$token[0] != T_TEXT) { + $self->dump($buffer); + die "$i: expected TEXT; got " . token2text($token); + } + + $data{id} = ${$$token[1]}; + + $token = $$buffer[++$i]; + + if (! token_compare($token, [T_END, 'id'])) { + $self->dump($buffer); + die "$i: expected </id>; got " . token2text($token); + } + } + } elsif ($state eq 'in_revision') { + if (token_compare($token, [T_END, 'revision'])) { + #If a comprehensive dump file is parsed + #it can cause uncontrolled stack growth and the + #parser only returns one revision out of + #all revisions - if we run into a + #comprehensive dump file, indicated by more + #than one <revision> section inside a <page> + #section then die with a message + + #just peeking ahead, don't want to update + #the index + $token = $$buffer[$i + 1]; + + if (token_compare($token, [T_START, 'revision'])) { + die "unable to properly parse comprehensive dump files"; + } + + $state = 'in_page'; + next; + } elsif (token_compare($token, [T_START, 'contributor'])) { + $state = 'in_contributor'; + next; + } elsif (token_compare($token, [T_START, 'id'])) { + $token = $$buffer[++$i]; + + if ($$token[0] != T_TEXT) { + $self->dump($buffer); + die "$i: expected TEXT; got " . token2text($token); + } + + $data{revision_id} = ${$$token[1]}; + + $token = $$buffer[++$i]; + + if (! token_compare($token, [T_END, 'id'])) { + $self->dump($buffer); + die "$i: expected </id>; got " . token2text($token); + } + + } elsif (token_compare($token, [T_START, 'timestamp'])) { + $token = $$buffer[++$i]; + + if ($$token[0] != T_TEXT) { + $self->dump($buffer); + die "$i: expected TEXT; got " . token2text($token); + } + + $data{timestamp} = ${$$token[1]}; + + $token = $$buffer[++$i]; + + if (! token_compare($token, [T_END, 'timestamp'])) { + $self->dump($buffer); + die "$i: expected </timestamp>; got " . token2text($token); + } + } elsif (token_compare($token, [T_START, 'minor'])) { + $data{minor} = 1; + $token = $$buffer[++$i]; + + if (! token_compare($token, [T_END, 'minor'])) { + $self->dump($buffer); + die "$i: expected </minor>; got " . token2text($token); + } + } elsif (token_compare($token, [T_START, 'comment'])) { + $token = $$buffer[++$i]; + + #account for possible null-text + if (token_compare($token, [T_END, 'comment'])) { + $data{comment} = ''; + next; + } + + if ($$token[0] != T_TEXT) { + $self->dump($buffer); + die "$i: expected TEXT; got " . token2text($token); + } + + $data{comment} = ${$$token[1]}; + + $token = $$buffer[++$i]; + + if (! token_compare($token, [T_END, 'comment'])) { + $self->dump($buffer); + die "$i: expected </comment>; got " . token2text($token); + } + + } elsif (token_compare($token, [T_START, 'text'])) { + my $token = $$buffer[++$i]; + + if (token_compare($token, [T_END, 'text'])) { + ${$data{text}} = ''; + next; + } elsif ($$token[0] != T_TEXT) { + $self->dump($buffer); + die "$i: expected TEXT; got " . token2text($token); + } + + $data{text} = $$token[1]; + + $token = $$buffer[++$i]; + + if (! token_compare($token, [T_END, 'text'])) { + $self->dump($buffer); + die "$i: expected </text>; got " . token2text($token); + } + + } + + } elsif ($state eq 'in_contributor') { + if (token_compare($token, [T_END, 'contributor'])) { + $state = 'in_revision'; + next; + } elsif (token_compare($token, [T_START, 'username'])) { + $token = $$buffer[++$i]; + + if ($$token[0] != T_TEXT) { + $self->dump($buffer); + die "$i: expecting TEXT; got " . token2text($token); + } + + $data{username} = ${$$token[1]}; + + $token = $$buffer[++$i]; + + if (! token_compare($token, [T_END, 'username'])) { + $self->dump($buffer); + die "$i: expected </username>; got " . token2text($token); + } + + } elsif (token_compare($token, [T_START, 'id'])) { + $token = $$buffer[++$i]; + + if ($$token[0] != T_TEXT) { + $self->dump($buffer); + die "$i: expecting TEXT; got " . token2text($token); + } + + $data{userid} = ${$$token[1]}; + + $token = $$buffer[++$i]; + + if (! token_compare($token, [T_END, 'id'])) { + $self->dump($buffer); + die "$i: expecting </id>; got " . token2text($token); + } + } + } else { + die "unknown state: $state"; + } + } + + $data{minor} = 0 unless defined($data{minor}); + + return \%data; +} + +#private functions with out OO interface +sub token2text { + my $token = shift; + + if ($$token[0] == T_START) { + return "<$$token[1]>"; + } elsif ($$token[0] == T_END) { + return "</$$token[1]>"; + } elsif ($$token[0] == T_TEXT) { + return "!text_token!"; + } else { + return "!unknown!"; + } +} + +#this function is where the majority of time is spent in this software +sub token_compare { + my ($toke1, $toke2) = @_; + + foreach my $i (0 .. $#$toke2) { + if ($$toke1[$i] ne $$toke2[$i]) { + return 0; + } + } + + return 1; +} + +sub start_handler { + my ($p, $tag, %atts) = @_; + my $self = $p->{state}; + + push(@{$$self{BUFFER}}, [T_START, $tag, \%atts]); + + return 1; +} + +sub end_handler { + my ($p, $tag) = @_; + my $self = $p->{state}; + + push(@{$$self{BUFFER}}, [T_END, $tag]); + + return 1; +} + +sub char_handler { + my ($p, $chars) = @_; + my $self = $p->{state}; + my $buffer = $$self{BUFFER}; + my $curent = $$buffer[$#$buffer]; + + if (! defined($curent)) { + #skip any text not inside a container + return 1; + } elsif ($$curent[0] == T_TEXT) { + ${$$curent[1]} .= $chars; + } elsif ($$curent[0] == T_START) { + my $ignore_ws_only = 1; + + #work around for bug #16583 - All spaces is a possible username + # at least if done via unicode. Force white space preservation + #for now. + if ($$curent[1] eq 'username') { + $ignore_ws_only = 0; + } + + if (defined($$curent[2]->{'xml:space'}) && + ($$curent[2]->{'xml:space'} eq 'preserve')) { + $ignore_ws_only = 0; + } + + if ($ignore_ws_only) { + #non-breaking spaces are not whitespace in XML + return 1 if $chars =~ m/^[ \t\r\n]+$/m; + } + + push(@$buffer, [T_TEXT, \$chars]); + } + + return 1; +} + +package Parse::MediaWikiDump::page; + +use strict; +use warnings; + +sub new { + my ($class, $data, $category_anchor, $case_setting) = @_; + my $self = {}; + + bless($self, $class); + + $$self{DATA} = $data; + $$self{CACHE} = {}; + $$self{CATEGORY_ANCHOR} = $category_anchor; + + return $self; +} + +sub namespace { + my $self = shift; + + return $$self{CACHE}{namespace} if defined($$self{CACHE}{namespace}); + + my $title = $$self{DATA}{title}; + + if ($title =~ m/^([^:]+)\:/) { + $$self{CACHE}{namespace} = $1; + return $1; + } else { + $$self{CACHE}{namespace} = ''; + return ''; + } +} + +sub categories { + my $self = shift; + my $anchor = $$self{CATEGORY_ANCHOR}; + + return $$self{CACHE}{categories} if defined($$self{CACHE}{categories}); + + my $text = $$self{DATA}{text}; + my @cats; + + while($$text =~ m/\[\[$anchor:\s*([^\]]+)\]\]/gi) { + my $buf = $1; + + #deal with the pipe trick + $buf =~ s/\|.*$//; + push(@cats, $buf); + } + + return undef if scalar(@cats) == 0; + + $$self{CACHE}{categories} = \@cats; + + return \@cats; +} + +sub redirect { + my $self = shift; + my $text = $$self{DATA}{text}; + + return $$self{CACHE}{redirect} if exists($$self{CACHE}{redirect}); + + if ($$text =~ m/^#redirect\s*:?\s*\[\[([^\]]*)\]\]/i) { + $$self{CACHE}{redirect} = $1; + return $1; + } else { + $$self{CACHE}{redirect} = undef; + return undef; + } +} + +sub title { + my $self = shift; + return $$self{DATA}{title}; +} + +sub id { + my $self = shift; + return $$self{DATA}{id}; +} + +sub revision_id { + my $self = shift; + return $$self{DATA}{revision_id}; +} + +sub timestamp { + my $self = shift; + return $$self{DATA}{timestamp}; +} + +sub username { + my $self = shift; + return $$self{DATA}{username}; +} + +sub userid { + my $self = shift; + return $$self{DATA}{userid}; +} + +sub minor { + my $self = shift; + return $$self{DATA}{minor}; +} + +sub text { + my $self = shift; + return $$self{DATA}{text}; +} + +package Parse::MediaWikiDump::Links; + +use strict; +use warnings; + +sub new { + my $class = shift; + my $source = shift; + my $self = {}; + $$self{BUFFER} = []; + + bless($self, $class); + + $self->open($source); + $self->init; + + return $self; +} + +sub next { + my $self = shift; + my $buffer = $$self{BUFFER}; + my $link; + + while(1) { + if (defined($link = pop(@$buffer))) { + last; + } + + #signals end of input + return undef unless $self->parse_more; + } + + return Parse::MediaWikiDump::link->new($link); +} + +#private functions with OO interface +sub parse_more { + my $self = shift; + my $source = $$self{SOURCE}; + my $need_data = 1; + + while($need_data) { + my $line = <$source>; + + last unless defined($line); + + while($line =~ m/\((\d+),(-?\d+),'(.*?)'\)[;,]/g) { + push(@{$$self{BUFFER}}, [$1, $2, $3]); + $need_data = 0; + } + } + + #if we still need data and we are here it means we ran out of input + if ($need_data) { + return 0; + } + + return 1; +} + +sub open { + my $self = shift; + my $source = shift; + + if (ref($source) ne 'GLOB') { + die "could not open $source: $!" unless + open($$self{SOURCE}, $source); + } else { + $$self{SOURCE} = $source; + } + + binmode($$self{SOURCE}, ':utf8'); + + return 1; +} + +sub init { + my $self = shift; + my $source = $$self{SOURCE}; + my $found = 0; + + while(<$source>) { + if (m/^LOCK TABLES `pagelinks` WRITE;/) { + $found = 1; + last; + } + } + + die "not a MediaWiki link dump file" unless $found; +} + +#depreciated backwards compatibility methods + +#replaced by next() +sub link { + my $self = shift; + $self->next(@_); +} + +package Parse::MediaWikiDump::link; + +#you must pass in a fully populated link array reference +sub new { + my $class = shift; + my $self = shift; + + bless($self, $class); + + return $self; +} + +sub from { + my $self = shift; + return $$self[0]; +} + +sub namespace { + my $self = shift; + return $$self[1]; +} + +sub to { + my $self = shift; + return $$self[2]; +} + + +1; + +__END__ + +=head1 NAME + +Parse::MediaWikiDump - Tools to process MediaWiki dump files + +=head1 SYNOPSIS + + use Parse::MediaWikiDump; + + $source = 'dump_filename.ext'; + $source = \*FILEHANDLE; + + $pages = Parse::MediaWikiDump::Pages->new($source); + $links = Parse::MediaWikiDump::Links->new($source); + + #get all the records from the dump files, one record at a time + while(defined($page = $pages->next)) { + print "title '", $page->title, "' id ", $page->id, "\n"; + } + + while(defined($link = $links->next)) { + print "link from ", $link->from, " to ", $link->to, "\n"; + } + + #information about the page dump file + $pages->sitename; + $pages->base; + $pages->generator; + $pages->case; + $pages->namespaces; + + #information about a page record + $page->redirect; + $page->categories; + $page->title; + $page->namespace; + $page->id; + $page->revision_id; + $page->timestamp; + $page->username; + $page->userid; + $page->minor; + $page->text; + + #information about a link + $link->from; + $link->to; + $link->namespace; + +=head1 DESCRIPTION + +This module provides the tools needed to process the contents of various +MediaWiki dump files. + +=head1 USAGE + +To use this module you must create an instance of a parser for the type of +dump file you are trying to parse. The current parsers are: + +=over 4 + +=item Parse::MediaWikiDump::Pages + +Parse the contents of the page archive. + +=item Parse::MediaWikiDump::Links + +Parse the contents of the links dump file. + +=back + +=head2 General + +Both parsers require an argument to new that is a location of source data +to parse; this argument can be either a filename or a reference to an already +open filehandle. This entire software suite will die() upon errors in the file, +inconsistencies on the stack, etc. If this concerns you then you can wrap +the portion of your code that uses these calls with eval(). + +=head2 Parse::MediaWikiDump::Pages + +It is possible to create a Parse::MediaWikiDump::Pages object two ways: + +=over 4 + +=item $pages = Parse::MediaWikiDump::Pages->new($filename); + +=item $pages = Parse::MediaWikiDump::Pages->new(\*FH); + +=back + +After creation the folowing methods are avalable: + +=over 4 + +=item $pages->next + +Returns the next available record from the dump file if it is available, +otherwise returns undef. Records returned are instances of +Parse::MediaWikiDump::page; see below for information on those objects. + +=item $pages->sitename + +Returns the plain-text name of the instance the dump is from. + +=item $pages->base + +Returns the base url to the website of the instance. + +=item $pages->generator + +Returns the version of the software that generated the file. + +=item $pages->case + +Returns the case-sensitivity configuration of the instance. + +=item $pages->namespaces + +Returns an array reference to the list of namespaces in the instance. Each +namespace is stored as an array reference which has two items; the first is the +namespace number and the second is the namespace name. In the case of namespace +0 the text stored for the name is '' + +=back + +=head3 Parse::MediaWikiDump::page + +The Parse::MediaWikiDump::page object represents a distinct MediaWiki page, +article, module, what have you. These objects are returned by the next() method +of a Parse::MediaWikiDump::Pages instance. The scalar returned is a reference +to a hash that contains all the data of the page in a straightforward manor. +While it is possible to access this hash directly, and it involves less overhead +than using the methods below, it is beyond the scope of the interface and is +undocumented. + +Some of the methods below require additional processing, such as namespaces, +redirect, and categories, to name a few. In these cases the returned result +is cached and stored inside the object so the processing does not have to be +redone. This is transparent to you; just know that you don't have to worry about +optimizing calls to these functions to limit processing overhead. + +The following methods are available: + +=over 4 + +=item $page->id + +=item $page->title + +=item $page->namespace + +Returns an empty string (such as '') for the main namespace or a string +containing the name of the namespace. + +=item $page->text + +A reference to a scalar containing the plaintext of the page. + +=item $page->redirect + +The plain text name of the article redirected to or undef if the page is not +a redirect. + +=item $page->categories + +Returns a reference to an array that contains a list of categories or undef +if there are no categories. This method does not understand templates and may +not return all the categories the article actually belongs in. + +=item $page->revision_id + +=item $page->timestamp + +=item $page->username + +=item $page->userid + +=item $page->minor + +=back + +=head2 Parse::MediaWikiDump::Links + +This module also takes either a filename or a reference to an already open +filehandle. For example: + + $links = Parse::MediaWikiDump::Links->new($filename); + $links = Parse::MediaWikiDump::Links->new(\*FH); + +It is then possible to extract the links a single link at a time using the +next method, which returns an instance of Parse::MediaWikiDump::link or undef +when there is no more data. For instance: + + while(defined($link = $links->next)) { + print 'from ', $link->from, ' to ', $link->to, "\n"; + } + +=head3 Parse::MediaWikiDump::link + +Instances of this class are returned by the link method of a +Parse::MediaWikiDump::Links instance. The following methods are available: + +=over 4 + +=item $link->from + +The numerical id the link was in. + +=item $link->to + +The plain text name the link is to, minus the namespace. + +=item $link->namespace + +The numerical id of the namespace the link points to. + +=back + +=head1 EXAMPLES + +=head2 Extract the article text for a given title + + #!/usr/bin/perl + + use strict; + use warnings; + use Parse::MediaWikiDump; + + my $file = shift(@ARGV) or die "must specify a MediaWiki dump of the current pages"; + my $title = shift(@ARGV) or die "must specify an article title"; + my $dump = Parse::MediaWikiDump::Pages->new($file); + + binmode(STDOUT, ':utf8'); + binmode(STDERR, ':utf8'); + + #this is the only currently known value but there could be more in the future + if ($dump->case ne 'first-letter') { + die "unable to handle any case setting besides 'first-letter'"; + } + + $title = case_fixer($title); + + while(my $page = $dump->next) { + if ($page->title eq $title) { + print STDERR "Located text for $title\n"; + my $text = $page->text; + print $$text; + exit 0; + } + } + + print STDERR "Unable to find article text for $title\n"; + exit 1; + + #removes any case sensativity from the very first letter of the title + #but not from the optional namespace name + sub case_fixer { + my $title = shift; + + #check for namespace + if ($title =~ /^(.+?):(.+)/) { + $title = $1 . ':' . ucfirst($2); + } else { + $title = ucfirst($title); + } + + return $title; + } + +=head2 Scan the dump file for double redirects + + #!/usr/bin/perl + + #progress information goes to STDERR, a list of double redirects found + #goes to STDOUT + + binmode(STDOUT, ":utf8"); + binmode(STDERR, ":utf8"); + + use strict; + use warnings; + use Parse::MediaWikiDump; + + my $file = shift(@ARGV); + my $pages; + my $page; + my %redirs; + my $artcount = 0; + my $file_size; + my $start = time; + + if (defined($file)) { + $file_size = (stat($file))[7]; + $pages = Parse::MediaWikiDump::Pages->new($file); + } else { + print STDERR "No file specified, using standard input\n"; + $pages = Parse::MediaWikiDump::Pages->new(\*STDIN); + } + + #the case of the first letter of titles is ignored - force this option + #because the other values of the case setting are unknown + die 'this program only supports the first-letter case setting' unless + $pages->case eq 'first-letter'; + + print STDERR "Analyzing articles:\n"; + + while(defined($page = $pages->next)) { + update_ui() if ++$artcount % 500 == 0; + + #main namespace only + next unless $page->namespace eq ''; + next unless defined($page->redirect); + + my $title = case_fixer($page->title); + #create a list of redirects indexed by their original name + $redirs{$title} = case_fixer($page->redirect); + } + + my $redir_count = scalar(keys(%redirs)); + print STDERR "done; searching $redir_count redirects:\n"; + + my $count = 0; + + #if a redirect location is also a key to the index we have a double redirect + foreach my $key (keys(%redirs)) { + my $redirect = $redirs{$key}; + + if (defined($redirs{$redirect})) { + print "$key\n"; + $count++; + } + } + + print STDERR "discovered $count double redirects\n"; + + #removes any case sensativity from the very first letter of the title + #but not from the optional namespace name + sub case_fixer { + my $title = shift; + + #check for namespace + if ($title =~ /^(.+?):(.+)/) { + $title = $1 . ':' . ucfirst($2); + } else { + $title = ucfirst($title); + } + + return $title; + } + + sub pretty_bytes { + my $bytes = shift; + my $pretty = int($bytes) . ' bytes'; + + if (($bytes = $bytes / 1024) > 1) { + $pretty = int($bytes) . ' kilobytes'; + } + + if (($bytes = $bytes / 1024) > 1) { + $pretty = sprintf("%0.2f", $bytes) . ' megabytes'; + } + + if (($bytes = $bytes / 1024) > 1) { + $pretty = sprintf("%0.4f", $bytes) . ' gigabytes'; + } + + return $pretty; + } + + sub pretty_number { + my $number = reverse(shift); + $number =~ s/(...)/$1,/g; + $number = reverse($number); + $number =~ s/^,//; + + return $number; + } + + sub update_ui { + my $seconds = time - $start; + my $bytes = $pages->current_byte; + + print STDERR " ", pretty_number($artcount), " articles; "; + print STDERR pretty_bytes($bytes), " processed; "; + + if (defined($file_size)) { + my $percent = int($bytes / $file_size * 100); + + print STDERR "$percent% completed\n"; + } else { + my $bytes_per_second = int($bytes / $seconds); + print STDERR pretty_bytes($bytes_per_second), " per second\n"; + } + } + +=head1 TODO + +=over 4 + +=item Support comprehensive dump files + +Currently the full page dump files (such as 20050909_pages_full.xml.gz) +are not supported. + +=item Optimization + +It would be nice to increase the processing speed of the XML files. Current +ideas: + +=over 4 + +=item Move to arrays instead of hashes for base objects + +Currently the base types for the majority of the classes are hashes. The +majority of these could be changed to arrays and numerical constants instead +of using hashes. + +=item Stackless parsing + +placing each XML token on the stack is probably quite time consuming. It may be +beter to move to a stackless system where the XML parser is given a new set +of callbacks to use when it encounters each specific token. + +=back + +=back + +=head1 AUTHOR + +This module was created and documented by Tyler Riddle E<lt>tr...@gm...E<gt>. + +=head1 BUGS + +Please report any bugs or feature requests to +C<bug...@rt...>, or through the web interface at +L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Parse-MediaWikiDump>. +I will be notified, and then you'll automatically be notified of progress on +your bug as I make changes. + +=head2 Known Bugs + +No known bugs at this time. + +=head1 COPYRIGHT & LICENSE + +Copyright 2005 Tyler Riddle, all rights reserved. + +This program is free software; you can redistribute it and/or modify it +under the same terms as Perl itself. + Modified: trunk/includes/functions.pm =================================================================== --- trunk/includes/functions.pm 2006-09-01 11:27:14 UTC (rev 15) +++ trunk/includes/functions.pm 2006-09-01 16:04:29 UTC (rev 16) @@ -3,20 +3,14 @@ use strict; # Libraries -use LWP; -use HTTP::Cookies; -use includes::mwdump; +use includes::dump; +use includes::http; use config::configure; use config::runtime; # Counters my ($i, $j, $k, $l, $m); -# The browser we use to surf; enable cookies, and use a specific user agent -my $browser = LWP::UserAgent->new; -$browser->cookie_jar({}); -$browser->agent("Mozilla/5.0 (compatible; Perl Wikibot)"); - # Get all the pages matching specific restrictions sub getMatchingPages { @@ -134,73 +128,6 @@ timegm($6, $5, $4, $3, $2 - 1, $1 - 1900); # Return } -# Build the URL of a wiki page -sub buildPageURL -{ - # Get parameters - my $title = $_[0]; - my $action = $_[1]; - my $get = $_[2]; - - # Initial URL: server, script path and title - my $url = server().scriptPath()."?title=$title"; - - # Action - if (defined($action)) - { - $url = "$url&action=$action"; - } - - # Parameters - if (defined($get)) - { - $url = "$url&$get"; - } - - $url; # Return -} - -# Get a wiki page, try again and again if error -sub getPage -{ - # Get parameters - my $title = $_[0]; - my $action = $_[1]; - my $get = $_[2]; - - my $url = buildPageURL($title, $action, $get); - - my $result; - do - { - $result = $browser->get($url); - } - while ((!$result->is_success) && ($result->status_line ne "302 Moved Temporarily")); - - $result->content; # Return -} - -# Post a wiki page, try again and again if error -sub postPage -{ - # Get parameters - my $title = $_[0]; - my $action = $_[1]; - my $post = $_[2]; - my $get = $_[3]; - - my $url = buildPageURL($title, $action, $get); - - my $result; - do - { - $result = $browser->post($url, $post); - } - while ((!$result->is_success) && (!$result->status_line =~ /302[\s\w]+/)); - - $result->content; # Return -} - # Log in sub login { Added: trunk/includes/http.pm =================================================================== --- trunk/includes/http.pm (rev 0) +++ trunk/includes/http.pm 2006-09-01 16:04:29 UTC (rev 16) @@ -0,0 +1,81 @@ +# Code style +use warnings; +use strict; + +# Libraries +use LWP; + +# Counters +my ($i, $j, $k, $l, $m); + +# The browser we use to surf; enable cookies, and use a specific user agent +my $browser = LWP::UserAgent->new; +$browser->cookie_jar({}); +$browser->agent("Mozilla/5.0 (compatible; Perl MediaWiki Robot)"); + +# Build the URL of a wiki page +sub buildPageURL +{ + # Get parameters + my $title = $_[0]; + my $action = $_[1]; + my $get = $_[2]; + + # Initial URL: server, script path and title + my $url = server().scriptPath()."?title=$title"; + + # Action + if (defined($action)) + { + $url = "$url&action=$action"; + } + + # Parameters + if (defined($get)) + { + $url = "$url&$get"; + } + + $url; # Return +} + +# Get a wiki page, try again and again if error +sub getPage +{ + # Get parameters + my $title = $_[0]; + my $action = $_[1]; + my $get = $_[2]; + + my $url = buildPageURL($title, $action, $get); + + my $result; + do + { + $result = $browser->get($url); + } + while ((!$result->is_success) && ($result->status_line ne "302 Moved Temporarily")); + + $result->content; # Return +} + +# Post a wiki page, try again and again if error +sub postPage +{ + # Get parameters + my $title = $_[0]; + my $action = $_[1]; + my $post = $_[2]; + my $get = $_[3]; + + my $url = buildPageURL($title, $action, $get); + + my $result; + do + { + $result = $browser->post($url, $post); + } + while ((!$result->is_success) && (!$result->status_line =~ /302[\s\w]+/)); + + $result->content; # Return +} Property changes on: trunk/includes/http.pm ___________________________________________________________________ Name: svn:eol-style + native Deleted: trunk/includes/mwdump.pm =================================================================== --- trunk/includes/mwdump.pm 2006-09-01 11:27:14 UTC (rev 15) +++ trunk/includes/mwdump.pm 2006-09-01 16:04:29 UTC (rev 16) @@ -1,1394 +0,0 @@ -our $VERSION = '0.31'; -#the POD is at the end of this file -#avoid shift() - it is computationally more expensive than pop -#and shifting values for subroutine input should be avoided in -#any subroutines that get called often, like the handlers - -package Parse::MediaWikiDump::Pages; - -#This parser works by placing all of the start, text, and end events into -#a buffer as they come out of XML::Parser. On each call to page() the function -#checks for a complete article in the buffer and calls for XML::Parser to add -#more tokens if a complete article is not found. Once a complete article is -#found it is removed from the buffer, parsed, and an instance of the page -#object is returned. - -use strict; -use warnings; -use XML::Parser; - -#tokens in the buffer are an array ref with the 0th element specifying -#its type; these are the constants for those types. -use constant T_START => 1; -use constant T_END => 2; -use constant T_TEXT => 3; - -sub new { - my $class = shift; - my $source = shift; - my $self = {}; - - bless($self, $class); - - $$self{PARSER} = XML::Parser->new(ProtocolEncoding => 'UTF-8'); - $$self{PARSER}->setHandlers('Start', \&start_handler, - 'End', \&end_handler, - 'Char', \&char_handler); - $$self{EXPAT} = $$self{PARSER}->parse_start(state => $self); - $$self{BUFFER} = []; - $$self{CHUNK_SIZE} = 32768; - $$self{BUF_LIMIT} = 10000; - $$self{BYTE} = 0; - - $self->open($source); - $self->init; - - return $self; -} - -sub next { - my $self = shift; - my $buffer = $$self{BUFFER}; - my $offset; - my @page; - - #look through the contents of our buffer for a complete article; fill - #the buffer with more data if an entire article is not there - while(1) { - $offset = $self->search_buffer([T_END, 'page']); - last if $offset != -1; - - #indicates EOF - return undef unless $self->parse_more; - } - - #remove the entire page from the buffer - @page = splice(@$buffer, 0, $offset + 1); - - if (! token_compare($page[0], [T_START, 'page'])) { - $self->dump($buffer); - die "expected <page>; got " . token2text($page[0]); - } - - my $data = $self->parse_page(\@page); - - return Parse::MediaWikiDump::page->new($data, $$self{CATEGORY_ANCHOR}); -} - -#outputs a nicely formated representation of the tokens on the buffer specified -sub dump { - my $self = shift; - my $buffer = shift || $$self{BUFFER}; - my $offset = 0; - - foreach my $i (0 .. $#$buffer) { - my $token = $$buffer[$i]; - - print STDERR "$i "; - - if ($$token[0] == T_START) { - my $attr = $$token[2]; - print STDERR " " x $offset; - print STDERR "START $$token[1] "; - - foreach my $key (sort(keys(%$attr))) { - print STDERR "$key=\"$$attr{$key}\" "; - } - - print STDERR "\n"; - $offset++; - } elsif ($$token[0] == T_END) { - $offset--; - print STDERR " " x $offset; - print STDERR "END $$token[1]\n"; - } elsif ($$token[0] == T_TEXT) { - my $ref = $$token[1]; - print STDERR " " x $offset; - print STDERR "TEXT "; - - my $len = length($$ref); - - if ($len < 50) { - print STDERR "'$$ref'\n"; - } else { - print STDERR "$len characters\n"; - } - } - } - - return 1; -} - -sub sitename { - my $self = shift; - return $$self{HEAD}{sitename}; -} - -sub base { - my $self = shift; - return $$self{HEAD}{base}; -} - -sub generator { - my $self = shift; - return $$self{HEAD}{generator}; -} - -sub case { - my $self = shift; - return $$self{HEAD}{case}; -} - -sub namespaces { - my $self = shift; - return $$self{HEAD}{namespaces}; -} - -sub current_byte { - my $self = shift; - return $$self{BYTE}; -} - -#depreciated backwards compatibility methods - -#replaced by next() -sub page { - my $self = shift; - return $self->next(@_); -} - -#private functions with OO interface -sub open { - my $self = shift; - my $source = shift; - - if (ref($source) eq 'GLOB') { - $$self{SOURCE} = $source; - } elsif (! open($$self{SOURCE}, $source)) { - die "could not open $source: $!"; - } - - binmode($$self{SOURCE}, ':utf8'); - - return 1; -} - -sub init { - my $self = shift; - my $offset; - my @head; - - #parse more XML until the entire siteinfo section is in the buffer - while(1) { - die "could not init" unless $self->parse_more; - - $offset = $self->search_buffer([T_END, 'siteinfo']); - - last if $offset != -1; - } - - #pull the siteinfo section out of the buffer - @head = splice(@{$$self{BUFFER}}, 0, $offset + 1); - - $self->parse_head(\@head); - - return 1; -} - -#feed data into expat and have it put more tokens onto the buffer -sub parse_more { - my ($self) = @_; - my $buf; - - my $read = read($$self{SOURCE}, $buf, $$self{CHUNK_SIZE}); - - if (! defined($read)) { - die "error during read: $!"; - } elsif ($read == 0) { - $$self{FINISHED} = 1; - $$self{EXPAT}->parse_done(); - return 0; - } - - $$self{BYTE} += $read; - $$self{EXPAT}->parse_more($buf); - - my $buflen = scalar(@{$$self{BUFFER}}); - - die "buffer length of $buflen exceeds $$self{BUF_LIMIT}" unless - $buflen < $$self{BUF_LIMIT}; - - return 1; -} - -#searches through a buffer for a specified token -sub search_buffer { - my ($self, $search, $list) = @_; - - $list = $$self{BUFFER} unless defined $list; - - return -1 if scalar(@$list) == 0; - - foreach my $i (0 .. $#$list) { - return $i if token_compare($$list[$i], $search); - } - - return -1; -} - -#this function is very frightning =) -sub parse_head { - my $self = shift; - my $buffer = shift; - my $state = 'start'; - my %data = (namespaces => []); - - for (my $i = 0; $i <= $#$buffer; $i++) { - my $token = $$buffer[$i]; - - if ($state eq 'start') { - my $version; - die "$i: expected <mediawiki> got " . token2text($token) unless - token_compare($token, [T_START, 'mediawiki']); - - die "$i: version is a required attribute" unless - defined($version = $$token[2]->{version}); - - die "$i: version $version unsupported" unless $version eq '0.3'; - - $token = $$buffer[++$i]; - - die "$i: expected <siteinfo> got " . token2text($token) unless - token_compare($token, [T_START, 'siteinfo']); - - $state = 'in_siteinfo'; - } elsif ($state eq 'in_siteinfo') { - if (token_compare($token, [T_START, 'namespaces'])) { - $state = 'in_namespaces'; - next; - } elsif (token_compare($token, [T_END, 'siteinfo'])) { - last; - } elsif (token_compare($token, [T_START, 'sitename'])) { - $token = $$buffer[++$i]; - - if ($$token[0] != T_TEXT) { - die "$i: expected TEXT but got " . token2text($token); - } - - $data{sitename} = ${$$token[1]}; - - $token = $$buffer[++$i]; - - if (! token_compare($token, [T_END, 'sitename'])) { - die "$i: expected </sitename> but got " . token2text($token); - } - } elsif (token_compare($token, [T_START, 'base'])) { - $token = $$buffer[++$i]; - - if ($$token[0] != T_TEXT) { - $self->dump($buffer); - die "$i: expected TEXT but got " . token2text($token); - } - - $data{base} = ${$$token[1]}; - - $token = $$buffer[++$i]; - - if (! token_compare($token, [T_END, 'base'])) { - $self->dump($buffer); - die "$i: expected </base> but got " . token2text($token); - } - - } elsif (token_compare($token, [T_START, 'generator'])) { - $token = $$buffer[++$i]; - - if ($$token[0] != T_TEXT) { - $self->dump($buffer); - die "$i: expected TEXT but got " . token2text($token); - } - - $data{generator} = ${$$token[1]}; - - $token = $$buffer[++$i]; - - if (! token_compare($token, [T_END, 'generator'])) { - $self->dump($buffer); - die "$i: expected </generator> but got " . token2text($token); - } - - } elsif (token_compare($token, [T_START, 'case'])) { - $token = $$buffer[++$i]; - - if ($$token[0] != T_TEXT) { - $self->dump($buffer); - die "$i: expected </case> but got " . token2text($token); - } - - $data{case} = ${$$token[1]}; - - $token = $$buffer[++$i]; - - if (! token_compare($token, [T_END, 'case'])) { - $self->dump($buffer); - die "$i: expected </case> but got " . token2text($token); - } - } - - } elsif ($state eq 'in_namespaces') { - my $key; - my $name; - - if (token_compare($token, [T_END, 'namespaces'])) { - $state = 'in_siteinfo'; - next; - } - - if (! token_compare($token, [T_START, 'namespace'])) { - die "$i: expected <namespace> or </namespaces>; got " . token2text($token); - } - - die "$i: key is a required attribute" unless - defined($key = $$token[2]->{key}); - - $token = $$buffer[++$i]; - - #the default namespace has no text associated with it - if ($$token[0] == T_TEXT) { - $name = ${$$token[1]}; - } elsif (token_compare($token, [T_END, 'namespace'])) { - $name = ''; - $i--; #move back one for below - } else { - die "$i: should never happen"; - } - - push(@{$data{namespaces}}, [$key, $name]); - - $token = $$buffer[++$i]; - - if (! token_compare($token, [T_END, 'namespace'])) { - $self->dump($buffer); - die "$i: expected </namespace> but got " . token2text($token); - } - - } else { - die "$i: unknown state '$state'"; - } - } - - $$self{HEAD} = \%data; - - #locate the anchor that indicates what looks like a link is really a - #category assignment ([[foo]] vs [[Category:foo]]) - #fix for bug #16616 - foreach my $ns (@{$data{namespaces}}) { - #namespace 14 is the category namespace - if ($$ns[0] == 14) { - $$self{CATEGORY_ANCHOR} = $$ns[1]; - last; - } - } - - if (! defined($$self{CATEGORY_ANCHOR})) { - die "Could not locate category indicator in namespace definitions"; - } - - return 1; -} - -#this function is very frightning =) -sub parse_page { - my $self = shift; - my $buffer = shift; - my %data; - my $state = 'start'; - - for (my $i = 0; $i <= $#$buffer; $i++) { - my $token = $$buffer[$i]; - - if ($state eq 'start') { - if (! token_compare($token, [T_START, 'page'])) { - $self->dump($buffer); - die "$i: expected <page>; got " . token2text($token); - } - - $state = 'in_page'; - } elsif ($state eq 'in_page') { - if (token_compare($token, [T_START, 'revision'])) { - $state = 'in_revision'; - next; - } elsif (token_compare($token, [T_END, 'page'])) { - last; - } elsif (token_compare($token, [T_START, 'title'])) { - $token = $$buffer[++$i]; - - if (token_compare($token, [T_END, 'title'])) { - $data{title} = ''; - next; - } - - if ($$token[0] != T_TEXT) { - $self->dump($buffer); - die "$i: expected TEXT; got " . token2text($token); - } - - $data{title} = ${$$token[1]}; - - $token = $$buffer[++$i]; - - if (! token_compare($token, [T_END, 'title'])) { - $self->dump($buffer); - die "$i: expected </title>; got " . token2text($token); - } - } elsif (token_compare($token, [T_START, 'id'])) { - $token = $$buffer[++$i]; - - if ($$token[0] != T_TEXT) { - $self->dump($buffer); - die "$i: expected TEXT; got " . token2text($token); - } - - $data{id} = ${$$token[1]}; - - $token = $$buffer[++$i]; - - if (! token_compare($token, [T_END, 'id'])) { - $self->dump($buffer); - die "$i: expected </id>; got " . token2text($token); - } - } - } elsif ($state eq 'in_revision') { - if (token_compare($token, [T_END, 'revision'])) { - #If a comprehensive dump file is parsed - #it can cause uncontrolled stack growth and the - #parser only returns one revision out of - #all revisions - if we run into a - #comprehensive dump file, indicated by more - #than one <revision> section inside a <page> - #section then die with a message - - #just peeking ahead, don't want to update - #the index - $token = $$buffer[$i + 1]; - - if (token_compare($token, [T_START, 'revision'])) { - die "unable to properly parse comprehensive dump files"; - } - - $state = 'in_page'; - next; - } elsif (token_compare($token, [T_START, 'contributor'])) { - $state = 'in_contributor'; - next; - } elsif (token_compare($token, [T_START, 'id'])) { - $token = $$buffer[++$i]; - - if ($$token[0] != T_TEXT) { - $self->dump($buffer); - die "$i: expected TEXT; got " . token2text($token); - } - - $data{revision_id} = ${$$token[1]}; - - $token = $$buffer[++$i]; - - if (! token_compare($token, [T_END, 'id'])) { - $self->dump($buffer); - die "$i: expected </id>; got " . token2text($token); - } - - } elsif (token_compare($token, [T_START, 'timestamp'])) { - $token = $$buffer[++$i]; - - if ($$token[0] != T_TEXT) { - $self->dump($buffer); - die "$i: expected TEXT; got " . token2text($token); - } - - $data{timestamp} = ${$$token[1]}; - - $token = $$buffer[++$i]; - - if (! token_compare($token, [T_END, 'timestamp'])) { - $self->dump($buffer); - die "$i: expected </timestamp>; got " . token2text($token); - } - } elsif (token_compare($token, [T_START, 'minor'])) { - $data{minor} = 1; - $token = $$buffer[++$i]; - - if (! token_compare($token, [T_END, 'minor'])) { - $self->dump($buffer); - die "$i: expected </minor>; got " . token2text($token); - } - } elsif (token_compare($token, [T_START, 'comment'])) { - $token = $$buffer[++$i]; - - #account for possible null-text - if (token_compare($token, [T_END, 'comment'])) { - $data{comment} = ''; - next; - } - - if ($$token[0] != T_TEXT) { - $self->dump($buffer); - die "$i: expected TEXT; got " . token2text($token); - } - - $data{comment} = ${$$token[1]}; - - $token = $$buffer[++$i]; - - if (! token_compare($token, [T_END, 'comment'])) { - $self->dump($buffer); - die "$i: expected </comment>; got " . token2text($token); - } - - } elsif (token_compare($token, [T_START, 'text'])) { - my $token = $$buffer[++$i]; - - if (token_compare($token, [T_END, 'text'])) { - ${$data{text}} = ''; - next; - } elsif ($$token[0] != T_TEXT) { - $self->dump($buffer); - die "$i: expected TEXT; got " . token2text($token); - } - - $data{text} = $$token[1]; - - $token = $$buffer[++$i]; - - if (! token_compare($token, [T_END, 'text'])) { - $self->dump($buffer); - die "$i: expected </text>; got " . token2text($token); - } - - } - - } elsif ($state eq 'in_contributor') { - if (token_compare($token, [T_END, 'contributor'])) { - $state = 'in_revision'; - next; - } elsif (token_compare($token, [T_START, 'username'])) { - $token = $$buffer[++$i]; - - if ($$token[0] != T_TEXT) { - $self->dump($buffer); - die "$i: expecting TEXT; got " . token2text($token); - } - - $data{username} = ${$$token[1]}; - - $token = $$buffer[++$i]; - - if (! token_compare($token, [T_END, 'username'])) { - $self->dump($buffer); - die "$i: expected </username>; got " . token2text($token); - } - - } elsif (token_compare($token, [T_START, 'id'])) { - $token = $$buffer[++$i]; - - if ($$token[0] != T_TEXT) { - $self->dump($buffer); - die "$i: expecting TEXT; got " . token2text($token); - } - - $data{userid} = ${$$token[1]}; - - $token = $$buffer[++$i]; - - if (! token_compare($token, [T_END, 'id'])) { - $self->dump($buffer); - die "$i: expecting </id>; got " . token2text($token); - } - } - } else { - die "unknown state: $state"; - } - } - - $data{minor} = 0 unless defined($data{minor}); - - return \%data; -} - -#private functions with out OO interface -sub token2text { - my $token = shift; - - if ($$token[0] == T_START) { - return "<$$token[1]>"; - } elsif ($$token[0] == T_END) { - return "</$$token[1]>"; - } elsif ($$token[0] == T_TEXT) { - return "!text_token!"; - } else { - return "!unknown!"; - } -} - -#this function is where the majority of time is spent in this software -sub token_compare { - my ($toke1, $toke2) = @_; - - foreach my $i (0 .. $#$toke2) { - if ($$toke1[$i] ne $$toke2[$i]) { - return 0; - } - } - - return 1; -} - -sub start_handler { - my ($p, $tag, %atts) = @_; - my $self = $p->{state}; - - push(@{$$self{BUFFER}}, [T_START, $tag, \%atts]); - - return 1; -} - -sub end_handler { - my ($p, $tag) = @_; - my $self = $p->{state}; - - push(@{$$self{BUFFER}}, [T_END, $tag]); - - return 1; -} - -sub char_handler { - my ($p, $chars) = @_; - my $self = $p->{state}; - my $buffer = $$self{BUFFER}; - my $curent = $$buffer[$#$buffer]; - - if (! defined($curent)) { - #skip any text not inside a container - return 1; - } elsif ($$curent[0] == T_TEXT) { - ${$$curent[1]} .= $chars; - } elsif ($$curent[0] == T_START) { - my $ignore_ws_only = 1; - - #work around for bug #16583 - All spaces is a possible username - # at least if done via unicode. Force white space preservation - #for now. - if ($$curent[1] eq 'username') { - $ignore_ws_only = 0; - } - - if (defined($$curent[2]->{'xml:space'}) && - ($$curent[2]->{'xml:space'} eq 'preserve')) { - $ignore_ws_only = 0; - } - - if ($ignore_ws_only) { - #non-breaking spaces are not whitespace in XML - return 1 if $chars =~ m/^[ \t\r\n]+$/m; - } - - push(@$buffer, [T_TEXT, \$chars]); - } - - return 1; -} - -package Parse::MediaWikiDump::page; - -use strict; -use warnings; - -sub new { - my ($class, $data, $category_anchor, $case_setting) = @_; - my $self = {}; - - bless($self, $class); - - $$self{DATA} = $data; - $$self{CACHE} = {}; - $$self{CATEGORY_ANCHOR} = $category_anchor; - - return $self; -} - -sub namespace { - my $self = shift; - - return $$self{CACHE}{namespace} if defined($$self{CACHE}{namespace}); - - my $title = $$self{DATA}{title}; - - if ($title =~ m/^([^:]+)\:/) { - $$self{CACHE}{namespace} = $1; - return $1; - } else { - $$self{CACHE}{namespace} = ''; - return ''; - } -} - -sub categories { - my $self = shift; - my $anchor = $$self{CATEGORY_ANCHOR}; - - return $$self{CACHE}{categories} if defined($$self{CACHE}{categories}); - - my $text = $$self{DATA}{text}; - my @cats; - - while($$text =~ m/\[\[$anchor:\s*([^\]]+)\]\]/gi) { - my $buf = $1; - - #deal with the pipe trick - $buf =~ s/\|.*$//; - push(@cats, $buf); - } - - return undef if scalar(@cats) == 0; - - $$self{CACHE}{categories} = \@cats; - - return \@cats; -} - -sub redirect { - my $self = shift; - my $text = $$self{DATA}{text}; - - return $$self{CACHE}{redirect} if exists($$self{CACHE}{redirect}); - - if ($$text =~ m/^#redirect\s*:?\s*\[\[([^\]]*)\]\]/i) { - $$self{CACHE}{redirect} = $1; - return $1; - } else { - $$self{CACHE}{redirect} = undef; - return undef; - } -} - -sub title { - my $self = shift; - return $$self{DATA}{title}; -} - -sub id { - my $self = shift; - return $$self{DATA}{id}; -} - -sub revision_id { - my $self = shift; - return $$self{DATA}{revision_id}; -} - -sub timestamp { - my $self = shift; - return $$self{DATA}{timestamp}; -} - -sub username { - my $self = shift; - return $$self{DATA}{username}; -} - -sub userid { - my $self = shift; - return $$self{DATA}{userid}; -} - -sub minor { - my $self = shift; - return $$self{DATA}{minor}; -} - -sub text { - my $self = shift; - return $$self{DATA}{text}; -} - -package Parse::MediaWikiDump::Links; - -use strict; -use warnings; - -sub new { - my $class = shift; - my $source = shift; - my $self = {}; - $$self{BUFFER} = []; - - bless($self, $class); - - $self->open($source); - $self->init; - - return $self; -} - -sub next { - my $self = shift; - my $buffer = $$self{BUFFER}; - my $link; - - while(1) { - if (defined($link = pop(@$buffer))) { - last; - } - - #signals end of input - return undef unless $self->parse_more; - } - - return Parse::MediaWikiDump::link->new($link); -} - -#private functions with OO interface -sub parse_more { - my $self = shift; - my $source = $$self{SOURCE}; - my $need_data = 1; - - while($need_data) { - my $line = <$source>; - - last unless defined($line); - - while($line =~ m/\((\d+),(-?\d+),'(.*?)'\)[;,]/g) { - push(@{$$self{BUFFER}}, [$1, $2, $3]); - $need_data = 0; - } - } - - #if we still need data and we are here it means we ran out of input - if ($need_data) { - return 0; - } - - return 1; -} - -sub open { - my $self = shift; - my $source = shift; - - if (ref($source) ne 'GLOB') { - die "could not open $source: $!" unless - open($$self{SOURCE}, $source); - } else { - $$self{SOURCE} = $source; - } - - binmode($$self{SOURCE}, ':utf8'); - - return 1; -} - -sub init { - my $self = shift; - my $source = $$self{SOURCE}; - my $found = 0; - - while(<$source>) { - if (m/^LOCK TABLES `pagelinks` WRITE;/) { - $found = 1; - last; - } - } - - die "not a MediaWiki link dump file" unless $found; -} - -#depreciated backwards compatibility methods - -#replaced by next() -sub link { - my $self = shift; - $self->next(@_); -} - -package Parse::MediaWikiDump::link; - -#you must pass in a fully populated link array reference -sub new { - my $class = shift; - my $self = shift; - - bless($self, $class); - - return $self; -} - -sub from { - my $self = shift; - return $$self[0]; -} - -sub namespace { - my $self = shift; - return $$self[1]; -} - -sub to { - my $self = shift; - return $$self[2]; -} - - -1; - -__END__ - -=head1 NAME - -Parse::MediaWikiDump - Tools to process MediaWiki dump files - -=head1 SYNOPSIS - - use Parse::MediaWikiDump; - - $source = 'dump_filename.ext'; - $source = \*FILEHANDLE; - - $pages = Parse::MediaWikiDump::Pages->new($source); - $links = Parse::MediaWikiDump::Links->new($source); - - #get all the records from the dump files, one record at a time - while(defined($page = $pages->next)) { - print "title '", $page->title, "' id ", $page->id, "\n"; - } - - while(defined($link = $links->next)) { - print "link from ", $link->from, " to ", $link->to, "\n"; - } - - #information about the page dump file - $pages->sitename; - $pages->base; - $pages->generator; - $pages->case; - $pages->namespaces; - - #information about a page record - $page->redirect; - $page->categories; - $page->title; - $page->namespace; - $page->id; - $page->revision_id; - $page->timestamp; - $page->username; - $page->userid; - $page->minor; - $page->text; - - #information about a link - $link->from; - $link->to; - $link->namespace; - -=head1 DESCRIPTION - -This module provides the tools needed to process the contents of various -MediaWiki dump files. - -=head1 USAGE - -To use this module you must create an instance of a parser for the type of -dump file you are trying to parse. The current parsers are: - -=over 4 - -=item Parse::MediaWikiDump::Pages - -Parse the contents of the page archive. - -=item Parse::MediaWikiDump::Links - -Parse the contents of the links dump file. - -=back - -=head2 General - -Both parsers require an argument to new that is a location of source data -to parse; this argument can be either a filename or a reference to an already -open filehandle. This entire software suite will die() upon errors in the file, -inconsistencies on the stack, etc. If... [truncated message content] |
From: <rot...@us...> - 2006-09-01 11:27:29
|
Revision: 15 http://svn.sourceforge.net/perlwikibot/?rev=15&view=rev Author: rotemliss Date: 2006-09-01 04:27:14 -0700 (Fri, 01 Sep 2006) Log Message: ----------- Fixing typo. Modified Paths: -------------- trunk/bot.pl trunk/includes/functions.pm trunk/old/delete.pl trunk/old/move.pl trunk/old/refresh.pl trunk/old/replace.pl Modified: trunk/bot.pl =================================================================== --- trunk/bot.pl 2006-08-31 14:09:30 UTC (rev 14) +++ trunk/bot.pl 2006-09-01 11:27:14 UTC (rev 15) @@ -4,7 +4,7 @@ use warnings; use strict; -# Libaries +# Libraries use includes::functions; # Counters Modified: trunk/includes/functions.pm =================================================================== --- trunk/includes/functions.pm 2006-08-31 14:09:30 UTC (rev 14) +++ trunk/includes/functions.pm 2006-09-01 11:27:14 UTC (rev 15) @@ -2,7 +2,7 @@ use warnings; use strict; -# Libaries +# Libraries use LWP; use HTTP::Cookies; use includes::mwdump; Modified: trunk/old/delete.pl =================================================================== --- trunk/old/delete.pl 2006-08-31 14:09:30 UTC (rev 14) +++ trunk/old/delete.pl 2006-09-01 11:27:14 UTC (rev 15) @@ -3,7 +3,7 @@ # Code style use warnings; -# Libaries +# Libraries use Time::Local; use functions; use configure; Modified: trunk/old/move.pl =================================================================== --- trunk/old/move.pl 2006-08-31 14:09:30 UTC (rev 14) +++ trunk/old/move.pl 2006-09-01 11:27:14 UTC (rev 15) @@ -3,7 +3,7 @@ # Code style use warnings; -# Libaries +# Libraries use functions; use configure; Modified: trunk/old/refresh.pl =================================================================== --- trunk/old/refresh.pl 2006-08-31 14:09:30 UTC (rev 14) +++ trunk/old/refresh.pl 2006-09-01 11:27:14 UTC (rev 15) @@ -3,7 +3,7 @@ # Code style use warnings; -# Libaries +# Libraries use mwdump; use functions; use configure; Modified: trunk/old/replace.pl =================================================================== --- trunk/old/replace.pl 2006-08-31 14:09:30 UTC (rev 14) +++ trunk/old/replace.pl 2006-09-01 11:27:14 UTC (rev 15) @@ -3,7 +3,7 @@ # Code style use warnings; -# Libaries +# Libraries use functions; use configure; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |