perlwikibot-svn Mailing List for Perl MediaWiki Robot (Page 4)
Status: Pre-Alpha
Brought to you by:
rotemliss
You can subscribe to this list here.
| 2006 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
(19) |
Oct
|
Nov
|
Dec
|
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2007 |
Jan
|
Feb
|
Mar
(4) |
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
| 2008 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
(20) |
Aug
(12) |
Sep
|
Oct
|
Nov
(3) |
Dec
|
| 2009 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
(11) |
Oct
(1) |
Nov
|
Dec
|
| 2010 |
Jan
|
Feb
|
Mar
(6) |
Apr
|
May
(1) |
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
(2) |
Dec
|
|
From: <rot...@us...> - 2006-09-16 11:06:21
|
Revision: 18
http://svn.sourceforge.net/perlwikibot/?rev=18&view=rev
Author: rotemliss
Date: 2006-09-16 04:06:13 -0700 (Sat, 16 Sep 2006)
Log Message:
-----------
Code style - properly getting parameters.
Modified Paths:
--------------
trunk/includes/functions.pm
trunk/includes/http.pm
Modified: trunk/includes/functions.pm
===================================================================
--- trunk/includes/functions.pm 2006-09-16 10:54:41 UTC (rev 17)
+++ trunk/includes/functions.pm 2006-09-16 11:06:13 UTC (rev 18)
@@ -101,8 +101,7 @@
sub getPageLastEdit
{
# Get parameters
- my $xmlFile = $_[0];
- my $title = $_[1];
+ my ($xmlFile, $title) = @_;
utf8::decode($title);
# Get the pages
@@ -188,7 +187,7 @@
sub getPageContents
{
# Get parameters
- my $title = $_[0];
+ my ($title) = @_;
http::getPage($title, "raw"); # Return
}
@@ -197,9 +196,7 @@
sub editPage
{
# Get parameters
- my $title = $_[0];
- my $text = $_[1];
- my $editSummary = $_[2];
+ my ($title, $text, $editSummary) = @_;
print "Editing page $title...\n";
@@ -253,7 +250,7 @@
sub replaceInPage
{
# Get parameters
- my $title = $_[0];
+ my ($title) = @_;
if ($configure::sendPages == 1 || $configure::sendPages == 2)
{
@@ -303,8 +300,7 @@
sub refreshPage
{
# Get parameters
- my $server = $_[0];
- my $title = $_[1];
+ my ($server, $title) = @_;
editPage($title, getPageContents($title), "Refreshing page");
}
@@ -313,10 +309,7 @@
sub movePage
{
# Get parameters
- my $server = $_[0];
- my $title = $_[1];
- my $newTitle = $_[2];
- my $moveReason = $_[3];
+ my ($server, $title, $newTitle, $moveReason) = @_;
print "Moving page $title to $newTitle...\n";
@@ -352,9 +345,7 @@
sub deletePage
{
# Get parameters
- my $server = $_[0];
- my $title = $_[1];
- my $deleteReason = $_[2];
+ my ($server, $title, $reason) = @_;
print "Deleting page $title...\n";
@@ -371,7 +362,7 @@
{
http::postPage($title, "delete",
[
- wpReason => $deleteReason,
+ wpReason => $reason,
wpEditToken => $editToken
],
);
Modified: trunk/includes/http.pm
===================================================================
--- trunk/includes/http.pm 2006-09-16 10:54:41 UTC (rev 17)
+++ trunk/includes/http.pm 2006-09-16 11:06:13 UTC (rev 18)
@@ -21,9 +21,7 @@
sub buildPageURL
{
# Get parameters
- my $title = $_[0];
- my $action = $_[1];
- my $get = $_[2];
+ my ($title, $action, $get) = @_;
# Initial URL: server, script path and title
my $server = $configure::servers{$ARGV[0]};
@@ -49,9 +47,7 @@
sub getPage
{
# Get parameters
- my $title = $_[0];
- my $action = $_[1];
- my $get = $_[2];
+ my ($title, $action, $get) = @_;
my $url = buildPageURL($title, $action, $get);
@@ -60,7 +56,7 @@
{
$result = $browser->get($url);
}
- while ((!$result->is_success) && ($result->status_line ne "302 Moved Temporarily"));
+ while (!$result->is_success && $result->status_line =~ /302[\s\w]+/);
$result->content; # Return
}
@@ -69,10 +65,7 @@
sub postPage
{
# Get parameters
- my $title = $_[0];
- my $action = $_[1];
- my $post = $_[2];
- my $get = $_[3];
+ my ($title, $action, $post, $get) = @_;
my $url = buildPageURL($title, $action, $get);
@@ -81,7 +74,7 @@
{
$result = $browser->post($url, $post);
}
- while ((!$result->is_success) && (!$result->status_line =~ /302[\s\w]+/));
+ while (!$result->is_success && !$result->status_line =~ /302[\s\w]+/);
$result->content; # Return
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <rot...@us...> - 2006-09-16 10:54:55
|
Revision: 17
http://svn.sourceforge.net/perlwikibot/?rev=17&view=rev
Author: rotemliss
Date: 2006-09-16 03:54:41 -0700 (Sat, 16 Sep 2006)
Log Message:
-----------
Use namespaces, and directly access the variables instead of using many functions.
Modified Paths:
--------------
trunk/bot.pl
trunk/config/configure.sample
trunk/config/runtime.sample
trunk/includes/dump.pm
trunk/includes/functions.pm
trunk/includes/http.pm
Modified: trunk/bot.pl
===================================================================
--- trunk/bot.pl 2006-09-01 16:04:29 UTC (rev 16)
+++ trunk/bot.pl 2006-09-16 10:54:41 UTC (rev 17)
@@ -6,6 +6,8 @@
# Libraries
use includes::functions;
+use config::configure;
+use config::runtime;
# Counters
my ($i, $j, $k, $l, $m);
@@ -17,34 +19,34 @@
}
# Show notes about the sending pages configuration
-if (sendPages() == 0)
+if ($configure::sendPages == 0)
{
print "Note: There will be no changes in the real server. Please set sendPages to 1 to do them.\n";
}
-elsif (sendPages() == 2)
+elsif ($configure::sendPages == 2)
{
print "Note: A prefix will be used when editing pages. Please set sendPages to 1 to disable that.\n";
}
# Log in to the server only if you have to do some stuff in the server
-if (connectToServer())
+if ($configure::sendPages == 1 || $configure::sendPages == 2)
{
- login();
+ functions::login();
}
# All the matching pages in the XML file
-my @pages = getMatchingPages();
+my @pages = functions::getMatchingPages();
# Execute actions of all the pages
for ($i = 0; ($i <= $#pages); $i += 2)
{
# Get title and action
my $title = $pages[$i];
- my $action = action($pages[$i + 1]);
+ my $action = $configure::actions{$pages[$i + 1]};
# Execute the action (TODO: add the actions which were supported in the previous bot)
if ($action eq "replace")
{
- replaceInPage($title);
+ functions::replaceInPage($title);
}
}
Modified: trunk/config/configure.sample
===================================================================
--- trunk/config/configure.sample 2006-09-01 16:04:29 UTC (rev 16)
+++ trunk/config/configure.sample 2006-09-16 10:54:41 UTC (rev 17)
@@ -1,3 +1,6 @@
+# Package name
+package configure;
+
# Code style
use warnings;
use strict;
@@ -9,165 +12,56 @@
my ($i, $j, $k, $l, $m);
# Available servers and details about them, the bot should work on
-my %servers = (
+our %servers = (
"server" => "http://www.server.com",
); # The MediaWiki Server
-my %scriptPaths = (
+our %scriptPaths = (
"server" => "/w/index.php",
); # The path to the script index.php (or wiki.phtml in old versions)
-my %userNames = (
+our %userNames = (
"server" => "My-bot",
); # The bot user name in every server
-my %passwords = (
+our %passwords = (
"server" => "password"
); # The bot password in every server
-my %sysopUserNames = (
+our %sysopUserNames = (
"server" => "My-sysop",
); # The sysop bot user name in every server, blank if no
-my %sysopPasswords = (
+our %sysopPasswords = (
"server" => "strong_password",
); # The sysop bot password in every server, blank if no
-my %xmlFiles = (
+our %xmlFiles = (
"server" => "site.xml",
); # The XML file of the database
-my %initialEditSummaries = (
+our %initialEditSummaries = (
"server" => "[[Project:Replace bot|Replace bot]] – "
); # Initial edit summary, when several replaces are done and several edit summaries are integrated, using the replace script
-my %serverPrefixes = (
+our %serverPrefixes = (
"server" => "User:My-bot/Tests/",
); # Prefixes for the saved pages for tests; won't be used when really running the script
# Avilable actions and details about them
-my %actions = (
+our %actions = (
"action" => "move"
); # The global actions done
-my %actionServers = (
+our %actionServers = (
"action" => "server"
); # Which server should it be executed on? TODO - allow using multiple servers like that, currently ignored
-my %titles = (
+our %titles = (
"action" => "^שיחת משתמש:[\\d]+\\.[\\d]+\\.[\\d]+\\.[\\d]+\$"
); # Restrict the actions to these titles
-my %texts = (
+our %texts = (
"action" => "^אנונימי\$"
); # Restrict the actions to the pages
-my %news = (
+our %news = (
"action" => "שיחת משתמש:אנונימי"
); # New title (move) or text (replace), leave blank otherwise
-my %reasons = (
+our %reasons = (
"action" => "מחליף שיחות אנונימיות"
); # Summary/reason sent about the action
-my %minimumTimes = (
+our %minimumTimes = (
"action" => timegm(0, 0, 0, 1, 1 - 1, 1970 - 1900)
); # The minimum diff between the current time and the last edit time
-sub server
-{
- $servers{$ARGV[0]}; # Return
-}
-
-sub scriptPath
-{
- $scriptPaths{$ARGV[0]}; # Return
-}
-
-sub userName
-{
- # Return the appropriate user name, if the sysop is needed or not,
- # using the parameter
- if ($_[0] == 1)
- {
- $sysopUserNames{$ARGV[0]}; # Return
- }
- else
- {
- $userNames{$ARGV[0]}; # Return
- }
-}
-
-sub password
-{
- # Return the appropriate user name, if the sysop is needed or not,
- # using the parameter
- if ($_[0] == 1)
- {
- $sysopPasswords{$ARGV[0]}; # Return
- }
- else
- {
- $passwords{$ARGV[0]}; # Return
- }
-}
-
-sub xmlFile
-{
- $xmlFiles{$ARGV[0]}; # Return
-}
-
-sub initialEditSummary
-{
- $initialEditSummaries{$ARGV[0]}; # Return
-}
-
-sub serverPrefix
-{
- $serverPrefixes{$ARGV[0]}; # Return
-}
-
-sub action
-{
- # Get parameter
- my $actionName = $_[0];
-
- $actions{$actionName}; # Return
-}
-
-sub actionServer
-{
- # Get parameter
- my $actionName = $_[0];
-
- $actionServers{$actionName}; # Return
-}
-
-sub title
-{
- # Get parameter
- my $actionName = $_[0];
-
- $titles{$actionName}; # Return
-}
-
-sub text
-{
- # Get parameter
- my $actionName = $_[0];
-
- $texts{$actionName}; # Return
-}
-
-sub new
-{
- # Get parameter
- my $actionName = $_[0];
-
- $news{$actionName}; # Return
-}
-
-sub reason
-{
- # Get parameter
- my $actionName = $_[0];
-
- $reasons{$actionName}; # Return
-}
-
-sub minimumTime
-{
- # Get parameter
- my $actionName = $_[0];
-
- $minimumTimes{$actionName}; # Return
-}
-
# Return a true value
1;
Modified: trunk/config/runtime.sample
===================================================================
--- trunk/config/runtime.sample 2006-09-01 16:04:29 UTC (rev 16)
+++ trunk/config/runtime.sample 2006-09-16 10:54:41 UTC (rev 17)
@@ -1,3 +1,6 @@
+# Package name
+package configure;
+
# Code style
use warnings;
use strict;
@@ -6,12 +9,12 @@
my ($i, $j, $k, $l, $m);
# Actions to execute
-my @executedActions = (
+our @executedActions = (
"action"
);
# Actions to execute by the way, if you are already editing the page (replace only)
-my @bywayActions = (
+our @bywayActions = (
"actionB"
);
@@ -22,35 +25,7 @@
# * 2 - send to the server with prefix when editing (if not, don't send at all).
# It's strongly recommended that you use 0 before making tests, and 2 before more
# advanced tests, before you use 1 for the real changes.
-my $sendPages = 0;
+our $sendPages = 0;
-sub executedActions
-{
- @executedActions; # Return
-}
-
-sub bywayActions
-{
- @bywayActions; # Return
-}
-
-sub sendPages
-{
- $sendPages; # Return
-}
-
-# Check if we have to connect to server
-sub connectToServer
-{
- if ($sendPages == 1 || $sendPages == 2)
- {
- 1; # Return
- }
- else
- {
- 0; # Return
- }
-}
-
# Return a true value
1;
Modified: trunk/includes/dump.pm
===================================================================
--- trunk/includes/dump.pm 2006-09-01 16:04:29 UTC (rev 16)
+++ trunk/includes/dump.pm 2006-09-16 10:54:41 UTC (rev 17)
@@ -4,7 +4,7 @@
#and shifting values for subroutine input should be avoided in
#any subroutines that get called often, like the handlers
-package Parse::MediaWikiDump::Pages;
+package mwdump;
#This parser works by placing all of the start, text, and end events into
#a buffer as they come out of XML::Parser. On each call to page() the function
Modified: trunk/includes/functions.pm
===================================================================
--- trunk/includes/functions.pm 2006-09-01 16:04:29 UTC (rev 16)
+++ trunk/includes/functions.pm 2006-09-16 10:54:41 UTC (rev 17)
@@ -1,3 +1,6 @@
+# Package name
+package functions;
+
# Code style
use warnings;
use strict;
@@ -15,7 +18,7 @@
sub getMatchingPages
{
# Get the pages
- my $pages = Parse::MediaWikiDump::Pages->new(xmlFile());
+ my $pages = mwdump->new("dumps/".$configure::xmlFiles{$ARGV[0]});
# Can use only "first-letter", TODO: enable also "case-sensitive" for Wiktionary
if ($pages->case ne "first-letter")
@@ -24,7 +27,7 @@
}
# Get the executed actions
- my @possibleActions = executedActions();
+ my @possibleActions = @configure::executedActions;
# An array contains all the pages
my @matchingPages;
@@ -49,7 +52,7 @@
$tempCounter = 0;
for ($i = 0; ($i <= $#pagePossibleActions); $i++)
{
- my $wantedTitle = title($pagePossibleActions[$i]);
+ my $wantedTitle = $configure::titles{$pagePossibleActions[$i]};
if ($title =~ /$wantedTitle/)
{
$tempTitleArray[$tempCounter++] = $pagePossibleActions[$i];
@@ -70,7 +73,7 @@
$tempCounter = 0;
for ($i = 0; ($i <= $#pagePossibleActions); $i++)
{
- my $wantedText = text($pagePossibleActions[$i]);
+ my $wantedText = $configure::texts{$pagePossibleActions[$i]};
if ($$text =~ /$wantedText/)
{
$tempTextArray[$tempCounter++] = $pagePossibleActions[$i];
@@ -131,16 +134,13 @@
# Log in
sub login
{
- # Get all the executed actions
- my @executedActions = executedActions();
-
- # Sysop finally required?
+ # Is sysop finally required?
my $sysop = 0;
# Go through the array, and check if sysop permission is needed.
- for ($i = 0; ($i <= $#executedActions); $i++)
+ for ($i = 0; ($i <= $#configure::executedActions); $i++)
{
- my $action = action($executedActions[$i]);
+ my $action = $configure::actions{$configure::executedActions[$i]};
if ($action eq "replace")
{
# Continue
@@ -159,12 +159,25 @@
}
}
+ # Get user name and password
+ my ($username, $password);
+ if ($sysop == 1)
+ {
+ $username = $configure::sysopUserNames{$ARGV[0]};
+ $password = $configure::sysopPasswords{$ARGV[0]};
+ }
+ else
+ {
+ $username = $configure::userNames{$ARGV[0]};
+ $password = $configure::passwords{$ARGV[0]};
+ }
+
# Log in
print "Logging in...\n";
- postPage("Special:Userlogin", "submitlogin",
+ http::postPage("Special:Userlogin", "submitlogin",
[
- wpName => userName($sysop),
- wpPassword => password($sysop),
+ wpName => $username,
+ wpPassword => $password,
wpRemember => 0
],
, "type=login");
@@ -177,7 +190,7 @@
# Get parameters
my $title = $_[0];
- getPage($title, "raw"); # Return
+ http::getPage($title, "raw"); # Return
}
# Edit page
@@ -190,13 +203,14 @@
print "Editing page $title...\n";
- if (sendPages() == 2)
+ if ($configure::sendPages == 2)
{
- $title = serverPrefix().$title;
+ my $serverPrefix = $configure::serverPrefixes{$ARGV[0]};
+ $title = $serverPrefix.$title;
}
# Get the edit page contents
- my $editPage = getPage($title, "edit");
+ my $editPage = http::getPage($title, "edit");
# Get the start time
$editPage =~ /<input type='hidden' value="([0-9]{14})" name="wpStarttime" \/>/;
@@ -211,11 +225,11 @@
my $editToken = $1;
# Send page
- if ((defined($editToken)) && ($editToken =~ /[0-9a-f]{32}/))
+ if (defined($editToken) && $editToken =~ /[0-9a-f]{32}/)
{
- if ((sendPages() == 1) || (sendPages() == 2))
+ if ($configure::sendPages == 1 || $configure::sendPages == 2)
{
- postPage($title, "submit",
+ http::postPage($title, "submit",
[
wpSection => "",
wpStarttime => $startTime,
@@ -241,16 +255,16 @@
# Get parameters
my $title = $_[0];
- if (connectToServer())
+ if ($configure::sendPages == 1 || $configure::sendPages == 2)
{
# Get the edited page contents
my $editedPage = getPageContents($title);
# Set initial edit summary
- my $editSummary = initialEditSummary();
+ my $editSummary = $configure::initialEditSummaries{$ARGV[0]};
# Groups array
- my @actions = (executedActions(), bywayActions());
+ my @actions = (@configure::executedActions, @configure::bywayActions);
# Replaced something at all? Flag to check
my $replaced = 0;
@@ -259,16 +273,16 @@
for ($i = 0; ($i <= $#actions); $i++)
{
my $action = $actions[$i];
- my $search = text($action);
+ my $search = $configure::texts{$action};
if ($editedPage =~ /$search/)
{
- my $replace = new($action);
+ my $replace = $configure::news{$action};
$editedPage =~ s/$search/$replace/g;
if ($replaced == 1)
{
$editSummary = $editSummary.", ";
}
- $editSummary = $editSummary.reason($action);
+ $editSummary = $editSummary.$configure::reasons{$action};
$replaced = 1;
}
}
@@ -307,7 +321,7 @@
print "Moving page $title to $newTitle...\n";
# Get the delete page contents
- my $movePage = getPage("Special:Movepage/$title");
+ my $movePage = http::getPage("Special:Movepage/$title");
# Get the edit token
if ($movePage =~ /<input type='hidden' name='wpEditToken' value="([0-9,a-f]{32})" \/>/)
@@ -315,9 +329,9 @@
my $editToken = $1;
# Send page
- if (sendPages() == 1)
+ if ($configure::sendPages == 1)
{
- postPage("Special:Movepage", "submit",
+ http::postPage("Special:Movepage", "submit",
[
wpOldTitle => $title,
wpNewTitle => $newTitle,
@@ -345,7 +359,7 @@
print "Deleting page $title...\n";
# Get the delete page contents
- my $deletePage = getPage($title, "delete");
+ my $deletePage = http::getPage($title, "delete");
# Get the edit token
if ($deletePage =~ /<input type='hidden' name='wpEditToken' value="([0-9,a-f]{32})" \/>/)
@@ -353,9 +367,9 @@
my $editToken = $1;
# Send page
- if (sendPages() == 1)
+ if ($configure::sendPages == 1)
{
- postPage($title, "delete",
+ http::postPage($title, "delete",
[
wpReason => $deleteReason,
wpEditToken => $editToken
Modified: trunk/includes/http.pm
===================================================================
--- trunk/includes/http.pm 2006-09-01 16:04:29 UTC (rev 16)
+++ trunk/includes/http.pm 2006-09-16 10:54:41 UTC (rev 17)
@@ -1,3 +1,6 @@
+# Package name
+package http;
+
# Code style
use warnings;
use strict;
@@ -4,6 +7,7 @@
# Libraries
use LWP;
+use config::configure;
# Counters
my ($i, $j, $k, $l, $m);
@@ -22,7 +26,9 @@
my $get = $_[2];
# Initial URL: server, script path and title
- my $url = server().scriptPath()."?title=$title";
+ my $server = $configure::servers{$ARGV[0]};
+ my $scriptPath = $configure::scriptPaths{$ARGV[0]};
+ my $url = "$server$scriptPath?title=$title";
# Action
if (defined($action))
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <rot...@us...> - 2006-09-01 16:04:39
|
Revision: 16
http://svn.sourceforge.net/perlwikibot/?rev=16&view=rev
Author: rotemliss
Date: 2006-09-01 09:04:29 -0700 (Fri, 01 Sep 2006)
Log Message:
-----------
Moving the dump file to a better name, and moving some functions about HTTP to a new HTTP library.
Modified Paths:
--------------
trunk/includes/functions.pm
Added Paths:
-----------
trunk/includes/dump.pm
trunk/includes/http.pm
Removed Paths:
-------------
trunk/includes/mwdump.pm
Copied: trunk/includes/dump.pm (from rev 6, trunk/includes/mwdump.pm)
===================================================================
--- trunk/includes/dump.pm (rev 0)
+++ trunk/includes/dump.pm 2006-09-01 16:04:29 UTC (rev 16)
@@ -0,0 +1,1394 @@
+our $VERSION = '0.31';
+#the POD is at the end of this file
+#avoid shift() - it is computationally more expensive than pop
+#and shifting values for subroutine input should be avoided in
+#any subroutines that get called often, like the handlers
+
+package Parse::MediaWikiDump::Pages;
+
+#This parser works by placing all of the start, text, and end events into
+#a buffer as they come out of XML::Parser. On each call to page() the function
+#checks for a complete article in the buffer and calls for XML::Parser to add
+#more tokens if a complete article is not found. Once a complete article is
+#found it is removed from the buffer, parsed, and an instance of the page
+#object is returned.
+
+use strict;
+use warnings;
+use XML::Parser;
+
+#tokens in the buffer are an array ref with the 0th element specifying
+#its type; these are the constants for those types.
+use constant T_START => 1;
+use constant T_END => 2;
+use constant T_TEXT => 3;
+
+sub new {
+ my $class = shift;
+ my $source = shift;
+ my $self = {};
+
+ bless($self, $class);
+
+ $$self{PARSER} = XML::Parser->new(ProtocolEncoding => 'UTF-8');
+ $$self{PARSER}->setHandlers('Start', \&start_handler,
+ 'End', \&end_handler,
+ 'Char', \&char_handler);
+ $$self{EXPAT} = $$self{PARSER}->parse_start(state => $self);
+ $$self{BUFFER} = [];
+ $$self{CHUNK_SIZE} = 32768;
+ $$self{BUF_LIMIT} = 10000;
+ $$self{BYTE} = 0;
+
+ $self->open($source);
+ $self->init;
+
+ return $self;
+}
+
+sub next {
+ my $self = shift;
+ my $buffer = $$self{BUFFER};
+ my $offset;
+ my @page;
+
+ #look through the contents of our buffer for a complete article; fill
+ #the buffer with more data if an entire article is not there
+ while(1) {
+ $offset = $self->search_buffer([T_END, 'page']);
+ last if $offset != -1;
+
+ #indicates EOF
+ return undef unless $self->parse_more;
+ }
+
+ #remove the entire page from the buffer
+ @page = splice(@$buffer, 0, $offset + 1);
+
+ if (! token_compare($page[0], [T_START, 'page'])) {
+ $self->dump($buffer);
+ die "expected <page>; got " . token2text($page[0]);
+ }
+
+ my $data = $self->parse_page(\@page);
+
+ return Parse::MediaWikiDump::page->new($data, $$self{CATEGORY_ANCHOR});
+}
+
+#outputs a nicely formated representation of the tokens on the buffer specified
+sub dump {
+ my $self = shift;
+ my $buffer = shift || $$self{BUFFER};
+ my $offset = 0;
+
+ foreach my $i (0 .. $#$buffer) {
+ my $token = $$buffer[$i];
+
+ print STDERR "$i ";
+
+ if ($$token[0] == T_START) {
+ my $attr = $$token[2];
+ print STDERR " " x $offset;
+ print STDERR "START $$token[1] ";
+
+ foreach my $key (sort(keys(%$attr))) {
+ print STDERR "$key=\"$$attr{$key}\" ";
+ }
+
+ print STDERR "\n";
+ $offset++;
+ } elsif ($$token[0] == T_END) {
+ $offset--;
+ print STDERR " " x $offset;
+ print STDERR "END $$token[1]\n";
+ } elsif ($$token[0] == T_TEXT) {
+ my $ref = $$token[1];
+ print STDERR " " x $offset;
+ print STDERR "TEXT ";
+
+ my $len = length($$ref);
+
+ if ($len < 50) {
+ print STDERR "'$$ref'\n";
+ } else {
+ print STDERR "$len characters\n";
+ }
+ }
+ }
+
+ return 1;
+}
+
+sub sitename {
+ my $self = shift;
+ return $$self{HEAD}{sitename};
+}
+
+sub base {
+ my $self = shift;
+ return $$self{HEAD}{base};
+}
+
+sub generator {
+ my $self = shift;
+ return $$self{HEAD}{generator};
+}
+
+sub case {
+ my $self = shift;
+ return $$self{HEAD}{case};
+}
+
+sub namespaces {
+ my $self = shift;
+ return $$self{HEAD}{namespaces};
+}
+
+sub current_byte {
+ my $self = shift;
+ return $$self{BYTE};
+}
+
+#depreciated backwards compatibility methods
+
+#replaced by next()
+sub page {
+ my $self = shift;
+ return $self->next(@_);
+}
+
+#private functions with OO interface
+sub open {
+ my $self = shift;
+ my $source = shift;
+
+ if (ref($source) eq 'GLOB') {
+ $$self{SOURCE} = $source;
+ } elsif (! open($$self{SOURCE}, $source)) {
+ die "could not open $source: $!";
+ }
+
+ binmode($$self{SOURCE}, ':utf8');
+
+ return 1;
+}
+
+sub init {
+ my $self = shift;
+ my $offset;
+ my @head;
+
+ #parse more XML until the entire siteinfo section is in the buffer
+ while(1) {
+ die "could not init" unless $self->parse_more;
+
+ $offset = $self->search_buffer([T_END, 'siteinfo']);
+
+ last if $offset != -1;
+ }
+
+ #pull the siteinfo section out of the buffer
+ @head = splice(@{$$self{BUFFER}}, 0, $offset + 1);
+
+ $self->parse_head(\@head);
+
+ return 1;
+}
+
+#feed data into expat and have it put more tokens onto the buffer
+sub parse_more {
+ my ($self) = @_;
+ my $buf;
+
+ my $read = read($$self{SOURCE}, $buf, $$self{CHUNK_SIZE});
+
+ if (! defined($read)) {
+ die "error during read: $!";
+ } elsif ($read == 0) {
+ $$self{FINISHED} = 1;
+ $$self{EXPAT}->parse_done();
+ return 0;
+ }
+
+ $$self{BYTE} += $read;
+ $$self{EXPAT}->parse_more($buf);
+
+ my $buflen = scalar(@{$$self{BUFFER}});
+
+ die "buffer length of $buflen exceeds $$self{BUF_LIMIT}" unless
+ $buflen < $$self{BUF_LIMIT};
+
+ return 1;
+}
+
+#searches through a buffer for a specified token
+sub search_buffer {
+ my ($self, $search, $list) = @_;
+
+ $list = $$self{BUFFER} unless defined $list;
+
+ return -1 if scalar(@$list) == 0;
+
+ foreach my $i (0 .. $#$list) {
+ return $i if token_compare($$list[$i], $search);
+ }
+
+ return -1;
+}
+
+#this function is very frightning =)
+sub parse_head {
+ my $self = shift;
+ my $buffer = shift;
+ my $state = 'start';
+ my %data = (namespaces => []);
+
+ for (my $i = 0; $i <= $#$buffer; $i++) {
+ my $token = $$buffer[$i];
+
+ if ($state eq 'start') {
+ my $version;
+ die "$i: expected <mediawiki> got " . token2text($token) unless
+ token_compare($token, [T_START, 'mediawiki']);
+
+ die "$i: version is a required attribute" unless
+ defined($version = $$token[2]->{version});
+
+ die "$i: version $version unsupported" unless $version eq '0.3';
+
+ $token = $$buffer[++$i];
+
+ die "$i: expected <siteinfo> got " . token2text($token) unless
+ token_compare($token, [T_START, 'siteinfo']);
+
+ $state = 'in_siteinfo';
+ } elsif ($state eq 'in_siteinfo') {
+ if (token_compare($token, [T_START, 'namespaces'])) {
+ $state = 'in_namespaces';
+ next;
+ } elsif (token_compare($token, [T_END, 'siteinfo'])) {
+ last;
+ } elsif (token_compare($token, [T_START, 'sitename'])) {
+ $token = $$buffer[++$i];
+
+ if ($$token[0] != T_TEXT) {
+ die "$i: expected TEXT but got " . token2text($token);
+ }
+
+ $data{sitename} = ${$$token[1]};
+
+ $token = $$buffer[++$i];
+
+ if (! token_compare($token, [T_END, 'sitename'])) {
+ die "$i: expected </sitename> but got " . token2text($token);
+ }
+ } elsif (token_compare($token, [T_START, 'base'])) {
+ $token = $$buffer[++$i];
+
+ if ($$token[0] != T_TEXT) {
+ $self->dump($buffer);
+ die "$i: expected TEXT but got " . token2text($token);
+ }
+
+ $data{base} = ${$$token[1]};
+
+ $token = $$buffer[++$i];
+
+ if (! token_compare($token, [T_END, 'base'])) {
+ $self->dump($buffer);
+ die "$i: expected </base> but got " . token2text($token);
+ }
+
+ } elsif (token_compare($token, [T_START, 'generator'])) {
+ $token = $$buffer[++$i];
+
+ if ($$token[0] != T_TEXT) {
+ $self->dump($buffer);
+ die "$i: expected TEXT but got " . token2text($token);
+ }
+
+ $data{generator} = ${$$token[1]};
+
+ $token = $$buffer[++$i];
+
+ if (! token_compare($token, [T_END, 'generator'])) {
+ $self->dump($buffer);
+ die "$i: expected </generator> but got " . token2text($token);
+ }
+
+ } elsif (token_compare($token, [T_START, 'case'])) {
+ $token = $$buffer[++$i];
+
+ if ($$token[0] != T_TEXT) {
+ $self->dump($buffer);
+ die "$i: expected </case> but got " . token2text($token);
+ }
+
+ $data{case} = ${$$token[1]};
+
+ $token = $$buffer[++$i];
+
+ if (! token_compare($token, [T_END, 'case'])) {
+ $self->dump($buffer);
+ die "$i: expected </case> but got " . token2text($token);
+ }
+ }
+
+ } elsif ($state eq 'in_namespaces') {
+ my $key;
+ my $name;
+
+ if (token_compare($token, [T_END, 'namespaces'])) {
+ $state = 'in_siteinfo';
+ next;
+ }
+
+ if (! token_compare($token, [T_START, 'namespace'])) {
+ die "$i: expected <namespace> or </namespaces>; got " . token2text($token);
+ }
+
+ die "$i: key is a required attribute" unless
+ defined($key = $$token[2]->{key});
+
+ $token = $$buffer[++$i];
+
+ #the default namespace has no text associated with it
+ if ($$token[0] == T_TEXT) {
+ $name = ${$$token[1]};
+ } elsif (token_compare($token, [T_END, 'namespace'])) {
+ $name = '';
+ $i--; #move back one for below
+ } else {
+ die "$i: should never happen";
+ }
+
+ push(@{$data{namespaces}}, [$key, $name]);
+
+ $token = $$buffer[++$i];
+
+ if (! token_compare($token, [T_END, 'namespace'])) {
+ $self->dump($buffer);
+ die "$i: expected </namespace> but got " . token2text($token);
+ }
+
+ } else {
+ die "$i: unknown state '$state'";
+ }
+ }
+
+ $$self{HEAD} = \%data;
+
+ #locate the anchor that indicates what looks like a link is really a
+ #category assignment ([[foo]] vs [[Category:foo]])
+ #fix for bug #16616
+ foreach my $ns (@{$data{namespaces}}) {
+ #namespace 14 is the category namespace
+ if ($$ns[0] == 14) {
+ $$self{CATEGORY_ANCHOR} = $$ns[1];
+ last;
+ }
+ }
+
+ if (! defined($$self{CATEGORY_ANCHOR})) {
+ die "Could not locate category indicator in namespace definitions";
+ }
+
+ return 1;
+}
+
+#this function is very frightning =)
+sub parse_page {
+ my $self = shift;
+ my $buffer = shift;
+ my %data;
+ my $state = 'start';
+
+ for (my $i = 0; $i <= $#$buffer; $i++) {
+ my $token = $$buffer[$i];
+
+ if ($state eq 'start') {
+ if (! token_compare($token, [T_START, 'page'])) {
+ $self->dump($buffer);
+ die "$i: expected <page>; got " . token2text($token);
+ }
+
+ $state = 'in_page';
+ } elsif ($state eq 'in_page') {
+ if (token_compare($token, [T_START, 'revision'])) {
+ $state = 'in_revision';
+ next;
+ } elsif (token_compare($token, [T_END, 'page'])) {
+ last;
+ } elsif (token_compare($token, [T_START, 'title'])) {
+ $token = $$buffer[++$i];
+
+ if (token_compare($token, [T_END, 'title'])) {
+ $data{title} = '';
+ next;
+ }
+
+ if ($$token[0] != T_TEXT) {
+ $self->dump($buffer);
+ die "$i: expected TEXT; got " . token2text($token);
+ }
+
+ $data{title} = ${$$token[1]};
+
+ $token = $$buffer[++$i];
+
+ if (! token_compare($token, [T_END, 'title'])) {
+ $self->dump($buffer);
+ die "$i: expected </title>; got " . token2text($token);
+ }
+ } elsif (token_compare($token, [T_START, 'id'])) {
+ $token = $$buffer[++$i];
+
+ if ($$token[0] != T_TEXT) {
+ $self->dump($buffer);
+ die "$i: expected TEXT; got " . token2text($token);
+ }
+
+ $data{id} = ${$$token[1]};
+
+ $token = $$buffer[++$i];
+
+ if (! token_compare($token, [T_END, 'id'])) {
+ $self->dump($buffer);
+ die "$i: expected </id>; got " . token2text($token);
+ }
+ }
+ } elsif ($state eq 'in_revision') {
+ if (token_compare($token, [T_END, 'revision'])) {
+ #If a comprehensive dump file is parsed
+ #it can cause uncontrolled stack growth and the
+ #parser only returns one revision out of
+ #all revisions - if we run into a
+ #comprehensive dump file, indicated by more
+ #than one <revision> section inside a <page>
+ #section then die with a message
+
+ #just peeking ahead, don't want to update
+ #the index
+ $token = $$buffer[$i + 1];
+
+ if (token_compare($token, [T_START, 'revision'])) {
+ die "unable to properly parse comprehensive dump files";
+ }
+
+ $state = 'in_page';
+ next;
+ } elsif (token_compare($token, [T_START, 'contributor'])) {
+ $state = 'in_contributor';
+ next;
+ } elsif (token_compare($token, [T_START, 'id'])) {
+ $token = $$buffer[++$i];
+
+ if ($$token[0] != T_TEXT) {
+ $self->dump($buffer);
+ die "$i: expected TEXT; got " . token2text($token);
+ }
+
+ $data{revision_id} = ${$$token[1]};
+
+ $token = $$buffer[++$i];
+
+ if (! token_compare($token, [T_END, 'id'])) {
+ $self->dump($buffer);
+ die "$i: expected </id>; got " . token2text($token);
+ }
+
+ } elsif (token_compare($token, [T_START, 'timestamp'])) {
+ $token = $$buffer[++$i];
+
+ if ($$token[0] != T_TEXT) {
+ $self->dump($buffer);
+ die "$i: expected TEXT; got " . token2text($token);
+ }
+
+ $data{timestamp} = ${$$token[1]};
+
+ $token = $$buffer[++$i];
+
+ if (! token_compare($token, [T_END, 'timestamp'])) {
+ $self->dump($buffer);
+ die "$i: expected </timestamp>; got " . token2text($token);
+ }
+ } elsif (token_compare($token, [T_START, 'minor'])) {
+ $data{minor} = 1;
+ $token = $$buffer[++$i];
+
+ if (! token_compare($token, [T_END, 'minor'])) {
+ $self->dump($buffer);
+ die "$i: expected </minor>; got " . token2text($token);
+ }
+ } elsif (token_compare($token, [T_START, 'comment'])) {
+ $token = $$buffer[++$i];
+
+ #account for possible null-text
+ if (token_compare($token, [T_END, 'comment'])) {
+ $data{comment} = '';
+ next;
+ }
+
+ if ($$token[0] != T_TEXT) {
+ $self->dump($buffer);
+ die "$i: expected TEXT; got " . token2text($token);
+ }
+
+ $data{comment} = ${$$token[1]};
+
+ $token = $$buffer[++$i];
+
+ if (! token_compare($token, [T_END, 'comment'])) {
+ $self->dump($buffer);
+ die "$i: expected </comment>; got " . token2text($token);
+ }
+
+ } elsif (token_compare($token, [T_START, 'text'])) {
+ my $token = $$buffer[++$i];
+
+ if (token_compare($token, [T_END, 'text'])) {
+ ${$data{text}} = '';
+ next;
+ } elsif ($$token[0] != T_TEXT) {
+ $self->dump($buffer);
+ die "$i: expected TEXT; got " . token2text($token);
+ }
+
+ $data{text} = $$token[1];
+
+ $token = $$buffer[++$i];
+
+ if (! token_compare($token, [T_END, 'text'])) {
+ $self->dump($buffer);
+ die "$i: expected </text>; got " . token2text($token);
+ }
+
+ }
+
+ } elsif ($state eq 'in_contributor') {
+ if (token_compare($token, [T_END, 'contributor'])) {
+ $state = 'in_revision';
+ next;
+ } elsif (token_compare($token, [T_START, 'username'])) {
+ $token = $$buffer[++$i];
+
+ if ($$token[0] != T_TEXT) {
+ $self->dump($buffer);
+ die "$i: expecting TEXT; got " . token2text($token);
+ }
+
+ $data{username} = ${$$token[1]};
+
+ $token = $$buffer[++$i];
+
+ if (! token_compare($token, [T_END, 'username'])) {
+ $self->dump($buffer);
+ die "$i: expected </username>; got " . token2text($token);
+ }
+
+ } elsif (token_compare($token, [T_START, 'id'])) {
+ $token = $$buffer[++$i];
+
+ if ($$token[0] != T_TEXT) {
+ $self->dump($buffer);
+ die "$i: expecting TEXT; got " . token2text($token);
+ }
+
+ $data{userid} = ${$$token[1]};
+
+ $token = $$buffer[++$i];
+
+ if (! token_compare($token, [T_END, 'id'])) {
+ $self->dump($buffer);
+ die "$i: expecting </id>; got " . token2text($token);
+ }
+ }
+ } else {
+ die "unknown state: $state";
+ }
+ }
+
+ $data{minor} = 0 unless defined($data{minor});
+
+ return \%data;
+}
+
+#private functions with out OO interface
+sub token2text {
+ my $token = shift;
+
+ if ($$token[0] == T_START) {
+ return "<$$token[1]>";
+ } elsif ($$token[0] == T_END) {
+ return "</$$token[1]>";
+ } elsif ($$token[0] == T_TEXT) {
+ return "!text_token!";
+ } else {
+ return "!unknown!";
+ }
+}
+
+#this function is where the majority of time is spent in this software
+sub token_compare {
+ my ($toke1, $toke2) = @_;
+
+ foreach my $i (0 .. $#$toke2) {
+ if ($$toke1[$i] ne $$toke2[$i]) {
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+sub start_handler {
+ my ($p, $tag, %atts) = @_;
+ my $self = $p->{state};
+
+ push(@{$$self{BUFFER}}, [T_START, $tag, \%atts]);
+
+ return 1;
+}
+
+sub end_handler {
+ my ($p, $tag) = @_;
+ my $self = $p->{state};
+
+ push(@{$$self{BUFFER}}, [T_END, $tag]);
+
+ return 1;
+}
+
+sub char_handler {
+ my ($p, $chars) = @_;
+ my $self = $p->{state};
+ my $buffer = $$self{BUFFER};
+ my $curent = $$buffer[$#$buffer];
+
+ if (! defined($curent)) {
+ #skip any text not inside a container
+ return 1;
+ } elsif ($$curent[0] == T_TEXT) {
+ ${$$curent[1]} .= $chars;
+ } elsif ($$curent[0] == T_START) {
+ my $ignore_ws_only = 1;
+
+ #work around for bug #16583 - All spaces is a possible username
+ # at least if done via unicode. Force white space preservation
+ #for now.
+ if ($$curent[1] eq 'username') {
+ $ignore_ws_only = 0;
+ }
+
+ if (defined($$curent[2]->{'xml:space'}) &&
+ ($$curent[2]->{'xml:space'} eq 'preserve')) {
+ $ignore_ws_only = 0;
+ }
+
+ if ($ignore_ws_only) {
+ #non-breaking spaces are not whitespace in XML
+ return 1 if $chars =~ m/^[ \t\r\n]+$/m;
+ }
+
+ push(@$buffer, [T_TEXT, \$chars]);
+ }
+
+ return 1;
+}
+
+package Parse::MediaWikiDump::page;
+
+use strict;
+use warnings;
+
+sub new {
+ my ($class, $data, $category_anchor, $case_setting) = @_;
+ my $self = {};
+
+ bless($self, $class);
+
+ $$self{DATA} = $data;
+ $$self{CACHE} = {};
+ $$self{CATEGORY_ANCHOR} = $category_anchor;
+
+ return $self;
+}
+
+sub namespace {
+ my $self = shift;
+
+ return $$self{CACHE}{namespace} if defined($$self{CACHE}{namespace});
+
+ my $title = $$self{DATA}{title};
+
+ if ($title =~ m/^([^:]+)\:/) {
+ $$self{CACHE}{namespace} = $1;
+ return $1;
+ } else {
+ $$self{CACHE}{namespace} = '';
+ return '';
+ }
+}
+
+sub categories {
+ my $self = shift;
+ my $anchor = $$self{CATEGORY_ANCHOR};
+
+ return $$self{CACHE}{categories} if defined($$self{CACHE}{categories});
+
+ my $text = $$self{DATA}{text};
+ my @cats;
+
+ while($$text =~ m/\[\[$anchor:\s*([^\]]+)\]\]/gi) {
+ my $buf = $1;
+
+ #deal with the pipe trick
+ $buf =~ s/\|.*$//;
+ push(@cats, $buf);
+ }
+
+ return undef if scalar(@cats) == 0;
+
+ $$self{CACHE}{categories} = \@cats;
+
+ return \@cats;
+}
+
+sub redirect {
+ my $self = shift;
+ my $text = $$self{DATA}{text};
+
+ return $$self{CACHE}{redirect} if exists($$self{CACHE}{redirect});
+
+ if ($$text =~ m/^#redirect\s*:?\s*\[\[([^\]]*)\]\]/i) {
+ $$self{CACHE}{redirect} = $1;
+ return $1;
+ } else {
+ $$self{CACHE}{redirect} = undef;
+ return undef;
+ }
+}
+
+sub title {
+ my $self = shift;
+ return $$self{DATA}{title};
+}
+
+sub id {
+ my $self = shift;
+ return $$self{DATA}{id};
+}
+
+sub revision_id {
+ my $self = shift;
+ return $$self{DATA}{revision_id};
+}
+
+sub timestamp {
+ my $self = shift;
+ return $$self{DATA}{timestamp};
+}
+
+sub username {
+ my $self = shift;
+ return $$self{DATA}{username};
+}
+
+sub userid {
+ my $self = shift;
+ return $$self{DATA}{userid};
+}
+
+sub minor {
+ my $self = shift;
+ return $$self{DATA}{minor};
+}
+
+sub text {
+ my $self = shift;
+ return $$self{DATA}{text};
+}
+
+package Parse::MediaWikiDump::Links;
+
+use strict;
+use warnings;
+
+sub new {
+ my $class = shift;
+ my $source = shift;
+ my $self = {};
+ $$self{BUFFER} = [];
+
+ bless($self, $class);
+
+ $self->open($source);
+ $self->init;
+
+ return $self;
+}
+
+sub next {
+ my $self = shift;
+ my $buffer = $$self{BUFFER};
+ my $link;
+
+ while(1) {
+ if (defined($link = pop(@$buffer))) {
+ last;
+ }
+
+ #signals end of input
+ return undef unless $self->parse_more;
+ }
+
+ return Parse::MediaWikiDump::link->new($link);
+}
+
+#private functions with OO interface
+sub parse_more {
+ my $self = shift;
+ my $source = $$self{SOURCE};
+ my $need_data = 1;
+
+ while($need_data) {
+ my $line = <$source>;
+
+ last unless defined($line);
+
+ while($line =~ m/\((\d+),(-?\d+),'(.*?)'\)[;,]/g) {
+ push(@{$$self{BUFFER}}, [$1, $2, $3]);
+ $need_data = 0;
+ }
+ }
+
+ #if we still need data and we are here it means we ran out of input
+ if ($need_data) {
+ return 0;
+ }
+
+ return 1;
+}
+
+sub open {
+ my $self = shift;
+ my $source = shift;
+
+ if (ref($source) ne 'GLOB') {
+ die "could not open $source: $!" unless
+ open($$self{SOURCE}, $source);
+ } else {
+ $$self{SOURCE} = $source;
+ }
+
+ binmode($$self{SOURCE}, ':utf8');
+
+ return 1;
+}
+
+sub init {
+ my $self = shift;
+ my $source = $$self{SOURCE};
+ my $found = 0;
+
+ while(<$source>) {
+ if (m/^LOCK TABLES `pagelinks` WRITE;/) {
+ $found = 1;
+ last;
+ }
+ }
+
+ die "not a MediaWiki link dump file" unless $found;
+}
+
+#depreciated backwards compatibility methods
+
+#replaced by next()
+sub link {
+ my $self = shift;
+ $self->next(@_);
+}
+
+package Parse::MediaWikiDump::link;
+
+#you must pass in a fully populated link array reference
+sub new {
+ my $class = shift;
+ my $self = shift;
+
+ bless($self, $class);
+
+ return $self;
+}
+
+sub from {
+ my $self = shift;
+ return $$self[0];
+}
+
+sub namespace {
+ my $self = shift;
+ return $$self[1];
+}
+
+sub to {
+ my $self = shift;
+ return $$self[2];
+}
+
+
+1;
+
+__END__
+
+=head1 NAME
+
+Parse::MediaWikiDump - Tools to process MediaWiki dump files
+
+=head1 SYNOPSIS
+
+ use Parse::MediaWikiDump;
+
+ $source = 'dump_filename.ext';
+ $source = \*FILEHANDLE;
+
+ $pages = Parse::MediaWikiDump::Pages->new($source);
+ $links = Parse::MediaWikiDump::Links->new($source);
+
+ #get all the records from the dump files, one record at a time
+ while(defined($page = $pages->next)) {
+ print "title '", $page->title, "' id ", $page->id, "\n";
+ }
+
+ while(defined($link = $links->next)) {
+ print "link from ", $link->from, " to ", $link->to, "\n";
+ }
+
+ #information about the page dump file
+ $pages->sitename;
+ $pages->base;
+ $pages->generator;
+ $pages->case;
+ $pages->namespaces;
+
+ #information about a page record
+ $page->redirect;
+ $page->categories;
+ $page->title;
+ $page->namespace;
+ $page->id;
+ $page->revision_id;
+ $page->timestamp;
+ $page->username;
+ $page->userid;
+ $page->minor;
+ $page->text;
+
+ #information about a link
+ $link->from;
+ $link->to;
+ $link->namespace;
+
+=head1 DESCRIPTION
+
+This module provides the tools needed to process the contents of various
+MediaWiki dump files.
+
+=head1 USAGE
+
+To use this module you must create an instance of a parser for the type of
+dump file you are trying to parse. The current parsers are:
+
+=over 4
+
+=item Parse::MediaWikiDump::Pages
+
+Parse the contents of the page archive.
+
+=item Parse::MediaWikiDump::Links
+
+Parse the contents of the links dump file.
+
+=back
+
+=head2 General
+
+Both parsers require an argument to new that is a location of source data
+to parse; this argument can be either a filename or a reference to an already
+open filehandle. This entire software suite will die() upon errors in the file,
+inconsistencies on the stack, etc. If this concerns you then you can wrap
+the portion of your code that uses these calls with eval().
+
+=head2 Parse::MediaWikiDump::Pages
+
+It is possible to create a Parse::MediaWikiDump::Pages object two ways:
+
+=over 4
+
+=item $pages = Parse::MediaWikiDump::Pages->new($filename);
+
+=item $pages = Parse::MediaWikiDump::Pages->new(\*FH);
+
+=back
+
+After creation the folowing methods are avalable:
+
+=over 4
+
+=item $pages->next
+
+Returns the next available record from the dump file if it is available,
+otherwise returns undef. Records returned are instances of
+Parse::MediaWikiDump::page; see below for information on those objects.
+
+=item $pages->sitename
+
+Returns the plain-text name of the instance the dump is from.
+
+=item $pages->base
+
+Returns the base url to the website of the instance.
+
+=item $pages->generator
+
+Returns the version of the software that generated the file.
+
+=item $pages->case
+
+Returns the case-sensitivity configuration of the instance.
+
+=item $pages->namespaces
+
+Returns an array reference to the list of namespaces in the instance. Each
+namespace is stored as an array reference which has two items; the first is the
+namespace number and the second is the namespace name. In the case of namespace
+0 the text stored for the name is ''
+
+=back
+
+=head3 Parse::MediaWikiDump::page
+
+The Parse::MediaWikiDump::page object represents a distinct MediaWiki page,
+article, module, what have you. These objects are returned by the next() method
+of a Parse::MediaWikiDump::Pages instance. The scalar returned is a reference
+to a hash that contains all the data of the page in a straightforward manor.
+While it is possible to access this hash directly, and it involves less overhead
+than using the methods below, it is beyond the scope of the interface and is
+undocumented.
+
+Some of the methods below require additional processing, such as namespaces,
+redirect, and categories, to name a few. In these cases the returned result
+is cached and stored inside the object so the processing does not have to be
+redone. This is transparent to you; just know that you don't have to worry about
+optimizing calls to these functions to limit processing overhead.
+
+The following methods are available:
+
+=over 4
+
+=item $page->id
+
+=item $page->title
+
+=item $page->namespace
+
+Returns an empty string (such as '') for the main namespace or a string
+containing the name of the namespace.
+
+=item $page->text
+
+A reference to a scalar containing the plaintext of the page.
+
+=item $page->redirect
+
+The plain text name of the article redirected to or undef if the page is not
+a redirect.
+
+=item $page->categories
+
+Returns a reference to an array that contains a list of categories or undef
+if there are no categories. This method does not understand templates and may
+not return all the categories the article actually belongs in.
+
+=item $page->revision_id
+
+=item $page->timestamp
+
+=item $page->username
+
+=item $page->userid
+
+=item $page->minor
+
+=back
+
+=head2 Parse::MediaWikiDump::Links
+
+This module also takes either a filename or a reference to an already open
+filehandle. For example:
+
+ $links = Parse::MediaWikiDump::Links->new($filename);
+ $links = Parse::MediaWikiDump::Links->new(\*FH);
+
+It is then possible to extract the links a single link at a time using the
+next method, which returns an instance of Parse::MediaWikiDump::link or undef
+when there is no more data. For instance:
+
+ while(defined($link = $links->next)) {
+ print 'from ', $link->from, ' to ', $link->to, "\n";
+ }
+
+=head3 Parse::MediaWikiDump::link
+
+Instances of this class are returned by the link method of a
+Parse::MediaWikiDump::Links instance. The following methods are available:
+
+=over 4
+
+=item $link->from
+
+The numerical id the link was in.
+
+=item $link->to
+
+The plain text name the link is to, minus the namespace.
+
+=item $link->namespace
+
+The numerical id of the namespace the link points to.
+
+=back
+
+=head1 EXAMPLES
+
+=head2 Extract the article text for a given title
+
+ #!/usr/bin/perl
+
+ use strict;
+ use warnings;
+ use Parse::MediaWikiDump;
+
+ my $file = shift(@ARGV) or die "must specify a MediaWiki dump of the current pages";
+ my $title = shift(@ARGV) or die "must specify an article title";
+ my $dump = Parse::MediaWikiDump::Pages->new($file);
+
+ binmode(STDOUT, ':utf8');
+ binmode(STDERR, ':utf8');
+
+ #this is the only currently known value but there could be more in the future
+ if ($dump->case ne 'first-letter') {
+ die "unable to handle any case setting besides 'first-letter'";
+ }
+
+ $title = case_fixer($title);
+
+ while(my $page = $dump->next) {
+ if ($page->title eq $title) {
+ print STDERR "Located text for $title\n";
+ my $text = $page->text;
+ print $$text;
+ exit 0;
+ }
+ }
+
+ print STDERR "Unable to find article text for $title\n";
+ exit 1;
+
+ #removes any case sensativity from the very first letter of the title
+ #but not from the optional namespace name
+ sub case_fixer {
+ my $title = shift;
+
+ #check for namespace
+ if ($title =~ /^(.+?):(.+)/) {
+ $title = $1 . ':' . ucfirst($2);
+ } else {
+ $title = ucfirst($title);
+ }
+
+ return $title;
+ }
+
+=head2 Scan the dump file for double redirects
+
+ #!/usr/bin/perl
+
+ #progress information goes to STDERR, a list of double redirects found
+ #goes to STDOUT
+
+ binmode(STDOUT, ":utf8");
+ binmode(STDERR, ":utf8");
+
+ use strict;
+ use warnings;
+ use Parse::MediaWikiDump;
+
+ my $file = shift(@ARGV);
+ my $pages;
+ my $page;
+ my %redirs;
+ my $artcount = 0;
+ my $file_size;
+ my $start = time;
+
+ if (defined($file)) {
+ $file_size = (stat($file))[7];
+ $pages = Parse::MediaWikiDump::Pages->new($file);
+ } else {
+ print STDERR "No file specified, using standard input\n";
+ $pages = Parse::MediaWikiDump::Pages->new(\*STDIN);
+ }
+
+ #the case of the first letter of titles is ignored - force this option
+ #because the other values of the case setting are unknown
+ die 'this program only supports the first-letter case setting' unless
+ $pages->case eq 'first-letter';
+
+ print STDERR "Analyzing articles:\n";
+
+ while(defined($page = $pages->next)) {
+ update_ui() if ++$artcount % 500 == 0;
+
+ #main namespace only
+ next unless $page->namespace eq '';
+ next unless defined($page->redirect);
+
+ my $title = case_fixer($page->title);
+ #create a list of redirects indexed by their original name
+ $redirs{$title} = case_fixer($page->redirect);
+ }
+
+ my $redir_count = scalar(keys(%redirs));
+ print STDERR "done; searching $redir_count redirects:\n";
+
+ my $count = 0;
+
+ #if a redirect location is also a key to the index we have a double redirect
+ foreach my $key (keys(%redirs)) {
+ my $redirect = $redirs{$key};
+
+ if (defined($redirs{$redirect})) {
+ print "$key\n";
+ $count++;
+ }
+ }
+
+ print STDERR "discovered $count double redirects\n";
+
+ #removes any case sensativity from the very first letter of the title
+ #but not from the optional namespace name
+ sub case_fixer {
+ my $title = shift;
+
+ #check for namespace
+ if ($title =~ /^(.+?):(.+)/) {
+ $title = $1 . ':' . ucfirst($2);
+ } else {
+ $title = ucfirst($title);
+ }
+
+ return $title;
+ }
+
+ sub pretty_bytes {
+ my $bytes = shift;
+ my $pretty = int($bytes) . ' bytes';
+
+ if (($bytes = $bytes / 1024) > 1) {
+ $pretty = int($bytes) . ' kilobytes';
+ }
+
+ if (($bytes = $bytes / 1024) > 1) {
+ $pretty = sprintf("%0.2f", $bytes) . ' megabytes';
+ }
+
+ if (($bytes = $bytes / 1024) > 1) {
+ $pretty = sprintf("%0.4f", $bytes) . ' gigabytes';
+ }
+
+ return $pretty;
+ }
+
+ sub pretty_number {
+ my $number = reverse(shift);
+ $number =~ s/(...)/$1,/g;
+ $number = reverse($number);
+ $number =~ s/^,//;
+
+ return $number;
+ }
+
+ sub update_ui {
+ my $seconds = time - $start;
+ my $bytes = $pages->current_byte;
+
+ print STDERR " ", pretty_number($artcount), " articles; ";
+ print STDERR pretty_bytes($bytes), " processed; ";
+
+ if (defined($file_size)) {
+ my $percent = int($bytes / $file_size * 100);
+
+ print STDERR "$percent% completed\n";
+ } else {
+ my $bytes_per_second = int($bytes / $seconds);
+ print STDERR pretty_bytes($bytes_per_second), " per second\n";
+ }
+ }
+
+=head1 TODO
+
+=over 4
+
+=item Support comprehensive dump files
+
+Currently the full page dump files (such as 20050909_pages_full.xml.gz)
+are not supported.
+
+=item Optimization
+
+It would be nice to increase the processing speed of the XML files. Current
+ideas:
+
+=over 4
+
+=item Move to arrays instead of hashes for base objects
+
+Currently the base types for the majority of the classes are hashes. The
+majority of these could be changed to arrays and numerical constants instead
+of using hashes.
+
+=item Stackless parsing
+
+placing each XML token on the stack is probably quite time consuming. It may be
+beter to move to a stackless system where the XML parser is given a new set
+of callbacks to use when it encounters each specific token.
+
+=back
+
+=back
+
+=head1 AUTHOR
+
+This module was created and documented by Tyler Riddle E<lt>tr...@gm...<gt>.
+
+=head1 BUGS
+
+Please report any bugs or feature requests to
+C<bug...@rt...>, or through the web interface at
+L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Parse-MediaWikiDump>.
+I will be notified, and then you'll automatically be notified of progress on
+your bug as I make changes.
+
+=head2 Known Bugs
+
+No known bugs at this time.
+
+=head1 COPYRIGHT & LICENSE
+
+Copyright 2005 Tyler Riddle, all rights reserved.
+
+This program is free software; you can redistribute it and/or modify it
+under the same terms as Perl itself.
+
Modified: trunk/includes/functions.pm
===================================================================
--- trunk/includes/functions.pm 2006-09-01 11:27:14 UTC (rev 15)
+++ trunk/includes/functions.pm 2006-09-01 16:04:29 UTC (rev 16)
@@ -3,20 +3,14 @@
use strict;
# Libraries
-use LWP;
-use HTTP::Cookies;
-use includes::mwdump;
+use includes::dump;
+use includes::http;
use config::configure;
use config::runtime;
# Counters
my ($i, $j, $k, $l, $m);
-# The browser we use to surf; enable cookies, and use a specific user agent
-my $browser = LWP::UserAgent->new;
-$browser->cookie_jar({});
-$browser->agent("Mozilla/5.0 (compatible; Perl Wikibot)");
-
# Get all the pages matching specific restrictions
sub getMatchingPages
{
@@ -134,73 +128,6 @@
timegm($6, $5, $4, $3, $2 - 1, $1 - 1900); # Return
}
-# Build the URL of a wiki page
-sub buildPageURL
-{
- # Get parameters
- my $title = $_[0];
- my $action = $_[1];
- my $get = $_[2];
-
- # Initial URL: server, script path and title
- my $url = server().scriptPath()."?title=$title";
-
- # Action
- if (defined($action))
- {
- $url = "$url&action=$action";
- }
-
- # Parameters
- if (defined($get))
- {
- $url = "$url&$get";
- }
-
- $url; # Return
-}
-
-# Get a wiki page, try again and again if error
-sub getPage
-{
- # Get parameters
- my $title = $_[0];
- my $action = $_[1];
- my $get = $_[2];
-
- my $url = buildPageURL($title, $action, $get);
-
- my $result;
- do
- {
- $result = $browser->get($url);
- }
- while ((!$result->is_success) && ($result->status_line ne "302 Moved Temporarily"));
-
- $result->content; # Return
-}
-
-# Post a wiki page, try again and again if error
-sub postPage
-{
- # Get parameters
- my $title = $_[0];
- my $action = $_[1];
- my $post = $_[2];
- my $get = $_[3];
-
- my $url = buildPageURL($title, $action, $get);
-
- my $result;
- do
- {
- $result = $browser->post($url, $post);
- }
- while ((!$result->is_success) && (!$result->status_line =~ /302[\s\w]+/));
-
- $result->content; # Return
-}
-
# Log in
sub login
{
Added: trunk/includes/http.pm
===================================================================
--- trunk/includes/http.pm (rev 0)
+++ trunk/includes/http.pm 2006-09-01 16:04:29 UTC (rev 16)
@@ -0,0 +1,81 @@
+# Code style
+use warnings;
+use strict;
+
+# Libraries
+use LWP;
+
+# Counters
+my ($i, $j, $k, $l, $m);
+
+# The browser we use to surf; enable cookies, and use a specific user agent
+my $browser = LWP::UserAgent->new;
+$browser->cookie_jar({});
+$browser->agent("Mozilla/5.0 (compatible; Perl MediaWiki Robot)");
+
+# Build the URL of a wiki page
+sub buildPageURL
+{
+ # Get parameters
+ my $title = $_[0];
+ my $action = $_[1];
+ my $get = $_[2];
+
+ # Initial URL: server, script path and title
+ my $url = server().scriptPath()."?title=$title";
+
+ # Action
+ if (defined($action))
+ {
+ $url = "$url&action=$action";
+ }
+
+ # Parameters
+ if (defined($get))
+ {
+ $url = "$url&$get";
+ }
+
+ $url; # Return
+}
+
+# Get a wiki page, try again and again if error
+sub getPage
+{
+ # Get parameters
+ my $title = $_[0];
+ my $action = $_[1];
+ my $get = $_[2];
+
+ my $url = buildPageURL($title, $action, $get);
+
+ my $result;
+ do
+ {
+ $result = $browser->get($url);
+ }
+ while ((!$result->is_success) && ($result->status_line ne "302 Moved Temporarily"));
+
+ $result->content; # Return
+}
+
+# Post a wiki page, try again and again if error
+sub postPage
+{
+ # Get parameters
+ my $title = $_[0];
+ my $action = $_[1];
+ my $post = $_[2];
+ my $get = $_[3];
+
+ my $url = buildPageURL($title, $action, $get);
+
+ my $result;
+ do
+ {
+ $result = $browser->post($url, $post);
+ }
+ while ((!$result->is_success) && (!$result->status_line =~ /302[\s\w]+/));
+
+ $result->content; # Return
+}
Property changes on: trunk/includes/http.pm
___________________________________________________________________
Name: svn:eol-style
+ native
Deleted: trunk/includes/mwdump.pm
===================================================================
--- trunk/includes/mwdump.pm 2006-09-01 11:27:14 UTC (rev 15)
+++ trunk/includes/mwdump.pm 2006-09-01 16:04:29 UTC (rev 16)
@@ -1,1394 +0,0 @@
-our $VERSION = '0.31';
-#the POD is at the end of this file
-#avoid shift() - it is computationally more expensive than pop
-#and shifting values for subroutine input should be avoided in
-#any subroutines that get called often, like the handlers
-
-package Parse::MediaWikiDump::Pages;
-
-#This parser works by placing all of the start, text, and end events into
-#a buffer as they come out of XML::Parser. On each call to page() the function
-#checks for a complete article in the buffer and calls for XML::Parser to add
-#more tokens if a complete article is not found. Once a complete article is
-#found it is removed from the buffer, parsed, and an instance of the page
-#object is returned.
-
-use strict;
-use warnings;
-use XML::Parser;
-
-#tokens in the buffer are an array ref with the 0th element specifying
-#its type; these are the constants for those types.
-use constant T_START => 1;
-use constant T_END => 2;
-use constant T_TEXT => 3;
-
-sub new {
- my $class = shift;
- my $source = shift;
- my $self = {};
-
- bless($self, $class);
-
- $$self{PARSER} = XML::Parser->new(ProtocolEncoding => 'UTF-8');
- $$self{PARSER}->setHandlers('Start', \&start_handler,
- 'End', \&end_handler,
- 'Char', \&char_handler);
- $$self{EXPAT} = $$self{PARSER}->parse_start(state => $self);
- $$self{BUFFER} = [];
- $$self{CHUNK_SIZE} = 32768;
- $$self{BUF_LIMIT} = 10000;
- $$self{BYTE} = 0;
-
- $self->open($source);
- $self->init;
-
- return $self;
-}
-
-sub next {
- my $self = shift;
- my $buffer = $$self{BUFFER};
- my $offset;
- my @page;
-
- #look through the contents of our buffer for a complete article; fill
- #the buffer with more data if an entire article is not there
- while(1) {
- $offset = $self->search_buffer([T_END, 'page']);
- last if $offset != -1;
-
- #indicates EOF
- return undef unless $self->parse_more;
- }
-
- #remove the entire page from the buffer
- @page = splice(@$buffer, 0, $offset + 1);
-
- if (! token_compare($page[0], [T_START, 'page'])) {
- $self->dump($buffer);
- die "expected <page>; got " . token2text($page[0]);
- }
-
- my $data = $self->parse_page(\@page);
-
- return Parse::MediaWikiDump::page->new($data, $$self{CATEGORY_ANCHOR});
-}
-
-#outputs a nicely formated representation of the tokens on the buffer specified
-sub dump {
- my $self = shift;
- my $buffer = shift || $$self{BUFFER};
- my $offset = 0;
-
- foreach my $i (0 .. $#$buffer) {
- my $token = $$buffer[$i];
-
- print STDERR "$i ";
-
- if ($$token[0] == T_START) {
- my $attr = $$token[2];
- print STDERR " " x $offset;
- print STDERR "START $$token[1] ";
-
- foreach my $key (sort(keys(%$attr))) {
- print STDERR "$key=\"$$attr{$key}\" ";
- }
-
- print STDERR "\n";
- $offset++;
- } elsif ($$token[0] == T_END) {
- $offset--;
- print STDERR " " x $offset;
- print STDERR "END $$token[1]\n";
- } elsif ($$token[0] == T_TEXT) {
- my $ref = $$token[1];
- print STDERR " " x $offset;
- print STDERR "TEXT ";
-
- my $len = length($$ref);
-
- if ($len < 50) {
- print STDERR "'$$ref'\n";
- } else {
- print STDERR "$len characters\n";
- }
- }
- }
-
- return 1;
-}
-
-sub sitename {
- my $self = shift;
- return $$self{HEAD}{sitename};
-}
-
-sub base {
- my $self = shift;
- return $$self{HEAD}{base};
-}
-
-sub generator {
- my $self = shift;
- return $$self{HEAD}{generator};
-}
-
-sub case {
- my $self = shift;
- return $$self{HEAD}{case};
-}
-
-sub namespaces {
- my $self = shift;
- return $$self{HEAD}{namespaces};
-}
-
-sub current_byte {
- my $self = shift;
- return $$self{BYTE};
-}
-
-#depreciated backwards compatibility methods
-
-#replaced by next()
-sub page {
- my $self = shift;
- return $self->next(@_);
-}
-
-#private functions with OO interface
-sub open {
- my $self = shift;
- my $source = shift;
-
- if (ref($source) eq 'GLOB') {
- $$self{SOURCE} = $source;
- } elsif (! open($$self{SOURCE}, $source)) {
- die "could not open $source: $!";
- }
-
- binmode($$self{SOURCE}, ':utf8');
-
- return 1;
-}
-
-sub init {
- my $self = shift;
- my $offset;
- my @head;
-
- #parse more XML until the entire siteinfo section is in the buffer
- while(1) {
- die "could not init" unless $self->parse_more;
-
- $offset = $self->search_buffer([T_END, 'siteinfo']);
-
- last if $offset != -1;
- }
-
- #pull the siteinfo section out of the buffer
- @head = splice(@{$$self{BUFFER}}, 0, $offset + 1);
-
- $self->parse_head(\@head);
-
- return 1;
-}
-
-#feed data into expat and have it put more tokens onto the buffer
-sub parse_more {
- my ($self) = @_;
- my $buf;
-
- my $read = read($$self{SOURCE}, $buf, $$self{CHUNK_SIZE});
-
- if (! defined($read)) {
- die "error during read: $!";
- } elsif ($read == 0) {
- $$self{FINISHED} = 1;
- $$self{EXPAT}->parse_done();
- return 0;
- }
-
- $$self{BYTE} += $read;
- $$self{EXPAT}->parse_more($buf);
-
- my $buflen = scalar(@{$$self{BUFFER}});
-
- die "buffer length of $buflen exceeds $$self{BUF_LIMIT}" unless
- $buflen < $$self{BUF_LIMIT};
-
- return 1;
-}
-
-#searches through a buffer for a specified token
-sub search_buffer {
- my ($self, $search, $list) = @_;
-
- $list = $$self{BUFFER} unless defined $list;
-
- return -1 if scalar(@$list) == 0;
-
- foreach my $i (0 .. $#$list) {
- return $i if token_compare($$list[$i], $search);
- }
-
- return -1;
-}
-
-#this function is very frightning =)
-sub parse_head {
- my $self = shift;
- my $buffer = shift;
- my $state = 'start';
- my %data = (namespaces => []);
-
- for (my $i = 0; $i <= $#$buffer; $i++) {
- my $token = $$buffer[$i];
-
- if ($state eq 'start') {
- my $version;
- die "$i: expected <mediawiki> got " . token2text($token) unless
- token_compare($token, [T_START, 'mediawiki']);
-
- die "$i: version is a required attribute" unless
- defined($version = $$token[2]->{version});
-
- die "$i: version $version unsupported" unless $version eq '0.3';
-
- $token = $$buffer[++$i];
-
- die "$i: expected <siteinfo> got " . token2text($token) unless
- token_compare($token, [T_START, 'siteinfo']);
-
- $state = 'in_siteinfo';
- } elsif ($state eq 'in_siteinfo') {
- if (token_compare($token, [T_START, 'namespaces'])) {
- $state = 'in_namespaces';
- next;
- } elsif (token_compare($token, [T_END, 'siteinfo'])) {
- last;
- } elsif (token_compare($token, [T_START, 'sitename'])) {
- $token = $$buffer[++$i];
-
- if ($$token[0] != T_TEXT) {
- die "$i: expected TEXT but got " . token2text($token);
- }
-
- $data{sitename} = ${$$token[1]};
-
- $token = $$buffer[++$i];
-
- if (! token_compare($token, [T_END, 'sitename'])) {
- die "$i: expected </sitename> but got " . token2text($token);
- }
- } elsif (token_compare($token, [T_START, 'base'])) {
- $token = $$buffer[++$i];
-
- if ($$token[0] != T_TEXT) {
- $self->dump($buffer);
- die "$i: expected TEXT but got " . token2text($token);
- }
-
- $data{base} = ${$$token[1]};
-
- $token = $$buffer[++$i];
-
- if (! token_compare($token, [T_END, 'base'])) {
- $self->dump($buffer);
- die "$i: expected </base> but got " . token2text($token);
- }
-
- } elsif (token_compare($token, [T_START, 'generator'])) {
- $token = $$buffer[++$i];
-
- if ($$token[0] != T_TEXT) {
- $self->dump($buffer);
- die "$i: expected TEXT but got " . token2text($token);
- }
-
- $data{generator} = ${$$token[1]};
-
- $token = $$buffer[++$i];
-
- if (! token_compare($token, [T_END, 'generator'])) {
- $self->dump($buffer);
- die "$i: expected </generator> but got " . token2text($token);
- }
-
- } elsif (token_compare($token, [T_START, 'case'])) {
- $token = $$buffer[++$i];
-
- if ($$token[0] != T_TEXT) {
- $self->dump($buffer);
- die "$i: expected </case> but got " . token2text($token);
- }
-
- $data{case} = ${$$token[1]};
-
- $token = $$buffer[++$i];
-
- if (! token_compare($token, [T_END, 'case'])) {
- $self->dump($buffer);
- die "$i: expected </case> but got " . token2text($token);
- }
- }
-
- } elsif ($state eq 'in_namespaces') {
- my $key;
- my $name;
-
- if (token_compare($token, [T_END, 'namespaces'])) {
- $state = 'in_siteinfo';
- next;
- }
-
- if (! token_compare($token, [T_START, 'namespace'])) {
- die "$i: expected <namespace> or </namespaces>; got " . token2text($token);
- }
-
- die "$i: key is a required attribute" unless
- defined($key = $$token[2]->{key});
-
- $token = $$buffer[++$i];
-
- #the default namespace has no text associated with it
- if ($$token[0] == T_TEXT) {
- $name = ${$$token[1]};
- } elsif (token_compare($token, [T_END, 'namespace'])) {
- $name = '';
- $i--; #move back one for below
- } else {
- die "$i: should never happen";
- }
-
- push(@{$data{namespaces}}, [$key, $name]);
-
- $token = $$buffer[++$i];
-
- if (! token_compare($token, [T_END, 'namespace'])) {
- $self->dump($buffer);
- die "$i: expected </namespace> but got " . token2text($token);
- }
-
- } else {
- die "$i: unknown state '$state'";
- }
- }
-
- $$self{HEAD} = \%data;
-
- #locate the anchor that indicates what looks like a link is really a
- #category assignment ([[foo]] vs [[Category:foo]])
- #fix for bug #16616
- foreach my $ns (@{$data{namespaces}}) {
- #namespace 14 is the category namespace
- if ($$ns[0] == 14) {
- $$self{CATEGORY_ANCHOR} = $$ns[1];
- last;
- }
- }
-
- if (! defined($$self{CATEGORY_ANCHOR})) {
- die "Could not locate category indicator in namespace definitions";
- }
-
- return 1;
-}
-
-#this function is very frightning =)
-sub parse_page {
- my $self = shift;
- my $buffer = shift;
- my %data;
- my $state = 'start';
-
- for (my $i = 0; $i <= $#$buffer; $i++) {
- my $token = $$buffer[$i];
-
- if ($state eq 'start') {
- if (! token_compare($token, [T_START, 'page'])) {
- $self->dump($buffer);
- die "$i: expected <page>; got " . token2text($token);
- }
-
- $state = 'in_page';
- } elsif ($state eq 'in_page') {
- if (token_compare($token, [T_START, 'revision'])) {
- $state = 'in_revision';
- next;
- } elsif (token_compare($token, [T_END, 'page'])) {
- last;
- } elsif (token_compare($token, [T_START, 'title'])) {
- $token = $$buffer[++$i];
-
- if (token_compare($token, [T_END, 'title'])) {
- $data{title} = '';
- next;
- }
-
- if ($$token[0] != T_TEXT) {
- $self->dump($buffer);
- die "$i: expected TEXT; got " . token2text($token);
- }
-
- $data{title} = ${$$token[1]};
-
- $token = $$buffer[++$i];
-
- if (! token_compare($token, [T_END, 'title'])) {
- $self->dump($buffer);
- die "$i: expected </title>; got " . token2text($token);
- }
- } elsif (token_compare($token, [T_START, 'id'])) {
- $token = $$buffer[++$i];
-
- if ($$token[0] != T_TEXT) {
- $self->dump($buffer);
- die "$i: expected TEXT; got " . token2text($token);
- }
-
- $data{id} = ${$$token[1]};
-
- $token = $$buffer[++$i];
-
- if (! token_compare($token, [T_END, 'id'])) {
- $self->dump($buffer);
- die "$i: expected </id>; got " . token2text($token);
- }
- }
- } elsif ($state eq 'in_revision') {
- if (token_compare($token, [T_END, 'revision'])) {
- #If a comprehensive dump file is parsed
- #it can cause uncontrolled stack growth and the
- #parser only returns one revision out of
- #all revisions - if we run into a
- #comprehensive dump file, indicated by more
- #than one <revision> section inside a <page>
- #section then die with a message
-
- #just peeking ahead, don't want to update
- #the index
- $token = $$buffer[$i + 1];
-
- if (token_compare($token, [T_START, 'revision'])) {
- die "unable to properly parse comprehensive dump files";
- }
-
- $state = 'in_page';
- next;
- } elsif (token_compare($token, [T_START, 'contributor'])) {
- $state = 'in_contributor';
- next;
- } elsif (token_compare($token, [T_START, 'id'])) {
- $token = $$buffer[++$i];
-
- if ($$token[0] != T_TEXT) {
- $self->dump($buffer);
- die "$i: expected TEXT; got " . token2text($token);
- }
-
- $data{revision_id} = ${$$token[1]};
-
- $token = $$buffer[++$i];
-
- if (! token_compare($token, [T_END, 'id'])) {
- $self->dump($buffer);
- die "$i: expected </id>; got " . token2text($token);
- }
-
- } elsif (token_compare($token, [T_START, 'timestamp'])) {
- $token = $$buffer[++$i];
-
- if ($$token[0] != T_TEXT) {
- $self->dump($buffer);
- die "$i: expected TEXT; got " . token2text($token);
- }
-
- $data{timestamp} = ${$$token[1]};
-
- $token = $$buffer[++$i];
-
- if (! token_compare($token, [T_END, 'timestamp'])) {
- $self->dump($buffer);
- die "$i: expected </timestamp>; got " . token2text($token);
- }
- } elsif (token_compare($token, [T_START, 'minor'])) {
- $data{minor} = 1;
- $token = $$buffer[++$i];
-
- if (! token_compare($token, [T_END, 'minor'])) {
- $self->dump($buffer);
- die "$i: expected </minor>; got " . token2text($token);
- }
- } elsif (token_compare($token, [T_START, 'comment'])) {
- $token = $$buffer[++$i];
-
- #account for possible null-text
- if (token_compare($token, [T_END, 'comment'])) {
- $data{comment} = '';
- next;
- }
-
- if ($$token[0] != T_TEXT) {
- $self->dump($buffer);
- die "$i: expected TEXT; got " . token2text($token);
- }
-
- $data{comment} = ${$$token[1]};
-
- $token = $$buffer[++$i];
-
- if (! token_compare($token, [T_END, 'comment'])) {
- $self->dump($buffer);
- die "$i: expected </comment>; got " . token2text($token);
- }
-
- } elsif (token_compare($token, [T_START, 'text'])) {
- my $token = $$buffer[++$i];
-
- if (token_compare($token, [T_END, 'text'])) {
- ${$data{text}} = '';
- next;
- } elsif ($$token[0] != T_TEXT) {
- $self->dump($buffer);
- die "$i: expected TEXT; got " . token2text($token);
- }
-
- $data{text} = $$token[1];
-
- $token = $$buffer[++$i];
-
- if (! token_compare($token, [T_END, 'text'])) {
- $self->dump($buffer);
- die "$i: expected </text>; got " . token2text($token);
- }
-
- }
-
- } elsif ($state eq 'in_contributor') {
- if (token_compare($token, [T_END, 'contributor'])) {
- $state = 'in_revision';
- next;
- } elsif (token_compare($token, [T_START, 'username'])) {
- $token = $$buffer[++$i];
-
- if ($$token[0] != T_TEXT) {
- $self->dump($buffer);
- die "$i: expecting TEXT; got " . token2text($token);
- }
-
- $data{username} = ${$$token[1]};
-
- $token = $$buffer[++$i];
-
- if (! token_compare($token, [T_END, 'username'])) {
- $self->dump($buffer);
- die "$i: expected </username>; got " . token2text($token);
- }
-
- } elsif (token_compare($token, [T_START, 'id'])) {
- $token = $$buffer[++$i];
-
- if ($$token[0] != T_TEXT) {
- $self->dump($buffer);
- die "$i: expecting TEXT; got " . token2text($token);
- }
-
- $data{userid} = ${$$token[1]};
-
- $token = $$buffer[++$i];
-
- if (! token_compare($token, [T_END, 'id'])) {
- $self->dump($buffer);
- die "$i: expecting </id>; got " . token2text($token);
- }
- }
- } else {
- die "unknown state: $state";
- }
- }
-
- $data{minor} = 0 unless defined($data{minor});
-
- return \%data;
-}
-
-#private functions with out OO interface
-sub token2text {
- my $token = shift;
-
- if ($$token[0] == T_START) {
- return "<$$token[1]>";
- } elsif ($$token[0] == T_END) {
- return "</$$token[1]>";
- } elsif ($$token[0] == T_TEXT) {
- return "!text_token!";
- } else {
- return "!unknown!";
- }
-}
-
-#this function is where the majority of time is spent in this software
-sub token_compare {
- my ($toke1, $toke2) = @_;
-
- foreach my $i (0 .. $#$toke2) {
- if ($$toke1[$i] ne $$toke2[$i]) {
- return 0;
- }
- }
-
- return 1;
-}
-
-sub start_handler {
- my ($p, $tag, %atts) = @_;
- my $self = $p->{state};
-
- push(@{$$self{BUFFER}}, [T_START, $tag, \%atts]);
-
- return 1;
-}
-
-sub end_handler {
- my ($p, $tag) = @_;
- my $self = $p->{state};
-
- push(@{$$self{BUFFER}}, [T_END, $tag]);
-
- return 1;
-}
-
-sub char_handler {
- my ($p, $chars) = @_;
- my $self = $p->{state};
- my $buffer = $$self{BUFFER};
- my $curent = $$buffer[$#$buffer];
-
- if (! defined($curent)) {
- #skip any text not inside a container
- return 1;
- } elsif ($$curent[0] == T_TEXT) {
- ${$$curent[1]} .= $chars;
- } elsif ($$curent[0] == T_START) {
- my $ignore_ws_only = 1;
-
- #work around for bug #16583 - All spaces is a possible username
- # at least if done via unicode. Force white space preservation
- #for now.
- if ($$curent[1] eq 'username') {
- $ignore_ws_only = 0;
- }
-
- if (defined($$curent[2]->{'xml:space'}) &&
- ($$curent[2]->{'xml:space'} eq 'preserve')) {
- $ignore_ws_only = 0;
- }
-
- if ($ignore_ws_only) {
- #non-breaking spaces are not whitespace in XML
- return 1 if $chars =~ m/^[ \t\r\n]+$/m;
- }
-
- push(@$buffer, [T_TEXT, \$chars]);
- }
-
- return 1;
-}
-
-package Parse::MediaWikiDump::page;
-
-use strict;
-use warnings;
-
-sub new {
- my ($class, $data, $category_anchor, $case_setting) = @_;
- my $self = {};
-
- bless($self, $class);
-
- $$self{DATA} = $data;
- $$self{CACHE} = {};
- $$self{CATEGORY_ANCHOR} = $category_anchor;
-
- return $self;
-}
-
-sub namespace {
- my $self = shift;
-
- return $$self{CACHE}{namespace} if defined($$self{CACHE}{namespace});
-
- my $title = $$self{DATA}{title};
-
- if ($title =~ m/^([^:]+)\:/) {
- $$self{CACHE}{namespace} = $1;
- return $1;
- } else {
- $$self{CACHE}{namespace} = '';
- return '';
- }
-}
-
-sub categories {
- my $self = shift;
- my $anchor = $$self{CATEGORY_ANCHOR};
-
- return $$self{CACHE}{categories} if defined($$self{CACHE}{categories});
-
- my $text = $$self{DATA}{text};
- my @cats;
-
- while($$text =~ m/\[\[$anchor:\s*([^\]]+)\]\]/gi) {
- my $buf = $1;
-
- #deal with the pipe trick
- $buf =~ s/\|.*$//;
- push(@cats, $buf);
- }
-
- return undef if scalar(@cats) == 0;
-
- $$self{CACHE}{categories} = \@cats;
-
- return \@cats;
-}
-
-sub redirect {
- my $self = shift;
- my $text = $$self{DATA}{text};
-
- return $$self{CACHE}{redirect} if exists($$self{CACHE}{redirect});
-
- if ($$text =~ m/^#redirect\s*:?\s*\[\[([^\]]*)\]\]/i) {
- $$self{CACHE}{redirect} = $1;
- return $1;
- } else {
- $$self{CACHE}{redirect} = undef;
- return undef;
- }
-}
-
-sub title {
- my $self = shift;
- return $$self{DATA}{title};
-}
-
-sub id {
- my $self = shift;
- return $$self{DATA}{id};
-}
-
-sub revision_id {
- my $self = shift;
- return $$self{DATA}{revision_id};
-}
-
-sub timestamp {
- my $self = shift;
- return $$self{DATA}{timestamp};
-}
-
-sub username {
- my $self = shift;
- return $$self{DATA}{username};
-}
-
-sub userid {
- my $self = shift;
- return $$self{DATA}{userid};
-}
-
-sub minor {
- my $self = shift;
- return $$self{DATA}{minor};
-}
-
-sub text {
- my $self = shift;
- return $$self{DATA}{text};
-}
-
-package Parse::MediaWikiDump::Links;
-
-use strict;
-use warnings;
-
-sub new {
- my $class = shift;
- my $source = shift;
- my $self = {};
- $$self{BUFFER} = [];
-
- bless($self, $class);
-
- $self->open($source);
- $self->init;
-
- return $self;
-}
-
-sub next {
- my $self = shift;
- my $buffer = $$self{BUFFER};
- my $link;
-
- while(1) {
- if (defined($link = pop(@$buffer))) {
- last;
- }
-
- #signals end of input
- return undef unless $self->parse_more;
- }
-
- return Parse::MediaWikiDump::link->new($link);
-}
-
-#private functions with OO interface
-sub parse_more {
- my $self = shift;
- my $source = $$self{SOURCE};
- my $need_data = 1;
-
- while($need_data) {
- my $line = <$source>;
-
- last unless defined($line);
-
- while($line =~ m/\((\d+),(-?\d+),'(.*?)'\)[;,]/g) {
- push(@{$$self{BUFFER}}, [$1, $2, $3]);
- $need_data = 0;
- }
- }
-
- #if we still need data and we are here it means we ran out of input
- if ($need_data) {
- return 0;
- }
-
- return 1;
-}
-
-sub open {
- my $self = shift;
- my $source = shift;
-
- if (ref($source) ne 'GLOB') {
- die "could not open $source: $!" unless
- open($$self{SOURCE}, $source);
- } else {
- $$self{SOURCE} = $source;
- }
-
- binmode($$self{SOURCE}, ':utf8');
-
- return 1;
-}
-
-sub init {
- my $self = shift;
- my $source = $$self{SOURCE};
- my $found = 0;
-
- while(<$source>) {
- if (m/^LOCK TABLES `pagelinks` WRITE;/) {
- $found = 1;
- last;
- }
- }
-
- die "not a MediaWiki link dump file" unless $found;
-}
-
-#depreciated backwards compatibility methods
-
-#replaced by next()
-sub link {
- my $self = shift;
- $self->next(@_);
-}
-
-package Parse::MediaWikiDump::link;
-
-#you must pass in a fully populated link array reference
-sub new {
- my $class = shift;
- my $self = shift;
-
- bless($self, $class);
-
- return $self;
-}
-
-sub from {
- my $self = shift;
- return $$self[0];
-}
-
-sub namespace {
- my $self = shift;
- return $$self[1];
-}
-
-sub to {
- my $self = shift;
- return $$self[2];
-}
-
-
-1;
-
-__END__
-
-=head1 NAME
-
-Parse::MediaWikiDump - Tools to process MediaWiki dump files
-
-=head1 SYNOPSIS
-
- use Parse::MediaWikiDump;
-
- $source = 'dump_filename.ext';
- $source = \*FILEHANDLE;
-
- $pages = Parse::MediaWikiDump::Pages->new($source);
- $links = Parse::MediaWikiDump::Links->new($source);
-
- #get all the records from the dump files, one record at a time
- while(defined($page = $pages->next)) {
- print "title '", $page->title, "' id ", $page->id, "\n";
- }
-
- while(defined($link = $links->next)) {
- print "link from ", $link->from, " to ", $link->to, "\n";
- }
-
- #information about the page dump file
- $pages->sitename;
- $pages->base;
- $pages->generator;
- $pages->case;
- $pages->namespaces;
-
- #information about a page record
- $page->redirect;
- $page->categories;
- $page->title;
- $page->namespace;
- $page->id;
- $page->revision_id;
- $page->timestamp;
- $page->username;
- $page->userid;
- $page->minor;
- $page->text;
-
- #information about a link
- $link->from;
- $link->to;
- $link->namespace;
-
-=head1 DESCRIPTION
-
-This module provides the tools needed to process the contents of various
-MediaWiki dump files.
-
-=head1 USAGE
-
-To use this module you must create an instance of a parser for the type of
-dump file you are trying to parse. The current parsers are:
-
-=over 4
-
-=item Parse::MediaWikiDump::Pages
-
-Parse the contents of the page archive.
-
-=item Parse::MediaWikiDump::Links
-
-Parse the contents of the links dump file.
-
-=back
-
-=head2 General
-
-Both parsers require an argument to new that is a location of source data
-to parse; this argument can be either a filename or a reference to an already
-open filehandle. This entire software suite will die() upon errors in the file,
-inconsistencies on the stack, etc. If...
[truncated message content] |
|
From: <rot...@us...> - 2006-09-01 11:27:29
|
Revision: 15
http://svn.sourceforge.net/perlwikibot/?rev=15&view=rev
Author: rotemliss
Date: 2006-09-01 04:27:14 -0700 (Fri, 01 Sep 2006)
Log Message:
-----------
Fixing typo.
Modified Paths:
--------------
trunk/bot.pl
trunk/includes/functions.pm
trunk/old/delete.pl
trunk/old/move.pl
trunk/old/refresh.pl
trunk/old/replace.pl
Modified: trunk/bot.pl
===================================================================
--- trunk/bot.pl 2006-08-31 14:09:30 UTC (rev 14)
+++ trunk/bot.pl 2006-09-01 11:27:14 UTC (rev 15)
@@ -4,7 +4,7 @@
use warnings;
use strict;
-# Libaries
+# Libraries
use includes::functions;
# Counters
Modified: trunk/includes/functions.pm
===================================================================
--- trunk/includes/functions.pm 2006-08-31 14:09:30 UTC (rev 14)
+++ trunk/includes/functions.pm 2006-09-01 11:27:14 UTC (rev 15)
@@ -2,7 +2,7 @@
use warnings;
use strict;
-# Libaries
+# Libraries
use LWP;
use HTTP::Cookies;
use includes::mwdump;
Modified: trunk/old/delete.pl
===================================================================
--- trunk/old/delete.pl 2006-08-31 14:09:30 UTC (rev 14)
+++ trunk/old/delete.pl 2006-09-01 11:27:14 UTC (rev 15)
@@ -3,7 +3,7 @@
# Code style
use warnings;
-# Libaries
+# Libraries
use Time::Local;
use functions;
use configure;
Modified: trunk/old/move.pl
===================================================================
--- trunk/old/move.pl 2006-08-31 14:09:30 UTC (rev 14)
+++ trunk/old/move.pl 2006-09-01 11:27:14 UTC (rev 15)
@@ -3,7 +3,7 @@
# Code style
use warnings;
-# Libaries
+# Libraries
use functions;
use configure;
Modified: trunk/old/refresh.pl
===================================================================
--- trunk/old/refresh.pl 2006-08-31 14:09:30 UTC (rev 14)
+++ trunk/old/refresh.pl 2006-09-01 11:27:14 UTC (rev 15)
@@ -3,7 +3,7 @@
# Code style
use warnings;
-# Libaries
+# Libraries
use mwdump;
use functions;
use configure;
Modified: trunk/old/replace.pl
===================================================================
--- trunk/old/replace.pl 2006-08-31 14:09:30 UTC (rev 14)
+++ trunk/old/replace.pl 2006-09-01 11:27:14 UTC (rev 15)
@@ -3,7 +3,7 @@
# Code style
use warnings;
-# Libaries
+# Libraries
use functions;
use configure;
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|