[PerlWikiBot] SF.net SVN: perlwikibot: [16] trunk/includes

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 16
          http://svn.sourceforge.net/perlwikibot/?rev=16&view=rev
Author:   rotemliss
Date:     2006-09-01 09:04:29 -0700 (Fri, 01 Sep 2006)

Log Message:
-----------
Moving the dump file to a better name, and moving some functions about HTTP to a new HTTP library.

Modified Paths:
--------------
    trunk/includes/functions.pm

Added Paths:
-----------
    trunk/includes/dump.pm
    trunk/includes/http.pm

Removed Paths:
-------------
    trunk/includes/mwdump.pm

Copied: trunk/includes/dump.pm (from rev 6, trunk/includes/mwdump.pm)
===================================================================

--- trunk/includes/dump.pm	                        (rev 0)
+++ trunk/includes/dump.pm	2006-09-01 16:04:29 UTC (rev 16)
@@ -0,0 +1,1394 @@
+our $VERSION = '0.31';
+#the POD is at the end of this file
+#avoid shift() - it is computationally more expensive than pop
+#and shifting values for subroutine input should be avoided in
+#any subroutines that get called often, like the handlers
+
+package Parse::MediaWikiDump::Pages;
+
+#This parser works by placing all of the start, text, and end events into
+#a buffer as they come out of XML::Parser. On each call to page() the function
+#checks for a complete article in the buffer and calls for XML::Parser to add
+#more tokens if a complete article is not found. Once a complete article is 
+#found it is removed from the buffer, parsed, and an instance of the page
+#object is returned. 
+
+use strict;
+use warnings;
+use XML::Parser;
+
+#tokens in the buffer are an array ref with the 0th element specifying
+#its type; these are the constants for those types. 
+use constant T_START => 1;
+use constant T_END => 2;
+use constant T_TEXT => 3;
+
+sub new {
+	my $class = shift;
+	my $source = shift;
+	my $self = {};
+
+	bless($self, $class);
+
+	$$self{PARSER} = XML::Parser->new(ProtocolEncoding => 'UTF-8');
+	$$self{PARSER}->setHandlers('Start', \&start_handler,
+				    'End', \&end_handler,
+				    'Char', \&char_handler);
+        $$self{EXPAT} = $$self{PARSER}->parse_start(state => $self);
+	$$self{BUFFER} = []; 
+	$$self{CHUNK_SIZE} = 32768;
+	$$self{BUF_LIMIT} = 10000;
+	$$self{BYTE} = 0;
+
+	$self->open($source);
+	$self->init;
+
+	return $self;
+}
+
+sub next {
+	my $self = shift;
+	my $buffer = $$self{BUFFER};
+	my $offset;
+	my @page;
+
+	#look through the contents of our buffer for a complete article; fill
+	#the buffer with more data if an entire article is not there
+	while(1) {
+		$offset = $self->search_buffer([T_END, 'page']);
+		last if $offset != -1;
+
+		#indicates EOF
+		return undef unless $self->parse_more;
+	}
+
+	#remove the entire page from the buffer
+	@page = splice(@$buffer, 0, $offset + 1);
+
+	if (! token_compare($page[0], [T_START, 'page'])) {
+		$self->dump($buffer);
+		die "expected <page>; got " . token2text($page[0]);
+	}
+
+	my $data = $self->parse_page(\@page);
+
+	return Parse::MediaWikiDump::page->new($data, $$self{CATEGORY_ANCHOR});
+}
+
+#outputs a nicely formated representation of the tokens on the buffer specified
+sub dump {
+	my $self = shift;
+	my $buffer = shift || $$self{BUFFER};
+	my $offset = 0;
+
+	foreach my $i (0 .. $#$buffer) {
+		my $token = $$buffer[$i];
+
+		print STDERR "$i ";
+
+		if ($$token[0] == T_START) {
+			my $attr = $$token[2];
+			print STDERR "  " x $offset;
+			print STDERR "START $$token[1] ";
+
+			foreach my $key (sort(keys(%$attr))) {
+				print STDERR "$key=\"$$attr{$key}\" ";
+			}
+
+			print STDERR "\n";
+			$offset++;
+		} elsif ($$token[0] == T_END) {
+			$offset--;
+			print STDERR "  " x $offset;
+			print STDERR "END $$token[1]\n";
+		} elsif ($$token[0] == T_TEXT) {
+			my $ref = $$token[1];
+			print STDERR "  " x $offset;
+			print STDERR "TEXT ";
+
+			my $len = length($$ref);
+
+			if ($len < 50) {
+				print STDERR "'$$ref'\n";
+			} else {
+				print STDERR "$len characters\n";
+			}
+		}
+	}
+	
+	return 1;
+}
+
+sub sitename {
+	my $self = shift;
+	return $$self{HEAD}{sitename};
+}
+
+sub base {
+	my $self = shift;
+	return $$self{HEAD}{base};
+}
+
+sub generator {
+	my $self = shift;
+	return $$self{HEAD}{generator};
+}
+
+sub case {
+	my $self = shift;
+	return $$self{HEAD}{case};
+}
+
+sub namespaces {
+	my $self = shift;
+	return $$self{HEAD}{namespaces};
+}
+
+sub current_byte {
+	my $self = shift;
+	return $$self{BYTE};
+}
+
+#depreciated backwards compatibility methods
+
+#replaced by next()
+sub page {
+	my $self = shift;
+	return $self->next(@_);
+}
+
+#private functions with OO interface
+sub open {
+	my $self = shift;
+	my $source = shift;
+
+	if (ref($source) eq 'GLOB') {
+		$$self{SOURCE} = $source;
+	} elsif (! open($$self{SOURCE}, $source)) {
+		die "could not open $source: $!";
+	}
+
+	binmode($$self{SOURCE}, ':utf8');
+
+	return 1;
+}
+
+sub init {
+	my $self = shift;
+	my $offset;
+	my @head;
+
+	#parse more XML until the entire siteinfo section is in the buffer
+	while(1) {
+		die "could not init" unless $self->parse_more;
+
+		$offset = $self->search_buffer([T_END, 'siteinfo']);
+
+		last if $offset != -1;
+	}
+
+	#pull the siteinfo section out of the buffer
+	@head = splice(@{$$self{BUFFER}}, 0, $offset + 1);
+
+	$self->parse_head(\@head);
+
+	return 1;
+}
+
+#feed data into expat and have it put more tokens onto the buffer
+sub parse_more {
+	my ($self) = @_;
+	my $buf;
+
+	my $read = read($$self{SOURCE}, $buf, $$self{CHUNK_SIZE});
+
+	if (! defined($read)) {
+		die "error during read: $!";
+	} elsif ($read == 0) {
+		$$self{FINISHED} = 1;
+		$$self{EXPAT}->parse_done();
+		return 0;
+	}
+
+	$$self{BYTE} += $read;
+	$$self{EXPAT}->parse_more($buf);
+
+	my $buflen = scalar(@{$$self{BUFFER}});
+
+	die "buffer length of $buflen exceeds $$self{BUF_LIMIT}" unless
+		$buflen < $$self{BUF_LIMIT};
+
+	return 1;
+}
+
+#searches through a buffer for a specified token
+sub search_buffer {
+	my ($self, $search, $list) = @_;
+
+	$list = $$self{BUFFER} unless defined $list;
+
+	return -1 if scalar(@$list) == 0;
+
+	foreach my $i (0 .. $#$list) {
+		return $i if token_compare($$list[$i], $search);
+	}
+
+	return -1;
+}
+
+#this function is very frightning =)
+sub parse_head {
+	my $self = shift;
+	my $buffer = shift;
+	my $state = 'start';
+	my %data = (namespaces => []);
+
+	for (my $i = 0; $i <= $#$buffer; $i++) {
+		my $token = $$buffer[$i];
+
+		if ($state eq 'start') {
+			my $version;
+			die "$i: expected <mediawiki> got " . token2text($token) unless
+				token_compare($token, [T_START, 'mediawiki']);
+
+			die "$i: version is a required attribute" unless
+				defined($version = $$token[2]->{version});
+
+			die "$i: version $version unsupported" unless $version eq '0.3';
+
+			$token = $$buffer[++$i];
+
+			die "$i: expected <siteinfo> got " . token2text($token) unless
+				token_compare($token, [T_START, 'siteinfo']);
+
+			$state = 'in_siteinfo';
+		} elsif ($state eq 'in_siteinfo') {
+			if (token_compare($token, [T_START, 'namespaces'])) {
+				$state = 'in_namespaces';
+				next;
+			} elsif (token_compare($token, [T_END, 'siteinfo'])) {
+				last;
+			} elsif (token_compare($token, [T_START, 'sitename'])) {
+				$token = $$buffer[++$i];
+
+				if ($$token[0] != T_TEXT) {
+					die "$i: expected TEXT but got " . token2text($token);
+				}
+
+				$data{sitename} = ${$$token[1]};
+
+				$token = $$buffer[++$i];
+
+				if (! token_compare($token, [T_END, 'sitename'])) {
+					die "$i: expected </sitename> but got " . token2text($token);
+				}
+			} elsif (token_compare($token, [T_START, 'base'])) {
+				$token = $$buffer[++$i];
+
+				if ($$token[0] != T_TEXT) {
+					$self->dump($buffer);
+					die "$i: expected TEXT but got " . token2text($token);
+				}
+
+				$data{base} = ${$$token[1]};
+
+				$token = $$buffer[++$i];
+
+				if (! token_compare($token, [T_END, 'base'])) {
+					$self->dump($buffer);
+					die "$i: expected </base> but got " . token2text($token);
+				}
+
+			} elsif (token_compare($token, [T_START, 'generator'])) {
+				$token = $$buffer[++$i];
+
+				if ($$token[0] != T_TEXT) {
+					$self->dump($buffer);
+					die "$i: expected TEXT but got " . token2text($token);
+				}
+
+				$data{generator} = ${$$token[1]};
+
+				$token = $$buffer[++$i];
+
+				if (! token_compare($token, [T_END, 'generator'])) {
+					$self->dump($buffer);
+					die "$i: expected </generator> but got " . token2text($token);
+				}
+
+			} elsif (token_compare($token, [T_START, 'case'])) {
+				$token = $$buffer[++$i];
+
+				if ($$token[0] != T_TEXT) {
+					$self->dump($buffer);
+					die "$i: expected </case> but got " . token2text($token);
+				}
+
+				$data{case} = ${$$token[1]};
+
+				$token = $$buffer[++$i];
+
+				if (! token_compare($token, [T_END, 'case'])) {
+					$self->dump($buffer);
+					die "$i: expected </case> but got " . token2text($token);
+				}
+			}
+
+		} elsif ($state eq 'in_namespaces') {
+			my $key;
+			my $name;
+
+			if (token_compare($token, [T_END, 'namespaces'])) {
+				$state = 'in_siteinfo';
+				next;
+			} 
+
+			if (! token_compare($token, [T_START, 'namespace'])) {
+				die "$i: expected <namespace> or </namespaces>; got " . token2text($token);
+			}
+
+			die "$i: key is a required attribute" unless
+				defined($key = $$token[2]->{key});
+
+			$token = $$buffer[++$i];
+
+			#the default namespace has no text associated with it
+			if ($$token[0] == T_TEXT) {
+				$name = ${$$token[1]};
+			} elsif (token_compare($token, [T_END, 'namespace'])) {
+				$name = '';
+				$i--; #move back one for below
+			} else {
+				die "$i: should never happen";	
+			}
+
+			push(@{$data{namespaces}}, [$key, $name]);
+
+			$token = $$buffer[++$i];
+
+			if (! token_compare($token, [T_END, 'namespace'])) {
+				$self->dump($buffer);
+				die "$i: expected </namespace> but got " . token2text($token);
+			}
+
+		} else {
+			die "$i: unknown state '$state'";
+		}
+	}
+
+	$$self{HEAD} = \%data;
+
+	#locate the anchor that indicates what looks like a link is really a 
+	#category assignment ([[foo]] vs [[Category:foo]])
+	#fix for bug #16616
+	foreach my $ns (@{$data{namespaces}}) {
+		#namespace 14 is the category namespace
+		if ($$ns[0] == 14) {
+			$$self{CATEGORY_ANCHOR} = $$ns[1];
+			last;
+		}
+	}
+
+	if (! defined($$self{CATEGORY_ANCHOR})) {
+		die "Could not locate category indicator in namespace definitions";
+	}
+
+	return 1;
+}
+
+#this function is very frightning =)
+sub parse_page {
+	my $self = shift;
+	my $buffer = shift;
+	my %data;
+	my $state = 'start';
+
+	for (my $i = 0; $i <= $#$buffer; $i++) {
+		my $token = $$buffer[$i];
+
+		if ($state eq 'start') {
+			if (! token_compare($token, [T_START, 'page'])) {
+				$self->dump($buffer);
+				die "$i: expected <page>; got " . token2text($token);
+			}
+
+			$state = 'in_page';
+		} elsif ($state eq 'in_page') {
+			if (token_compare($token, [T_START, 'revision'])) {
+				$state = 'in_revision';
+				next;
+			} elsif (token_compare($token, [T_END, 'page'])) {
+				last;
+			} elsif (token_compare($token, [T_START, 'title'])) {
+				$token = $$buffer[++$i];
+
+				if (token_compare($token, [T_END, 'title'])) {
+					$data{title} = '';
+					next;
+				}
+
+				if ($$token[0] != T_TEXT) {
+					$self->dump($buffer);
+					die "$i: expected TEXT; got " . token2text($token);
+				}
+
+				$data{title} = ${$$token[1]};
+
+				$token = $$buffer[++$i];
+
+				if (! token_compare($token, [T_END, 'title'])) {
+					$self->dump($buffer);
+					die "$i: expected </title>; got " . token2text($token);
+				}
+			} elsif (token_compare($token, [T_START, 'id'])) {
+				$token = $$buffer[++$i];
+	
+				if ($$token[0] != T_TEXT) {
+					$self->dump($buffer);
+					die "$i: expected TEXT; got " . token2text($token);
+				}
+
+				$data{id} = ${$$token[1]};
+
+				$token = $$buffer[++$i];
+
+				if (! token_compare($token, [T_END, 'id'])) {
+					$self->dump($buffer);
+					die "$i: expected </id>; got " . token2text($token);
+				}
+			}
+		} elsif ($state eq 'in_revision') {
+			if (token_compare($token, [T_END, 'revision'])) {
+				#If a comprehensive dump file is parsed
+				#it can cause uncontrolled stack growth and the
+				#parser only returns one revision out of
+				#all revisions - if we run into a 
+				#comprehensive dump file, indicated by more
+				#than one <revision> section inside a <page>
+				#section then die with a message
+
+				#just peeking ahead, don't want to update
+				#the index
+				$token = $$buffer[$i + 1];
+
+				if (token_compare($token, [T_START, 'revision'])) {
+					die "unable to properly parse comprehensive dump files";
+				}
+
+				$state = 'in_page';
+				next;	
+			} elsif (token_compare($token, [T_START, 'contributor'])) {
+				$state = 'in_contributor';
+				next;
+			} elsif (token_compare($token, [T_START, 'id'])) {
+				$token = $$buffer[++$i];
+	
+				if ($$token[0] != T_TEXT) {
+					$self->dump($buffer);
+					die "$i: expected TEXT; got " . token2text($token);
+				}
+
+				$data{revision_id} = ${$$token[1]};
+
+				$token = $$buffer[++$i];
+
+				if (! token_compare($token, [T_END, 'id'])) {
+					$self->dump($buffer);
+					die "$i: expected </id>; got " . token2text($token);
+				}
+
+			} elsif (token_compare($token, [T_START, 'timestamp'])) {
+				$token = $$buffer[++$i];
+
+				if ($$token[0] != T_TEXT) {
+					$self->dump($buffer);
+					die "$i: expected TEXT; got " . token2text($token);
+				}
+
+				$data{timestamp} = ${$$token[1]};
+
+				$token = $$buffer[++$i];
+
+				if (! token_compare($token, [T_END, 'timestamp'])) {
+					$self->dump($buffer);
+					die "$i: expected </timestamp>; got " . token2text($token);
+				}
+			} elsif (token_compare($token, [T_START, 'minor'])) {
+				$data{minor} = 1;
+				$token = $$buffer[++$i];
+
+				if (! token_compare($token, [T_END, 'minor'])) {
+					$self->dump($buffer);
+					die "$i: expected </minor>; got " . token2text($token);
+				}
+			} elsif (token_compare($token, [T_START, 'comment'])) {
+				$token = $$buffer[++$i];
+
+				#account for possible null-text 
+				if (token_compare($token, [T_END, 'comment'])) {
+					$data{comment} = '';
+					next;
+				}
+
+				if ($$token[0] != T_TEXT) {
+					$self->dump($buffer);
+					die "$i: expected TEXT; got " . token2text($token);
+				}
+
+				$data{comment} = ${$$token[1]};
+
+				$token = $$buffer[++$i];
+
+				if (! token_compare($token, [T_END, 'comment'])) {
+					$self->dump($buffer);
+					die "$i: expected </comment>; got " . token2text($token);
+				}
+
+			} elsif (token_compare($token, [T_START, 'text'])) {
+				my $token = $$buffer[++$i];
+
+				if (token_compare($token, [T_END, 'text'])) {
+					${$data{text}} = '';
+					next;
+				} elsif ($$token[0] != T_TEXT) {
+					$self->dump($buffer);
+					die "$i: expected TEXT; got " . token2text($token);
+				}
+
+				$data{text} = $$token[1];
+
+				$token = $$buffer[++$i];
+
+				if (! token_compare($token, [T_END, 'text'])) {
+					$self->dump($buffer);
+					die "$i: expected </text>; got " . token2text($token);
+				}
+			
+			}
+
+		} elsif ($state eq 'in_contributor') {
+			if (token_compare($token, [T_END, 'contributor'])) {
+				$state = 'in_revision';
+				next;
+			} elsif (token_compare($token, [T_START, 'username'])) {
+				$token = $$buffer[++$i];
+
+				if ($$token[0] != T_TEXT) {
+					$self->dump($buffer);
+					die "$i: expecting TEXT; got " . token2text($token);
+				}
+
+				$data{username} = ${$$token[1]};
+
+				$token = $$buffer[++$i];
+
+				if (! token_compare($token, [T_END, 'username'])) {
+					$self->dump($buffer);
+					die "$i: expected </username>; got " . token2text($token);
+				}
+
+			} elsif (token_compare($token, [T_START, 'id'])) {
+				$token = $$buffer[++$i];
+				
+				if ($$token[0] != T_TEXT) {
+					$self->dump($buffer);
+					die "$i: expecting TEXT; got " . token2text($token);
+				}
+
+				$data{userid} = ${$$token[1]};
+
+				$token = $$buffer[++$i];
+
+				if (! token_compare($token, [T_END, 'id'])) {
+					$self->dump($buffer);
+					die "$i: expecting </id>; got " . token2text($token);
+				}
+			}
+		} else {
+			die "unknown state: $state";
+		}
+	}
+
+	$data{minor} = 0 unless defined($data{minor});
+
+	return \%data;
+}
+
+#private functions with out OO interface
+sub token2text {
+	my $token = shift;
+
+	if ($$token[0] == T_START) {
+		return "<$$token[1]>";
+	} elsif ($$token[0] == T_END) {
+		return "</$$token[1]>";
+	} elsif ($$token[0] == T_TEXT) {
+		return "!text_token!";	
+	} else {
+		return "!unknown!";
+	}
+}
+
+#this function is where the majority of time is spent in this software
+sub token_compare {
+	my ($toke1, $toke2) = @_;
+
+	foreach my $i (0 .. $#$toke2) {
+		if ($$toke1[$i] ne $$toke2[$i]) {
+			return 0;
+		}
+	}
+
+	return 1;
+}
+
+sub start_handler {
+	my ($p, $tag, %atts) = @_;	
+	my $self = $p->{state};
+
+	push(@{$$self{BUFFER}}, [T_START, $tag, \%atts]);
+
+	return 1;
+}
+
+sub end_handler {
+	my ($p, $tag) = @_;
+	my $self = $p->{state};
+
+	push(@{$$self{BUFFER}}, [T_END, $tag]);
+
+	return 1;
+}
+
+sub char_handler {
+	my ($p, $chars) = @_;
+	my $self = $p->{state};
+	my $buffer = $$self{BUFFER};
+	my $curent = $$buffer[$#$buffer];
+
+	if (! defined($curent)) {
+ 		#skip any text not inside a container
+		return 1;
+	} elsif ($$curent[0] == T_TEXT) {
+		${$$curent[1]} .= $chars;
+	} elsif ($$curent[0] == T_START) {
+		my $ignore_ws_only = 1;
+
+		#work around for bug #16583 - All spaces is a possible username
+		# at least if done via unicode. Force white space preservation 
+		#for now. 
+		if ($$curent[1] eq 'username') {
+			$ignore_ws_only = 0;
+		}
+
+		if (defined($$curent[2]->{'xml:space'}) &&
+			($$curent[2]->{'xml:space'} eq 'preserve')) {
+				$ignore_ws_only = 0;
+		}
+
+		if ($ignore_ws_only) {
+			#non-breaking spaces are not whitespace in XML
+			return 1 if $chars =~ m/^[ \t\r\n]+$/m;
+		}
+
+		push(@$buffer, [T_TEXT, \$chars]);
+	} 
+
+	return 1;
+}
+
+package Parse::MediaWikiDump::page;
+
+use strict;
+use warnings;
+
+sub new {
+	my ($class, $data, $category_anchor, $case_setting) = @_; 
+	my $self = {};
+
+	bless($self, $class);
+
+	$$self{DATA} = $data;
+	$$self{CACHE} = {};
+	$$self{CATEGORY_ANCHOR} = $category_anchor;
+
+	return $self;
+}
+
+sub namespace {
+	my $self = shift;
+
+	return $$self{CACHE}{namespace} if defined($$self{CACHE}{namespace});
+
+	my $title = $$self{DATA}{title};
+
+	if ($title =~ m/^([^:]+)\:/) {
+		$$self{CACHE}{namespace} = $1;
+		return $1;
+	} else {
+		$$self{CACHE}{namespace} = '';
+		return '';
+	}
+}
+
+sub categories {
+	my $self = shift;
+	my $anchor = $$self{CATEGORY_ANCHOR};
+
+	return $$self{CACHE}{categories} if defined($$self{CACHE}{categories});
+
+	my $text = $$self{DATA}{text};
+	my @cats;
+	
+	while($$text =~ m/\[\[$anchor:\s*([^\]]+)\]\]/gi) {
+		my $buf = $1;
+
+		#deal with the pipe trick
+		$buf =~ s/\|.*$//;
+		push(@cats, $buf);
+	}
+
+	return undef if scalar(@cats) == 0;
+
+	$$self{CACHE}{categories} = \@cats;
+
+	return \@cats;
+}
+
+sub redirect {
+	my $self = shift;
+	my $text = $$self{DATA}{text};
+
+	return $$self{CACHE}{redirect} if exists($$self{CACHE}{redirect});
+
+	if ($$text =~ m/^#redirect\s*:?\s*\[\[([^\]]*)\]\]/i) {
+		$$self{CACHE}{redirect} = $1;
+		return $1;
+	} else {
+		$$self{CACHE}{redirect} = undef;
+		return undef;
+	}
+}
+
+sub title {
+	my $self = shift;
+	return $$self{DATA}{title};
+}
+
+sub id {
+	my $self = shift;
+	return $$self{DATA}{id};
+}
+
+sub revision_id {
+	my $self = shift;
+	return $$self{DATA}{revision_id};
+}
+
+sub timestamp {
+	my $self = shift;
+	return $$self{DATA}{timestamp};
+}
+
+sub username {
+	my $self = shift;
+	return $$self{DATA}{username};
+}
+
+sub userid {
+	my $self = shift;
+	return $$self{DATA}{userid};
+}
+
+sub minor {
+	my $self = shift;
+	return $$self{DATA}{minor};
+}
+
+sub text {
+	my $self = shift;
+	return $$self{DATA}{text};
+}
+
+package Parse::MediaWikiDump::Links;
+
+use strict;
+use warnings;
+
+sub new {
+	my $class = shift;
+	my $source = shift;
+	my $self = {};
+	$$self{BUFFER} = [];
+
+	bless($self, $class);
+
+	$self->open($source);
+	$self->init;
+
+	return $self;
+}
+
+sub next {
+	my $self = shift;
+	my $buffer = $$self{BUFFER};
+	my $link;
+
+	while(1) {
+		if (defined($link = pop(@$buffer))) {
+			last;
+		}
+
+		#signals end of input
+		return undef unless $self->parse_more;
+	}
+
+	return Parse::MediaWikiDump::link->new($link);
+}
+
+#private functions with OO interface
+sub parse_more {
+	my $self = shift;
+	my $source = $$self{SOURCE};
+	my $need_data = 1;
+	
+	while($need_data) {
+		my $line = <$source>;
+
+		last unless defined($line);
+
+		while($line =~ m/\((\d+),(-?\d+),'(.*?)'\)[;,]/g) {
+			push(@{$$self{BUFFER}}, [$1, $2, $3]);
+			$need_data = 0;
+		}
+	}
+
+	#if we still need data and we are here it means we ran out of input
+	if ($need_data) {
+		return 0;
+	}
+	
+	return 1;
+}
+
+sub open {
+	my $self = shift;
+	my $source = shift;
+
+	if (ref($source) ne 'GLOB') {
+		die "could not open $source: $!" unless
+			open($$self{SOURCE}, $source);
+	} else {
+		$$self{SOURCE} = $source;
+	}
+
+	binmode($$self{SOURCE}, ':utf8');
+
+	return 1;
+}
+
+sub init {
+	my $self = shift;
+	my $source = $$self{SOURCE};
+	my $found = 0;
+	
+	while(<$source>) {
+		if (m/^LOCK TABLES `pagelinks` WRITE;/) {
+			$found = 1;
+			last;
+		}
+	}
+
+	die "not a MediaWiki link dump file" unless $found;
+}
+
+#depreciated backwards compatibility methods
+
+#replaced by next()
+sub link {
+	my $self = shift;
+	$self->next(@_);
+}
+
+package Parse::MediaWikiDump::link;
+
+#you must pass in a fully populated link array reference
+sub new {
+	my $class = shift;
+	my $self = shift;
+
+	bless($self, $class);
+
+	return $self;
+}
+
+sub from {
+	my $self = shift;
+	return $$self[0];
+}
+
+sub namespace {
+	my $self = shift;
+	return $$self[1];
+}
+
+sub to {
+	my $self = shift;
+	return $$self[2];
+}
+
+
+1;
+
+__END__
+
+=head1 NAME
+
+Parse::MediaWikiDump - Tools to process MediaWiki dump files
+
+=head1 SYNOPSIS
+
+  use Parse::MediaWikiDump;
+
+  $source = 'dump_filename.ext';
+  $source = \*FILEHANDLE;
+
+  $pages = Parse::MediaWikiDump::Pages->new($source);
+  $links = Parse::MediaWikiDump::Links->new($source);
+
+  #get all the records from the dump files, one record at a time
+  while(defined($page = $pages->next)) {
+    print "title '", $page->title, "' id ", $page->id, "\n";
+  }
+
+  while(defined($link = $links->next)) {
+    print "link from ", $link->from, " to ", $link->to, "\n";
+  }
+
+  #information about the page dump file
+  $pages->sitename;
+  $pages->base;
+  $pages->generator;
+  $pages->case;
+  $pages->namespaces;
+
+  #information about a page record
+  $page->redirect;
+  $page->categories;
+  $page->title;
+  $page->namespace;
+  $page->id;
+  $page->revision_id;
+  $page->timestamp;
+  $page->username;
+  $page->userid;
+  $page->minor;
+  $page->text;
+
+  #information about a link
+  $link->from;
+  $link->to;
+  $link->namespace;
+
+=head1 DESCRIPTION
+
+This module provides the tools needed to process the contents of various 
+MediaWiki dump files. 
+
+=head1 USAGE
+
+To use this module you must create an instance of a parser for the type of
+dump file you are trying to parse. The current parsers are:
+
+=over 4
+
+=item Parse::MediaWikiDump::Pages
+
+Parse the contents of the page archive.
+
+=item Parse::MediaWikiDump::Links
+
+Parse the contents of the links dump file. 
+
+=back
+
+=head2 General
+
+Both parsers require an argument to new that is a location of source data
+to parse; this argument can be either a filename or a reference to an already
+open filehandle. This entire software suite will die() upon errors in the file,
+inconsistencies on the stack, etc. If this concerns you then you can wrap
+the portion of your code that uses these calls with eval().
+
+=head2 Parse::MediaWikiDump::Pages
+
+It is possible to create a Parse::MediaWikiDump::Pages object two ways:
+
+=over 4
+
+=item $pages = Parse::MediaWikiDump::Pages->new($filename);
+
+=item $pages = Parse::MediaWikiDump::Pages->new(\*FH);
+
+=back
+
+After creation the folowing methods are avalable:
+
+=over 4
+
+=item $pages->next
+
+Returns the next available record from the dump file if it is available,
+otherwise returns undef. Records returned are instances of 
+Parse::MediaWikiDump::page; see below for information on those objects.
+
+=item $pages->sitename
+
+Returns the plain-text name of the instance the dump is from.
+
+=item $pages->base
+
+Returns the base url to the website of the instance.
+
+=item $pages->generator
+
+Returns the version of the software that generated the file.
+
+=item $pages->case
+
+Returns the case-sensitivity configuration of the instance.
+
+=item $pages->namespaces
+
+Returns an array reference to the list of namespaces in the instance. Each
+namespace is stored as an array reference which has two items; the first is the
+namespace number and the second is the namespace name. In the case of namespace
+0 the text stored for the name is ''
+
+=back
+
+=head3 Parse::MediaWikiDump::page
+
+The Parse::MediaWikiDump::page object represents a distinct MediaWiki page, 
+article, module, what have you. These objects are returned by the next() method
+of a Parse::MediaWikiDump::Pages instance. The scalar returned is a reference
+to a hash that contains all the data of the page in a straightforward manor. 
+While it is possible to access this hash directly, and it involves less overhead
+than using the methods below, it is beyond the scope of the interface and is
+undocumented. 
+
+Some of the methods below require additional processing, such as namespaces,
+redirect, and categories, to name a few. In these cases the returned result
+is cached and stored inside the object so the processing does not have to be
+redone. This is transparent to you; just know that you don't have to worry about
+optimizing calls to these functions to limit processing overhead. 
+
+The following methods are available:
+
+=over 4
+
+=item $page->id
+
+=item $page->title
+
+=item $page->namespace
+
+Returns an empty string (such as '') for the main namespace or a string 
+containing the name of the namespace.
+
+=item $page->text
+
+A reference to a scalar containing the plaintext of the page.
+
+=item $page->redirect
+
+The plain text name of the article redirected to or undef if the page is not
+a redirect.
+
+=item $page->categories
+
+Returns a reference to an array that contains a list of categories or undef
+if there are no categories. This method does not understand templates and may
+not return all the categories the article actually belongs in. 
+
+=item $page->revision_id
+
+=item $page->timestamp
+
+=item $page->username
+
+=item $page->userid
+
+=item $page->minor
+
+=back
+
+=head2 Parse::MediaWikiDump::Links
+
+This module also takes either a filename or a reference to an already open 
+filehandle. For example:
+
+  $links = Parse::MediaWikiDump::Links->new($filename);
+  $links = Parse::MediaWikiDump::Links->new(\*FH);
+
+It is then possible to extract the links a single link at a time using the
+next method, which returns an instance of Parse::MediaWikiDump::link or undef
+when there is no more data. For instance: 
+
+  while(defined($link = $links->next)) {
+    print 'from ', $link->from, ' to ', $link->to, "\n";
+  }
+
+=head3 Parse::MediaWikiDump::link
+
+Instances of this class are returned by the link method of a 
+Parse::MediaWikiDump::Links instance. The following methods are available:
+
+=over 4
+
+=item $link->from
+
+The numerical id the link was in. 
+
+=item $link->to
+
+The plain text name the link is to, minus the namespace.
+
+=item $link->namespace
+
+The numerical id of the namespace the link points to. 
+
+=back
+
+=head1 EXAMPLES
+
+=head2 Extract the article text for a given title
+
+  #!/usr/bin/perl
+  
+  use strict;
+  use warnings;
+  use Parse::MediaWikiDump;
+  
+  my $file = shift(@ARGV) or die "must specify a MediaWiki dump of the current pages";
+  my $title = shift(@ARGV) or die "must specify an article title";
+  my $dump = Parse::MediaWikiDump::Pages->new($file);
+  
+  binmode(STDOUT, ':utf8');
+  binmode(STDERR, ':utf8');
+  
+  #this is the only currently known value but there could be more in the future
+  if ($dump->case ne 'first-letter') {
+    die "unable to handle any case setting besides 'first-letter'";
+  }
+  
+  $title = case_fixer($title);
+  
+  while(my $page = $dump->next) {
+    if ($page->title eq $title) {
+      print STDERR "Located text for $title\n";
+      my $text = $page->text;
+      print $$text;
+      exit 0;
+    }
+  }
+  
+  print STDERR "Unable to find article text for $title\n";
+  exit 1;
+  
+  #removes any case sensativity from the very first letter of the title
+  #but not from the optional namespace name
+  sub case_fixer {
+    my $title = shift;
+  
+    #check for namespace
+    if ($title =~ /^(.+?):(.+)/) {
+      $title = $1 . ':' . ucfirst($2);
+    } else {
+      $title = ucfirst($title);
+    }
+  
+    return $title;
+  }
+
+=head2 Scan the dump file for double redirects
+
+  #!/usr/bin/perl
+  
+  #progress information goes to STDERR, a list of double redirects found
+  #goes to STDOUT
+  
+  binmode(STDOUT, ":utf8");
+  binmode(STDERR, ":utf8");
+  
+  use strict;
+  use warnings;
+  use Parse::MediaWikiDump;
+  
+  my $file = shift(@ARGV);
+  my $pages;
+  my $page;
+  my %redirs;
+  my $artcount = 0;
+  my $file_size;
+  my $start = time;
+  
+  if (defined($file)) {
+  	$file_size = (stat($file))[7];
+  	$pages = Parse::MediaWikiDump::Pages->new($file);
+  } else {
+  	print STDERR "No file specified, using standard input\n";
+  	$pages = Parse::MediaWikiDump::Pages->new(\*STDIN);
+  }
+  
+  #the case of the first letter of titles is ignored - force this option
+  #because the other values of the case setting are unknown
+  die 'this program only supports the first-letter case setting' unless
+  	$pages->case eq 'first-letter';
+  
+  print STDERR "Analyzing articles:\n";
+  
+  while(defined($page = $pages->next)) {
+    update_ui() if ++$artcount % 500 == 0;
+  
+    #main namespace only
+    next unless $page->namespace eq '';
+    next unless defined($page->redirect);
+  
+    my $title = case_fixer($page->title);
+    #create a list of redirects indexed by their original name
+    $redirs{$title} = case_fixer($page->redirect);
+  }
+  
+  my $redir_count = scalar(keys(%redirs));
+  print STDERR "done; searching $redir_count redirects:\n";
+  
+  my $count = 0;
+  
+  #if a redirect location is also a key to the index we have a double redirect
+  foreach my $key (keys(%redirs)) {
+    my $redirect = $redirs{$key};
+  
+    if (defined($redirs{$redirect})) {
+      print "$key\n";
+      $count++;
+    }
+  }
+  
+  print STDERR "discovered $count double redirects\n";
+  
+  #removes any case sensativity from the very first letter of the title
+  #but not from the optional namespace name
+  sub case_fixer {
+    my $title = shift;
+  
+    #check for namespace
+    if ($title =~ /^(.+?):(.+)/) {
+      $title = $1 . ':' . ucfirst($2);
+    } else {
+      $title = ucfirst($title);
+    }
+  
+    return $title;
+  }
+  
+  sub pretty_bytes {
+    my $bytes = shift;
+    my $pretty = int($bytes) . ' bytes';
+  
+    if (($bytes = $bytes / 1024) > 1) {
+      $pretty = int($bytes) . ' kilobytes';
+    }
+  
+    if (($bytes = $bytes / 1024) > 1) {
+      $pretty = sprintf("%0.2f", $bytes) . ' megabytes';
+    }
+  
+    if (($bytes = $bytes / 1024) > 1) {
+      $pretty = sprintf("%0.4f", $bytes) . ' gigabytes';
+    }
+  
+    return $pretty;
+  }
+  
+  sub pretty_number {
+    my $number = reverse(shift);
+    $number =~ s/(...)/$1,/g;
+    $number = reverse($number);
+    $number =~ s/^,//;
+  
+    return $number;
+  }
+  
+  sub update_ui {
+    my $seconds = time - $start;
+    my $bytes = $pages->current_byte;
+  
+    print STDERR "  ", pretty_number($artcount),  " articles; "; 
+    print STDERR pretty_bytes($bytes), " processed; ";
+  
+    if (defined($file_size)) {
+      my $percent = int($bytes / $file_size * 100);
+  
+      print STDERR "$percent% completed\n"; 
+    } else {
+      my $bytes_per_second = int($bytes / $seconds);
+      print STDERR pretty_bytes($bytes_per_second), " per second\n";
+    }
+  }
+
+=head1 TODO
+
+=over 4
+
+=item Support comprehensive dump files
+
+Currently the full page dump files (such as 20050909_pages_full.xml.gz) 
+are not supported.
+
+=item Optimization
+
+It would be nice to increase the processing speed of the XML files. Current
+ideas:
+
+=over 4
+
+=item Move to arrays instead of hashes for base objects 
+
+Currently the base types for the majority of the classes are hashes. The 
+majority of these could be changed to arrays and numerical constants instead
+of using hashes. 
+
+=item Stackless parsing
+
+placing each XML token on the stack is probably quite time consuming. It may be
+beter to move to a stackless system where the XML parser is given a new set
+of callbacks to use when it encounters each specific token.
+
+=back
+
+=back
+
+=head1 AUTHOR
+
+This module was created and documented by Tyler Riddle E<lt>tr...@gm...<gt>. 
+
+=head1 BUGS
+
+Please report any bugs or feature requests to
+C<bug...@rt...>, or through the web interface at
+L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Parse-MediaWikiDump>.
+I will be notified, and then you'll automatically be notified of progress on
+your bug as I make changes.
+
+=head2 Known Bugs
+
+No known bugs at this time. 
+
+=head1 COPYRIGHT & LICENSE
+
+Copyright 2005 Tyler Riddle, all rights reserved.
+
+This program is free software; you can redistribute it and/or modify it
+under the same terms as Perl itself.
+

Modified: trunk/includes/functions.pm
===================================================================
--- trunk/includes/functions.pm	2006-09-01 11:27:14 UTC (rev 15)
+++ trunk/includes/functions.pm	2006-09-01 16:04:29 UTC (rev 16)
@@ -3,20 +3,14 @@
 use strict;
 
 # Libraries
-use LWP;
-use HTTP::Cookies;
-use includes::mwdump;
+use includes::dump;
+use includes::http;
 use config::configure;
 use config::runtime;
 
 # Counters
 my ($i, $j, $k, $l, $m);
 
-# The browser we use to surf; enable cookies, and use a specific user agent
-my $browser = LWP::UserAgent->new;
-$browser->cookie_jar({});
-$browser->agent("Mozilla/5.0 (compatible; Perl Wikibot)");
-
 # Get all the pages matching specific restrictions
 sub getMatchingPages
 {
@@ -134,73 +128,6 @@
 	timegm($6, $5, $4, $3, $2 - 1, $1 - 1900); # Return
 }
 
-# Build the URL of a wiki page
-sub buildPageURL
-{
-	# Get parameters
-	my $title = $_[0];
-	my $action = $_[1];
-	my $get = $_[2];
-	
-	# Initial URL: server, script path and title
-	my $url = server().scriptPath()."?title=$title";
-	
-	# Action
-	if (defined($action))
-	{
-		$url = "$url&action=$action";
-	}
-	
-	# Parameters
-	if (defined($get))
-	{
-		$url = "$url&$get";
-	}
-	
-	$url; # Return
-}
-
-# Get a wiki page, try again and again if error
-sub getPage
-{
-	# Get parameters
-	my $title = $_[0];
-	my $action = $_[1];
-	my $get = $_[2];
-	
-	my $url = buildPageURL($title, $action, $get);
-	
-	my $result;
-	do
-	{
-		$result = $browser->get($url);
-	}
-	while ((!$result->is_success) && ($result->status_line ne "302 Moved Temporarily"));
-	
-	$result->content; # Return
-}
-
-# Post a wiki page, try again and again if error
-sub postPage
-{
-	# Get parameters
-	my $title = $_[0];
-	my $action = $_[1];
-	my $post = $_[2];
-	my $get = $_[3];
-	
-	my $url = buildPageURL($title, $action, $get);
-	
-	my $result;
-	do
-	{
-		$result = $browser->post($url, $post);
-	}
-	while ((!$result->is_success) && (!$result->status_line =~ /302[\s\w]+/));
-	
-	$result->content; # Return
-}
-
 # Log in
 sub login
 {

Added: trunk/includes/http.pm
===================================================================
--- trunk/includes/http.pm	                        (rev 0)
+++ trunk/includes/http.pm	2006-09-01 16:04:29 UTC (rev 16)
@@ -0,0 +1,81 @@
+# Code style
+use warnings;
+use strict;
+
+# Libraries
+use LWP;
+
+# Counters
+my ($i, $j, $k, $l, $m);
+
+# The browser we use to surf; enable cookies, and use a specific user agent
+my $browser = LWP::UserAgent->new;
+$browser->cookie_jar({});
+$browser->agent("Mozilla/5.0 (compatible; Perl MediaWiki Robot)");
+
+# Build the URL of a wiki page
+sub buildPageURL
+{
+	# Get parameters
+	my $title = $_[0];
+	my $action = $_[1];
+	my $get = $_[2];
+	
+	# Initial URL: server, script path and title
+	my $url = server().scriptPath()."?title=$title";
+	
+	# Action
+	if (defined($action))
+	{
+		$url = "$url&action=$action";
+	}
+	
+	# Parameters
+	if (defined($get))
+	{
+		$url = "$url&$get";
+	}
+	
+	$url; # Return
+}
+
+# Get a wiki page, try again and again if error
+sub getPage
+{
+	# Get parameters
+	my $title = $_[0];
+	my $action = $_[1];
+	my $get = $_[2];
+	
+	my $url = buildPageURL($title, $action, $get);
+	
+	my $result;
+	do
+	{
+		$result = $browser->get($url);
+	}
+	while ((!$result->is_success) && ($result->status_line ne "302 Moved Temporarily"));
+	
+	$result->content; # Return
+}
+
+# Post a wiki page, try again and again if error
+sub postPage
+{
+	# Get parameters
+	my $title = $_[0];
+	my $action = $_[1];
+	my $post = $_[2];
+	my $get = $_[3];
+	
+	my $url = buildPageURL($title, $action, $get);
+	
+	my $result;
+	do
+	{
+		$result = $browser->post($url, $post);
+	}
+	while ((!$result->is_success) && (!$result->status_line =~ /302[\s\w]+/));
+	
+	$result->content; # Return
+}


Property changes on: trunk/includes/http.pm
___________________________________________________________________
Name: svn:eol-style
   + native

Deleted: trunk/includes/mwdump.pm
===================================================================
--- trunk/includes/mwdump.pm	2006-09-01 11:27:14 UTC (rev 15)
+++ trunk/includes/mwdump.pm	2006-09-01 16:04:29 UTC (rev 16)
@@ -1,1394 +0,0 @@
-our $VERSION = '0.31';
-#the POD is at the end of this file
-#avoid shift() - it is computationally more expensive than pop
-#and shifting values for subroutine input should be avoided in
-#any subroutines that get called often, like the handlers
-
-package Parse::MediaWikiDump::Pages;
-
-#This parser works by placing all of the start, text, and end events into
-#a buffer as they come out of XML::Parser. On each call to page() the function
-#checks for a complete article in the buffer and calls for XML::Parser to add
-#more tokens if a complete article is not found. Once a complete article is 
-#found it is removed from the buffer, parsed, and an instance of the page
-#object is returned. 
-
-use strict;
-use warnings;
-use XML::Parser;
-
-#tokens in the buffer are an array ref with the 0th element specifying
-#its type; these are the constants for those types. 
-use constant T_START => 1;
-use constant T_END => 2;
-use constant T_TEXT => 3;
-
-sub new {
-	my $class = shift;
-	my $source = shift;
-	my $self = {};
-
-	bless($self, $class);
-
-	$$self{PARSER} = XML::Parser->new(ProtocolEncoding => 'UTF-8');
-	$$self{PARSER}->setHandlers('Start', \&start_handler,
-				    'End', \&end_handler,
-				    'Char', \&char_handler);
-        $$self{EXPAT} = $$self{PARSER}->parse_start(state => $self);
-	$$self{BUFFER} = []; 
-	$$self{CHUNK_SIZE} = 32768;
-	$$self{BUF_LIMIT} = 10000;
-	$$self{BYTE} = 0;
-
-	$self->open($source);
-	$self->init;
-
-	return $self;
-}
-
-sub next {
-	my $self = shift;
-	my $buffer = $$self{BUFFER};
-	my $offset;
-	my @page;
-
-	#look through the contents of our buffer for a complete article; fill
-	#the buffer with more data if an entire article is not there
-	while(1) {
-		$offset = $self->search_buffer([T_END, 'page']);
-		last if $offset != -1;
-
-		#indicates EOF
-		return undef unless $self->parse_more;
-	}
-
-	#remove the entire page from the buffer
-	@page = splice(@$buffer, 0, $offset + 1);
-
-	if (! token_compare($page[0], [T_START, 'page'])) {
-		$self->dump($buffer);
-		die "expected <page>; got " . token2text($page[0]);
-	}
-
-	my $data = $self->parse_page(\@page);
-
-	return Parse::MediaWikiDump::page->new($data, $$self{CATEGORY_ANCHOR});
-}
-
-#outputs a nicely formated representation of the tokens on the buffer specified
-sub dump {
-	my $self = shift;
-	my $buffer = shift || $$self{BUFFER};
-	my $offset = 0;
-
-	foreach my $i (0 .. $#$buffer) {
-		my $token = $$buffer[$i];
-
-		print STDERR "$i ";
-
-		if ($$token[0] == T_START) {
-			my $attr = $$token[2];
-			print STDERR "  " x $offset;
-			print STDERR "START $$token[1] ";
-
-			foreach my $key (sort(keys(%$attr))) {
-				print STDERR "$key=\"$$attr{$key}\" ";
-			}
-
-			print STDERR "\n";
-			$offset++;
-		} elsif ($$token[0] == T_END) {
-			$offset--;
-			print STDERR "  " x $offset;
-			print STDERR "END $$token[1]\n";
-		} elsif ($$token[0] == T_TEXT) {
-			my $ref = $$token[1];
-			print STDERR "  " x $offset;
-			print STDERR "TEXT ";
-
-			my $len = length($$ref);
-
-			if ($len < 50) {
-				print STDERR "'$$ref'\n";
-			} else {
-				print STDERR "$len characters\n";
-			}
-		}
-	}
-	
-	return 1;
-}
-
-sub sitename {
-	my $self = shift;
-	return $$self{HEAD}{sitename};
-}
-
-sub base {
-	my $self = shift;
-	return $$self{HEAD}{base};
-}
-
-sub generator {
-	my $self = shift;
-	return $$self{HEAD}{generator};
-}
-
-sub case {
-	my $self = shift;
-	return $$self{HEAD}{case};
-}
-
-sub namespaces {
-	my $self = shift;
-	return $$self{HEAD}{namespaces};
-}
-
-sub current_byte {
-	my $self = shift;
-	return $$self{BYTE};
-}
-
-#depreciated backwards compatibility methods
-
-#replaced by next()
-sub page {
-	my $self = shift;
-	return $self->next(@_);
-}
-
-#private functions with OO interface
-sub open {
-	my $self = shift;
-	my $source = shift;
-
-	if (ref($source) eq 'GLOB') {
-		$$self{SOURCE} = $source;
-	} elsif (! open($$self{SOURCE}, $source)) {
-		die "could not open $source: $!";
-	}
-
-	binmode($$self{SOURCE}, ':utf8');
-
-	return 1;
-}
-
-sub init {
-	my $self = shift;
-	my $offset;
-	my @head;
-
-	#parse more XML until the entire siteinfo section is in the buffer
-	while(1) {
-		die "could not init" unless $self->parse_more;
-
-		$offset = $self->search_buffer([T_END, 'siteinfo']);
-
-		last if $offset != -1;
-	}
-
-	#pull the siteinfo section out of the buffer
-	@head = splice(@{$$self{BUFFER}}, 0, $offset + 1);
-
-	$self->parse_head(\@head);
-
-	return 1;
-}
-
-#feed data into expat and have it put more tokens onto the buffer
-sub parse_more {
-	my ($self) = @_;
-	my $buf;
-
-	my $read = read($$self{SOURCE}, $buf, $$self{CHUNK_SIZE});
-
-	if (! defined($read)) {
-		die "error during read: $!";
-	} elsif ($read == 0) {
-		$$self{FINISHED} = 1;
-		$$self{EXPAT}->parse_done();
-		return 0;
-	}
-
-	$$self{BYTE} += $read;
-	$$self{EXPAT}->parse_more($buf);
-
-	my $buflen = scalar(@{$$self{BUFFER}});
-
-	die "buffer length of $buflen exceeds $$self{BUF_LIMIT}" unless
-		$buflen < $$self{BUF_LIMIT};
-
-	return 1;
-}
-
-#searches through a buffer for a specified token
-sub search_buffer {
-	my ($self, $search, $list) = @_;
-
-	$list = $$self{BUFFER} unless defined $list;
-
-	return -1 if scalar(@$list) == 0;
-
-	foreach my $i (0 .. $#$list) {
-		return $i if token_compare($$list[$i], $search);
-	}
-
-	return -1;
-}
-
-#this function is very frightning =)
-sub parse_head {
-	my $self = shift;
-	my $buffer = shift;
-	my $state = 'start';
-	my %data = (namespaces => []);
-
-	for (my $i = 0; $i <= $#$buffer; $i++) {
-		my $token = $$buffer[$i];
-
-		if ($state eq 'start') {
-			my $version;
-			die "$i: expected <mediawiki> got " . token2text($token) unless
-				token_compare($token, [T_START, 'mediawiki']);
-
-			die "$i: version is a required attribute" unless
-				defined($version = $$token[2]->{version});
-
-			die "$i: version $version unsupported" unless $version eq '0.3';
-
-			$token = $$buffer[++$i];
-
-			die "$i: expected <siteinfo> got " . token2text($token) unless
-				token_compare($token, [T_START, 'siteinfo']);
-
-			$state = 'in_siteinfo';
-		} elsif ($state eq 'in_siteinfo') {
-			if (token_compare($token, [T_START, 'namespaces'])) {
-				$state = 'in_namespaces';
-				next;
-			} elsif (token_compare($token, [T_END, 'siteinfo'])) {
-				last;
-			} elsif (token_compare($token, [T_START, 'sitename'])) {
-				$token = $$buffer[++$i];
-
-				if ($$token[0] != T_TEXT) {
-					die "$i: expected TEXT but got " . token2text($token);
-				}
-
-				$data{sitename} = ${$$token[1]};
-
-				$token = $$buffer[++$i];
-
-				if (! token_compare($token, [T_END, 'sitename'])) {
-					die "$i: expected </sitename> but got " . token2text($token);
-				}
-			} elsif (token_compare($token, [T_START, 'base'])) {
-				$token = $$buffer[++$i];
-
-				if ($$token[0] != T_TEXT) {
-					$self->dump($buffer);
-					die "$i: expected TEXT but got " . token2text($token);
-				}
-
-				$data{base} = ${$$token[1]};
-
-				$token = $$buffer[++$i];
-
-				if (! token_compare($token, [T_END, 'base'])) {
-					$self->dump($buffer);
-					die "$i: expected </base> but got " . token2text($token);
-				}
-
-			} elsif (token_compare($token, [T_START, 'generator'])) {
-				$token = $$buffer[++$i];
-
-				if ($$token[0] != T_TEXT) {
-					$self->dump($buffer);
-					die "$i: expected TEXT but got " . token2text($token);
-				}
-
-				$data{generator} = ${$$token[1]};
-
-				$token = $$buffer[++$i];
-
-				if (! token_compare($token, [T_END, 'generator'])) {
-					$self->dump($buffer);
-					die "$i: expected </generator> but got " . token2text($token);
-				}
-
-			} elsif (token_compare($token, [T_START, 'case'])) {
-				$token = $$buffer[++$i];
-
-				if ($$token[0] != T_TEXT) {
-					$self->dump($buffer);
-					die "$i: expected </case> but got " . token2text($token);
-				}
-
-				$data{case} = ${$$token[1]};
-
-				$token = $$buffer[++$i];
-
-				if (! token_compare($token, [T_END, 'case'])) {
-					$self->dump($buffer);
-					die "$i: expected </case> but got " . token2text($token);
-				}
-			}
-
-		} elsif ($state eq 'in_namespaces') {
-			my $key;
-			my $name;
-
-			if (token_compare($token, [T_END, 'namespaces'])) {
-				$state = 'in_siteinfo';
-				next;
-			} 
-
-			if (! token_compare($token, [T_START, 'namespace'])) {
-				die "$i: expected <namespace> or </namespaces>; got " . token2text($token);
-			}
-
-			die "$i: key is a required attribute" unless
-				defined($key = $$token[2]->{key});
-
-			$token = $$buffer[++$i];
-
-			#the default namespace has no text associated with it
-			if ($$token[0] == T_TEXT) {
-				$name = ${$$token[1]};
-			} elsif (token_compare($token, [T_END, 'namespace'])) {
-				$name = '';
-				$i--; #move back one for below
-			} else {
-				die "$i: should never happen";	
-			}
-
-			push(@{$data{namespaces}}, [$key, $name]);
-
-			$token = $$buffer[++$i];
-
-			if (! token_compare($token, [T_END, 'namespace'])) {
-				$self->dump($buffer);
-				die "$i: expected </namespace> but got " . token2text($token);
-			}
-
-		} else {
-			die "$i: unknown state '$state'";
-		}
-	}
-
-	$$self{HEAD} = \%data;
-
-	#locate the anchor that indicates what looks like a link is really a 
-	#category assignment ([[foo]] vs [[Category:foo]])
-	#fix for bug #16616
-	foreach my $ns (@{$data{namespaces}}) {
-		#namespace 14 is the category namespace
-		if ($$ns[0] == 14) {
-			$$self{CATEGORY_ANCHOR} = $$ns[1];
-			last;
-		}
-	}
-
-	if (! defined($$self{CATEGORY_ANCHOR})) {
-		die "Could not locate category indicator in namespace definitions";
-	}
-
-	return 1;
-}
-
-#this function is very frightning =)
-sub parse_page {
-	my $self = shift;
-	my $buffer = shift;
-	my %data;
-	my $state = 'start';
-
-	for (my $i = 0; $i <= $#$buffer; $i++) {
-		my $token = $$buffer[$i];
-
-		if ($state eq 'start') {
-			if (! token_compare($token, [T_START, 'page'])) {
-				$self->dump($buffer);
-				die "$i: expected <page>; got " . token2text($token);
-			}
-
-			$state = 'in_page';
-		} elsif ($state eq 'in_page') {
-			if (token_compare($token, [T_START, 'revision'])) {
-				$state = 'in_revision';
-				next;
-			} elsif (token_compare($token, [T_END, 'page'])) {
-				last;
-			} elsif (token_compare($token, [T_START, 'title'])) {
-				$token = $$buffer[++$i];
-
-				if (token_compare($token, [T_END, 'title'])) {
-					$data{title} = '';
-					next;
-				}
-
-				if ($$token[0] != T_TEXT) {
-					$self->dump($buffer);
-					die "$i: expected TEXT; got " . token2text($token);
-				}
-
-				$data{title} = ${$$token[1]};
-
-				$token = $$buffer[++$i];
-
-				if (! token_compare($token, [T_END, 'title'])) {
-					$self->dump($buffer);
-					die "$i: expected </title>; got " . token2text($token);
-				}
-			} elsif (token_compare($token, [T_START, 'id'])) {
-				$token = $$buffer[++$i];
-	
-				if ($$token[0] != T_TEXT) {
-					$self->dump($buffer);
-					die "$i: expected TEXT; got " . token2text($token);
-				}
-
-				$data{id} = ${$$token[1]};
-
-				$token = $$buffer[++$i];
-
-				if (! token_compare($token, [T_END, 'id'])) {
-					$self->dump($buffer);
-					die "$i: expected </id>; got " . token2text($token);
-				}
-			}
-		} elsif ($state eq 'in_revision') {
-			if (token_compare($token, [T_END, 'revision'])) {
-				#If a comprehensive dump file is parsed
-				#it can cause uncontrolled stack growth and the
-				#parser only returns one revision out of
-				#all revisions - if we run into a 
-				#comprehensive dump file, indicated by more
-				#than one <revision> section inside a <page>
-				#section then die with a message
-
-				#just peeking ahead, don't want to update
-				#the index
-				$token = $$buffer[$i + 1];
-
-				if (token_compare($token, [T_START, 'revision'])) {
-					die "unable to properly parse comprehensive dump files";
-				}
-
-				$state = 'in_page';
-				next;	
-			} elsif (token_compare($token, [T_START, 'contributor'])) {
-				$state = 'in_contributor';
-				next;
-			} elsif (token_compare($token, [T_START, 'id'])) {
-				$token = $$buffer[++$i];
-	
-				if ($$token[0] != T_TEXT) {
-					$self->dump($buffer);
-					die "$i: expected TEXT; got " . token2text($token);
-				}
-
-				$data{revision_id} = ${$$token[1]};
-
-				$token = $$buffer[++$i];
-
-				if (! token_compare($token, [T_END, 'id'])) {
-					$self->dump($buffer);
-					die "$i: expected </id>; got " . token2text($token);
-				}
-
-			} elsif (token_compare($token, [T_START, 'timestamp'])) {
-				$token = $$buffer[++$i];
-
-				if ($$token[0] != T_TEXT) {
-					$self->dump($buffer);
-					die "$i: expected TEXT; got " . token2text($token);
-				}
-
-				$data{timestamp} = ${$$token[1]};
-
-				$token = $$buffer[++$i];
-
-				if (! token_compare($token, [T_END, 'timestamp'])) {
-					$self->dump($buffer);
-					die "$i: expected </timestamp>; got " . token2text($token);
-				}
-			} elsif (token_compare($token, [T_START, 'minor'])) {
-				$data{minor} = 1;
-				$token = $$buffer[++$i];
-
-				if (! token_compare($token, [T_END, 'minor'])) {
-					$self->dump($buffer);
-					die "$i: expected </minor>; got " . token2text($token);
-				}
-			} elsif (token_compare($token, [T_START, 'comment'])) {
-				$token = $$buffer[++$i];
-
-				#account for possible null-text 
-				if (token_compare($token, [T_END, 'comment'])) {
-					$data{comment} = '';
-					next;
-				}
-
-				if ($$token[0] != T_TEXT) {
-					$self->dump($buffer);
-					die "$i: expected TEXT; got " . token2text($token);
-				}
-
-				$data{comment} = ${$$token[1]};
-
-				$token = $$buffer[++$i];
-
-				if (! token_compare($token, [T_END, 'comment'])) {
-					$self->dump($buffer);
-					die "$i: expected </comment>; got " . token2text($token);
-				}
-
-			} elsif (token_compare($token, [T_START, 'text'])) {
-				my $token = $$buffer[++$i];
-
-				if (token_compare($token, [T_END, 'text'])) {
-					${$data{text}} = '';
-					next;
-				} elsif ($$token[0] != T_TEXT) {
-					$self->dump($buffer);
-					die "$i: expected TEXT; got " . token2text($token);
-				}
-
-				$data{text} = $$token[1];
-
-				$token = $$buffer[++$i];
-
-				if (! token_compare($token, [T_END, 'text'])) {
-					$self->dump($buffer);
-					die "$i: expected </text>; got " . token2text($token);
-				}
-			
-			}
-
-		} elsif ($state eq 'in_contributor') {
-			if (token_compare($token, [T_END, 'contributor'])) {
-				$state = 'in_revision';
-				next;
-			} elsif (token_compare($token, [T_START, 'username'])) {
-				$token = $$buffer[++$i];
-
-				if ($$token[0] != T_TEXT) {
-					$self->dump($buffer);
-					die "$i: expecting TEXT; got " . token2text($token);
-				}
-
-				$data{username} = ${$$token[1]};
-
-				$token = $$buffer[++$i];
-
-				if (! token_compare($token, [T_END, 'username'])) {
-					$self->dump($buffer);
-					die "$i: expected </username>; got " . token2text($token);
-				}
-
-			} elsif (token_compare($token, [T_START, 'id'])) {
-				$token = $$buffer[++$i];
-				
-				if ($$token[0] != T_TEXT) {
-					$self->dump($buffer);
-					die "$i: expecting TEXT; got " . token2text($token);
-				}
-
-				$data{userid} = ${$$token[1]};
-
-				$token = $$buffer[++$i];
-
-				if (! token_compare($token, [T_END, 'id'])) {
-					$self->dump($buffer);
-					die "$i: expecting </id>; got " . token2text($token);
-				}
-			}
-		} else {
-			die "unknown state: $state";
-		}
-	}
-
-	$data{minor} = 0 unless defined($data{minor});
-
-	return \%data;
-}
-
-#private functions with out OO interface
-sub token2text {
-	my $token = shift;
-
-	if ($$token[0] == T_START) {
-		return "<$$token[1]>";
-	} elsif ($$token[0] == T_END) {
-		return "</$$token[1]>";
-	} elsif ($$token[0] == T_TEXT) {
-		return "!text_token!";	
-	} else {
-		return "!unknown!";
-	}
-}
-
-#this function is where the majority of time is spent in this software
-sub token_compare {
-	my ($toke1, $toke2) = @_;
-
-	foreach my $i (0 .. $#$toke2) {
-		if ($$toke1[$i] ne $$toke2[$i]) {
-			return 0;
-		}
-	}
-
-	return 1;
-}
-
-sub start_handler {
-	my ($p, $tag, %atts) = @_;	
-	my $self = $p->{state};
-
-	push(@{$$self{BUFFER}}, [T_START, $tag, \%atts]);
-
-	return 1;
-}
-
-sub end_handler {
-	my ($p, $tag) = @_;
-	my $self = $p->{state};
-
-	push(@{$$self{BUFFER}}, [T_END, $tag]);
-
-	return 1;
-}
-
-sub char_handler {
-	my ($p, $chars) = @_;
-	my $self = $p->{state};
-	my $buffer = $$self{BUFFER};
-	my $curent = $$buffer[$#$buffer];
-
-	if (! defined($curent)) {
- 		#skip any text not inside a container
-		return 1;
-	} elsif ($$curent[0] == T_TEXT) {
-		${$$curent[1]} .= $chars;
-	} elsif ($$curent[0] == T_START) {
-		my $ignore_ws_only = 1;
-
-		#work around for bug #16583 - All spaces is a possible username
-		# at least if done via unicode. Force white space preservation 
-		#for now. 
-		if ($$curent[1] eq 'username') {
-			$ignore_ws_only = 0;
-		}
-
-		if (defined($$curent[2]->{'xml:space'}) &&
-			($$curent[2]->{'xml:space'} eq 'preserve')) {
-				$ignore_ws_only = 0;
-		}
-
-		if ($ignore_ws_only) {
-			#non-breaking spaces are not whitespace in XML
-			return 1 if $chars =~ m/^[ \t\r\n]+$/m;
-		}
-
-		push(@$buffer, [T_TEXT, \$chars]);
-	} 
-
-	return 1;
-}
-
-package Parse::MediaWikiDump::page;
-
-use strict;
-use warnings;
-
-sub new {
-	my ($class, $data, $category_anchor, $case_setting) = @_; 
-	my $self = {};
-
-	bless($self, $class);
-
-	$$self{DATA} = $data;
-	$$self{CACHE} = {};
-	$$self{CATEGORY_ANCHOR} = $category_anchor;
-
-	return $self;
-}
-
-sub namespace {
-	my $self = shift;
-
-	return $$self{CACHE}{namespace} if defined($$self{CACHE}{namespace});
-
-	my $title = $$self{DATA}{title};
-
-	if ($title =~ m/^([^:]+)\:/) {
-		$$self{CACHE}{namespace} = $1;
-		return $1;
-	} else {
-		$$self{CACHE}{namespace} = '';
-		return '';
-	}
-}
-
-sub categories {
-	my $self = shift;
-	my $anchor = $$self{CATEGORY_ANCHOR};
-
-	return $$self{CACHE}{categories} if defined($$self{CACHE}{categories});
-
-	my $text = $$self{DATA}{text};
-	my @cats;
-	
-	while($$text =~ m/\[\[$anchor:\s*([^\]]+)\]\]/gi) {
-		my $buf = $1;
-
-		#deal with the pipe trick
-		$buf =~ s/\|.*$//;
-		push(@cats, $buf);
-	}
-
-	return undef if scalar(@cats) == 0;
-
-	$$self{CACHE}{categories} = \@cats;
-
-	return \@cats;
-}
-
-sub redirect {
-	my $self = shift;
-	my $text = $$self{DATA}{text};
-
-	return $$self{CACHE}{redirect} if exists($$self{CACHE}{redirect});
-
-	if ($$text =~ m/^#redirect\s*:?\s*\[\[([^\]]*)\]\]/i) {
-		$$self{CACHE}{redirect} = $1;
-		return $1;
-	} else {
-		$$self{CACHE}{redirect} = undef;
-		return undef;
-	}
-}
-
-sub title {
-	my $self = shift;
-	return $$self{DATA}{title};
-}
-
-sub id {
-	my $self = shift;
-	return $$self{DATA}{id};
-}
-
-sub revision_id {
-	my $self = shift;
-	return $$self{DATA}{revision_id};
-}
-
-sub timestamp {
-	my $self = shift;
-	return $$self{DATA}{timestamp};
-}
-
-sub username {
-	my $self = shift;
-	return $$self{DATA}{username};
-}
-
-sub userid {
-	my $self = shift;
-	return $$self{DATA}{userid};
-}
-
-sub minor {
-	my $self = shift;
-	return $$self{DATA}{minor};
-}
-
-sub text {
-	my $self = shift;
-	return $$self{DATA}{text};
-}
-
-package Parse::MediaWikiDump::Links;
-
-use strict;
-use warnings;
-
-sub new {
-	my $class = shift;
-	my $source = shift;
-	my $self = {};
-	$$self{BUFFER} = [];
-
-	bless($self, $class);
-
-	$self->open($source);
-	$self->init;
-
-	return $self;
-}
-
-sub next {
-	my $self = shift;
-	my $buffer = $$self{BUFFER};
-	my $link;
-
-	while(1) {
-		if (defined($link = pop(@$buffer))) {
-			last;
-		}
-
-		#signals end of input
-		return undef unless $self->parse_more;
-	}
-
-	return Parse::MediaWikiDump::link->new($link);
-}
-
-#private functions with OO interface
-sub parse_more {
-	my $self = shift;
-	my $source = $$self{SOURCE};
-	my $need_data = 1;
-	
-	while($need_data) {
-		my $line = <$source>;
-
-		last unless defined($line);
-
-		while($line =~ m/\((\d+),(-?\d+),'(.*?)'\)[;,]/g) {
-			push(@{$$self{BUFFER}}, [$1, $2, $3]);
-			$need_data = 0;
-		}
-	}
-
-	#if we still need data and we are here it means we ran out of input
-	if ($need_data) {
-		return 0;
-	}
-	
-	return 1;
-}
-
-sub open {
-	my $self = shift;
-	my $source = shift;
-
-	if (ref($source) ne 'GLOB') {
-		die "could not open $source: $!" unless
-			open($$self{SOURCE}, $source);
-	} else {
-		$$self{SOURCE} = $source;
-	}
-
-	binmode($$self{SOURCE}, ':utf8');
-
-	return 1;
-}
-
-sub init {
-	my $self = shift;
-	my $source = $$self{SOURCE};
-	my $found = 0;
-	
-	while(<$source>) {
-		if (m/^LOCK TABLES `pagelinks` WRITE;/) {
-			$found = 1;
-			last;
-		}
-	}
-
-	die "not a MediaWiki link dump file" unless $found;
-}
-
-#depreciated backwards compatibility methods
-
-#replaced by next()
-sub link {
-	my $self = shift;
-	$self->next(@_);
-}
-
-package Parse::MediaWikiDump::link;
-
-#you must pass in a fully populated link array reference
-sub new {
-	my $class = shift;
-	my $self = shift;
-
-	bless($self, $class);
-
-	return $self;
-}
-
-sub from {
-	my $self = shift;
-	return $$self[0];
-}
-
-sub namespace {
-	my $self = shift;
-	return $$self[1];
-}
-
-sub to {
-	my $self = shift;
-	return $$self[2];
-}
-
-
-1;
-
-__END__
-
-=head1 NAME
-
-Parse::MediaWikiDump - Tools to process MediaWiki dump files
-
-=head1 SYNOPSIS
-
-  use Parse::MediaWikiDump;
-
-  $source = 'dump_filename.ext';
-  $source = \*FILEHANDLE;
-
-  $pages = Parse::MediaWikiDump::Pages->new($source);
-  $links = Parse::MediaWikiDump::Links->new($source);
-
-  #get all the records from the dump files, one record at a time
-  while(defined($page = $pages->next)) {
-    print "title '", $page->title, "' id ", $page->id, "\n";
-  }
-
-  while(defined($link = $links->next)) {
-    print "link from ", $link->from, " to ", $link->to, "\n";
-  }
-
-  #information about the page dump file
-  $pages->sitename;
-  $pages->base;
-  $pages->generator;
-  $pages->case;
-  $pages->namespaces;
-
-  #information about a page record
-  $page->redirect;
-  $page->categories;
-  $page->title;
-  $page->namespace;
-  $page->id;
-  $page->revision_id;
-  $page->timestamp;
-  $page->username;
-  $page->userid;
-  $page->minor;
-  $page->text;
-
-  #information about a link
-  $link->from;
-  $link->to;
-  $link->namespace;
-
-=head1 DESCRIPTION
-
-This module provides the tools needed to process the contents of various 
-MediaWiki dump files. 
-
-=head1 USAGE
-
-To use this module you must create an instance of a parser for the type of
-dump file you are trying to parse. The current parsers are:
-
-=over 4
-
-=item Parse::MediaWikiDump::Pages
-
-Parse the contents of the page archive.
-
-=item Parse::MediaWikiDump::Links
-
-Parse the contents of the links dump file. 
-
-=back
-
-=head2 General
-
-Both parsers require an argument to new that is a location of source data
-to parse; this argument can be either a filename or a reference to an already
-open filehandle. This entire software suite will die() upon errors in the file,
-inconsistencies on the stack, etc. If...
 
[truncated message content]