Update of /cvsroot/docbook/xsl/fo
In directory usw-pr-cvs1:/tmp/cvs-serv15283
Added Files:
pdf2index
Log Message:
Extract indexes from PDF and merge page number ranges
--- NEW FILE ---
#!/usr/bin/perl -- # -*- Perl -*-
# this needs some cleanup...
my $PSTOTEXT = "pstotext";
my $pdf = shift @ARGV;
my $index = "";
my $inindex = 0;
open (F, "$PSTOTEXT $pdf |");
while (<F>) {
if (/^<\/index/) {
$index .= $_;
$inindex = 0;
}
$inindex = 1 if /^<index/;
if ($inindex) {
$index .= $_ if /^\s*</;
}
}
my $cindex = "";
while ($index =~ /^(.*?)((<phrase role=\"pageno\">.*?<\/phrase>\s*)+)/s) {
$cindex .= $1;
$_ = $2;
$index = $'; # '
my @pages = m/<phrase role=\"pageno\">.*?<\/phrase>\s*/sg;
# Remove duplicates...
if ($#pages > 0) {
my @mpages = ();
my $current = "";
foreach my $page (@pages) {
my $pageno = &pageno($page);
if ($pageno ne $current) {
push (@mpages, $page);
$current = $pageno;
}
}
@pages = @mpages;
}
# Collapse ranges...
if ($#pages > 1) {
my @cpages = ();
while (@pages) {
my $count = 0;
my $len = &rangelen($count, @pages);
if ($len <= 2) {
my $page = shift @pages;
push (@cpages, $page);
} else {
my $fpage = shift @pages;
my $lpage = "";
while ($len > 1) {
$lpage = shift @pages;
$len--;
}
my $fpno = &pageno($fpage);
my $lpno = &pageno($lpage);
$fpage =~ s/>$fpno</>${fpno}-$lpno</s;
push (@cpages, $fpage);
}
}
@pages = @cpages;
}
my $page = shift @pages;
$page =~ s/\s*$//s;
$cindex .= $page;
while (@pages) {
$page = shift @pages;
$page =~ s/\s*$//s;
$cindex .= ", $page";
}
}
$cindex .= $index;
print "$cindex\n";
sub pageno {
my $page = shift;
$page =~ s/^<phrase.*?>//;
$page =~ s/^<link.*?>//;
return $1 if $page =~ /^([^<>]+)/;
return "?";
}
sub rangelen {
my $count = shift;
my @pages = @_;
my $len = 1;
my $inrange = 1;
my $current = &pageno($pages[$count]);
while ($count < $#pages && $inrange) {
$count++;
my $next = &pageno($pages[$count]);
if ($current + 1 eq $next) {
$current = $next;
$inrange = 1;
$len++;
} else {
$inrange = 0;
}
}
return $len;
}
|