Re: [phpodpworld-users] Suggestion on tools/extract.pl
Status: Beta
Brought to you by:
hansfn
|
From: Howard L. <hl...@gm...> - 2010-06-13 11:08:26
|
#!/usr/bin/perl
#
# This file is part of phpODPWorld and released under GNU GPL.
#
# $Id: extract.pl,v 1.3 2006/03/24 21:49:59 hansfn Exp $
#
use strict;
use warnings;
if ($#ARGV < 1)
{
print "Usage: extract.pl rdffile category [...]\n\n";
print "rdffile Specifies the RDF file for parsing\n";
print " (can be text format or compressed by gzip)\n";
print "category Specifies the category to extract\n";
print " (separate multiple categories with space)\n\n";
print "e.g. extract.pl structure.rdf.u8.gz World/Norsk Regional/Europe/Norway\n";
exit;
}
my $rdffile = $ARGV[0];
if (! -e $rdffile ) {
die("RDF file ($rdffile) doesn't exist\n");
}
# Determine type of RDF file (based on filename) - "structure" or "content"
my $type;
if ($rdffile =~ /structure/i) {
$type = "structure";
} elsif ($rdffile =~ /content/i) {
$type = "content";
} else {
$type = "unknown";
}
# Creating an array for the categories
my $i = 0;
my @categories = ();
while ( $ARGV[$i+1] )
{
push (@categories, "$ARGV[$i+1]");
$i++;
}
my $catsize = $i;
my @catfh = ();
my @catmatch = ();
# Open files for each category
for ($i = 0; $i < $catsize; $i++)
{
$catmatch[$i] = 0;
my $safecategory = $categories[$i];
$safecategory =~ s#/#_#g;
my $outfile = "$safecategory-$type.rdf.u8";
open ($catfh[$i], ">$outfile") or die "Can't open RDF file ($outfile) for writing: $!";
print {$catfh[$i]} '<?xml version="1.0" encoding="UTF-8" ?>
<RDF xmlns:r="http://www.w3.org/TR/RDF/"
xmlns:d="http://purl.org/dc/elements/1.0/"
xmlns="http://dmoz.org/rdf">
';
}
# The main parsing starts here
my $line;
my $catread;
$rdffile =~ s/(.*\.gz)$/gzip -dc $1|/;
open(RDFFILE, $rdffile) or die "Can't open RDF file ($rdffile) for reading: $!";
while(<RDFFILE>)
{
$line = $_;
# Check for Topic lines and compare
if ($line =~ m/<Topic r:id="/)
{
$catread = $line;
$catread =~ s/^\s+<Topic r:id="(.*)">/$1/;
# print "Current category: $catread\n";
for ($i = 0; $i < $catsize; $i++)
{
$catmatch[$i] = ($catread =~ m#^Top/$categories[$i]#) ? 1 : 0;
}
}
# Write line to file if matched
for ($i = 0; $i < $catsize; $i++)
{
print {$catfh[$i]} $line if ($catmatch[$i] == 1);
}
}
close(RDFFILE);
# Close files for each category
for ($i = 0; $i < $catsize; $i++)
{
print {$catfh[$i]} '</RDF>';
close ($catfh[$i]);
}
exit;
|