[phpodpworld-users] Suggestion on tools/extract.pl
Status: Beta
Brought to you by:
hansfn
|
From: Howard L. <hl...@gm...> - 2009-01-18 13:16:25
|
#!/usr/bin/perl
#
# This file is part of phpODPWorld and released under GNU GPL.
#
# $Id: extract.pl,v 1.3 2006/03/24 21:49:59 hansfn Exp $
#
use strict;
if ($#ARGV < 1){
print "Usage: extract.pl rdffile category...\n";
print "(The RDF file should already be uncompressed\n";
print "and the category name must NOT end with a slash.)\n";
exit;
}
my $rdffile = $ARGV[0];
if (! -e $rdffile ) {
die("RDF file ($rdffile) doesn't exist\n");
}
# Determine type of RDF file (based on filename) - "structure" or "content"
my $type;
if ($rdffile =~ /structure/i) {
$type = "structure";
} elsif ($rdffile =~ /content/i) {
$type = "content";
} else {
$type = "unknown";
}
# Creating an array for the categories and sort it
my @array = ();
my $category;
my $ptr = 1;
while ( $ARGV[$ptr] )
{
$category = "Top/$ARGV[$ptr]";
push (@array, $category);
$ptr++;
}
@array = sort { $a cmp $b }(@array);
# The main parsing starts here
my $line;
my $tmpcat;
my $cat = shift(@array);
my $match = 0;
my $key_cat;
open(RDFFILE, $rdffile) or die "Can't open RDF file ($rdffile) for reading: $!";
while(<RDFFILE>)
{
$line = $_;
if ($line =~ m/<Topic r:id="/)
{
$tmpcat = $line;
chomp $tmpcat;
$tmpcat =~ s/<Topic r:id="(.*)">/$1/;
#print "$tmpcat, $cat\n";
while (substr($tmpcat, 0, (length($cat)-1)) gt $cat)
{
#print "Switching ($tmpcat) ($cat)\n";
$match = 0;
$cat = shift(@array);
}
if ($tmpcat =~ m/$cat/)
{
if ($match == 0)
{
print "Parsing category: $cat\n";
$match = 1;
print OUTFILE '</RDF>';
close(OUTFILE);
# Removing slashes from category since it will be used in the filename
$key_cat = $cat;
$key_cat =~ s#Top/##;
$key_cat =~ s#/#_#g;
my $outfile = "$key_cat-$type.rdf.u8";
open(OUTFILE, ">$outfile") or die "Can't open extracted RDF file ($outfile) for writing: $!";
print OUTFILE '<?xml version="1.0" encoding="UTF-8" ?>
<RDF xmlns:r="http://www.w3.org/TR/RDF/"
xmlns:d="http://purl.org/dc/elements/1.0/"
xmlns="http://dmoz.org/rdf">
';
}
$match = 1;
}
}
if ($match == 1)
{
# Write line to file
print OUTFILE $line;
}
else
{
# Skip this line
}
}
close(RDFFILE);
print OUTFILE '</RDF>';
close(OUTFILE);
exit;
|