[phpodpworld-users] Suggestion on tools/extract.pl
Status: Beta
Brought to you by:
hansfn
From: Howard L. <hl...@gm...> - 2009-01-18 13:16:25
|
#!/usr/bin/perl # # This file is part of phpODPWorld and released under GNU GPL. # # $Id: extract.pl,v 1.3 2006/03/24 21:49:59 hansfn Exp $ # use strict; if ($#ARGV < 1){ print "Usage: extract.pl rdffile category...\n"; print "(The RDF file should already be uncompressed\n"; print "and the category name must NOT end with a slash.)\n"; exit; } my $rdffile = $ARGV[0]; if (! -e $rdffile ) { die("RDF file ($rdffile) doesn't exist\n"); } # Determine type of RDF file (based on filename) - "structure" or "content" my $type; if ($rdffile =~ /structure/i) { $type = "structure"; } elsif ($rdffile =~ /content/i) { $type = "content"; } else { $type = "unknown"; } # Creating an array for the categories and sort it my @array = (); my $category; my $ptr = 1; while ( $ARGV[$ptr] ) { $category = "Top/$ARGV[$ptr]"; push (@array, $category); $ptr++; } @array = sort { $a cmp $b }(@array); # The main parsing starts here my $line; my $tmpcat; my $cat = shift(@array); my $match = 0; my $key_cat; open(RDFFILE, $rdffile) or die "Can't open RDF file ($rdffile) for reading: $!"; while(<RDFFILE>) { $line = $_; if ($line =~ m/<Topic r:id="/) { $tmpcat = $line; chomp $tmpcat; $tmpcat =~ s/<Topic r:id="(.*)">/$1/; #print "$tmpcat, $cat\n"; while (substr($tmpcat, 0, (length($cat)-1)) gt $cat) { #print "Switching ($tmpcat) ($cat)\n"; $match = 0; $cat = shift(@array); } if ($tmpcat =~ m/$cat/) { if ($match == 0) { print "Parsing category: $cat\n"; $match = 1; print OUTFILE '</RDF>'; close(OUTFILE); # Removing slashes from category since it will be used in the filename $key_cat = $cat; $key_cat =~ s#Top/##; $key_cat =~ s#/#_#g; my $outfile = "$key_cat-$type.rdf.u8"; open(OUTFILE, ">$outfile") or die "Can't open extracted RDF file ($outfile) for writing: $!"; print OUTFILE '<?xml version="1.0" encoding="UTF-8" ?> <RDF xmlns:r="http://www.w3.org/TR/RDF/" xmlns:d="http://purl.org/dc/elements/1.0/" xmlns="http://dmoz.org/rdf"> '; } $match = 1; } } if ($match == 1) { # Write line to file print OUTFILE $line; } else { # Skip this line } } close(RDFFILE); print OUTFILE '</RDF>'; close(OUTFILE); exit; |