Re: [phpodpworld-users] Suggestion on tools/extract.pl
Status: Beta
Brought to you by:
hansfn
From: Howard L. <hl...@gm...> - 2010-06-13 11:08:26
|
#!/usr/bin/perl # # This file is part of phpODPWorld and released under GNU GPL. # # $Id: extract.pl,v 1.3 2006/03/24 21:49:59 hansfn Exp $ # use strict; use warnings; if ($#ARGV < 1) { print "Usage: extract.pl rdffile category [...]\n\n"; print "rdffile Specifies the RDF file for parsing\n"; print " (can be text format or compressed by gzip)\n"; print "category Specifies the category to extract\n"; print " (separate multiple categories with space)\n\n"; print "e.g. extract.pl structure.rdf.u8.gz World/Norsk Regional/Europe/Norway\n"; exit; } my $rdffile = $ARGV[0]; if (! -e $rdffile ) { die("RDF file ($rdffile) doesn't exist\n"); } # Determine type of RDF file (based on filename) - "structure" or "content" my $type; if ($rdffile =~ /structure/i) { $type = "structure"; } elsif ($rdffile =~ /content/i) { $type = "content"; } else { $type = "unknown"; } # Creating an array for the categories my $i = 0; my @categories = (); while ( $ARGV[$i+1] ) { push (@categories, "$ARGV[$i+1]"); $i++; } my $catsize = $i; my @catfh = (); my @catmatch = (); # Open files for each category for ($i = 0; $i < $catsize; $i++) { $catmatch[$i] = 0; my $safecategory = $categories[$i]; $safecategory =~ s#/#_#g; my $outfile = "$safecategory-$type.rdf.u8"; open ($catfh[$i], ">$outfile") or die "Can't open RDF file ($outfile) for writing: $!"; print {$catfh[$i]} '<?xml version="1.0" encoding="UTF-8" ?> <RDF xmlns:r="http://www.w3.org/TR/RDF/" xmlns:d="http://purl.org/dc/elements/1.0/" xmlns="http://dmoz.org/rdf"> '; } # The main parsing starts here my $line; my $catread; $rdffile =~ s/(.*\.gz)$/gzip -dc $1|/; open(RDFFILE, $rdffile) or die "Can't open RDF file ($rdffile) for reading: $!"; while(<RDFFILE>) { $line = $_; # Check for Topic lines and compare if ($line =~ m/<Topic r:id="/) { $catread = $line; $catread =~ s/^\s+<Topic r:id="(.*)">/$1/; # print "Current category: $catread\n"; for ($i = 0; $i < $catsize; $i++) { $catmatch[$i] = ($catread =~ m#^Top/$categories[$i]#) ? 1 : 0; } } # Write line to file if matched for ($i = 0; $i < $catsize; $i++) { print {$catfh[$i]} $line if ($catmatch[$i] == 1); } } close(RDFFILE); # Close files for each category for ($i = 0; $i < $catsize; $i++) { print {$catfh[$i]} '</RDF>'; close ($catfh[$i]); } exit; |