Re: [phpodpworld-users] Suggestion on tools/extract.pl
Status: Beta
                
                Brought to you by:
                
                    hansfn
                    
                
            | 
      
      
      From: Howard L. <hl...@gm...> - 2010-06-13 11:08:26
      
     | 
| #!/usr/bin/perl
#
# This file is part of phpODPWorld and released under GNU GPL.
#
# $Id: extract.pl,v 1.3 2006/03/24 21:49:59 hansfn Exp $
#
use strict;
use warnings;
if ($#ARGV < 1)
{
  print "Usage: extract.pl rdffile category [...]\n\n";
  print "rdffile   Specifies the RDF file for parsing\n";
  print "          (can be text format or compressed by gzip)\n";
  print "category  Specifies the category to extract\n";
  print "          (separate multiple categories with space)\n\n";
  print "e.g. extract.pl structure.rdf.u8.gz World/Norsk Regional/Europe/Norway\n";
  exit;
}
my $rdffile = $ARGV[0];
if (! -e $rdffile ) {
    die("RDF file ($rdffile) doesn't exist\n");
}
# Determine type of RDF file (based on filename) - "structure" or "content"
my $type;
if ($rdffile =~ /structure/i) {
    $type = "structure";
} elsif ($rdffile =~ /content/i) {
    $type = "content";
} else {
    $type = "unknown";
}
# Creating an array for the categories
my $i = 0;
my @categories = ();
while ( $ARGV[$i+1] )
{
  push (@categories, "$ARGV[$i+1]");
  $i++;
}
my $catsize = $i;
my @catfh = ();
my @catmatch = ();
# Open files for each category
for ($i = 0; $i < $catsize; $i++)
{
  $catmatch[$i] = 0;
  my $safecategory = $categories[$i];
  $safecategory =~ s#/#_#g;
  my $outfile = "$safecategory-$type.rdf.u8";
  open ($catfh[$i], ">$outfile") or die "Can't open RDF file ($outfile) for writing: $!";
  print {$catfh[$i]} '<?xml version="1.0" encoding="UTF-8" ?>
<RDF xmlns:r="http://www.w3.org/TR/RDF/"
     xmlns:d="http://purl.org/dc/elements/1.0/"
     xmlns="http://dmoz.org/rdf">
';
}
# The main parsing starts here
my $line;
my $catread;
$rdffile =~ s/(.*\.gz)$/gzip -dc $1|/;
open(RDFFILE, $rdffile) or die "Can't open RDF file ($rdffile) for reading: $!";
while(<RDFFILE>)
{
  $line = $_;
  # Check for Topic lines and compare
  if ($line =~ m/<Topic r:id="/)
  {
    $catread = $line;
    $catread =~ s/^\s+<Topic r:id="(.*)">/$1/;
    # print "Current category: $catread\n";
    for ($i = 0; $i < $catsize; $i++)
    {
      $catmatch[$i] = ($catread =~ m#^Top/$categories[$i]#) ? 1 : 0;
    }
  }
  # Write line to file if matched
  for ($i = 0; $i < $catsize; $i++)
  {
    print {$catfh[$i]} $line if ($catmatch[$i] == 1);
  }
}
close(RDFFILE);
# Close files for each category
for ($i = 0; $i < $catsize; $i++)
{
  print {$catfh[$i]} '</RDF>';
  close ($catfh[$i]);
}
exit;
 |