Thread: [phpodpworld-users] Suggestion on tools/extract.pl

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

#!/usr/bin/perl
#
# This file is part of phpODPWorld and released under GNU GPL.
#
# $Id: extract.pl,v 1.3 2006/03/24 21:49:59 hansfn Exp $
#

use strict;

if ($#ARGV < 1){
    print "Usage: extract.pl rdffile category...\n";
    print "(The RDF file should already be uncompressed\n";
    print "and the category name must NOT end with a slash.)\n";
    exit;
}
my $rdffile = $ARGV[0];

if (! -e $rdffile ) {
    die("RDF file ($rdffile) doesn't exist\n");
}

# Determine type of RDF file (based on filename) - "structure" or "content"
my $type;
if ($rdffile =~ /structure/i) {
    $type = "structure";
} elsif ($rdffile =~ /content/i) {
    $type = "content";
} else {
    $type = "unknown";
}

# Creating an array for the categories and sort it
my @array = ();
my $category;
my $ptr = 1;
while ( $ARGV[$ptr] )
{
  $category = "Top/$ARGV[$ptr]";
  push (@array, $category);
  $ptr++;
}
@array = sort { $a cmp $b }(@array);

# The main parsing starts here
my $line;
my $tmpcat;
my $cat = shift(@array);
my $match = 0;
my $key_cat;

open(RDFFILE, $rdffile) or die "Can't open RDF file ($rdffile) for reading: $!";
while(<RDFFILE>)
{
  $line = $_;
  if ($line =~ m/<Topic r:id="/)
  {
    $tmpcat = $line;
    chomp $tmpcat;
    $tmpcat =~ s/<Topic r:id="(.*)">/$1/;
    #print "$tmpcat, $cat\n";

    while (substr($tmpcat, 0, (length($cat)-1)) gt $cat)
    {
      #print "Switching ($tmpcat) ($cat)\n";
      $match = 0;
      $cat = shift(@array);
    }

    if ($tmpcat =~ m/$cat/)
    {
      if ($match == 0)
      {
        print "Parsing category: $cat\n";
        $match = 1;

        print OUTFILE '</RDF>';
        close(OUTFILE);

        # Removing slashes from category since it will be used in the filename
        $key_cat = $cat;
        $key_cat =~ s#Top/##;
        $key_cat =~ s#/#_#g;
        my $outfile = "$key_cat-$type.rdf.u8";

        open(OUTFILE, ">$outfile") or die "Can't open extracted RDF file ($outfile) for writing: $!";

print OUTFILE '<?xml version="1.0" encoding="UTF-8" ?>
<RDF xmlns:r="http://www.w3.org/TR/RDF/"
     xmlns:d="http://purl.org/dc/elements/1.0/"
     xmlns="http://dmoz.org/rdf">

';
      }
      $match = 1;
    }
  }

  if ($match == 1)
  {
    # Write line to file
    print OUTFILE $line;
  }
  else
  {
    # Skip this line
  }
}
close(RDFFILE);

print OUTFILE '</RDF>';
close(OUTFILE);

exit;