[Archive-access-cvs] SF.net SVN: archive-access: [2265] trunk/archive-access/projects

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 2265
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2265&view=rev
Author:   binzino
Date:     2008-05-13 17:20:24 -0700 (Tue, 13 May 2008)

Log Message:
-----------
Initial checkin of NutchWAX 0.12, a.k.a Nutch Archive Tools (NAT).

Added Paths:
-----------
    trunk/archive-access/projects/nat/
    trunk/archive-access/projects/nat/archive/
    trunk/archive-access/projects/nat/archive/INSTALL.txt
    trunk/archive-access/projects/nat/archive/README.txt
    trunk/archive-access/projects/nat/archive/bin/
    trunk/archive-access/projects/nat/archive/bin/nutchwax
    trunk/archive-access/projects/nat/archive/build.xml
    trunk/archive-access/projects/nat/archive/conf/
    trunk/archive-access/projects/nat/archive/conf/nutch-site.xml
    trunk/archive-access/projects/nat/archive/conf/search-servers.txt
    trunk/archive-access/projects/nat/archive/conf/tika-mimetypes.xml
    trunk/archive-access/projects/nat/archive/lib/
    trunk/archive-access/projects/nat/archive/lib/commons-2.0.1-SNAPSHOT.jar
    trunk/archive-access/projects/nat/archive/lib/commons-httpclient-3.0.1.jar
    trunk/archive-access/projects/nat/archive/lib/fastutil-5.0.3.jar
    trunk/archive-access/projects/nat/archive/src/
    trunk/archive-access/projects/nat/archive/src/java/
    trunk/archive-access/projects/nat/archive/src/java/org/
    trunk/archive-access/projects/nat/archive/src/java/org/archive/
    trunk/archive-access/projects/nat/archive/src/java/org/archive/nutchwax/
    trunk/archive-access/projects/nat/archive/src/java/org/archive/nutchwax/ArcReader.java
    trunk/archive-access/projects/nat/archive/src/java/org/archive/nutchwax/ArcsToSegment.java
    trunk/archive-access/projects/nat/archive/src/java/org/archive/nutchwax/NutchWax.java
    trunk/archive-access/projects/nat/archive/src/java/org/archive/nutchwax/tools/
    trunk/archive-access/projects/nat/archive/src/java/org/archive/nutchwax/tools/DumpIndex.java
    trunk/archive-access/projects/nat/archive/src/plugin/
    trunk/archive-access/projects/nat/archive/src/plugin/build-plugin.xml
    trunk/archive-access/projects/nat/archive/src/plugin/build.xml
    trunk/archive-access/projects/nat/archive/src/plugin/index-nutchwax/
    trunk/archive-access/projects/nat/archive/src/plugin/index-nutchwax/build.xml
    trunk/archive-access/projects/nat/archive/src/plugin/index-nutchwax/plugin.xml
    trunk/archive-access/projects/nat/archive/src/plugin/index-nutchwax/src/
    trunk/archive-access/projects/nat/archive/src/plugin/index-nutchwax/src/java/
    trunk/archive-access/projects/nat/archive/src/plugin/index-nutchwax/src/java/org/
    trunk/archive-access/projects/nat/archive/src/plugin/index-nutchwax/src/java/org/archive/
    trunk/archive-access/projects/nat/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/
    trunk/archive-access/projects/nat/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/
    trunk/archive-access/projects/nat/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java
    trunk/archive-access/projects/nat/archive/src/plugin/query-nutchwax/
    trunk/archive-access/projects/nat/archive/src/plugin/query-nutchwax/build.xml
    trunk/archive-access/projects/nat/archive/src/plugin/query-nutchwax/plugin.xml
    trunk/archive-access/projects/nat/archive/src/plugin/query-nutchwax/src/
    trunk/archive-access/projects/nat/archive/src/plugin/query-nutchwax/src/java/
    trunk/archive-access/projects/nat/archive/src/plugin/query-nutchwax/src/java/org/
    trunk/archive-access/projects/nat/archive/src/plugin/query-nutchwax/src/java/org/archive/
    trunk/archive-access/projects/nat/archive/src/plugin/query-nutchwax/src/java/org/archive/nutchwax/
    trunk/archive-access/projects/nat/archive/src/plugin/query-nutchwax/src/java/org/archive/nutchwax/query/
    trunk/archive-access/projects/nat/archive/src/plugin/query-nutchwax/src/java/org/archive/nutchwax/query/ConfigurableQueryFilter.java
    trunk/archive-access/projects/nat/archive/src/plugin/query-nutchwax/src/java/org/archive/nutchwax/query/DateQueryFilter.java

Added: trunk/archive-access/projects/nat/archive/INSTALL.txt
===================================================================

--- trunk/archive-access/projects/nat/archive/INSTALL.txt	                        (rev 0)
+++ trunk/archive-access/projects/nat/archive/INSTALL.txt	2008-05-14 00:20:24 UTC (rev 2265)
@@ -0,0 +1,236 @@
+
+INSTALL.txt
+2008-05-06
+Aaron Binns
+
+
+The NutchWAX 0.12 build and installation is as an "add-on" to an
+existing Nutch 1.0-dev installation.
+
+NutchWAX 0.12 uses a simple 'ant' build script.  The script compiles
+the NutchWAX sources, using the libraries in the installed
+Nutch-1.0-dev.
+
+We strongly recommend having *two* Nutch-1.0-dev installation
+directories: one that you build NutchWAX against, and another into
+which NutchWAX is deployed.
+
+NutchWAX is deployed by un-tar'ing the nutchwax-0.12.tar.gz file
+*into* an existing Nutch-1.0-dev installation.  Think of NutchWAX as
+an add-on.  We over-write a few Nutch config files, but the rest is
+simply added to the existing Nutch-1.0-dev installation.
+
+
+Nutch-1.0-dev
+-------------
+
+As mentioned above, NutchWAX 0.12 is built against Nutch-1.0-dev.  Now
+Nutch doesn't have a 1.0 release package yet, so we have to use the
+Nutch SVN trunk.  The specific SVN revision that NutchWAX 0.12 is 
+built against is:
+
+  650739
+
+To checkout this revision of Nutch, use:
+
+ $ mkdir nutch
+ $ cd nutch
+ $ svn checkout -r 650739 http://svn.apache.org/repos/asf/lucene/nutch/trunk
+
+To build the nutch-1.0-dev.tar.gz package, use 'ant'
+
+ $ cd trunk
+ $ ant tar
+
+This produces
+
+  build/nutch-1.0-dev.tar.gz
+
+Which we then install *twice*
+
+ $ mkdir -p ~/nutchwax-0.12/nutch-1.0-dev
+ $ tar xfz -C ~/nutchwax-0.12/nutch-1.0-dev build/nutch-1.0-dev.tar.gz
+ $ mkdir -p /opt/nutch-1.0-dev
+ $ tar xfz -C /opt/nutch-1.0-dev build/nutch-1.0-dev.tar.gz
+
+The idea is that we keep /opt/nutch-1.0-dev as our pristine copy which
+we compile against, then, when we want to test NutcWAX, we deploy it
+into ~/nutchwax-0.12/nutch-1.0-dev.
+
+Why can't we just use one installation of Nutch?  Mainly to avoid
+weirdness where we are compiling NutchWAX source against the same set
+of libraries where we would be installing NutchWAX.  Consider, when we
+deploy NutchWAX, we copy the nutchwax.jar into the Nutch 'lib'
+directory.  If we use that same 'lib' directory for dependencies when
+compiling the source, 'ant'/'javac' will likely get confused when
+calculating dependencies.
+
+It's possible that you could successfully go through the
+build/test/release cycle using one Nutch-1.0-dev directory, but these
+instructions assume you will have two.
+
+
+Build and install
+-----------------
+
+  1. Install two Nutch-1.0-dev packages per the instructions above.
+
+  2. Edit build.xml to point to the "pristine" installation of Nutch-1.0-dev
+
+       <!-- NOTE: Point this to your Nutch 1.0-dev directory -->
+       <property name="nutch.dir" value="/opt/nutch-1.0-dev" />
+
+  3. Build NutchWAX-0.12
+
+      $ ant
+
+     The default build rule is "package" which will compile all the source
+     and build an intallation tarball: nutchwax-0.12.tar.gz
+
+     The "build.xml" file is pretty straightforward and just grepping
+     for the targets should be pretty obvious: compile, clean, etc.
+
+  4. Install NutchWAX into the build/test Nutch installation
+
+     $ tar xfz -C ~/nutchwax-0.12/nutch-1.0-dev nutchwax-0.12.tar.gz
+
+That's it!
+
+All we do is add our libraries (nutchwax.jar and dependencies), the
+'nutchwax' helper script, plugins for indexing and querying, and a few
+config files.
+
+Except for the config files, no files in the Nutch-1.0-dev
+installation are over-written, only added.  The "nutch-site.xml" file
+is over-written, but that file is empty in a vanilla Nutch
+installation, so there's small risk of over-writing something.
+
+
+HOWTO run and test
+------------------
+
+The 'nutchwax' helper script is installed in the Nutch-1.0-dev 'bin'
+directory next to the 'nutch' helper script.
+
+The 'nutchwax' script is used to run the NutchWAX-specific tools, use
+the regular 'nutch' script for regular Nutch activities.
+
+The 'nutchwax' script runs two tools
+
+  "import"     Import a set of .arc/.warc files from a manifest, creating
+               a Nutch segment.
+
+  "dumpindex"  Debug tool that dumps a Lucene index, such as the ones
+               created by Nutch's "index" tool.
+
+The idea is that the NutchWAX "import" tool supplants the Nutch
+generate and fetch cycle.  Rather than generating and fetching
+segments, we import the .arc/.warc files directly into a newly created
+segment.  Then, we process that segment just as we normally would with
+Nutch.
+
+For example,
+
+  $ cd nutch-test
+  $ cat > manifest
+    http://someserver/foo-bar-baz.arc mycollection
+    ^D
+  $ nutch-1.0-dev/bin/nutchwax import manifest
+
+This will import the arc file listed in the manifest into a newly
+created segment.  The segment is created by default in a directory
+hierarchy of the form:
+
+  segments/[date-timestamp]
+
+This mirrors the way segments are created in vanilla Nutch by the
+"generate" command.
+
+You can explicitly name the segment if you want, e.g.
+
+  nutchwax import manifest mysegment
+
+Once the segment is created by the importing of ARC files with
+NutchWAX, you can use Nutch to perform the rest of the steps.  For
+example:
+
+  $ nutch-1.0-dev/bin/nutchwax import manifest
+  $ nutch-1.0-dev/bin/nutch updatedb crawldb -dir segments
+  $ nutch-1.0-dev/bin/nutch invertlinks linkdb -dir segments
+  $ nutch-1.0-dev/bin/nutch index indexes crawldb linkdb segments/*
+  $ nutch-1.0-dev/bin/nutch merge index indexes
+
+This is pretty much the minimal set of steps to import and index a set
+of ARC files.  The crawldb update and link inversion steps are pro
+forma and don't have anything to do with NutchWAX specifically, but
+are a part of regular Nutch processing.
+
+Now you have a Nutch "index" directory and are ready to search!
+
+Searching is done as in vanilla Nutch.  Either launch the Nutch webapp
+or use the command-line interface to NutchBean to run some test
+searches.  Nothing NutchWAX-specific here.
+
+
+Miscellaneous notes
+-------------------
+
+1. Plugins
+
+There are two plugins bundled with NutchWAX: 
+
+   index-nutchwax
+   query-nutchwax
+
+See the "plugin.includes" property in nutch-site.xml to see where
+these plugins are added to the filter chain.
+
+The index-nutchwax plugin ensures that WAX-specifici metadata is
+transferred from the Nutch Content object to the Lucene Document
+object, which is placed in the Lucene index.
+
+The query-nutchwax plugin is used to process query requests against
+those same meta-data fields.  It also expands the capabilities of
+searching the basic Nutch fields as well.
+
+2. URL filters
+
+Nutch's URL filter by default filters-out many common URL oddities
+that would normally trip-up Nutch's crawler.  However, when importing
+content from ARC files, filtering out content probably doens't make
+sense.  That is, whatever content made it into the ARC file should be
+imported, no matter what the URL looks like.
+
+To change the URL filter, edit the Nutch file 'conf/regex-urlfilter.txt'.
+To pass all content through the filter, remove all filter rules except
+for the last one:
+
+  # accept anything else
+  +.
+
+3. conf/tika-mimetypes.xml
+
+NutchWAX comes with a fixed copy of tika-mimetypes.xml.  The version
+in Nutch revision 650739 has a few bugs in it which cause parsing to
+fail for many document types.  The bugs are:
+
+ o Move
+
+	<mime-type type="application/xml">
+		<alias type="text/xml" />
+		<glob pattern="*.xml" />
+	</mime-type>
+
+   definition higher up in the file, before the reference to it.
+
+ o Remove
+
+	<mime-type type="application/x-ms-dos-executable">
+		<alias type="application/x-dosexec;exe" />
+	</mime-type>
+
+   as the ';' character is illegal according to the comments in the
+   Nutch code.
+
+The copy of "conf/tika-mimetypes.xml" bundled with NutchWAX fixes
+these two bugs.

Added: trunk/archive-access/projects/nat/archive/README.txt
===================================================================
--- trunk/archive-access/projects/nat/archive/README.txt	                        (rev 0)
+++ trunk/archive-access/projects/nat/archive/README.txt	2008-05-14 00:20:24 UTC (rev 2265)
@@ -0,0 +1,105 @@
+
+README.txt
+2008-05-06
+Aaron Binns
+
+
+This is the NutchWAX-0.12 source that John Lee handed-off to me.  It
+is a work-in-progress.
+
+Compared to NutchWAX-0.10 (and earlier) it is *much* simpler.  The
+main WAX-specific code is in just a few files really:
+
+src/java/org/archive/nutchwax/ArcsToSegment.java
+
+  This is the meat of the WAX logic for processing .arc files and
+  generating Nutch segments.  Once we use this to generate a set of
+  segments for the .arc files, we can use the rest of vanilla
+  Nutch-1.0-dev to invert links and index the content with Lucene.
+
+  This conversion code is heavily edited from:
+
+    nutch-1.0-dev/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java
+
+  taken from the Nutch SVN head (a.k.a the "1.0-dev" in-development).
+
+  Ours differs in a few important ways:
+
+    o Rather than taking a directory with .arc files as input, we take
+      a manifest file with URLs to .arc files.  This way, the manifest
+      is split up among the distributed Hadoop jobs and the .arc files
+      are processed in whole by each worker.
+
+      In the Nutch-1.0-dev, the ArcSegmentCreator.java expects the
+      input directory to contain the .arc files and (AFAICT) splits
+      them up and distributes them across the Hadoop workers.  This
+      seems really inefficient to me, I think our approach is much
+      better -- at least for us.
+
+    o Related to the way input files are split and processed, we use
+      the standard Archive ARCReader class just like Heritrix and
+      Wayback.
+
+      The ArcSegmentCreator.java in Nutch-1.0-dev doesn't use our
+      ARCReader because of licensing imcompatibility.  Ours is under
+      GPL and Nutch-1.0-dev forbids the use of GPL code.
+      
+      We are in the process of re-licensing or dual-licensing with
+      Apache License, but until then, our ARCReader code won't be incldued      
+      in mainline Nutch.
+
+      This isn's a problem per se, but worth noting in case anyone
+      looks at the Nutch-1.0-dev code and wonders why they built their
+      own (horribly inefficient) .arc reader.
+
+    o We add metadata fields to the processed document for WAX-specific
+      purposes:
+
+        content.getMetadata().set( NutchWax.CONTENT_TYPE_KEY, meta.getMimetype() );
+        content.getMetadata().set( NutchWax.ARCNAME_KEY,      meta.getArcFile().getName() ) ;
+        content.getMetadata().set( NutchWax.COLLECTION_KEY,   collection);
+        content.getMetadata().set( NutchWax.ARCHIVE_DATE_KEY, meta.getDate() );
+
+      The addition of the arcname and collection key is pretty
+      obvious.  I don't know why the content-type isn't added in the
+      vanilla Nutch-1.0-dev.
+      
+      Also, we should review the use of the ARCHIVE_DATE_KEY in that
+      John Lee mentioned to me that there was possibly duplicate date
+      fields put in the index: one that is a plain old Java date, and
+      one that is a 14-digit date string for use with Wayback.
+
+src/java/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/NutchWaxIndexingFilter.java
+src/java/plugin/index-nutchwax/plugin.xml
+
+  This filter is pretty straightforward.  All it does is take the
+  metadata fields that were added to the document (as described above)
+  and placed in the Lucene index so that we can make use of them at
+  search-time.
+
+src/java/plugin/query-nutchwax/src/java/org/archive/nutchwax/query/MultipleFieldQueryFilter.java
+src/java/plugin/query-nutchwax/plugin.xml
+
+  This is a single query filter that can be used for querying single
+  fields from a single implementation.  It does *not* allow for
+  querying multiple fields as you can already do that via Nutch.
+
+  What this filter does is allows one to more-or-less create query
+  filters in a data-driven manner rather than having to code-up a new
+  class for each field.  That is, before one would have to create a
+  CollectionQueryFilter class to filter on the "collection" field.
+  With the MultipleFieldQueryFilter class, you can specify that the
+  "collection" field is to be filterable via the plugin.xml file and
+  "nutchwax.filter.query" configuration property.
+
+src/java/org/archive/nutchwax/NutchWax.java
+
+  Just a simple enum used by the above two classes for the metadata
+  keys.
+
+src/java/org/archive/nutchwax/tools/DumpIndex.java
+
+  A simple command-line utility to dump the contents of a Lucene
+  index.  Used for debugging.
+
+

Added: trunk/archive-access/projects/nat/archive/bin/nutchwax
===================================================================
--- trunk/archive-access/projects/nat/archive/bin/nutchwax	                        (rev 0)
+++ trunk/archive-access/projects/nat/archive/bin/nutchwax	2008-05-14 00:20:24 UTC (rev 2265)
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed
+# with this work for additional information regarding copyright
+# ownership.  The ASF licenses this file to You under the Apache
+# License, Version 2.0 (the "License"); you may not use this file
+# except in compliance with the License.  You may obtain a copy of the
+# License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.  See the License for the specific language governing
+# permissions and limitations under the License.
+
+
+# The following is cribbed from the 'nutch' script to ascertain the
+# location of Nutch so we can call its scripts.
+#
+# resolve links - $0 may be a softlink
+THIS="$0"
+while [ -h "$THIS" ]; do
+  ls=`ls -ld "$THIS"`
+  link=`expr "$ls" : '.*-> \(.*\)$'`
+  if expr "$link" : '.*/.*' > /dev/null; then
+    THIS="$link"
+  else
+    THIS=`dirname "$THIS"`/"$link"
+  fi
+done
+
+THIS_DIR=`dirname "$THIS"`
+NUTCH_HOME=`cd "$THIS_DIR/.." ; pwd`
+
+# Now that we have NUTCH_HOME, process the command-line.
+
+case "$1" in
+  import)
+    shift
+    if [ $# -eq 0 ]; then
+        ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.ArcsToSegment
+        exit 1
+    fi
+    if [ -z "$2" ]; then
+        segment=`date +"%Y%m%d%H%M%S"`
+        segment="segments/${segment}"
+    else
+        segment="$2"
+    fi
+    ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.ArcsToSegment "$1" "${segment}"
+    ;;
+  dumpindex)
+    shift
+    ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.tools.DumpIndex $@
+    ;;
+  *)
+    echo ""
+    echo "Usage: nutchwax COMMAND"
+    echo "where COMMAND is one of:"
+    echo "  import       Import ARCs into a new Nutch segment"
+    echo "  dumpindex    Dump an index to the screen"
+    echo ""
+    exit 1
+    ;;
+esac
+
+exit 0


Property changes on: trunk/archive-access/projects/nat/archive/bin/nutchwax
___________________________________________________________________
Name: svn:executable
   + *

Added: trunk/archive-access/projects/nat/archive/build.xml
===================================================================
--- trunk/archive-access/projects/nat/archive/build.xml	                        (rev 0)
+++ trunk/archive-access/projects/nat/archive/build.xml	2008-05-14 00:20:24 UTC (rev 2265)
@@ -0,0 +1,138 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="nutchwax" default="job">
+
+  <property name="nutch.dir" value="../../" />
+
+  <property name="src.dir"   value="src" />
+  <property name="lib.dir"   value="lib" />
+  <property name="build.dir" value="${nutch.dir}/build" />
+  <!-- HACK: Need to import default.properties like Nutch does -->
+  <property name="dist.dir"  value="${build.dir}/nutch-1.0-dev" />
+
+  <target name="nutch-compile-core">
+    <ant dir="${nutch.dir}" target="compile-core" inheritAll="false" />
+  </target>
+
+  <target name="nutch-compile-plugins">
+    <ant dir="${nutch.dir}" target="compile-plugins" inheritAll="false" />
+  </target>
+
+  <target name="compile-core" depends="nutch-compile-core">
+    <javac 
+           destdir="${build.dir}/classes"
+           debug="true"
+           verbose="false"
+           source="1.5"
+           target="1.5"
+           encoding="UTF-8"
+           fork="true"
+           nowarn="true"
+           deprecation="false">
+      <src path="${src.dir}/java" />
+      <include name="**/*.java" />
+      <classpath>
+        <pathelement location="${build.dir}/classes" />
+        <fileset dir="${lib.dir}">
+          <include name="*.jar"/>
+        </fileset>
+        <fileset dir="${nutch.dir}/lib">
+          <include name="*.jar"/>
+        </fileset>
+      </classpath>
+    </javac>
+  </target>
+
+  <target name="compile-plugins">
+    <ant dir="src/plugin" target="deploy" inheritAll="false" />
+  </target>
+
+  <!--
+      These targets all call down to the corresponding target in the
+      Nutch build.xml file.  This way all of the 'ant' build commands
+      can be executed from this directory and everything should get
+      built as expected.
+    -->
+  <target name="compile" depends="compile-core, compile-plugins, nutch-compile-plugins">
+  </target>
+
+  <target name="jar" depends="compile-core">
+    <ant dir="${nutch.dir}" target="jar" inheritAll="false" />
+  </target>
+
+  <target name="job" depends="compile">
+    <ant dir="${nutch.dir}" target="job" inheritAll="false" />
+  </target>
+
+  <target name="war" depends="compile">
+    <ant dir="${nutch.dir}" target="war" inheritAll="false" />
+  </target>
+
+  <target name="javadoc" depends="compile">
+    <ant dir="${nutch.dir}" target="javadoc" inheritAll="false" />
+  </target>
+
+  <target name="tar" depends="package">
+    <ant dir="${nutch.dir}" target="tar" inheritAll="false" />
+  </target>
+
+  <target name="clean">
+    <ant dir="${nutch.dir}" target="clean" inheritAll="false" />
+  </target>
+
+  <!-- This one does a little more after calling down to the relevant
+       Nutch target.  After Nutch has copied everything into the
+       distribution directory, we add our script, libraries, etc.
+       
+       Rather than over-write the standard Nutch configuration files,
+       we place ours in a newly created directory
+       
+         contrib/archive/conf
+
+       and let the individual user decide whether or not to
+       incorporate our modifications.
+    -->
+  <target name="package" depends="jar, job, war, javadoc">
+    <ant dir="${nutch.dir}" target="package" inheritAll="false" />
+
+    <copy todir="${dist.dir}/lib" includeEmptyDirs="false">
+      <fileset dir="lib"/>
+    </copy>
+
+    <copy todir="${dist.dir}/bin">
+      <fileset dir="bin"/>
+    </copy>
+
+    <chmod perm="ugo+x" type="file">
+        <fileset dir="${dist.dir}/bin"/>
+    </chmod>
+
+    <mkdir dir="${dist.dir}/contrib/archive/conf"/>
+    <copy todir="${dist.dir}/contrib/archive/conf">
+      <fileset dir="conf" />
+    </copy>
+
+    <copy todir="${dist.dir}/contrib/archive">
+      <fileset dir=".">
+        <include name="*.txt" />
+      </fileset>
+    </copy>
+
+  </target>
+
+</project>

Added: trunk/archive-access/projects/nat/archive/conf/nutch-site.xml
===================================================================
--- trunk/archive-access/projects/nat/archive/conf/nutch-site.xml	                        (rev 0)
+++ trunk/archive-access/projects/nat/archive/conf/nutch-site.xml	2008-05-14 00:20:24 UTC (rev 2265)
@@ -0,0 +1,65 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+
+<!-- Put site-specific property overrides in this file. -->
+
+<configuration>
+
+<property>
+  <name>plugin.includes</name>
+  <!-- Add 'index-nutchwax' and 'query-nutchwax' to plugin list. -->
+  <!-- Also, add 'parse-pdf' -->
+  <!-- Remove 'urlfilter-regex' and 'normalizer-(pass|regex|basic)' -->
+  <value>protocol-http|parse-(text|html|js|pdf)|index-(basic|anchor|nutchwax)|query-(basic|site|url|nutchwax)|summary-basic|scoring-opic</value>
+</property>
+
+<property>
+  <!-- Configure the 'index-nutchwax' plugin.  Specify how the metadata fields added by the ArcsToSegment are mapped to the Lucene documents during indexing.
+       The specifications here are of the form "src-key:lowercase:store:tokenize:dest-key"
+       Where the only required part is the "src-key", the rest will assume the following defaults:
+          lowercase = true
+          store     = true
+          tokenize  = false
+          dest-key  = src-key
+    -->
+  <name>nutchwax.filter.index</name>
+  <value>
+    arcname:false
+    collection
+    date
+    type
+  </value>
+</property>
+
+<property>
+  <!-- Configure the 'query-nutchwax' plugin.  Specify which fields to make searchable via "field:[term|phrase]" query syntax, and whether they are "raw" fields or not.  
+       The specification format is "raw:name:lowercase:boost" or "field:name:boost".  Default values are
+          lowercase = true
+          boost     = 1.0f
+       There is no "lowercase" property for "field" specification because the Nutch FieldQueryFilter doesn't expose the option, unlike the RawFieldQueryFilter.
+       AFAICT, the order isn't important. -->
+  <!-- We do *not* use this filter for handling "date" queries, there is a specific filter for that: DateQueryFilter -->
+  <name>nutchwax.filter.query</name>
+  <value>
+    raw:arcname:false
+    raw:collection
+    raw:type
+    field:anchor
+    field:content
+    field:host
+    field:title
+  </value>
+</property>
+
+<!-- Over-ride setting in Nutch "nutch-default.xml" file.  We do *not* want Content-Type detection via magic resolution because the implementation 
+     in Nutch reads in the entire content body (which could be a 1GB MPG movie), then converts it to a String before examining the first dozen or
+     so bytes/characters for magic matching.  Since we archvie large files, this is bad, and OOMs occur.  So, we disable this feature and keep
+     the Content-Type that is already in the (W)ARC file. -->
+<property>
+  <name>mime.type.magic</name>
+  <value>false</value>
+  <description>Defines if the mime content type detector uses magic resolution.
+  </description>
+</property>
+
+</configuration>

Added: trunk/archive-access/projects/nat/archive/conf/search-servers.txt
===================================================================
--- trunk/archive-access/projects/nat/archive/conf/search-servers.txt	                        (rev 0)
+++ trunk/archive-access/projects/nat/archive/conf/search-servers.txt	2008-05-14 00:20:24 UTC (rev 2265)
@@ -0,0 +1 @@
+localhost 9000

Added: trunk/archive-access/projects/nat/archive/conf/tika-mimetypes.xml
===================================================================
--- trunk/archive-access/projects/nat/archive/conf/tika-mimetypes.xml	                        (rev 0)
+++ trunk/archive-access/projects/nat/archive/conf/tika-mimetypes.xml	2008-05-14 00:20:24 UTC (rev 2265)
@@ -0,0 +1,364 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+	Licensed to the Apache Software Foundation (ASF) under one or more
+	contributor license agreements.  See the NOTICE file distributed with
+	this work for additional information regarding copyright ownership.
+	The ASF licenses this file to You under the Apache License, Version 2.0
+	(the "License"); you may not use this file except in compliance with
+	the License.  You may obtain a copy of the License at
+	
+	http://www.apache.org/licenses/LICENSE-2.0
+	
+	Unless required by applicable law or agreed to in writing, software
+	distributed under the License is distributed on an "AS IS" BASIS,
+	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	See the License for the specific language governing permissions and
+	limitations under the License.
+	
+	Description: This xml file defines the valid mime types used by Tika.
+	The mime types within this file are based on the types in the mime-types.xml 
+	file available in Apache Nutch.
+-->
+
+<mime-info>
+
+	<mime-type type="text/plain">
+		<magic priority="50">
+			<match value="This is TeX," type="string" offset="0" />
+			<match value="This is METAFONT," type="string" offset="0" />
+		</magic>
+		<glob pattern="*.txt" />
+		<glob pattern="*.asc" />
+	</mime-type>
+
+	<mime-type type="text/html">
+		<magic priority="50">
+			<match value="&lt;!DOCTYPE HTML" type="string"
+				offset="0:64" />
+			<match value="&lt;!doctype html" type="string"
+				offset="0:64" />
+			<match value="&lt;HEAD" type="string" offset="0:64" />
+			<match value="&lt;head" type="string" offset="0:64" />
+			<match value="&lt;TITLE" type="string" offset="0:64" />
+			<match value="&lt;title" type="string" offset="0:64" />
+			<match value="&lt;html" type="string" offset="0:64" />
+			<match value="&lt;HTML" type="string" offset="0:64" />
+			<match value="&lt;BODY" type="string" offset="0" />
+			<match value="&lt;body" type="string" offset="0" />
+			<match value="&lt;TITLE" type="string" offset="0" />
+			<match value="&lt;title" type="string" offset="0" />
+			<match value="&lt;!--" type="string" offset="0" />
+			<match value="&lt;h1" type="string" offset="0" />
+			<match value="&lt;H1" type="string" offset="0" />
+			<match value="&lt;!doctype HTML" type="string" offset="0" />
+			<match value="&lt;!DOCTYPE html" type="string" offset="0" />
+		</magic>
+		<glob pattern="*.html" />
+		<glob pattern="*.htm" />
+	</mime-type>
+
+	<mime-type type="application/xml">
+		<alias type="text/xml" />
+		<glob pattern="*.xml" />
+	</mime-type>
+
+	<mime-type type="application/xhtml+xml">
+		<sub-class-of type="text/xml" />
+		<glob pattern="*.xhtml" />
+		<root-XML namespaceURI='http://www.w3.org/1999/xhtml'
+			localName='html' />
+	</mime-type>
+
+	<mime-type type="application/vnd.ms-powerpoint">
+		<glob pattern="*.ppz" />
+		<glob pattern="*.ppt" />
+		<glob pattern="*.pps" />
+		<glob pattern="*.pot" />
+		<magic priority="50">
+			<match value="0xcfd0e011" type="little32" offset="0" />
+		</magic>
+	</mime-type>
+
+	<mime-type type="application/vnd.ms-excel">
+		<magic priority="50">
+			<match value="Microsoft Excel 5.0 Worksheet" type="string"
+				offset="2080" />
+		</magic>
+		<glob pattern="*.xls" />
+		<glob pattern="*.xlc" />
+		<glob pattern="*.xll" />
+		<glob pattern="*.xlm" />
+		<glob pattern="*.xlw" />
+		<glob pattern="*.xla" />
+		<glob pattern="*.xlt" />
+		<glob pattern="*.xld" />
+		<alias type="application/msexcel" />
+	</mime-type>
+
+	<mime-type type="application/vnd.oasis.opendocument.text">
+		<glob pattern="*.odt" />
+	</mime-type>
+
+
+	<mime-type type="application/zip">
+		<alias type="application/x-zip-compressed" />
+		<magic priority="40">
+			<match value="PK\003\004" type="string" offset="0" />
+		</magic>
+		<glob pattern="*.zip" />
+	</mime-type>
+
+	<mime-type type="application/vnd.oasis.opendocument.text">
+		<glob pattern="*.oth" />
+	</mime-type>
+
+	<mime-type type="application/msword">
+		<magic priority="50">
+			<match value="\x31\xbe\x00\x00" type="string" offset="0" />
+			<match value="PO^Q`" type="string" offset="0" />
+			<match value="\376\067\0\043" type="string" offset="0" />
+			<match value="\333\245-\0\0\0" type="string" offset="0" />
+			<match value="Microsoft Word 6.0 Document" type="string"
+				offset="2080" />
+			<match value="Microsoft Word document data" type="string"
+				offset="2112" />
+		</magic>
+		<glob pattern="*.doc" />
+		<alias type="application/vnd.ms-word" />
+	</mime-type>
+
+	<mime-type type="application/octet-stream">
+		<magic priority="50">
+			<match value="\037\036" type="string" offset="0" />
+			<match value="017437" type="host16" offset="0" />
+			<match value="0x1fff" type="host16" offset="0" />
+			<match value="\377\037" type="string" offset="0" />
+			<match value="0145405" type="host16" offset="0" />
+		</magic>
+		<glob pattern="*.bin" />
+	</mime-type>
+
+	<mime-type type="application/pdf">
+		<magic priority="50">
+			<match value="%PDF-" type="string" offset="0" />
+		</magic>
+		<glob pattern="*.pdf" />
+		<alias type="application/x-pdf" />
+	</mime-type>
+
+	<mime-type type="application/atom+xml">
+		<root-XML localName="feed"
+			namespaceURI="http://purl.org/atom/ns#" />
+	</mime-type>
+
+	<mime-type type="application/mac-binhex40">
+		<glob pattern="*.hqx" />
+	</mime-type>
+
+	<mime-type type="application/mac-compactpro">
+		<glob pattern="*.cpt" />
+	</mime-type>
+
+	<mime-type type="application/rtf">
+	    <glob pattern="*.rtf"/>
+		<alias type="text/rtf" />
+	</mime-type>
+
+	<mime-type type="application/rss+xml">
+		<alias type="text/rss" />
+		<root-XML localName="rss" />
+		<root-XML namespaceURI="http://purl.org/rss/1.0/" />
+		<glob pattern="*.rss" />
+	</mime-type>
+
+	<!--  added in by mattmann -->
+	<mime-type type="application/x-mif">
+		<alias type="application/vnd.mif" />
+	</mime-type>
+
+	<mime-type type="application/vnd.wap.wbxml">
+		<glob pattern="*.wbxml" />
+	</mime-type>
+
+	<mime-type type="application/vnd.wap.wmlc">
+		<_comment>Compiled WML Document</_comment>
+		<glob pattern="*.wmlc" />
+	</mime-type>
+
+	<mime-type type="application/vnd.wap.wmlscriptc">
+		<_comment>Compiled WML Script</_comment>
+		<glob pattern="*.wmlsc" />
+	</mime-type>
+
+	<mime-type type="text/vnd.wap.wmlscript">
+		<_comment>WML Script</_comment>
+		<glob pattern="*.wmls" />
+	</mime-type>
+
+	<mime-type type="application/x-bzip">
+		<alias type="application/x-bzip2" />
+	</mime-type>
+
+	<mime-type type="application/x-bzip-compressed-tar">
+		<glob pattern="*.tbz" />
+		<glob pattern="*.tbz2" />
+	</mime-type>
+
+	<mime-type type="application/x-cdlink">
+		<_comment>Virtual CD-ROM CD Image File</_comment>
+		<glob pattern="*.vcd" />
+	</mime-type>
+
+	<mime-type type="application/x-director">
+		<_comment>Shockwave Movie</_comment>
+		<glob pattern="*.dcr" />
+		<glob pattern="*.dir" />
+		<glob pattern="*.dxr" />
+	</mime-type>
+
+	<mime-type type="application/x-futuresplash">
+		<_comment>Macromedia FutureSplash File</_comment>
+		<glob pattern="*.spl" />
+	</mime-type>
+
+	<mime-type type="application/x-java">
+		<alias type="application/java" />
+	</mime-type>
+
+	<mime-type type="application/x-koan">
+		<_comment>SSEYO Koan File</_comment>
+		<glob pattern="*.skp" />
+		<glob pattern="*.skd" />
+		<glob pattern="*.skt" />
+		<glob pattern="*.skm" />
+	</mime-type>
+
+	<mime-type type="application/x-latex">
+		<_comment>LaTeX Source Document</_comment>
+		<glob pattern="*.latex" />
+	</mime-type>
+
+	<!-- JC CHANGED
+		<mime-type type="application/x-mif">
+		<_comment>FrameMaker MIF document</_comment>
+		<glob pattern="*.mif"/>
+		</mime-type> -->
+
+	<mime-type type="application/ogg">
+		<alias type="application/x-ogg" />
+	</mime-type>
+
+	<mime-type type="application/x-rar">
+		<alias type="application/x-rar-compressed" />
+	</mime-type>
+
+	<mime-type type="application/x-shellscript">
+		<alias type="application/x-sh" />
+	</mime-type>
+
+	<mime-type type="application/xhtml+xml">
+		<glob pattern="*.xht" />
+	</mime-type>
+
+	<mime-type type="audio/midi">
+		<glob pattern="*.kar" />
+	</mime-type>
+
+	<mime-type type="audio/x-pn-realaudio">
+		<alias type="audio/x-realaudio" />
+	</mime-type>
+
+	<mime-type type="image/tiff">
+		<magic priority="50">
+			<match value="0x4d4d2a00" type="string" offset="0" />
+			<match value="0x49492a00" type="string" offset="0" />
+		</magic>
+	</mime-type>
+
+	<mime-type type="message/rfc822">
+		<magic priority="50">
+			<match type="string" value="Relay-Version:" offset="0" />
+			<match type="string" value="#! rnews" offset="0" />
+			<match type="string" value="N#! rnews" offset="0" />
+			<match type="string" value="Forward to" offset="0" />
+			<match type="string" value="Pipe to" offset="0" />
+			<match type="string" value="Return-Path:" offset="0" />
+			<match type="string" value="From:" offset="0" />
+			<match type="string" value="Message-ID:" offset="0" />
+			<match type="string" value="Date:" offset="0" />
+		</magic>
+	</mime-type>
+	
+	<mime-type type="application/x-javascript">
+        <glob pattern="*.js" />
+    </mime-type>
+    
+
+	<mime-type type="image/vnd.wap.wbmp">
+		<_comment>Wireless Bitmap File Format</_comment>
+		<glob pattern="*.wbmp" />
+	</mime-type>
+
+	<mime-type type="image/x-psd">
+		<alias type="image/photoshop" />
+	</mime-type>
+
+	<mime-type type="image/x-xcf">
+		<alias type="image/xcf" />
+		<magic priority="50">
+			<match type="string" value="gimp xcf " offset="0" />
+		</magic>
+	</mime-type>
+	
+	<mime-type type="application/x-shockwave-flash">
+      <glob pattern="*.swf"/>
+      <magic priority="50">
+        <match type="string" value="FWS" offset="0"/>
+        <match type="string" value="CWS" offset="0"/>
+      </magic>
+    </mime-type>
+
+	<mime-type type="model/iges">
+		<_comment>
+			Initial Graphics Exchange Specification Format
+		</_comment>
+		<glob pattern="*.igs" />
+		<glob pattern="*.iges" />
+	</mime-type>
+
+	<mime-type type="model/mesh">
+		<glob pattern="*.msh" />
+		<glob pattern="*.mesh" />
+		<glob pattern="*.silo" />
+	</mime-type>
+
+	<mime-type type="model/vrml">
+		<glob pattern="*.vrml" />
+	</mime-type>
+
+	<mime-type type="text/x-tcl">
+		<alias type="application/x-tcl" />
+	</mime-type>
+
+	<mime-type type="text/x-tex">
+		<alias type="application/x-tex" />
+	</mime-type>
+
+	<mime-type type="text/x-texinfo">
+		<alias type="application/x-texinfo" />
+	</mime-type>
+
+	<mime-type type="text/x-troff-me">
+		<alias type="application/x-troff-me" />
+	</mime-type>
+
+	<mime-type type="video/vnd.mpegurl">
+		<glob pattern="*.mxu" />
+	</mime-type>
+
+	<mime-type type="x-conference/x-cooltalk">
+		<_comment>Cooltalk Audio</_comment>
+		<glob pattern="*.ice" />
+	</mime-type>
+
+</mime-info>

Added: trunk/archive-access/projects/nat/archive/lib/commons-2.0.1-SNAPSHOT.jar
===================================================================
(Binary files differ)


Property changes on: trunk/archive-access/projects/nat/archive/lib/commons-2.0.1-SNAPSHOT.jar
___________________________________________________________________
Name: svn:mime-type
   + application/octet-stream

Added: trunk/archive-access/projects/nat/archive/lib/commons-httpclient-3.0.1.jar
===================================================================
(Binary files differ)


Property changes on: trunk/archive-access/projects/nat/archive/lib/commons-httpclient-3.0.1.jar
___________________________________________________________________
Name: svn:mime-type
   + application/octet-stream

Added: trunk/archive-access/projects/nat/archive/lib/fastutil-5.0.3.jar
===================================================================
(Binary files differ)


Property changes on: trunk/archive-access/projects/nat/archive/lib/fastutil-5.0.3.jar
___________________________________________________________________
Name: svn:mime-type
   + application/octet-stream

Added: trunk/archive-access/projects/nat/archive/src/java/org/archive/nutchwax/ArcReader.java
===================================================================
--- trunk/archive-access/projects/nat/archive/src/java/org/archive/nutchwax/ArcReader.java	                        (rev 0)
+++ trunk/archive-access/projects/nat/archive/src/java/org/archive/nutchwax/ArcReader.java	2008-05-14 00:20:24 UTC (rev 2265)
@@ -0,0 +1,273 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.nutchwax;
+
+import java.util.Iterator;
+import java.util.Map;
+import java.util.HashMap;
+import java.io.IOException;
+
+import org.archive.io.ArchiveReader;
+import org.archive.io.ArchiveReaderFactory;
+import org.archive.io.ArchiveRecord;
+import org.archive.io.ArchiveRecordHeader;
+
+import org.archive.io.arc.ARCConstants;
+import org.archive.io.arc.ARCReader;
+import org.archive.io.arc.ARCRecord;
+import org.archive.io.arc.ARCRecordMetaData;
+import org.archive.io.warc.WARCConstants;
+import org.archive.io.warc.WARCRecord;
+
+import org.apache.commons.httpclient.Header;
+
+
+/**
+ * <p>
+ *   Reader of both ARC and WARC format archive files.  This is not a
+ *   general-purpose archive file reader, but is written specifically
+ *   for NutchWAX.  It's possible that this could become a
+ *   general-purpose archive file reader, but for now, consider it
+ *   custom-tailored to the needs of NutchWAX.
+ * </p>
+ * <p>
+ *   <code>ArcReader</code> is a wrapper around the underlying
+ *   <code>ArchiveReader</code> implementation
+ *   (<code>ARCReader</code>/<code>WARCReader</code>) which converts
+ *   <code>WARCRecord</code>s to <code>ARCRecord</code>s on the fly.
+ * </p>
+ * <p>
+ *   If an <code>ARCReader</code> is being wrapped, then the
+ *   underlying <code>ARCRecord</code>s are read and passed-through
+ *   unmolested.
+ * </p>
+ * <p>
+ *   If a <code>WARCReader</code> is being wrapped, then the
+ *   <code>WARCRecord</code>s are converted to <code>ARCRecord</code>s
+ *   on the fly.
+ * </p>
+ * <p>
+ *   <strong>WARNING:</strong> We only convert WARC
+ *   <code>response</code> records.  All other WARC record types are
+ *   returned as <code>null</code> by the iterator's
+ *   <code>next()</code> method.  So, when using the iterator, don't
+ *   forget to check for a <code>null</code> value returned by
+ *   <code>next()</code>.
+ * </p>
+ */
+public class ArcReader implements Iterable<ARCRecord>
+{
+  private ArchiveReader reader;
+
+  /**
+   * Construct an <code>ArcReader<code> wrapping an
+   * <code>ArchiveReader</code> instance.
+   *
+   * @param reader the ArchiveReader instance to wrap
+   */
+  public ArcReader( ArchiveReader reader )
+  {
+    this.reader = reader;
+  }
+
+  /**
+   * Returns an iterator over <code>ARCRecord</code>s in the wrapped
+   * <code>ArchiveReader</code>, converting <code>WARCRecords</code>
+   * to <code>ARCRecords</code> on-the-fly.
+   *
+   * @return an interator
+   */
+  public Iterator<ARCRecord> iterator( )
+  {
+    return new ArcIterator( );
+  }
+
+  /**
+   * 
+   */
+  private class ArcIterator implements Iterator<ARCRecord>
+  {
+    private Iterator<ArchiveRecord> i;
+
+    /**
+     * Construct an <code>ArcIterator</code>, skipping the header
+     * record if the wrapped reader is an <code>ARCReader</code>.
+     */
+    public ArcIterator( )
+    {
+      this.i = ArcReader.this.reader.iterator( );
+      
+      if ( ArcReader.this.reader instanceof ARCReader )
+        {
+          // Skip the first record, which is a "filedesc://"
+          // record describing the ARC file.
+          if ( this.i.hasNext( ) ) this.i.next( );
+        }
+    }
+
+    /**
+     * Returns <code>true</code> if the iteration has more elements.
+     * Will return <code>true</code> even if the value returned by the
+     * next call to <code>next()</code> returns <code>null</code>.
+     *
+     * @return <code>true</code> if the iterator has more elements.
+     */
+    public boolean hasNext( )
+    {
+      return this.i.hasNext( );
+    }
+    
+    /**
+     * Returns the next element in the iteration. Calling this method
+     * repeatedly until the <code>hasNext()</code> method returns
+     * <code>false</code> will return each element in the underlying
+     * collection exactly once.
+     * 
+     * @return the next element in the iteration, which can be <code>null</code>
+     */
+    public ARCRecord next( )
+    {
+      try
+        {
+          ArchiveRecord record = this.i.next( );
+          
+          if ( record instanceof ARCRecord )
+            {
+              // Just return the ARCRecord as-is.
+              ARCRecord arc = (ARCRecord) record;
+              
+              return arc;
+            }
+          
+          if ( record instanceof WARCRecord )
+            {
+              WARCRecord warc = (WARCRecord) record;
+              
+              ARCRecord arc = convert( warc );
+
+              return arc;
+            }
+
+          // If we get here then the record we reaad in was neither an ARC
+          // or WARC record.  What is a good exception to throw?
+          throw new RuntimeException( "Record neither ARC nor WARC: " + record.getClass( ) );
+        }
+      catch ( IOException ioe )
+        {
+          throw new RuntimeException( ioe );
+        }
+    }
+
+    /**
+     * Unsupported optional operation.
+     *
+     * @throw UnsupportedOperationException
+     */
+    public void remove( )
+    {
+      throw new UnsupportedOperationException( );
+    }
+
+    /**
+     * Convert a WARCRecord to an ARCRecord.  Only "response"
+     * WARCRecords are converted to meaningful ARCRecords.  All other
+     * WARCRecord types are converted to <code>null</code>.
+     *
+     * @param warc the WARCRecord to convert
+     * @return the corresponding ARCRecord, <code>null</code> if WARCRecord not a "reponse" record
+     */
+    private ARCRecord convert( WARCRecord warc )
+      throws IOException
+    {
+      ArchiveRecordHeader header = warc.getHeader( );
+      
+      // We only care about "response" WARC records.
+      if ( ! WARCConstants.RESPONSE.equals( header.getHeaderValue( WARCConstants.HEADER_KEY_TYPE ) ) )
+        {
+          return null;
+        }
+              
+      // Construct an ARCRecordMetadata object based on the info in
+      // the ArchiveRecordHeader.
+      Map arcMetadataFields = new HashMap( );
+      arcMetadataFields.put( ARCConstants.URL_FIELD_KEY,       header.getHeaderValue( WARCConstants.HEADER_KEY_URI  ) );
+      arcMetadataFields.put( ARCConstants.IP_HEADER_FIELD_KEY, header.getHeaderValue( WARCConstants.HEADER_KEY_IP   ) );
+      arcMetadataFields.put( ARCConstants.DATE_FIELD_KEY,      header.getHeaderValue( WARCConstants.HEADER_KEY_DATE ) );
+      arcMetadataFields.put( ARCConstants.MIMETYPE_FIELD_KEY,  header.getHeaderValue( null ) );  // We don't know the MIME type of the *payload* in a WARC (yet)
+      arcMetadataFields.put( ARCConstants.LENGTH_FIELD_KEY,    header.getHeaderValue( WARCConstants.CONTENT_LENGTH  ) );
+      arcMetadataFields.put( ARCConstants.VERSION_FIELD_KEY,   header.getHeaderValue( null ) );  // FIXME: Do we need actual values for these?
+      arcMetadataFields.put( ARCConstants.ABSOLUTE_OFFSET_KEY, header.getHeaderValue( null ) );  // FIXME: Do we need actual values for these?
+              
+      ARCRecordMetaData metadata = new ARCRecordMetaData( header.getReaderIdentifier( ), arcMetadataFields );
+              
+      // Then, create an ARCRecord using the WARCRecord and the
+      // ARCRecordMetaData object we just created.
+      ARCRecord arc = new ARCRecord( warc, 
+                                     metadata,
+                                     0,  // offset
+                                     ArcReader.this.reader.isDigest( ),
+                                     ArcReader.this.reader.isStrict( ),
+                                     true  // parse HTTP headers
+                                   );
+      
+      // Now that we've created the ARCRecord, we get the HTTP headers
+      // from it.  From these HTTP headers, we obtain the Content-Type
+      // of the ARCRecord's payload, then set value as the MIME-type
+      // of the ARCRecord itself.
+      
+      // If the response is something other than HTTP
+      // (like DNS) there are no HTTP headers.  
+      if ( arc.getHttpHeaders( ) != null )
+        {
+          for ( Header h : arc.getHttpHeaders( ) )
+            {
+              if ( h.getName( ).equals( "Content-Type" ) )
+                {
+                  arc.getMetaData( ).getHeaderFields( ).put( ARCConstants.MIMETYPE_FIELD_KEY, h.getValue( ) );
+                }
+            }
+        }
+      
+      return arc;
+    }
+
+  }
+
+  /**
+   * Simple test/debug driver to read an archive file and print out
+   * the header for each record.
+   */
+  public static void main( String args[] ) throws Exception
+  {
+    if ( args.length != 1 )
+      {
+        System.out.println( "ReaderTest <(w)arc file>" );
+        System.exit( 1 );
+      }
+
+    String arcName = args[0];
+
+    ArchiveReader r = ArchiveReaderFactory.get( arcName );
+
+    ArcReader reader = new ArcReader( r );
+
+    for ( ARCRecord rec : reader )
+      {
+        if ( rec != null ) System.out.println( rec.getHeader( ) );
+      }
+  }
+}

Added: trunk/archive-access/projects/nat/archive/src/java/org/archive/nutchwax/ArcsToSegment.java
===================================================================
--- trunk/archive-access/projects/nat/archive/src/java/org/archive/nutchwax/ArcsToSegment.java	                        (rev 0)
+++ trunk/archive-access/projects/nat/archive/src/java/org/archive/nutchwax/ArcsToSegment.java	2008-05-14 00:20:24 UTC (rev 2265)
@@ -0,0 +1,553 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.nutchwax;
+
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.util.Map.Entry;
+import java.util.Iterator;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.TextInputFormat;
+import org.apache.hadoop.mapred.TextOutputFormat;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.NutchWritable;
+import org.apache.nutch.crawl.SignatureFactory;
+import org.apache.nutch.fetcher.FetcherOutputFormat;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.URLFilters;
+import org.apache.nutch.net.URLFilterException;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.ParseText;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolStatus;
+import org.apache.nutch.scoring.ScoringFilters;
+import org.apache.nutch.util.LogUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.StringUtil;
+
+import org.archive.io.ArchiveReader;
+import org.archive.io.ArchiveReaderFactory;
+import org.archive.io.arc.ARCRecord;
+import org.archive.io.arc.ARCRecordMetaData;
+
+
+/**
+ * Convert Archive files (.arc/.warc) files to a Nutch segment.  This
+ * is sometimes called "importing" other times "converting", the terms
+ * are equivalent.
+ *
+ * <code>ArcsToSegment</code> is coded as a Hadoop job and is intended
+ * to be run within the Hadoop framework, or at least started by the
+ * Hadoop launcher incorporated into Nutch.  Although there is a
+ * <code>main</code> driver, the Nutch launcher script is strongly
+ * recommended.
+ *
+ * This class was initially adapted from the Nutch
+ * <code>Fetcher</code> class.  The premise is since the Nutch
+ * fetching process acquires external content and places it in a Nutch
+ * segment, we can perform a similar activity by taking content from
+ * the ARC files and place that content in a Nutch segment in a
+ * similar fashion.  Ideally, once the <code>ArcsToSegment</code> is
+ * used to import a set of ARCs into a Nutch segment, the resulting
+ * segment should be more-or-less the same as one created by Nutch's
+ * own Fetcher.
+ * 
+ * Since we are mimicing the Nutch Fetcher, we have to be careful
+ * about some implementation details that might not seem relevant
+ * to the importing of ARC files.  I've noted those details with
+ * comments prefaced with "?:".
+ */
+public class ArcsToSegment extends Configured implements Tool, Mapper
+{
+
+  public static final Log LOG = LogFactory.getLog( ArcsToSegment.class );
+
+  private JobConf        jobConf;
+  private URLFilters     urlFilters;
+  private ScoringFilters scfilters;
+  private ParseUtil      parseUtil;
+  private URLNormalizers normalizers;
+  private int            interval;
+
+  private long           numSkipped;
+  private long           numImported;
+  private long           bytesSkipped;
+  private long           bytesImported;
+
+  /**
+   * ?: Is this necessary?
+   */
+  public ArcsToSegment()
+  {
+    
+  }
+
+  /**
+   * <p>Constructor that sets the job configuration.</p>
+   * 
+   * @param conf
+   */
+  public ArcsToSegment( Configuration conf )
+  {
+    setConf( conf );
+  }
+
+  /**
+   * <p>Configures the job.  Sets the url filters, scoring filters, url normalizers
+   * and other relevant data.</p>
+   * 
+   * @param job The job configuration.
+   */
+  public void configure( JobConf job )
+  {
+    // set the url filters, scoring filters the parse util and the url
+    // normalizers
+    this.jobConf     = job;
+    this.urlFilters  = new URLFilters    ( jobConf );
+    this.scfilters   = new ScoringFilters( jobConf );
+    this.parseUtil   = new ParseUtil     ( jobConf );
+    this.normalizers = new URLNormalizers( jobConf, URLNormalizers.SCOPE_FETCHER );
+    this.interval    = jobConf.getInt( "db.fetch.interval.default", 2592000      );
+  }
+
+  /**
+   * In Mapper interface.
+   * @inherit
+   */
+  public void close()
+  {
+    
+  }
+
+  /**
+   * <p>Runs the Map job to translate an arc file into output for Nutch 
+   * segments.</p>
+   * 
+   * @param key Line number in manifest corresponding to the <code>value</code>
+   * @param value A line from the manifest
+   * @param output The output collecter.
+   * @param reporter The progress reporter.
+   */
+  public void map( WritableComparable key, 
+                   Writable           value, 
+                   OutputCollector    output, 
+                   Reporter           reporter )
+    throws IOException
+  {
+    String arcUrl      = "";
+    String collection  = "";
+    String segmentName = getConf().get( Nutch.SEGMENT_NAME_KEY );
+    
+    // Each line of the manifest is "<url> <collection>" where <collection> is optional
+    String[] line = value.toString().split( " " );
+    arcUrl = line[0];
+
+    if ( line.length > 1 )
+      {
+        collection = line[1];
+      }
+
+    if ( LOG.isInfoEnabled() ) LOG.info( "Importing ARC: " + arcUrl );
+
+    ArchiveReader r = ArchiveReaderFactory.get( arcUrl );
+
+    ArcReader reader = new ArcReader( r );
+
+    try
+      {
+        for ( ARCRecord record : reader )
+          {
+            // When reading WARC files, records of type other than
+            // "response" are returned as 'null' by the Iterator, so
+            // we skip them.
+            if ( record == null ) continue ;
+
+            importRecord( record, segmentName, collection, output );
+
+            // FIXME: What does this do exactly?
+            reporter.progress();
+          }
+      }
+    finally
+      {
+        r.close();
+
+        if ( LOG.isInfoEnabled() ) 
+          {
+            LOG.info( "Completed ARC: "  + arcUrl );
+            LOG.info( "URLs skipped : " + this.numSkipped  );
+            LOG.info( "URLs imported: " + this.numImported );
+            LOG.info( "URLs total   : " + ( this.numSkipped + this.numImported ) );
+          }
+      }
+    
+  }
+
+  /**
+   * Import an ARCRecord.
+   *
+   * @param record
+   * @param segmentName 
+   * @param collectionName
+   * @param output
+   * @return whether record was imported or not (i.e. filtered out due to URL filtering rules, etc.)
+   */
+  private boolean importRecord( ARCRecord record, String segmentName, String collectionName, OutputCollector output )
+  {
+    ARCRecordMetaData meta = record.getMetaData();
+    
+    if ( LOG.isInfoEnabled() ) LOG.info( "Consider URL: " + meta.getUrl() + " (" + meta.getMimetype() + ")" );
+
+    /* ?: On second thought, DON'T do this.  Even if we don't have a
+       parser registered for a content-type, we still want to index
+       its URL and possibly other meta-data.
+    */
+    /*
+    // First, check to see if we have a parser registered for the
+    // URL's Content-Type, so we don't read in some huge video file
+    // only to discover we don't have a parser for it.
+    if ( ! this.hasRegisteredParser( meta.getMimetype() ) )
+      {
+        if ( LOG.isInfoEnabled() ) LOG.info( "No parser registered for: "  + meta.getMimetype() );
+        
+        this.numSkipped++;
+        this.bytesSkipped += meta.getLength();
+        
+        return false ;
+      }
+    */
+
+    // ?: Arguably, we shouldn't be normalizing nor filtering based
+    // on the URL.  If the document made it into the (W)ARC file, then
+    // it should be indexed.  But then again, the normalizers and
+    // filters can be disabled in the Nutch configuration files.
+    String url = this.normalizeAndFilterUrl( meta.getUrl() );
+    
+    if ( url == null )
+      {
+        if ( LOG.isInfoEnabled() ) LOG.info( "Skip     URL: "  + meta.getUrl() );
+        
+        this.numSkipped++;
+        this.bytesSkipped += meta.getLength();
+        
+        return false;
+      }
+    
+    // URL is good, let's import the content.
+    if ( LOG.isInfoEnabled() ) LOG.info( "Import   URL: " + meta.getUrl() );
+    this.numImported++;
+    this.bytesImported += meta.getLength();
+    
+    try
+      {
+        ...
 
[truncated message content]