From: <bi...@us...> - 2008-05-14 00:20:16
|
Revision: 2265 http://archive-access.svn.sourceforge.net/archive-access/?rev=2265&view=rev Author: binzino Date: 2008-05-13 17:20:24 -0700 (Tue, 13 May 2008) Log Message: ----------- Initial checkin of NutchWAX 0.12, a.k.a Nutch Archive Tools (NAT). Added Paths: ----------- trunk/archive-access/projects/nat/ trunk/archive-access/projects/nat/archive/ trunk/archive-access/projects/nat/archive/INSTALL.txt trunk/archive-access/projects/nat/archive/README.txt trunk/archive-access/projects/nat/archive/bin/ trunk/archive-access/projects/nat/archive/bin/nutchwax trunk/archive-access/projects/nat/archive/build.xml trunk/archive-access/projects/nat/archive/conf/ trunk/archive-access/projects/nat/archive/conf/nutch-site.xml trunk/archive-access/projects/nat/archive/conf/search-servers.txt trunk/archive-access/projects/nat/archive/conf/tika-mimetypes.xml trunk/archive-access/projects/nat/archive/lib/ trunk/archive-access/projects/nat/archive/lib/commons-2.0.1-SNAPSHOT.jar trunk/archive-access/projects/nat/archive/lib/commons-httpclient-3.0.1.jar trunk/archive-access/projects/nat/archive/lib/fastutil-5.0.3.jar trunk/archive-access/projects/nat/archive/src/ trunk/archive-access/projects/nat/archive/src/java/ trunk/archive-access/projects/nat/archive/src/java/org/ trunk/archive-access/projects/nat/archive/src/java/org/archive/ trunk/archive-access/projects/nat/archive/src/java/org/archive/nutchwax/ trunk/archive-access/projects/nat/archive/src/java/org/archive/nutchwax/ArcReader.java trunk/archive-access/projects/nat/archive/src/java/org/archive/nutchwax/ArcsToSegment.java trunk/archive-access/projects/nat/archive/src/java/org/archive/nutchwax/NutchWax.java trunk/archive-access/projects/nat/archive/src/java/org/archive/nutchwax/tools/ trunk/archive-access/projects/nat/archive/src/java/org/archive/nutchwax/tools/DumpIndex.java trunk/archive-access/projects/nat/archive/src/plugin/ trunk/archive-access/projects/nat/archive/src/plugin/build-plugin.xml trunk/archive-access/projects/nat/archive/src/plugin/build.xml trunk/archive-access/projects/nat/archive/src/plugin/index-nutchwax/ trunk/archive-access/projects/nat/archive/src/plugin/index-nutchwax/build.xml trunk/archive-access/projects/nat/archive/src/plugin/index-nutchwax/plugin.xml trunk/archive-access/projects/nat/archive/src/plugin/index-nutchwax/src/ trunk/archive-access/projects/nat/archive/src/plugin/index-nutchwax/src/java/ trunk/archive-access/projects/nat/archive/src/plugin/index-nutchwax/src/java/org/ trunk/archive-access/projects/nat/archive/src/plugin/index-nutchwax/src/java/org/archive/ trunk/archive-access/projects/nat/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/ trunk/archive-access/projects/nat/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ trunk/archive-access/projects/nat/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java trunk/archive-access/projects/nat/archive/src/plugin/query-nutchwax/ trunk/archive-access/projects/nat/archive/src/plugin/query-nutchwax/build.xml trunk/archive-access/projects/nat/archive/src/plugin/query-nutchwax/plugin.xml trunk/archive-access/projects/nat/archive/src/plugin/query-nutchwax/src/ trunk/archive-access/projects/nat/archive/src/plugin/query-nutchwax/src/java/ trunk/archive-access/projects/nat/archive/src/plugin/query-nutchwax/src/java/org/ trunk/archive-access/projects/nat/archive/src/plugin/query-nutchwax/src/java/org/archive/ trunk/archive-access/projects/nat/archive/src/plugin/query-nutchwax/src/java/org/archive/nutchwax/ trunk/archive-access/projects/nat/archive/src/plugin/query-nutchwax/src/java/org/archive/nutchwax/query/ trunk/archive-access/projects/nat/archive/src/plugin/query-nutchwax/src/java/org/archive/nutchwax/query/ConfigurableQueryFilter.java trunk/archive-access/projects/nat/archive/src/plugin/query-nutchwax/src/java/org/archive/nutchwax/query/DateQueryFilter.java Added: trunk/archive-access/projects/nat/archive/INSTALL.txt =================================================================== --- trunk/archive-access/projects/nat/archive/INSTALL.txt (rev 0) +++ trunk/archive-access/projects/nat/archive/INSTALL.txt 2008-05-14 00:20:24 UTC (rev 2265) @@ -0,0 +1,236 @@ + +INSTALL.txt +2008-05-06 +Aaron Binns + + +The NutchWAX 0.12 build and installation is as an "add-on" to an +existing Nutch 1.0-dev installation. + +NutchWAX 0.12 uses a simple 'ant' build script. The script compiles +the NutchWAX sources, using the libraries in the installed +Nutch-1.0-dev. + +We strongly recommend having *two* Nutch-1.0-dev installation +directories: one that you build NutchWAX against, and another into +which NutchWAX is deployed. + +NutchWAX is deployed by un-tar'ing the nutchwax-0.12.tar.gz file +*into* an existing Nutch-1.0-dev installation. Think of NutchWAX as +an add-on. We over-write a few Nutch config files, but the rest is +simply added to the existing Nutch-1.0-dev installation. + + +Nutch-1.0-dev +------------- + +As mentioned above, NutchWAX 0.12 is built against Nutch-1.0-dev. Now +Nutch doesn't have a 1.0 release package yet, so we have to use the +Nutch SVN trunk. The specific SVN revision that NutchWAX 0.12 is +built against is: + + 650739 + +To checkout this revision of Nutch, use: + + $ mkdir nutch + $ cd nutch + $ svn checkout -r 650739 http://svn.apache.org/repos/asf/lucene/nutch/trunk + +To build the nutch-1.0-dev.tar.gz package, use 'ant' + + $ cd trunk + $ ant tar + +This produces + + build/nutch-1.0-dev.tar.gz + +Which we then install *twice* + + $ mkdir -p ~/nutchwax-0.12/nutch-1.0-dev + $ tar xfz -C ~/nutchwax-0.12/nutch-1.0-dev build/nutch-1.0-dev.tar.gz + $ mkdir -p /opt/nutch-1.0-dev + $ tar xfz -C /opt/nutch-1.0-dev build/nutch-1.0-dev.tar.gz + +The idea is that we keep /opt/nutch-1.0-dev as our pristine copy which +we compile against, then, when we want to test NutcWAX, we deploy it +into ~/nutchwax-0.12/nutch-1.0-dev. + +Why can't we just use one installation of Nutch? Mainly to avoid +weirdness where we are compiling NutchWAX source against the same set +of libraries where we would be installing NutchWAX. Consider, when we +deploy NutchWAX, we copy the nutchwax.jar into the Nutch 'lib' +directory. If we use that same 'lib' directory for dependencies when +compiling the source, 'ant'/'javac' will likely get confused when +calculating dependencies. + +It's possible that you could successfully go through the +build/test/release cycle using one Nutch-1.0-dev directory, but these +instructions assume you will have two. + + +Build and install +----------------- + + 1. Install two Nutch-1.0-dev packages per the instructions above. + + 2. Edit build.xml to point to the "pristine" installation of Nutch-1.0-dev + + <!-- NOTE: Point this to your Nutch 1.0-dev directory --> + <property name="nutch.dir" value="/opt/nutch-1.0-dev" /> + + 3. Build NutchWAX-0.12 + + $ ant + + The default build rule is "package" which will compile all the source + and build an intallation tarball: nutchwax-0.12.tar.gz + + The "build.xml" file is pretty straightforward and just grepping + for the targets should be pretty obvious: compile, clean, etc. + + 4. Install NutchWAX into the build/test Nutch installation + + $ tar xfz -C ~/nutchwax-0.12/nutch-1.0-dev nutchwax-0.12.tar.gz + +That's it! + +All we do is add our libraries (nutchwax.jar and dependencies), the +'nutchwax' helper script, plugins for indexing and querying, and a few +config files. + +Except for the config files, no files in the Nutch-1.0-dev +installation are over-written, only added. The "nutch-site.xml" file +is over-written, but that file is empty in a vanilla Nutch +installation, so there's small risk of over-writing something. + + +HOWTO run and test +------------------ + +The 'nutchwax' helper script is installed in the Nutch-1.0-dev 'bin' +directory next to the 'nutch' helper script. + +The 'nutchwax' script is used to run the NutchWAX-specific tools, use +the regular 'nutch' script for regular Nutch activities. + +The 'nutchwax' script runs two tools + + "import" Import a set of .arc/.warc files from a manifest, creating + a Nutch segment. + + "dumpindex" Debug tool that dumps a Lucene index, such as the ones + created by Nutch's "index" tool. + +The idea is that the NutchWAX "import" tool supplants the Nutch +generate and fetch cycle. Rather than generating and fetching +segments, we import the .arc/.warc files directly into a newly created +segment. Then, we process that segment just as we normally would with +Nutch. + +For example, + + $ cd nutch-test + $ cat > manifest + http://someserver/foo-bar-baz.arc mycollection + ^D + $ nutch-1.0-dev/bin/nutchwax import manifest + +This will import the arc file listed in the manifest into a newly +created segment. The segment is created by default in a directory +hierarchy of the form: + + segments/[date-timestamp] + +This mirrors the way segments are created in vanilla Nutch by the +"generate" command. + +You can explicitly name the segment if you want, e.g. + + nutchwax import manifest mysegment + +Once the segment is created by the importing of ARC files with +NutchWAX, you can use Nutch to perform the rest of the steps. For +example: + + $ nutch-1.0-dev/bin/nutchwax import manifest + $ nutch-1.0-dev/bin/nutch updatedb crawldb -dir segments + $ nutch-1.0-dev/bin/nutch invertlinks linkdb -dir segments + $ nutch-1.0-dev/bin/nutch index indexes crawldb linkdb segments/* + $ nutch-1.0-dev/bin/nutch merge index indexes + +This is pretty much the minimal set of steps to import and index a set +of ARC files. The crawldb update and link inversion steps are pro +forma and don't have anything to do with NutchWAX specifically, but +are a part of regular Nutch processing. + +Now you have a Nutch "index" directory and are ready to search! + +Searching is done as in vanilla Nutch. Either launch the Nutch webapp +or use the command-line interface to NutchBean to run some test +searches. Nothing NutchWAX-specific here. + + +Miscellaneous notes +------------------- + +1. Plugins + +There are two plugins bundled with NutchWAX: + + index-nutchwax + query-nutchwax + +See the "plugin.includes" property in nutch-site.xml to see where +these plugins are added to the filter chain. + +The index-nutchwax plugin ensures that WAX-specifici metadata is +transferred from the Nutch Content object to the Lucene Document +object, which is placed in the Lucene index. + +The query-nutchwax plugin is used to process query requests against +those same meta-data fields. It also expands the capabilities of +searching the basic Nutch fields as well. + +2. URL filters + +Nutch's URL filter by default filters-out many common URL oddities +that would normally trip-up Nutch's crawler. However, when importing +content from ARC files, filtering out content probably doens't make +sense. That is, whatever content made it into the ARC file should be +imported, no matter what the URL looks like. + +To change the URL filter, edit the Nutch file 'conf/regex-urlfilter.txt'. +To pass all content through the filter, remove all filter rules except +for the last one: + + # accept anything else + +. + +3. conf/tika-mimetypes.xml + +NutchWAX comes with a fixed copy of tika-mimetypes.xml. The version +in Nutch revision 650739 has a few bugs in it which cause parsing to +fail for many document types. The bugs are: + + o Move + + <mime-type type="application/xml"> + <alias type="text/xml" /> + <glob pattern="*.xml" /> + </mime-type> + + definition higher up in the file, before the reference to it. + + o Remove + + <mime-type type="application/x-ms-dos-executable"> + <alias type="application/x-dosexec;exe" /> + </mime-type> + + as the ';' character is illegal according to the comments in the + Nutch code. + +The copy of "conf/tika-mimetypes.xml" bundled with NutchWAX fixes +these two bugs. Added: trunk/archive-access/projects/nat/archive/README.txt =================================================================== --- trunk/archive-access/projects/nat/archive/README.txt (rev 0) +++ trunk/archive-access/projects/nat/archive/README.txt 2008-05-14 00:20:24 UTC (rev 2265) @@ -0,0 +1,105 @@ + +README.txt +2008-05-06 +Aaron Binns + + +This is the NutchWAX-0.12 source that John Lee handed-off to me. It +is a work-in-progress. + +Compared to NutchWAX-0.10 (and earlier) it is *much* simpler. The +main WAX-specific code is in just a few files really: + +src/java/org/archive/nutchwax/ArcsToSegment.java + + This is the meat of the WAX logic for processing .arc files and + generating Nutch segments. Once we use this to generate a set of + segments for the .arc files, we can use the rest of vanilla + Nutch-1.0-dev to invert links and index the content with Lucene. + + This conversion code is heavily edited from: + + nutch-1.0-dev/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java + + taken from the Nutch SVN head (a.k.a the "1.0-dev" in-development). + + Ours differs in a few important ways: + + o Rather than taking a directory with .arc files as input, we take + a manifest file with URLs to .arc files. This way, the manifest + is split up among the distributed Hadoop jobs and the .arc files + are processed in whole by each worker. + + In the Nutch-1.0-dev, the ArcSegmentCreator.java expects the + input directory to contain the .arc files and (AFAICT) splits + them up and distributes them across the Hadoop workers. This + seems really inefficient to me, I think our approach is much + better -- at least for us. + + o Related to the way input files are split and processed, we use + the standard Archive ARCReader class just like Heritrix and + Wayback. + + The ArcSegmentCreator.java in Nutch-1.0-dev doesn't use our + ARCReader because of licensing imcompatibility. Ours is under + GPL and Nutch-1.0-dev forbids the use of GPL code. + + We are in the process of re-licensing or dual-licensing with + Apache License, but until then, our ARCReader code won't be incldued + in mainline Nutch. + + This isn's a problem per se, but worth noting in case anyone + looks at the Nutch-1.0-dev code and wonders why they built their + own (horribly inefficient) .arc reader. + + o We add metadata fields to the processed document for WAX-specific + purposes: + + content.getMetadata().set( NutchWax.CONTENT_TYPE_KEY, meta.getMimetype() ); + content.getMetadata().set( NutchWax.ARCNAME_KEY, meta.getArcFile().getName() ) ; + content.getMetadata().set( NutchWax.COLLECTION_KEY, collection); + content.getMetadata().set( NutchWax.ARCHIVE_DATE_KEY, meta.getDate() ); + + The addition of the arcname and collection key is pretty + obvious. I don't know why the content-type isn't added in the + vanilla Nutch-1.0-dev. + + Also, we should review the use of the ARCHIVE_DATE_KEY in that + John Lee mentioned to me that there was possibly duplicate date + fields put in the index: one that is a plain old Java date, and + one that is a 14-digit date string for use with Wayback. + +src/java/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/NutchWaxIndexingFilter.java +src/java/plugin/index-nutchwax/plugin.xml + + This filter is pretty straightforward. All it does is take the + metadata fields that were added to the document (as described above) + and placed in the Lucene index so that we can make use of them at + search-time. + +src/java/plugin/query-nutchwax/src/java/org/archive/nutchwax/query/MultipleFieldQueryFilter.java +src/java/plugin/query-nutchwax/plugin.xml + + This is a single query filter that can be used for querying single + fields from a single implementation. It does *not* allow for + querying multiple fields as you can already do that via Nutch. + + What this filter does is allows one to more-or-less create query + filters in a data-driven manner rather than having to code-up a new + class for each field. That is, before one would have to create a + CollectionQueryFilter class to filter on the "collection" field. + With the MultipleFieldQueryFilter class, you can specify that the + "collection" field is to be filterable via the plugin.xml file and + "nutchwax.filter.query" configuration property. + +src/java/org/archive/nutchwax/NutchWax.java + + Just a simple enum used by the above two classes for the metadata + keys. + +src/java/org/archive/nutchwax/tools/DumpIndex.java + + A simple command-line utility to dump the contents of a Lucene + index. Used for debugging. + + Added: trunk/archive-access/projects/nat/archive/bin/nutchwax =================================================================== --- trunk/archive-access/projects/nat/archive/bin/nutchwax (rev 0) +++ trunk/archive-access/projects/nat/archive/bin/nutchwax 2008-05-14 00:20:24 UTC (rev 2265) @@ -0,0 +1,70 @@ +#!/usr/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. + + +# The following is cribbed from the 'nutch' script to ascertain the +# location of Nutch so we can call its scripts. +# +# resolve links - $0 may be a softlink +THIS="$0" +while [ -h "$THIS" ]; do + ls=`ls -ld "$THIS"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '.*/.*' > /dev/null; then + THIS="$link" + else + THIS=`dirname "$THIS"`/"$link" + fi +done + +THIS_DIR=`dirname "$THIS"` +NUTCH_HOME=`cd "$THIS_DIR/.." ; pwd` + +# Now that we have NUTCH_HOME, process the command-line. + +case "$1" in + import) + shift + if [ $# -eq 0 ]; then + ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.ArcsToSegment + exit 1 + fi + if [ -z "$2" ]; then + segment=`date +"%Y%m%d%H%M%S"` + segment="segments/${segment}" + else + segment="$2" + fi + ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.ArcsToSegment "$1" "${segment}" + ;; + dumpindex) + shift + ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.tools.DumpIndex $@ + ;; + *) + echo "" + echo "Usage: nutchwax COMMAND" + echo "where COMMAND is one of:" + echo " import Import ARCs into a new Nutch segment" + echo " dumpindex Dump an index to the screen" + echo "" + exit 1 + ;; +esac + +exit 0 Property changes on: trunk/archive-access/projects/nat/archive/bin/nutchwax ___________________________________________________________________ Name: svn:executable + * Added: trunk/archive-access/projects/nat/archive/build.xml =================================================================== --- trunk/archive-access/projects/nat/archive/build.xml (rev 0) +++ trunk/archive-access/projects/nat/archive/build.xml 2008-05-14 00:20:24 UTC (rev 2265) @@ -0,0 +1,138 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="nutchwax" default="job"> + + <property name="nutch.dir" value="../../" /> + + <property name="src.dir" value="src" /> + <property name="lib.dir" value="lib" /> + <property name="build.dir" value="${nutch.dir}/build" /> + <!-- HACK: Need to import default.properties like Nutch does --> + <property name="dist.dir" value="${build.dir}/nutch-1.0-dev" /> + + <target name="nutch-compile-core"> + <ant dir="${nutch.dir}" target="compile-core" inheritAll="false" /> + </target> + + <target name="nutch-compile-plugins"> + <ant dir="${nutch.dir}" target="compile-plugins" inheritAll="false" /> + </target> + + <target name="compile-core" depends="nutch-compile-core"> + <javac + destdir="${build.dir}/classes" + debug="true" + verbose="false" + source="1.5" + target="1.5" + encoding="UTF-8" + fork="true" + nowarn="true" + deprecation="false"> + <src path="${src.dir}/java" /> + <include name="**/*.java" /> + <classpath> + <pathelement location="${build.dir}/classes" /> + <fileset dir="${lib.dir}"> + <include name="*.jar"/> + </fileset> + <fileset dir="${nutch.dir}/lib"> + <include name="*.jar"/> + </fileset> + </classpath> + </javac> + </target> + + <target name="compile-plugins"> + <ant dir="src/plugin" target="deploy" inheritAll="false" /> + </target> + + <!-- + These targets all call down to the corresponding target in the + Nutch build.xml file. This way all of the 'ant' build commands + can be executed from this directory and everything should get + built as expected. + --> + <target name="compile" depends="compile-core, compile-plugins, nutch-compile-plugins"> + </target> + + <target name="jar" depends="compile-core"> + <ant dir="${nutch.dir}" target="jar" inheritAll="false" /> + </target> + + <target name="job" depends="compile"> + <ant dir="${nutch.dir}" target="job" inheritAll="false" /> + </target> + + <target name="war" depends="compile"> + <ant dir="${nutch.dir}" target="war" inheritAll="false" /> + </target> + + <target name="javadoc" depends="compile"> + <ant dir="${nutch.dir}" target="javadoc" inheritAll="false" /> + </target> + + <target name="tar" depends="package"> + <ant dir="${nutch.dir}" target="tar" inheritAll="false" /> + </target> + + <target name="clean"> + <ant dir="${nutch.dir}" target="clean" inheritAll="false" /> + </target> + + <!-- This one does a little more after calling down to the relevant + Nutch target. After Nutch has copied everything into the + distribution directory, we add our script, libraries, etc. + + Rather than over-write the standard Nutch configuration files, + we place ours in a newly created directory + + contrib/archive/conf + + and let the individual user decide whether or not to + incorporate our modifications. + --> + <target name="package" depends="jar, job, war, javadoc"> + <ant dir="${nutch.dir}" target="package" inheritAll="false" /> + + <copy todir="${dist.dir}/lib" includeEmptyDirs="false"> + <fileset dir="lib"/> + </copy> + + <copy todir="${dist.dir}/bin"> + <fileset dir="bin"/> + </copy> + + <chmod perm="ugo+x" type="file"> + <fileset dir="${dist.dir}/bin"/> + </chmod> + + <mkdir dir="${dist.dir}/contrib/archive/conf"/> + <copy todir="${dist.dir}/contrib/archive/conf"> + <fileset dir="conf" /> + </copy> + + <copy todir="${dist.dir}/contrib/archive"> + <fileset dir="."> + <include name="*.txt" /> + </fileset> + </copy> + + </target> + +</project> Added: trunk/archive-access/projects/nat/archive/conf/nutch-site.xml =================================================================== --- trunk/archive-access/projects/nat/archive/conf/nutch-site.xml (rev 0) +++ trunk/archive-access/projects/nat/archive/conf/nutch-site.xml 2008-05-14 00:20:24 UTC (rev 2265) @@ -0,0 +1,65 @@ +<?xml version="1.0"?> +<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> + +<!-- Put site-specific property overrides in this file. --> + +<configuration> + +<property> + <name>plugin.includes</name> + <!-- Add 'index-nutchwax' and 'query-nutchwax' to plugin list. --> + <!-- Also, add 'parse-pdf' --> + <!-- Remove 'urlfilter-regex' and 'normalizer-(pass|regex|basic)' --> + <value>protocol-http|parse-(text|html|js|pdf)|index-(basic|anchor|nutchwax)|query-(basic|site|url|nutchwax)|summary-basic|scoring-opic</value> +</property> + +<property> + <!-- Configure the 'index-nutchwax' plugin. Specify how the metadata fields added by the ArcsToSegment are mapped to the Lucene documents during indexing. + The specifications here are of the form "src-key:lowercase:store:tokenize:dest-key" + Where the only required part is the "src-key", the rest will assume the following defaults: + lowercase = true + store = true + tokenize = false + dest-key = src-key + --> + <name>nutchwax.filter.index</name> + <value> + arcname:false + collection + date + type + </value> +</property> + +<property> + <!-- Configure the 'query-nutchwax' plugin. Specify which fields to make searchable via "field:[term|phrase]" query syntax, and whether they are "raw" fields or not. + The specification format is "raw:name:lowercase:boost" or "field:name:boost". Default values are + lowercase = true + boost = 1.0f + There is no "lowercase" property for "field" specification because the Nutch FieldQueryFilter doesn't expose the option, unlike the RawFieldQueryFilter. + AFAICT, the order isn't important. --> + <!-- We do *not* use this filter for handling "date" queries, there is a specific filter for that: DateQueryFilter --> + <name>nutchwax.filter.query</name> + <value> + raw:arcname:false + raw:collection + raw:type + field:anchor + field:content + field:host + field:title + </value> +</property> + +<!-- Over-ride setting in Nutch "nutch-default.xml" file. We do *not* want Content-Type detection via magic resolution because the implementation + in Nutch reads in the entire content body (which could be a 1GB MPG movie), then converts it to a String before examining the first dozen or + so bytes/characters for magic matching. Since we archvie large files, this is bad, and OOMs occur. So, we disable this feature and keep + the Content-Type that is already in the (W)ARC file. --> +<property> + <name>mime.type.magic</name> + <value>false</value> + <description>Defines if the mime content type detector uses magic resolution. + </description> +</property> + +</configuration> Added: trunk/archive-access/projects/nat/archive/conf/search-servers.txt =================================================================== --- trunk/archive-access/projects/nat/archive/conf/search-servers.txt (rev 0) +++ trunk/archive-access/projects/nat/archive/conf/search-servers.txt 2008-05-14 00:20:24 UTC (rev 2265) @@ -0,0 +1 @@ +localhost 9000 Added: trunk/archive-access/projects/nat/archive/conf/tika-mimetypes.xml =================================================================== --- trunk/archive-access/projects/nat/archive/conf/tika-mimetypes.xml (rev 0) +++ trunk/archive-access/projects/nat/archive/conf/tika-mimetypes.xml 2008-05-14 00:20:24 UTC (rev 2265) @@ -0,0 +1,364 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + Description: This xml file defines the valid mime types used by Tika. + The mime types within this file are based on the types in the mime-types.xml + file available in Apache Nutch. +--> + +<mime-info> + + <mime-type type="text/plain"> + <magic priority="50"> + <match value="This is TeX," type="string" offset="0" /> + <match value="This is METAFONT," type="string" offset="0" /> + </magic> + <glob pattern="*.txt" /> + <glob pattern="*.asc" /> + </mime-type> + + <mime-type type="text/html"> + <magic priority="50"> + <match value="<!DOCTYPE HTML" type="string" + offset="0:64" /> + <match value="<!doctype html" type="string" + offset="0:64" /> + <match value="<HEAD" type="string" offset="0:64" /> + <match value="<head" type="string" offset="0:64" /> + <match value="<TITLE" type="string" offset="0:64" /> + <match value="<title" type="string" offset="0:64" /> + <match value="<html" type="string" offset="0:64" /> + <match value="<HTML" type="string" offset="0:64" /> + <match value="<BODY" type="string" offset="0" /> + <match value="<body" type="string" offset="0" /> + <match value="<TITLE" type="string" offset="0" /> + <match value="<title" type="string" offset="0" /> + <match value="<!--" type="string" offset="0" /> + <match value="<h1" type="string" offset="0" /> + <match value="<H1" type="string" offset="0" /> + <match value="<!doctype HTML" type="string" offset="0" /> + <match value="<!DOCTYPE html" type="string" offset="0" /> + </magic> + <glob pattern="*.html" /> + <glob pattern="*.htm" /> + </mime-type> + + <mime-type type="application/xml"> + <alias type="text/xml" /> + <glob pattern="*.xml" /> + </mime-type> + + <mime-type type="application/xhtml+xml"> + <sub-class-of type="text/xml" /> + <glob pattern="*.xhtml" /> + <root-XML namespaceURI='http://www.w3.org/1999/xhtml' + localName='html' /> + </mime-type> + + <mime-type type="application/vnd.ms-powerpoint"> + <glob pattern="*.ppz" /> + <glob pattern="*.ppt" /> + <glob pattern="*.pps" /> + <glob pattern="*.pot" /> + <magic priority="50"> + <match value="0xcfd0e011" type="little32" offset="0" /> + </magic> + </mime-type> + + <mime-type type="application/vnd.ms-excel"> + <magic priority="50"> + <match value="Microsoft Excel 5.0 Worksheet" type="string" + offset="2080" /> + </magic> + <glob pattern="*.xls" /> + <glob pattern="*.xlc" /> + <glob pattern="*.xll" /> + <glob pattern="*.xlm" /> + <glob pattern="*.xlw" /> + <glob pattern="*.xla" /> + <glob pattern="*.xlt" /> + <glob pattern="*.xld" /> + <alias type="application/msexcel" /> + </mime-type> + + <mime-type type="application/vnd.oasis.opendocument.text"> + <glob pattern="*.odt" /> + </mime-type> + + + <mime-type type="application/zip"> + <alias type="application/x-zip-compressed" /> + <magic priority="40"> + <match value="PK\003\004" type="string" offset="0" /> + </magic> + <glob pattern="*.zip" /> + </mime-type> + + <mime-type type="application/vnd.oasis.opendocument.text"> + <glob pattern="*.oth" /> + </mime-type> + + <mime-type type="application/msword"> + <magic priority="50"> + <match value="\x31\xbe\x00\x00" type="string" offset="0" /> + <match value="PO^Q`" type="string" offset="0" /> + <match value="\376\067\0\043" type="string" offset="0" /> + <match value="\333\245-\0\0\0" type="string" offset="0" /> + <match value="Microsoft Word 6.0 Document" type="string" + offset="2080" /> + <match value="Microsoft Word document data" type="string" + offset="2112" /> + </magic> + <glob pattern="*.doc" /> + <alias type="application/vnd.ms-word" /> + </mime-type> + + <mime-type type="application/octet-stream"> + <magic priority="50"> + <match value="\037\036" type="string" offset="0" /> + <match value="017437" type="host16" offset="0" /> + <match value="0x1fff" type="host16" offset="0" /> + <match value="\377\037" type="string" offset="0" /> + <match value="0145405" type="host16" offset="0" /> + </magic> + <glob pattern="*.bin" /> + </mime-type> + + <mime-type type="application/pdf"> + <magic priority="50"> + <match value="%PDF-" type="string" offset="0" /> + </magic> + <glob pattern="*.pdf" /> + <alias type="application/x-pdf" /> + </mime-type> + + <mime-type type="application/atom+xml"> + <root-XML localName="feed" + namespaceURI="http://purl.org/atom/ns#" /> + </mime-type> + + <mime-type type="application/mac-binhex40"> + <glob pattern="*.hqx" /> + </mime-type> + + <mime-type type="application/mac-compactpro"> + <glob pattern="*.cpt" /> + </mime-type> + + <mime-type type="application/rtf"> + <glob pattern="*.rtf"/> + <alias type="text/rtf" /> + </mime-type> + + <mime-type type="application/rss+xml"> + <alias type="text/rss" /> + <root-XML localName="rss" /> + <root-XML namespaceURI="http://purl.org/rss/1.0/" /> + <glob pattern="*.rss" /> + </mime-type> + + <!-- added in by mattmann --> + <mime-type type="application/x-mif"> + <alias type="application/vnd.mif" /> + </mime-type> + + <mime-type type="application/vnd.wap.wbxml"> + <glob pattern="*.wbxml" /> + </mime-type> + + <mime-type type="application/vnd.wap.wmlc"> + <_comment>Compiled WML Document</_comment> + <glob pattern="*.wmlc" /> + </mime-type> + + <mime-type type="application/vnd.wap.wmlscriptc"> + <_comment>Compiled WML Script</_comment> + <glob pattern="*.wmlsc" /> + </mime-type> + + <mime-type type="text/vnd.wap.wmlscript"> + <_comment>WML Script</_comment> + <glob pattern="*.wmls" /> + </mime-type> + + <mime-type type="application/x-bzip"> + <alias type="application/x-bzip2" /> + </mime-type> + + <mime-type type="application/x-bzip-compressed-tar"> + <glob pattern="*.tbz" /> + <glob pattern="*.tbz2" /> + </mime-type> + + <mime-type type="application/x-cdlink"> + <_comment>Virtual CD-ROM CD Image File</_comment> + <glob pattern="*.vcd" /> + </mime-type> + + <mime-type type="application/x-director"> + <_comment>Shockwave Movie</_comment> + <glob pattern="*.dcr" /> + <glob pattern="*.dir" /> + <glob pattern="*.dxr" /> + </mime-type> + + <mime-type type="application/x-futuresplash"> + <_comment>Macromedia FutureSplash File</_comment> + <glob pattern="*.spl" /> + </mime-type> + + <mime-type type="application/x-java"> + <alias type="application/java" /> + </mime-type> + + <mime-type type="application/x-koan"> + <_comment>SSEYO Koan File</_comment> + <glob pattern="*.skp" /> + <glob pattern="*.skd" /> + <glob pattern="*.skt" /> + <glob pattern="*.skm" /> + </mime-type> + + <mime-type type="application/x-latex"> + <_comment>LaTeX Source Document</_comment> + <glob pattern="*.latex" /> + </mime-type> + + <!-- JC CHANGED + <mime-type type="application/x-mif"> + <_comment>FrameMaker MIF document</_comment> + <glob pattern="*.mif"/> + </mime-type> --> + + <mime-type type="application/ogg"> + <alias type="application/x-ogg" /> + </mime-type> + + <mime-type type="application/x-rar"> + <alias type="application/x-rar-compressed" /> + </mime-type> + + <mime-type type="application/x-shellscript"> + <alias type="application/x-sh" /> + </mime-type> + + <mime-type type="application/xhtml+xml"> + <glob pattern="*.xht" /> + </mime-type> + + <mime-type type="audio/midi"> + <glob pattern="*.kar" /> + </mime-type> + + <mime-type type="audio/x-pn-realaudio"> + <alias type="audio/x-realaudio" /> + </mime-type> + + <mime-type type="image/tiff"> + <magic priority="50"> + <match value="0x4d4d2a00" type="string" offset="0" /> + <match value="0x49492a00" type="string" offset="0" /> + </magic> + </mime-type> + + <mime-type type="message/rfc822"> + <magic priority="50"> + <match type="string" value="Relay-Version:" offset="0" /> + <match type="string" value="#! rnews" offset="0" /> + <match type="string" value="N#! rnews" offset="0" /> + <match type="string" value="Forward to" offset="0" /> + <match type="string" value="Pipe to" offset="0" /> + <match type="string" value="Return-Path:" offset="0" /> + <match type="string" value="From:" offset="0" /> + <match type="string" value="Message-ID:" offset="0" /> + <match type="string" value="Date:" offset="0" /> + </magic> + </mime-type> + + <mime-type type="application/x-javascript"> + <glob pattern="*.js" /> + </mime-type> + + + <mime-type type="image/vnd.wap.wbmp"> + <_comment>Wireless Bitmap File Format</_comment> + <glob pattern="*.wbmp" /> + </mime-type> + + <mime-type type="image/x-psd"> + <alias type="image/photoshop" /> + </mime-type> + + <mime-type type="image/x-xcf"> + <alias type="image/xcf" /> + <magic priority="50"> + <match type="string" value="gimp xcf " offset="0" /> + </magic> + </mime-type> + + <mime-type type="application/x-shockwave-flash"> + <glob pattern="*.swf"/> + <magic priority="50"> + <match type="string" value="FWS" offset="0"/> + <match type="string" value="CWS" offset="0"/> + </magic> + </mime-type> + + <mime-type type="model/iges"> + <_comment> + Initial Graphics Exchange Specification Format + </_comment> + <glob pattern="*.igs" /> + <glob pattern="*.iges" /> + </mime-type> + + <mime-type type="model/mesh"> + <glob pattern="*.msh" /> + <glob pattern="*.mesh" /> + <glob pattern="*.silo" /> + </mime-type> + + <mime-type type="model/vrml"> + <glob pattern="*.vrml" /> + </mime-type> + + <mime-type type="text/x-tcl"> + <alias type="application/x-tcl" /> + </mime-type> + + <mime-type type="text/x-tex"> + <alias type="application/x-tex" /> + </mime-type> + + <mime-type type="text/x-texinfo"> + <alias type="application/x-texinfo" /> + </mime-type> + + <mime-type type="text/x-troff-me"> + <alias type="application/x-troff-me" /> + </mime-type> + + <mime-type type="video/vnd.mpegurl"> + <glob pattern="*.mxu" /> + </mime-type> + + <mime-type type="x-conference/x-cooltalk"> + <_comment>Cooltalk Audio</_comment> + <glob pattern="*.ice" /> + </mime-type> + +</mime-info> Added: trunk/archive-access/projects/nat/archive/lib/commons-2.0.1-SNAPSHOT.jar =================================================================== (Binary files differ) Property changes on: trunk/archive-access/projects/nat/archive/lib/commons-2.0.1-SNAPSHOT.jar ___________________________________________________________________ Name: svn:mime-type + application/octet-stream Added: trunk/archive-access/projects/nat/archive/lib/commons-httpclient-3.0.1.jar =================================================================== (Binary files differ) Property changes on: trunk/archive-access/projects/nat/archive/lib/commons-httpclient-3.0.1.jar ___________________________________________________________________ Name: svn:mime-type + application/octet-stream Added: trunk/archive-access/projects/nat/archive/lib/fastutil-5.0.3.jar =================================================================== (Binary files differ) Property changes on: trunk/archive-access/projects/nat/archive/lib/fastutil-5.0.3.jar ___________________________________________________________________ Name: svn:mime-type + application/octet-stream Added: trunk/archive-access/projects/nat/archive/src/java/org/archive/nutchwax/ArcReader.java =================================================================== --- trunk/archive-access/projects/nat/archive/src/java/org/archive/nutchwax/ArcReader.java (rev 0) +++ trunk/archive-access/projects/nat/archive/src/java/org/archive/nutchwax/ArcReader.java 2008-05-14 00:20:24 UTC (rev 2265) @@ -0,0 +1,273 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.nutchwax; + +import java.util.Iterator; +import java.util.Map; +import java.util.HashMap; +import java.io.IOException; + +import org.archive.io.ArchiveReader; +import org.archive.io.ArchiveReaderFactory; +import org.archive.io.ArchiveRecord; +import org.archive.io.ArchiveRecordHeader; + +import org.archive.io.arc.ARCConstants; +import org.archive.io.arc.ARCReader; +import org.archive.io.arc.ARCRecord; +import org.archive.io.arc.ARCRecordMetaData; +import org.archive.io.warc.WARCConstants; +import org.archive.io.warc.WARCRecord; + +import org.apache.commons.httpclient.Header; + + +/** + * <p> + * Reader of both ARC and WARC format archive files. This is not a + * general-purpose archive file reader, but is written specifically + * for NutchWAX. It's possible that this could become a + * general-purpose archive file reader, but for now, consider it + * custom-tailored to the needs of NutchWAX. + * </p> + * <p> + * <code>ArcReader</code> is a wrapper around the underlying + * <code>ArchiveReader</code> implementation + * (<code>ARCReader</code>/<code>WARCReader</code>) which converts + * <code>WARCRecord</code>s to <code>ARCRecord</code>s on the fly. + * </p> + * <p> + * If an <code>ARCReader</code> is being wrapped, then the + * underlying <code>ARCRecord</code>s are read and passed-through + * unmolested. + * </p> + * <p> + * If a <code>WARCReader</code> is being wrapped, then the + * <code>WARCRecord</code>s are converted to <code>ARCRecord</code>s + * on the fly. + * </p> + * <p> + * <strong>WARNING:</strong> We only convert WARC + * <code>response</code> records. All other WARC record types are + * returned as <code>null</code> by the iterator's + * <code>next()</code> method. So, when using the iterator, don't + * forget to check for a <code>null</code> value returned by + * <code>next()</code>. + * </p> + */ +public class ArcReader implements Iterable<ARCRecord> +{ + private ArchiveReader reader; + + /** + * Construct an <code>ArcReader<code> wrapping an + * <code>ArchiveReader</code> instance. + * + * @param reader the ArchiveReader instance to wrap + */ + public ArcReader( ArchiveReader reader ) + { + this.reader = reader; + } + + /** + * Returns an iterator over <code>ARCRecord</code>s in the wrapped + * <code>ArchiveReader</code>, converting <code>WARCRecords</code> + * to <code>ARCRecords</code> on-the-fly. + * + * @return an interator + */ + public Iterator<ARCRecord> iterator( ) + { + return new ArcIterator( ); + } + + /** + * + */ + private class ArcIterator implements Iterator<ARCRecord> + { + private Iterator<ArchiveRecord> i; + + /** + * Construct an <code>ArcIterator</code>, skipping the header + * record if the wrapped reader is an <code>ARCReader</code>. + */ + public ArcIterator( ) + { + this.i = ArcReader.this.reader.iterator( ); + + if ( ArcReader.this.reader instanceof ARCReader ) + { + // Skip the first record, which is a "filedesc://" + // record describing the ARC file. + if ( this.i.hasNext( ) ) this.i.next( ); + } + } + + /** + * Returns <code>true</code> if the iteration has more elements. + * Will return <code>true</code> even if the value returned by the + * next call to <code>next()</code> returns <code>null</code>. + * + * @return <code>true</code> if the iterator has more elements. + */ + public boolean hasNext( ) + { + return this.i.hasNext( ); + } + + /** + * Returns the next element in the iteration. Calling this method + * repeatedly until the <code>hasNext()</code> method returns + * <code>false</code> will return each element in the underlying + * collection exactly once. + * + * @return the next element in the iteration, which can be <code>null</code> + */ + public ARCRecord next( ) + { + try + { + ArchiveRecord record = this.i.next( ); + + if ( record instanceof ARCRecord ) + { + // Just return the ARCRecord as-is. + ARCRecord arc = (ARCRecord) record; + + return arc; + } + + if ( record instanceof WARCRecord ) + { + WARCRecord warc = (WARCRecord) record; + + ARCRecord arc = convert( warc ); + + return arc; + } + + // If we get here then the record we reaad in was neither an ARC + // or WARC record. What is a good exception to throw? + throw new RuntimeException( "Record neither ARC nor WARC: " + record.getClass( ) ); + } + catch ( IOException ioe ) + { + throw new RuntimeException( ioe ); + } + } + + /** + * Unsupported optional operation. + * + * @throw UnsupportedOperationException + */ + public void remove( ) + { + throw new UnsupportedOperationException( ); + } + + /** + * Convert a WARCRecord to an ARCRecord. Only "response" + * WARCRecords are converted to meaningful ARCRecords. All other + * WARCRecord types are converted to <code>null</code>. + * + * @param warc the WARCRecord to convert + * @return the corresponding ARCRecord, <code>null</code> if WARCRecord not a "reponse" record + */ + private ARCRecord convert( WARCRecord warc ) + throws IOException + { + ArchiveRecordHeader header = warc.getHeader( ); + + // We only care about "response" WARC records. + if ( ! WARCConstants.RESPONSE.equals( header.getHeaderValue( WARCConstants.HEADER_KEY_TYPE ) ) ) + { + return null; + } + + // Construct an ARCRecordMetadata object based on the info in + // the ArchiveRecordHeader. + Map arcMetadataFields = new HashMap( ); + arcMetadataFields.put( ARCConstants.URL_FIELD_KEY, header.getHeaderValue( WARCConstants.HEADER_KEY_URI ) ); + arcMetadataFields.put( ARCConstants.IP_HEADER_FIELD_KEY, header.getHeaderValue( WARCConstants.HEADER_KEY_IP ) ); + arcMetadataFields.put( ARCConstants.DATE_FIELD_KEY, header.getHeaderValue( WARCConstants.HEADER_KEY_DATE ) ); + arcMetadataFields.put( ARCConstants.MIMETYPE_FIELD_KEY, header.getHeaderValue( null ) ); // We don't know the MIME type of the *payload* in a WARC (yet) + arcMetadataFields.put( ARCConstants.LENGTH_FIELD_KEY, header.getHeaderValue( WARCConstants.CONTENT_LENGTH ) ); + arcMetadataFields.put( ARCConstants.VERSION_FIELD_KEY, header.getHeaderValue( null ) ); // FIXME: Do we need actual values for these? + arcMetadataFields.put( ARCConstants.ABSOLUTE_OFFSET_KEY, header.getHeaderValue( null ) ); // FIXME: Do we need actual values for these? + + ARCRecordMetaData metadata = new ARCRecordMetaData( header.getReaderIdentifier( ), arcMetadataFields ); + + // Then, create an ARCRecord using the WARCRecord and the + // ARCRecordMetaData object we just created. + ARCRecord arc = new ARCRecord( warc, + metadata, + 0, // offset + ArcReader.this.reader.isDigest( ), + ArcReader.this.reader.isStrict( ), + true // parse HTTP headers + ); + + // Now that we've created the ARCRecord, we get the HTTP headers + // from it. From these HTTP headers, we obtain the Content-Type + // of the ARCRecord's payload, then set value as the MIME-type + // of the ARCRecord itself. + + // If the response is something other than HTTP + // (like DNS) there are no HTTP headers. + if ( arc.getHttpHeaders( ) != null ) + { + for ( Header h : arc.getHttpHeaders( ) ) + { + if ( h.getName( ).equals( "Content-Type" ) ) + { + arc.getMetaData( ).getHeaderFields( ).put( ARCConstants.MIMETYPE_FIELD_KEY, h.getValue( ) ); + } + } + } + + return arc; + } + + } + + /** + * Simple test/debug driver to read an archive file and print out + * the header for each record. + */ + public static void main( String args[] ) throws Exception + { + if ( args.length != 1 ) + { + System.out.println( "ReaderTest <(w)arc file>" ); + System.exit( 1 ); + } + + String arcName = args[0]; + + ArchiveReader r = ArchiveReaderFactory.get( arcName ); + + ArcReader reader = new ArcReader( r ); + + for ( ARCRecord rec : reader ) + { + if ( rec != null ) System.out.println( rec.getHeader( ) ); + } + } +} Added: trunk/archive-access/projects/nat/archive/src/java/org/archive/nutchwax/ArcsToSegment.java =================================================================== --- trunk/archive-access/projects/nat/archive/src/java/org/archive/nutchwax/ArcsToSegment.java (rev 0) +++ trunk/archive-access/projects/nat/archive/src/java/org/archive/nutchwax/ArcsToSegment.java 2008-05-14 00:20:24 UTC (rev 2265) @@ -0,0 +1,553 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.nutchwax; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.util.Map.Entry; +import java.util.Iterator; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.TextInputFormat; +import org.apache.hadoop.mapred.TextOutputFormat; +import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.NutchWritable; +import org.apache.nutch.crawl.SignatureFactory; +import org.apache.nutch.fetcher.FetcherOutputFormat; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.net.URLFilters; +import org.apache.nutch.net.URLFilterException; +import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.ParseResult; +import org.apache.nutch.parse.ParseStatus; +import org.apache.nutch.parse.ParseText; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ProtocolStatus; +import org.apache.nutch.scoring.ScoringFilters; +import org.apache.nutch.util.LogUtil; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.NutchJob; +import org.apache.nutch.util.StringUtil; + +import org.archive.io.ArchiveReader; +import org.archive.io.ArchiveReaderFactory; +import org.archive.io.arc.ARCRecord; +import org.archive.io.arc.ARCRecordMetaData; + + +/** + * Convert Archive files (.arc/.warc) files to a Nutch segment. This + * is sometimes called "importing" other times "converting", the terms + * are equivalent. + * + * <code>ArcsToSegment</code> is coded as a Hadoop job and is intended + * to be run within the Hadoop framework, or at least started by the + * Hadoop launcher incorporated into Nutch. Although there is a + * <code>main</code> driver, the Nutch launcher script is strongly + * recommended. + * + * This class was initially adapted from the Nutch + * <code>Fetcher</code> class. The premise is since the Nutch + * fetching process acquires external content and places it in a Nutch + * segment, we can perform a similar activity by taking content from + * the ARC files and place that content in a Nutch segment in a + * similar fashion. Ideally, once the <code>ArcsToSegment</code> is + * used to import a set of ARCs into a Nutch segment, the resulting + * segment should be more-or-less the same as one created by Nutch's + * own Fetcher. + * + * Since we are mimicing the Nutch Fetcher, we have to be careful + * about some implementation details that might not seem relevant + * to the importing of ARC files. I've noted those details with + * comments prefaced with "?:". + */ +public class ArcsToSegment extends Configured implements Tool, Mapper +{ + + public static final Log LOG = LogFactory.getLog( ArcsToSegment.class ); + + private JobConf jobConf; + private URLFilters urlFilters; + private ScoringFilters scfilters; + private ParseUtil parseUtil; + private URLNormalizers normalizers; + private int interval; + + private long numSkipped; + private long numImported; + private long bytesSkipped; + private long bytesImported; + + /** + * ?: Is this necessary? + */ + public ArcsToSegment() + { + + } + + /** + * <p>Constructor that sets the job configuration.</p> + * + * @param conf + */ + public ArcsToSegment( Configuration conf ) + { + setConf( conf ); + } + + /** + * <p>Configures the job. Sets the url filters, scoring filters, url normalizers + * and other relevant data.</p> + * + * @param job The job configuration. + */ + public void configure( JobConf job ) + { + // set the url filters, scoring filters the parse util and the url + // normalizers + this.jobConf = job; + this.urlFilters = new URLFilters ( jobConf ); + this.scfilters = new ScoringFilters( jobConf ); + this.parseUtil = new ParseUtil ( jobConf ); + this.normalizers = new URLNormalizers( jobConf, URLNormalizers.SCOPE_FETCHER ); + this.interval = jobConf.getInt( "db.fetch.interval.default", 2592000 ); + } + + /** + * In Mapper interface. + * @inherit + */ + public void close() + { + + } + + /** + * <p>Runs the Map job to translate an arc file into output for Nutch + * segments.</p> + * + * @param key Line number in manifest corresponding to the <code>value</code> + * @param value A line from the manifest + * @param output The output collecter. + * @param reporter The progress reporter. + */ + public void map( WritableComparable key, + Writable value, + OutputCollector output, + Reporter reporter ) + throws IOException + { + String arcUrl = ""; + String collection = ""; + String segmentName = getConf().get( Nutch.SEGMENT_NAME_KEY ); + + // Each line of the manifest is "<url> <collection>" where <collection> is optional + String[] line = value.toString().split( " " ); + arcUrl = line[0]; + + if ( line.length > 1 ) + { + collection = line[1]; + } + + if ( LOG.isInfoEnabled() ) LOG.info( "Importing ARC: " + arcUrl ); + + ArchiveReader r = ArchiveReaderFactory.get( arcUrl ); + + ArcReader reader = new ArcReader( r ); + + try + { + for ( ARCRecord record : reader ) + { + // When reading WARC files, records of type other than + // "response" are returned as 'null' by the Iterator, so + // we skip them. + if ( record == null ) continue ; + + importRecord( record, segmentName, collection, output ); + + // FIXME: What does this do exactly? + reporter.progress(); + } + } + finally + { + r.close(); + + if ( LOG.isInfoEnabled() ) + { + LOG.info( "Completed ARC: " + arcUrl ); + LOG.info( "URLs skipped : " + this.numSkipped ); + LOG.info( "URLs imported: " + this.numImported ); + LOG.info( "URLs total : " + ( this.numSkipped + this.numImported ) ); + } + } + + } + + /** + * Import an ARCRecord. + * + * @param record + * @param segmentName + * @param collectionName + * @param output + * @return whether record was imported or not (i.e. filtered out due to URL filtering rules, etc.) + */ + private boolean importRecord( ARCRecord record, String segmentName, String collectionName, OutputCollector output ) + { + ARCRecordMetaData meta = record.getMetaData(); + + if ( LOG.isInfoEnabled() ) LOG.info( "Consider URL: " + meta.getUrl() + " (" + meta.getMimetype() + ")" ); + + /* ?: On second thought, DON'T do this. Even if we don't have a + parser registered for a content-type, we still want to index + its URL and possibly other meta-data. + */ + /* + // First, check to see if we have a parser registered for the + // URL's Content-Type, so we don't read in some huge video file + // only to discover we don't have a parser for it. + if ( ! this.hasRegisteredParser( meta.getMimetype() ) ) + { + if ( LOG.isInfoEnabled() ) LOG.info( "No parser registered for: " + meta.getMimetype() ); + + this.numSkipped++; + this.bytesSkipped += meta.getLength(); + + return false ; + } + */ + + // ?: Arguably, we shouldn't be normalizing nor filtering based + // on the URL. If the document made it into the (W)ARC file, then + // it should be indexed. But then again, the normalizers and + // filters can be disabled in the Nutch configuration files. + String url = this.normalizeAndFilterUrl( meta.getUrl() ); + + if ( url == null ) + { + if ( LOG.isInfoEnabled() ) LOG.info( "Skip URL: " + meta.getUrl() ); + + this.numSkipped++; + this.bytesSkipped += meta.getLength(); + + return false; + } + + // URL is good, let's import the content. + if ( LOG.isInfoEnabled() ) LOG.info( "Import URL: " + meta.getUrl() ); + this.numImported++; + this.bytesImported += meta.getLength(); + + try + { + ... [truncated message content] |