From: <bi...@us...> - 2008-07-14 21:24:27
|
Revision: 2441 http://archive-access.svn.sourceforge.net/archive-access/?rev=2441&view=rev Author: binzino Date: 2008-07-14 14:24:37 -0700 (Mon, 14 Jul 2008) Log Message: ----------- Fix JIRA: WAX-10, WAX-11, WAX-12. Added 'fileoffset' metadata field and changed 'arcname' to 'filename'. Also add 'exacturl' to be copy of 'url' that is untokenized so it can be matched exactly. All of these changes are driven by Wayback-NutchWAX integration. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java trunk/archive-access/projects/nutchwax/archive/src/plugin/query-nutchwax/plugin.xml Modified: trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml 2008-07-14 21:21:52 UTC (rev 2440) +++ trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml 2008-07-14 21:24:37 UTC (rev 2441) @@ -44,9 +44,11 @@ <name>nutchwax.filter.index</name> <value> url:false:true:true + url:flase:true:false:true:exacturl orig:false digest:false - arcname:false + filename:false + fileoffset:false collection date type @@ -68,7 +70,9 @@ <name>nutchwax.filter.query</name> <value> raw:digest:false - raw:arcname:false + raw:filename:false + raw:fileoffset:false + raw:exacturl:false group:collection group:type field:anchor Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2008-07-14 21:21:52 UTC (rev 2440) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2008-07-14 21:24:37 UTC (rev 2441) @@ -158,8 +158,8 @@ } /** - * <p>Runs the Map job to translate an arc file into output for Nutch - * segments.</p> + * <p>Runs the Map job to import records from an archive file into a + * Nutch segment.</p> * * @param key Line number in manifest corresponding to the <code>value</code> * @param value A line from the manifest @@ -306,7 +306,8 @@ contentMetadata.set( NutchWax.ORIG_KEY, key ); contentMetadata.set( NutchWax.CONTENT_TYPE_KEY, meta.getMimetype() ); - contentMetadata.set( NutchWax.ARCNAME_KEY, meta.getArcFile().getName() ); + contentMetadata.set( NutchWax.FILENAME_KEY, meta.getArcFile().getName() ); + contentMetadata.set( NutchWax.FILEOFFSET_KEY, String.valueOf( record.getHeader().getOffset( ) ) ); contentMetadata.set( NutchWax.COLLECTION_KEY, collectionName ); contentMetadata.set( NutchWax.DATE_KEY, meta.getDate() ); contentMetadata.set( NutchWax.DIGEST_KEY, meta.getDigest() ); @@ -360,7 +361,11 @@ } /** - * + * Writes the key and related content to the output collector. The + * division between <code>importRecord</code> and + * <code>output</code> is merely based on the way the code was + * structured in the <code>ArcSegmentCreator.java</code> which was + * used as a starting-point for this class. */ private void output( OutputCollector output, Text key, @@ -563,7 +568,10 @@ } /** - * + * Runs the import job with the given arguments. This method + * assumes that is is being run via the command-line; as such, it + * emits error messages regarding invalid/missing arguments to the + * system error stream. */ public int run( String[] args ) throws Exception { @@ -630,6 +638,8 @@ catch ( Exception e ) { LOG.fatal( "Importer: ", e ); + System.err.println( "Fatal error: " + e ); + e.printStackTrace( System.err ); return -1; } @@ -637,7 +647,7 @@ } /** - * + * Emit usage information for command-line driver. */ public void usage( ) { @@ -655,7 +665,7 @@ } /** - * + * Command-line driver. Runs the Importer as a Hadoop job. */ public static void main( String args[] ) throws Exception { Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java 2008-07-14 21:21:52 UTC (rev 2440) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java 2008-07-14 21:24:37 UTC (rev 2441) @@ -24,7 +24,8 @@ { public static final String URL_KEY = "url"; public static final String ORIG_KEY = "orig"; - public static final String ARCNAME_KEY = "arcname"; + public static final String FILENAME_KEY = "filename"; + public static final String FILEOFFSET_KEY = "fileoffset"; public static final String COLLECTION_KEY = "collection"; public static final String CONTENT_TYPE_KEY = "type"; public static final String DATE_KEY = "date"; Modified: trunk/archive-access/projects/nutchwax/archive/src/plugin/query-nutchwax/plugin.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/query-nutchwax/plugin.xml 2008-07-14 21:21:52 UTC (rev 2440) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/query-nutchwax/plugin.xml 2008-07-14 21:24:37 UTC (rev 2441) @@ -40,7 +40,7 @@ point="org.apache.nutch.searcher.QueryFilter"> <implementation id="ConfigurableQueryFilter" class="org.archive.nutchwax.query.ConfigurableQueryFilter"> - <parameter name="raw-fields" value="arcname,collection,date,type" /> + <parameter name="raw-fields" value="collection,date,digest,exacturl,filename,fileoffset,type" /> <parameter name="fields" value="anchor,content,host,title" /> </implementation> </extension> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |