From: <bi...@us...> - 2008-07-24 23:34:37
|
Revision: 2491 http://archive-access.svn.sourceforge.net/archive-access/?rev=2491&view=rev Author: binzino Date: 2008-07-24 23:34:46 +0000 (Thu, 24 Jul 2008) Log Message: ----------- Add content-length to metadata stored for imported document. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2008-07-24 23:31:54 UTC (rev 2490) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2008-07-24 23:34:46 UTC (rev 2491) @@ -231,7 +231,7 @@ { ARCRecordMetaData meta = record.getMetaData(); - if ( LOG.isInfoEnabled() ) LOG.info( "Consider URL: " + meta.getUrl() + " (" + meta.getMimetype() + ")" ); + if ( LOG.isInfoEnabled() ) LOG.info( "Consider URL: " + meta.getUrl() + " (" + meta.getMimetype() + ") [" + meta.getLength( ) + "]" ); try { @@ -302,16 +302,18 @@ // We store both the normal URL and the URL+digest key for // later retrieval by the indexing plugin(s). - contentMetadata.set( NutchWax.URL_KEY, url ); - contentMetadata.set( NutchWax.ORIG_KEY, key ); + contentMetadata.set( NutchWax.URL_KEY, url ); + contentMetadata.set( NutchWax.ORIG_KEY, key ); - contentMetadata.set( NutchWax.CONTENT_TYPE_KEY, meta.getMimetype() ); - contentMetadata.set( NutchWax.FILENAME_KEY, meta.getArcFile().getName() ); - contentMetadata.set( NutchWax.FILEOFFSET_KEY, String.valueOf( record.getHeader().getOffset( ) ) ); - contentMetadata.set( NutchWax.COLLECTION_KEY, collectionName ); - contentMetadata.set( NutchWax.DATE_KEY, meta.getDate() ); - contentMetadata.set( NutchWax.DIGEST_KEY, meta.getDigest() ); + contentMetadata.set( NutchWax.FILENAME_KEY, meta.getArcFile().getName() ); + contentMetadata.set( NutchWax.FILEOFFSET_KEY, String.valueOf( record.getHeader().getOffset( ) ) ); + contentMetadata.set( NutchWax.COLLECTION_KEY, collectionName ); + contentMetadata.set( NutchWax.DATE_KEY, meta.getDate() ); + contentMetadata.set( NutchWax.DIGEST_KEY, meta.getDigest() ); + contentMetadata.set( NutchWax.CONTENT_TYPE_KEY, meta.getMimetype() ); + contentMetadata.set( NutchWax.CONTENT_LENGTH_KEY, String.valueOf( meta.getLength() ) ); + Content content = new Content( url, url, bytes, meta.getMimetype(), contentMetadata, getConf() ); output( output, new Text( key ), content ); Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java 2008-07-24 23:31:54 UTC (rev 2490) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java 2008-07-24 23:34:46 UTC (rev 2491) @@ -22,12 +22,13 @@ public class NutchWax { - public static final String URL_KEY = "url"; - public static final String ORIG_KEY = "orig"; - public static final String FILENAME_KEY = "filename"; - public static final String FILEOFFSET_KEY = "fileoffset"; - public static final String COLLECTION_KEY = "collection"; - public static final String CONTENT_TYPE_KEY = "type"; - public static final String DATE_KEY = "date"; - public static final String DIGEST_KEY = "digest"; + public static final String URL_KEY = "url"; + public static final String ORIG_KEY = "orig"; + public static final String FILENAME_KEY = "filename"; + public static final String FILEOFFSET_KEY = "fileoffset"; + public static final String COLLECTION_KEY = "collection"; + public static final String DATE_KEY = "date"; + public static final String DIGEST_KEY = "digest"; + public static final String CONTENT_TYPE_KEY = "type"; + public static final String CONTENT_LENGTH_KEY = "length"; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |