From: <bi...@us...> - 2008-07-01 22:52:05
|
Revision: 2347 http://archive-access.svn.sourceforge.net/archive-access/?rev=2347&view=rev Author: binzino Date: 2008-07-01 15:52:08 -0700 (Tue, 01 Jul 2008) Log Message: ----------- Moved read logic to readBytes() method. Also fixed bug WAX-9, so now up to nutchwax.import.content.limit bytes are read, or all bytes if that property is not defined or has a value of -1. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2008-07-01 22:41:57 UTC (rev 2346) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2008-07-01 22:52:08 UTC (rev 2347) @@ -71,25 +71,24 @@ /** - * Convert Archive files (.arc/.warc) files to a Nutch segment. This - * is sometimes called "importing" other times "converting", the terms - * are equivalent. + * Import Archive files (.arc/.warc) files into a newly-created Nutch + * segment. * - * <code>Importer</code> is coded as a Hadoop job and is intended - * to be run within the Hadoop framework, or at least started by the + * <code>Importer</code> is coded as a Hadoop job and is intended to + * be run within the Hadoop framework, or at least started by the * Hadoop launcher incorporated into Nutch. Although there is a * <code>main</code> driver, the Nutch launcher script is strongly * recommended. * * This class was initially adapted from the Nutch - * <code>Fetcher</code> class. The premise is since the Nutch - * fetching process acquires external content and places it in a Nutch - * segment, we can perform a similar activity by taking content from - * the ARC files and place that content in a Nutch segment in a - * similar fashion. Ideally, once the <code>Importer</code> is - * used to import a set of ARCs into a Nutch segment, the resulting - * segment should be more-or-less the same as one created by Nutch's - * own Fetcher. + * <code>Fetcher</code> and <code>ArcSegmentCreator</code> classes. + * The premise is since the Nutch fetching process acquires external + * content and places it in a Nutch segment, we can perform a similar + * activity by taking content from the ARC files and place that + * content in a Nutch segment in a similar fashion. Ideally, once the + * <code>Importer</code> is used to import a set of ARCs into a Nutch + * segment, the resulting segment should be more-or-less the same as + * one created by Nutch's own Fetcher. * * Since we are mimicing the Nutch Fetcher, we have to be careful * about some implementation details that might not seem relevant @@ -241,18 +240,16 @@ // headers. record.skipHttpHeader(); - // TODO: Put in a size limiter, akin to Nutch's file.content.limit. - - // Read the bytes of the HTTP response - byte[] bytes = new byte[(int) meta.getLength()]; - - // NOTE: Do not use read(byte[]) because ArchiveRecord does NOT over-ride - // the implementation inherited from InputStream. And since it does - // not over-ride it, it won't do the digesting on it. Must use either - // read(byte[],offset,length) or read(). - record.read( bytes, 0, bytes.length ); + // We use record.available() rather than meta.getLength() + // because the latter includes the size of the HTTP header, + // which we just skipped. + byte[] bytes = readBytes( record, record.available( ) ); - // Must call close() for digest calculation to be finished. + // If there is no digest, then we assume we're reading an + // ARCRecord not a WARCRecord. In that case, we close the + // record, which updates the digest string. Then we tweak the + // digest string so we have the same for for both ARC and WARC + // records. if ( meta.getDigest() == null ) { record.close(); @@ -505,6 +502,67 @@ } /** + * Utility method to read the content bytes from an archive record. + * The number of bytes read can be limited via the configuration + * property <code>nutchwax.import.content.limit</code>. + */ + private byte[] readBytes( ARCRecord record, long contentLength ) + throws IOException + { + // Ensure the record does strict reading. + record.setStrict( true ); + + long size = jobConf.getLong( "nutchwax.import.content.limit", -1 ); + + if ( size < 0 ) + { + size = contentLength; + } + else + { + size = Math.min( size, contentLength ); + } + + // Read the bytes of the HTTP response + byte[] bytes = new byte[(int) size]; + + if ( size == 0 ) + { + return bytes; + } + + // NOTE: Do not use read(byte[]) because ArchiveRecord does NOT over-ride + // the implementation inherited from InputStream. And since it does + // not over-ride it, it won't do the digesting on it. Must use either + // read(byte[],offset,length) or read(). + int pos = 0; + while ( (pos += record.read( bytes, pos, (bytes.length - pos) )) < bytes.length ) + ; + + // Now that the bytes[] buffer has been filled, read the remainder + // of the record so that the digest is computed over the entire + // content. + byte[] buf = new byte[1024 * 1024]; + int count = 0; + while ( record.available( ) > 0 ) + { + count += record.read( buf, 0, Math.min( buf.length, record.available( ) ) ); + } + + if ( LOG.isInfoEnabled() ) LOG.info( "Bytes read: expected=" + contentLength + " bytes.length=" + bytes.length + " pos=" + pos + " count=" + count ); + + // Sanity check. The number of bytes read into our bytes[] + // buffer, plus the count of extra stuff read after it should + // equal the contentLength passed into this function. + if ( pos + count != contentLength ) + { + throw new IOException( "Incorrect number of bytes read from ArchiveRecord: expected=" + contentLength + " bytes.length=" + bytes.length + " pos=" + pos + " count=" + count ); + } + + return bytes; + } + + /** * */ public int run( String[] args ) throws Exception This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |