From: <bi...@us...> - 2008-06-26 22:32:35
|
Revision: 2328 http://archive-access.svn.sourceforge.net/archive-access/?rev=2328&view=rev Author: binzino Date: 2008-06-26 15:32:40 -0700 (Thu, 26 Jun 2008) Log Message: ----------- Ensure digest calculation is enabled in ARC reading. Convert dates read from WARC files from WARC format to 14-digit format. Explicitly set digest if reading from WARC file. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/ArcReader.java Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/ArcReader.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/ArcReader.java 2008-06-26 22:30:24 UTC (rev 2327) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/ArcReader.java 2008-06-26 22:32:40 UTC (rev 2328) @@ -209,13 +209,27 @@ Map arcMetadataFields = new HashMap( ); arcMetadataFields.put( ARCConstants.URL_FIELD_KEY, header.getHeaderValue( WARCConstants.HEADER_KEY_URI ) ); arcMetadataFields.put( ARCConstants.IP_HEADER_FIELD_KEY, header.getHeaderValue( WARCConstants.HEADER_KEY_IP ) ); - arcMetadataFields.put( ARCConstants.DATE_FIELD_KEY, header.getHeaderValue( WARCConstants.HEADER_KEY_DATE ) ); arcMetadataFields.put( ARCConstants.MIMETYPE_FIELD_KEY, header.getHeaderValue( null ) ); // We don't know the MIME type of the *payload* in a WARC (yet) arcMetadataFields.put( ARCConstants.LENGTH_FIELD_KEY, header.getHeaderValue( WARCConstants.CONTENT_LENGTH ) ); arcMetadataFields.put( ARCConstants.VERSION_FIELD_KEY, header.getHeaderValue( null ) ); // FIXME: Do we need actual values for these? arcMetadataFields.put( ARCConstants.ABSOLUTE_OFFSET_KEY, header.getHeaderValue( null ) ); // FIXME: Do we need actual values for these? + + // Dates must be converted from WARC format to 14-digit format, + // that is, from YYYY-MM-DDTHH:MM:SSZ to YYYYMMDDHHMMSS + String warcDate = (String) header.getHeaderValue( WARCConstants.HEADER_KEY_DATE ); + StringBuilder date = new StringBuilder( ) + .append( warcDate, 0, 4 ) + .append( warcDate, 5, 7 ) + .append( warcDate, 8, 10 ) + .append( warcDate, 11, 13 ) + .append( warcDate, 14, 16 ) + .append( warcDate, 17, 19 ); + + arcMetadataFields.put( ARCConstants.DATE_FIELD_KEY, date.toString( ) ); ARCRecordMetaData metadata = new ARCRecordMetaData( header.getReaderIdentifier( ), arcMetadataFields ); + + metadata.setDigest( (String) header.getHeaderValue( WARCConstants.HEADER_KEY_PAYLOAD_DIGEST ) ); // Then, create an ARCRecord using the WARCRecord and the // ARCRecordMetaData object we just created. @@ -250,6 +264,7 @@ } + /** * Simple test/debug driver to read an archive file and print out * the header for each record. @@ -258,18 +273,22 @@ { if ( args.length != 1 ) { - System.out.println( "ReaderTest <(w)arc file>" ); + System.out.println( "ArcReader <(w)arc file>" ); System.exit( 1 ); } String arcName = args[0]; ArchiveReader r = ArchiveReaderFactory.get( arcName ); + r.setDigest( true ); ArcReader reader = new ArcReader( r ); for ( ARCRecord rec : reader ) { + // Must call close() for digest calculation to be finished. + rec.close( ); + if ( rec != null ) System.out.println( rec.getHeader( ) ); } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |