From: <bi...@us...> - 2010-10-28 00:53:17
|
Revision: 3317 http://archive-access.svn.sourceforge.net/archive-access/?rev=3317&view=rev Author: binzino Date: 2010-10-28 00:53:11 +0000 (Thu, 28 Oct 2010) Log Message: ----------- Added digest to metadata. Added use of auto-content-type-detection. Disabled BoilerPipe. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2010-10-28 00:52:10 UTC (rev 3316) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2010-10-28 00:53:11 UTC (rev 3317) @@ -334,13 +334,29 @@ contentMetadata.set( NutchWax.COLLECTION_KEY, collectionName ); contentMetadata.set( NutchWax.DATE_KEY, meta.getDate() ); contentMetadata.set( NutchWax.DIGEST_KEY, meta.getDigest() ); - contentMetadata.set( NutchWax.CONTENT_TYPE_KEY, meta.getMimetype() ); contentMetadata.set( NutchWax.CONTENT_LENGTH_KEY, String.valueOf( meta.getLength() ) ); contentMetadata.set( NutchWax.HTTP_RESPONSE_KEY, String.valueOf( record.getStatusCode() ) ); + String type = (meta.getMimetype( ) == null ? "" : meta.getMimetype( )).split( "[;]" )[0].toLowerCase().trim(); + + // If the Content-Type from the HTTP response is "text/plain", + // set it to null to trigger full auto-detection via Tika. + if ( "text/plain".equals( type ) ) + { + type = null; + } + + Content content = new Content( url, url, bytes, type, contentMetadata, getConf() ); + + if ( LOG.isDebugEnabled() ) LOG.debug( "Auto-detect content-type: " + type + " " + content.getContentType( ) + " " + url ); + + // Store both the original and auto-detected content types. + contentMetadata.set( NutchWax.ORIGINAL_TYPE_KEY, meta.getMimetype( ) ); + contentMetadata.set( NutchWax.CONTENT_TYPE_KEY, content.getContentType( ) ); + // BoilerPipe! /* - if ( "text/html".equals( meta.getMimetype() ) ) + if ( "text/html".equals( content.getContentType( ) ) ) { String boiledHTML = de.l3s.boilerpipe.extractors.DefaultExtractor.INSTANCE.getText( new org.xml.sax.InputSource( new java.io.ByteArrayInputStream( bytes ) ) ); @@ -348,8 +364,6 @@ } */ - Content content = new Content( url, url, bytes, meta.getMimetype(), contentMetadata, getConf() ); - output( output, new Text( key ), content ); return true; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |