[Archive-access-cvs] SF.net SVN: archive-access:[3317] tags/nutchwax-0_13-JIRA-WAX-75/archive/ src

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 3317
          http://archive-access.svn.sourceforge.net/archive-access/?rev=3317&view=rev
Author:   binzino
Date:     2010-10-28 00:53:11 +0000 (Thu, 28 Oct 2010)

Log Message:
-----------
Added digest to metadata.  Added use of auto-content-type-detection.  Disabled BoilerPipe.

Modified Paths:
--------------
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java

Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java
===================================================================

--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java	2010-10-28 00:52:10 UTC (rev 3316)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java	2010-10-28 00:53:11 UTC (rev 3317)
@@ -334,13 +334,29 @@
         contentMetadata.set( NutchWax.COLLECTION_KEY,     collectionName     );
         contentMetadata.set( NutchWax.DATE_KEY,           meta.getDate()     );
         contentMetadata.set( NutchWax.DIGEST_KEY,         meta.getDigest()   );
-        contentMetadata.set( NutchWax.CONTENT_TYPE_KEY,   meta.getMimetype() );
         contentMetadata.set( NutchWax.CONTENT_LENGTH_KEY, String.valueOf( meta.getLength() ) );
         contentMetadata.set( NutchWax.HTTP_RESPONSE_KEY,  String.valueOf( record.getStatusCode() ) );
 
+        String type = (meta.getMimetype( ) == null ? "" : meta.getMimetype( )).split( "[;]" )[0].toLowerCase().trim();
+
+        // If the Content-Type from the HTTP response is "text/plain",
+        // set it to null to trigger full auto-detection via Tika.
+        if ( "text/plain".equals( type ) )
+          {
+            type = null;
+          }
+        
+        Content content = new Content( url, url, bytes, type, contentMetadata, getConf() );
+
+        if ( LOG.isDebugEnabled() ) LOG.debug( "Auto-detect content-type: " + type + " " + content.getContentType( ) + " " + url );
+        
+        // Store both the original and auto-detected content types.
+        contentMetadata.set( NutchWax.ORIGINAL_TYPE_KEY, meta.getMimetype( ) );
+        contentMetadata.set( NutchWax.CONTENT_TYPE_KEY,  content.getContentType( ) );
+
         // BoilerPipe!
         /*
-        if ( "text/html".equals( meta.getMimetype() ) )
+        if ( "text/html".equals( content.getContentType( ) ) )
           {
             String boiledHTML = de.l3s.boilerpipe.extractors.DefaultExtractor.INSTANCE.getText( new org.xml.sax.InputSource( new java.io.ByteArrayInputStream( bytes ) ) );
 
@@ -348,8 +364,6 @@
           }
         */
 
-        Content content = new Content( url, url, bytes, meta.getMimetype(), contentMetadata, getConf() );
-
         output( output, new Text( key  ), content );
 
         return true;


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.




[Archive-access-cvs] SF.net SVN: archive-access:[3317] tags/nutchwax-0_13-JIRA-WAX-75/archive/ src

[Archive-access-cvs] SF.net SVN: archive-access:[3317] tags/nutchwax-0_13-JIRA-WAX-75/archive/ src/java/org/archive/nutchwax/Importer.java