From: <bi...@us...> - 2010-11-12 23:54:43
|
Revision: 3333 http://archive-access.svn.sourceforge.net/archive-access/?rev=3333&view=rev Author: binzino Date: 2010-11-12 23:54:34 +0000 (Fri, 12 Nov 2010) Log Message: ----------- Added config property to enable/disable BoilerPipe. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2010-11-11 05:49:07 UTC (rev 3332) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2010-11-12 23:54:34 UTC (rev 3333) @@ -354,15 +354,16 @@ contentMetadata.set( NutchWax.ORIGINAL_TYPE_KEY, meta.getMimetype( ) ); contentMetadata.set( NutchWax.CONTENT_TYPE_KEY, content.getContentType( ) ); - // BoilerPipe! - /* - if ( "text/html".equals( content.getContentType( ) ) ) + if ( "text/html" .equals( content.getContentType( ) ) || + "application/xhtml+xml".equals( content.getContentType( ) ) || + "application/xhtml" .equals( content.getContentType( ) ) ) { - String boiledHTML = de.l3s.boilerpipe.extractors.DefaultExtractor.INSTANCE.getText( new org.xml.sax.InputSource( new java.io.ByteArrayInputStream( bytes ) ) ); - - contentMetadata.set( "boiledHTML", boiledHTML ); + if ( jobConf.getBoolean( "nutchwax.import.boilerpipe", false ) ) + { + // BoilerPipe! + contentMetadata.set( "boiled", de.l3s.boilerpipe.extractors.DefaultExtractor.INSTANCE.getText( new org.xml.sax.InputSource( new java.io.ByteArrayInputStream( bytes ) ) ) ); + } } - */ output( output, new Text( key ), content ); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |