From: <bi...@us...> - 2010-10-27 07:07:57
|
Revision: 3310 http://archive-access.svn.sourceforge.net/archive-access/?rev=3310&view=rev Author: binzino Date: 2010-10-27 07:07:51 +0000 (Wed, 27 Oct 2010) Log Message: ----------- Disabled the BoilerPipe stuff for now. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2010-10-27 07:07:20 UTC (rev 3309) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2010-10-27 07:07:51 UTC (rev 3310) @@ -338,6 +338,16 @@ contentMetadata.set( NutchWax.CONTENT_LENGTH_KEY, String.valueOf( meta.getLength() ) ); contentMetadata.set( NutchWax.HTTP_RESPONSE_KEY, String.valueOf( record.getStatusCode() ) ); + // BoilerPipe! + /* + if ( "text/html".equals( meta.getMimetype() ) ) + { + String boiledHTML = de.l3s.boilerpipe.extractors.DefaultExtractor.INSTANCE.getText( new org.xml.sax.InputSource( new java.io.ByteArrayInputStream( bytes ) ) ); + + contentMetadata.set( "boiledHTML", boiledHTML ); + } + */ + Content content = new Content( url, url, bytes, meta.getMimetype(), contentMetadata, getConf() ); output( output, new Text( key ), content ); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |