From: <bi...@us...> - 2010-11-16 23:16:42
|
Revision: 3335 http://archive-access.svn.sourceforge.net/archive-access/?rev=3335&view=rev Author: binzino Date: 2010-11-16 23:16:35 +0000 (Tue, 16 Nov 2010) Log Message: ----------- Added config controls to trim input docs for text/plain and text/html to avoid performance problems with large (50+MB) input docs. Also added try/catch around boilerpipe. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2010-11-15 20:32:34 UTC (rev 3334) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2010-11-16 23:16:35 UTC (rev 3335) @@ -16,11 +16,10 @@ */ package org.archive.nutchwax; -import java.io.IOException; -import java.net.MalformedURLException; +import java.io.*; +import java.net.*; import java.util.Map.Entry; -import java.util.List; -import java.util.ArrayList; +import java.util.*; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -359,20 +358,43 @@ "application/xhtml+xml".equals( content.getContentType( ) ) || "application/xhtml" .equals( content.getContentType( ) ) ) { - long size = jobConf.getLong( "nutchwax.import.content.limit.html", -1 ); + int size = jobConf.getInt( "nutchwax.import.content.limit.html", -1 ); if ( size > 0 && size < length ) { - LOG.warn( "HTML file size exceeds threshold [" + size + "], skipping: " + meta.getUrl( ) + " [" + length + "]" ); - return false; + LOG.warn( "HTML file size exceeds threshold [" + size + "]: " + meta.getUrl( ) + " [" + length + "]" ); + + bytes = Arrays.copyOf( bytes, size ); + + content.setContent( bytes ); } - if ( jobConf.getBoolean( "nutchwax.import.boilerpipe", false ) ) + try { - // BoilerPipe! - contentMetadata.set( "boiled", de.l3s.boilerpipe.extractors.DefaultExtractor.INSTANCE.getText( new org.xml.sax.InputSource( new java.io.ByteArrayInputStream( bytes ) ) ) ); + if ( jobConf.getBoolean( "nutchwax.import.boilerpipe", false ) ) + { + // BoilerPipe! + contentMetadata.set( "boiled", de.l3s.boilerpipe.extractors.DefaultExtractor.INSTANCE.getText( new org.xml.sax.InputSource( new java.io.ByteArrayInputStream( bytes ) ) ) ); + } } + catch ( Exception e ) + { + LOG.warn( "Error boilerpiping: " + meta.getUrl( ) ); + } } + if ( "text/plain".equals( content.getContentType( ) ) ) + { + int size = jobConf.getInt( "nutchwax.import.content.limit.text", -1 ); + if ( size > 0 && size < length ) + { + LOG.warn( "Text file size exceeds threshold [" + size + "]: " + meta.getUrl( ) + " [" + length + "]" ); + + bytes = Arrays.copyOf( bytes, size ); + + content.setContent( bytes ); + } + } + output( output, new Text( key ), content ); return true; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |