From: <bi...@us...> - 2010-11-15 20:32:40
|
Revision: 3334 http://archive-access.svn.sourceforge.net/archive-access/?rev=3334&view=rev Author: binzino Date: 2010-11-15 20:32:34 +0000 (Mon, 15 Nov 2010) Log Message: ----------- Added nutchwax.import.content.limit.html property. If html file is larger than this value, it is skipped. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2010-11-12 23:54:34 UTC (rev 3333) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2010-11-15 20:32:34 UTC (rev 3334) @@ -247,7 +247,7 @@ { ARCRecordMetaData meta = record.getMetaData(); - if ( LOG.isDebugEnabled() ) LOG.debug( "Consider URL: " + meta.getUrl() + " (" + meta.getMimetype() + ") [" + meta.getLength( ) + "]" ); + if ( LOG.isInfoEnabled() ) LOG.info( "Consider URL: " + meta.getUrl() + " (" + meta.getMimetype() + ") [" + meta.getLength( ) + "]" ); if ( ! this.httpStatusCodeFilter.isAllowed( record.getStatusCode( ) ) ) { @@ -266,7 +266,8 @@ // We use record.available() rather than meta.getLength() // because the latter includes the size of the HTTP header, // which we just skipped. - byte[] bytes = readBytes( record, record.available( ) ); + long length = record.available(); + byte[] bytes = readBytes( record, length ); // If there is no digest, then we assume we're reading an // ARCRecord not a WARCRecord. In that case, we close the @@ -358,6 +359,13 @@ "application/xhtml+xml".equals( content.getContentType( ) ) || "application/xhtml" .equals( content.getContentType( ) ) ) { + long size = jobConf.getLong( "nutchwax.import.content.limit.html", -1 ); + if ( size > 0 && size < length ) + { + LOG.warn( "HTML file size exceeds threshold [" + size + "], skipping: " + meta.getUrl( ) + " [" + length + "]" ); + return false; + } + if ( jobConf.getBoolean( "nutchwax.import.boilerpipe", false ) ) { // BoilerPipe! @@ -365,7 +373,7 @@ } } - output( output, new Text( key ), content ); + output( output, new Text( key ), content ); return true; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |