Revision: 3337 http://archive-access.svn.sourceforge.net/archive-access/?rev=3337&view=rev Author: binzino Date: 2010-11-16 23:17:48 +0000 (Tue, 16 Nov 2010) Log Message: ----------- Added config property to control size of body to be indexed. Default 100k. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java 2010-11-16 23:17:04 UTC (rev 3336) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java 2010-11-16 23:17:48 UTC (rev 3337) @@ -47,6 +47,7 @@ private List<FieldSpecification> fieldSpecs; private int MAX_TITLE_LENGTH; + private int MAX_BODY_LENGTH; private TypeNormalizer typenormalizer; private TypeFilter typefilter; private URLFilter urlfilter; @@ -56,8 +57,8 @@ this.conf = conf; this.MAX_TITLE_LENGTH = conf.getInt("indexer.max.title.length", 100); + this.MAX_BODY_LENGTH = conf.getInt("indexer.max.body.length", (100 * 1024)); - // this.allowedTypes = new HashSet<String>( conf.get( "indexer.mimetypes.allowed", "" ).split( "\\s+" ) ); this.typenormalizer = new TypeNormalizer( ); this.typenormalizer.setAliases( typenormalizer.getDefaultAliases( ) ); @@ -185,6 +186,11 @@ else if ( "content".equals( spec.srcKey ) ) { value = parse.getText( ); + + if ( value != null && value.length() > MAX_BODY_LENGTH ) + { + value = value.substring( 0, MAX_BODY_LENGTH ); + } } else if ( "title".equals( spec.srcKey ) ) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |