From: <bi...@us...> - 2010-03-18 22:40:45
|
Revision: 2979 http://archive-access.svn.sourceforge.net/archive-access/?rev=2979&view=rev Author: binzino Date: 2010-03-18 22:40:39 +0000 (Thu, 18 Mar 2010) Log Message: ----------- WAX-74. Add support for storing field value in compressed form. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java Modified: trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml 2010-03-18 22:11:53 UTC (rev 2978) +++ trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml 2010-03-18 22:40:39 UTC (rev 2979) @@ -44,11 +44,10 @@ <name>nutchwax.filter.index</name> <value> title:false:true:tokenized - content:false:false:tokenized + content:false:compress:tokenized site:false:false:untokenized url:false:true:tokenized - digest:false:true:no collection:true:true:no_norms date:true:true:no_norms Modified: trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java 2010-03-18 22:11:53 UTC (rev 2978) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java 2010-03-18 22:40:39 UTC (rev 2979) @@ -36,6 +36,7 @@ import org.apache.nutch.indexer.NutchDocument; import org.apache.nutch.indexer.lucene.LuceneWriter; import org.apache.nutch.indexer.lucene.LuceneWriter.INDEX; +import org.apache.nutch.indexer.lucene.LuceneWriter.STORE; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.parse.Parse; @@ -74,7 +75,7 @@ String srcKey = spec[0]; boolean lowerCase = true; - boolean store = true; + STORE store = STORE.YES; INDEX index = INDEX.TOKENIZED; boolean exclusive = true; String destKey = srcKey; @@ -91,7 +92,10 @@ "no_norms". equals(spec[3]) ? INDEX.NO_NORMS : INDEX.NO; case 3: - store = Boolean.parseBoolean( spec[2] ); + //store = Boolean.parseBoolean( spec[2] ); + store = "true". equals(spec[2]) ? STORE.YES : + "compress".equals(spec[2]) ? STORE.COMPRESS : + STORE.NO; case 2: lowerCase = Boolean.parseBoolean( spec[1] ); case 1: @@ -109,12 +113,12 @@ { String srcKey; boolean lowerCase; - boolean store; + STORE store; INDEX index; boolean exclusive; String destKey; - public FieldSpecification( String srcKey, boolean lowerCase, boolean store, INDEX index, boolean exclusive, String destKey ) + public FieldSpecification( String srcKey, boolean lowerCase, STORE store, INDEX index, boolean exclusive, String destKey ) { this.srcKey = srcKey; this.lowerCase = lowerCase; @@ -147,6 +151,12 @@ try { value = (new URL( meta.get( "url" ) ) ).getHost( ); + + // Strip off any "www." header. + if ( value.startsWith( "www." ) ) + { + value = value.substring( 4 ); + } } catch ( MalformedURLException mue ) { /* Eat it */ } } @@ -171,6 +181,11 @@ int p = value.indexOf( ';' ); if ( p >= 0 ) value = value.substring( 0, p ); } + else if ( "collection".equals( spec.srcKey ) ) + { + // Use value given in config first, otherwise what's in the metadata object. + value = conf.get( "nutchwax.index.collection", meta.get( spec.srcKey ) ); + } else { value = meta.get( spec.srcKey ); @@ -188,7 +203,7 @@ doc.removeField( spec.destKey ); } - if ( spec.store || spec.index != INDEX.NO ) + if ( spec.store != STORE.NO || spec.index != INDEX.NO ) { doc.add( spec.destKey, value ); } @@ -202,13 +217,13 @@ { for ( FieldSpecification spec : this.fieldSpecs ) { - if ( ! spec.store && spec.index == INDEX.NO ) + if ( spec.store == STORE.NO && spec.index == INDEX.NO ) { continue ; } LuceneWriter.addFieldOptions( spec.destKey, - spec.store ? LuceneWriter.STORE.YES : LuceneWriter.STORE.NO, + spec.store, spec.index, conf ); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |