Revision: 2836 http://archive-access.svn.sourceforge.net/archive-access/?rev=2836&view=rev Author: binzino Date: 2009-10-27 21:18:32 +0000 (Tue, 27 Oct 2009) Log Message: ----------- Updated to Nutch 1.0 API. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java Modified: trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java 2009-10-27 21:14:24 UTC (rev 2835) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java 2009-10-27 21:18:32 UTC (rev 2836) @@ -20,19 +20,22 @@ */ package org.archive.nutchwax.index; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; import java.util.List; -import java.util.ArrayList; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.Inlinks; import org.apache.nutch.indexer.IndexingException; import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.indexer.lucene.LuceneWriter; +import org.apache.nutch.indexer.lucene.LuceneWriter.INDEX; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.parse.Parse; @@ -46,9 +49,13 @@ private Configuration conf; private List<FieldSpecification> fieldSpecs; + private int MAX_TITLE_LENGTH; + public void setConf( Configuration conf ) { this.conf = conf; + + this.MAX_TITLE_LENGTH = conf.getInt("indexer.max.title.length", 100); String filterSpecs = conf.get( "nutchwax.filter.index" ); @@ -65,12 +72,12 @@ { String spec[] = filterSpec.split("[:]"); - String srcKey = spec[0]; - boolean lowerCase = true; - boolean store = true; - boolean tokenize = false; - boolean exclusive = true; - String destKey = srcKey; + String srcKey = spec[0]; + boolean lowerCase = true; + boolean store = true; + INDEX index = INDEX.TOKENIZED; + boolean exclusive = true; + String destKey = srcKey; switch ( spec.length ) { default: @@ -79,7 +86,10 @@ case 5: exclusive = Boolean.parseBoolean( spec[4] ); case 4: - tokenize = Boolean.parseBoolean( spec[3] ); + index = "tokenized". equals(spec[3]) ? INDEX.TOKENIZED : + "untokenized".equals(spec[3]) ? INDEX.UNTOKENIZED : + "no_norms". equals(spec[3]) ? INDEX.NO_NORMS : + INDEX.NO; case 3: store = Boolean.parseBoolean( spec[2] ); case 2: @@ -89,9 +99,9 @@ ; } - LOG.info( "Add field specification: " + srcKey + ":" + lowerCase + ":" + store + ":" + tokenize + ":" + exclusive + ":" + destKey ); + LOG.info( "Add field specification: " + srcKey + ":" + lowerCase + ":" + store + ":" + index + ":" + exclusive + ":" + destKey ); - this.fieldSpecs.add( new FieldSpecification( srcKey, lowerCase, store, tokenize, exclusive, destKey ) ); + this.fieldSpecs.add( new FieldSpecification( srcKey, lowerCase, store, index, exclusive, destKey ) ); } } @@ -100,16 +110,16 @@ String srcKey; boolean lowerCase; boolean store; - boolean tokenize; + INDEX index; boolean exclusive; String destKey; - public FieldSpecification( String srcKey, boolean lowerCase, boolean store, boolean tokenize, boolean exclusive, String destKey ) + public FieldSpecification( String srcKey, boolean lowerCase, boolean store, INDEX index, boolean exclusive, String destKey ) { this.srcKey = srcKey; this.lowerCase = lowerCase; this.store = store; - this.tokenize = tokenize; + this.index = index; this.exclusive = exclusive; this.destKey = destKey; } @@ -124,14 +134,47 @@ * Transfer NutchWAX field values stored in the parsed content to * the Lucene document. */ - public Document filter( Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks ) + public NutchDocument filter( NutchDocument doc, Parse parse, Text key, CrawlDatum datum, Inlinks inlinks ) throws IndexingException { Metadata meta = parse.getData().getContentMeta(); for ( FieldSpecification spec : this.fieldSpecs ) { - String value = meta.get( spec.srcKey ); + String value = null; + if ( "site".equals( spec.srcKey ) || "host".equals( spec.srcKey ) ) + { + try + { + value = (new URL( meta.get( "url" ) ) ).getHost( ); + } + catch ( MalformedURLException mue ) { /* Eat it */ } + } + else if ( "content".equals( spec.srcKey ) ) + { + value = parse.getText( ); + } + else if ( "title".equals( spec.srcKey ) ) + { + value = parse.getData().getTitle(); + if ( value.length() > MAX_TITLE_LENGTH ) // truncate title if needed + { + value = value.substring( 0, MAX_TITLE_LENGTH ); + } + } + else if ( "type".equals( spec.srcKey ) ) + { + value = meta.get( spec.srcKey ); + + if ( value == null ) continue ; + + int p = value.indexOf( ';' ); + if ( p >= 0 ) value = value.substring( 0, p ); + } + else + { + value = meta.get( spec.srcKey ); + } if ( value == null ) continue; @@ -142,16 +185,33 @@ if ( spec.exclusive ) { - doc.removeFields( spec.destKey ); + doc.removeField( spec.destKey ); } - - doc.add( new Field( spec.destKey, - value, - spec.store ? Field.Store.YES : Field.Store.NO, - spec.tokenize ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED ) ); + + if ( spec.store || spec.index != INDEX.NO ) + { + doc.add( spec.destKey, value ); + } + } return doc; } - + + public void addIndexBackendOptions( Configuration conf ) + { + for ( FieldSpecification spec : this.fieldSpecs ) + { + if ( ! spec.store && spec.index == INDEX.NO ) + { + continue ; + } + + LuceneWriter.addFieldOptions( spec.destKey, + spec.store ? LuceneWriter.STORE.YES : LuceneWriter.STORE.NO, + spec.index, + conf ); + } + + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |