From: <bi...@us...> - 2009-06-23 21:35:02
|
Revision: 2743 http://archive-access.svn.sourceforge.net/archive-access/?rev=2743&view=rev Author: binzino Date: 2009-06-23 21:35:00 +0000 (Tue, 23 Jun 2009) Log Message: ----------- Fix WAX-45 and WAX-48. ConfigurableIndexingFilter can handle all the fields relevant to Nutch(WAX). Update the nute-site.xml accordingly. Also, remove the site and url query filters from nutch-site.xml and configure NutchWAX query filter to take over for them. Modified Paths: -------------- tags/nutchwax-0_12_5/archive/src/nutch/conf/nutch-site.xml tags/nutchwax-0_12_5/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java tags/nutchwax-0_12_5/archive/src/plugin/query-nutchwax/plugin.xml Modified: tags/nutchwax-0_12_5/archive/src/nutch/conf/nutch-site.xml =================================================================== --- tags/nutchwax-0_12_5/archive/src/nutch/conf/nutch-site.xml 2009-06-23 21:17:31 UTC (rev 2742) +++ tags/nutchwax-0_12_5/archive/src/nutch/conf/nutch-site.xml 2009-06-23 21:35:00 UTC (rev 2743) @@ -10,19 +10,18 @@ <!-- Add 'index-nutchwax' and 'query-nutchwax' to plugin list. --> <!-- Also, add 'parse-pdf' --> <!-- Remove 'urlfilter-regex' and 'normalizer-(pass|regex|basic)' --> - <value>protocol-http|parse-(text|html|js|pdf)|index-(basic|nutchwax)|query-(basic|site|url|nutchwax)|summary-basic|scoring-nutchwax|urlfilter-nutchwax</value> + <value>protocol-http|parse-(text|html|js|pdf)|index-nutchwax|query-(basic|nutchwax)|summary-basic|scoring-nutchwax|urlfilter-nutchwax</value> </property> -<!-- The indexing filter order *must* be specified in order for - NutchWAX's ConfigurableIndexingFilter to be called *after* the - BasicIndexingFilter. This is necessary so that the - ConfigurableIndexingFilter can over-write some of the values put - into the Lucene document by the BasicIndexingFilter. - - The over-written values are the 'url' and 'digest' fields, which - NutchWAX needs to handle specially in order for de-duplication to - work properly. - --> +<!-- + When using *only* the 'index-nutchwax' in 'plugin.includes' above, + we don't need to specify an order since there is only one plugin. + + However, if you choose to use the Nutch 'index-basic', then you have + to specify the order such that the NutchWAX ConfigurableIndexingFilter + is after it. Whichever plugin comes last over-writes the values + of those that come before it. + <property> <name>indexingfilter.order</name> <value> @@ -30,29 +29,31 @@ org.archive.nutchwax.index.ConfigurableIndexingFilter </value> </property> + --> <property> <!-- Configure the 'index-nutchwax' plugin. Specify how the metadata fields added by the Importer are mapped to the Lucene documents during indexing. - The specifications here are of the form "src-key:lowercase:store:tokenize:dest-key" + The specifications here are of the form "src-key:lowercase:store:index:dest-key" Where the only required part is the "src-key", the rest will assume the following defaults: lowercase = true store = true - tokenize = false + index = tokenized exclusive = true dest-key = src-key --> <name>nutchwax.filter.index</name> <value> - url:false:true:true - url:false:true:false:true:exacturl - orig:false - digest:false - filename:false - fileoffset:false - collection - date - type - length + title:false:true:tokenized + content:false:false:tokenized + site:false:false:untokenized + + url:false:true:no + digest:false:true:no + + collection:true:true:no_norms + date:true:true:no_norms + type:true:true:no_norms + length:false:true:no </value> </property> @@ -70,15 +71,10 @@ <!-- We do *not* use this filter for handling "date" queries, there is a specific filter for that: DateQueryFilter --> <name>nutchwax.filter.query</name> <value> - raw:digest:false - raw:filename:false - raw:fileoffset:false - raw:exacturl:false group:collection + group:site:false group:type - field:anchor field:content - field:host field:title </value> </property> Modified: tags/nutchwax-0_12_5/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java =================================================================== --- tags/nutchwax-0_12_5/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java 2009-06-23 21:17:31 UTC (rev 2742) +++ tags/nutchwax-0_12_5/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java 2009-06-23 21:35:00 UTC (rev 2743) @@ -20,6 +20,8 @@ */ package org.archive.nutchwax.index; +import java.net.MalformedURLException; +import java.net.URL; import java.util.List; import java.util.ArrayList; @@ -27,6 +29,7 @@ import org.apache.commons.logging.LogFactory; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Index; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.nutch.crawl.CrawlDatum; @@ -46,10 +49,14 @@ private Configuration conf; private List<FieldSpecification> fieldSpecs; + private int MAX_TITLE_LENGTH; + public void setConf( Configuration conf ) { this.conf = conf; - + + this.MAX_TITLE_LENGTH = conf.getInt("indexer.max.title.length", 100); + String filterSpecs = conf.get( "nutchwax.filter.index" ); if ( null == filterSpecs ) @@ -65,12 +72,12 @@ { String spec[] = filterSpec.split("[:]"); - String srcKey = spec[0]; - boolean lowerCase = true; - boolean store = true; - boolean tokenize = false; - boolean exclusive = true; - String destKey = srcKey; + String srcKey = spec[0]; + boolean lowerCase = true; + boolean store = true; + Index index = Index.TOKENIZED; + boolean exclusive = true; + String destKey = srcKey; switch ( spec.length ) { default: @@ -79,7 +86,10 @@ case 5: exclusive = Boolean.parseBoolean( spec[4] ); case 4: - tokenize = Boolean.parseBoolean( spec[3] ); + index = "tokenized". equals(spec[3]) ? Index.TOKENIZED : + "untokenized".equals(spec[3]) ? Index.UN_TOKENIZED : + "no_norms". equals(spec[3]) ? Index.NO_NORMS : + Index.NO; case 3: store = Boolean.parseBoolean( spec[2] ); case 2: @@ -89,9 +99,9 @@ ; } - LOG.info( "Add field specification: " + srcKey + ":" + lowerCase + ":" + store + ":" + tokenize + ":" + exclusive + ":" + destKey ); + LOG.info( "Add field specification: " + srcKey + ":" + lowerCase + ":" + store + ":" + index + ":" + exclusive + ":" + destKey ); - this.fieldSpecs.add( new FieldSpecification( srcKey, lowerCase, store, tokenize, exclusive, destKey ) ); + this.fieldSpecs.add( new FieldSpecification( srcKey, lowerCase, store, index, exclusive, destKey ) ); } } @@ -100,16 +110,16 @@ String srcKey; boolean lowerCase; boolean store; - boolean tokenize; + Index index; boolean exclusive; String destKey; - public FieldSpecification( String srcKey, boolean lowerCase, boolean store, boolean tokenize, boolean exclusive, String destKey ) + public FieldSpecification( String srcKey, boolean lowerCase, boolean store, Index index, boolean exclusive, String destKey ) { this.srcKey = srcKey; this.lowerCase = lowerCase; this.store = store; - this.tokenize = tokenize; + this.index = index; this.exclusive = exclusive; this.destKey = destKey; } @@ -124,14 +134,47 @@ * Transfer NutchWAX field values stored in the parsed content to * the Lucene document. */ - public Document filter( Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks ) + public Document filter( Document doc, Parse parse, Text key, CrawlDatum datum, Inlinks inlinks ) throws IndexingException { Metadata meta = parse.getData().getContentMeta(); for ( FieldSpecification spec : this.fieldSpecs ) { - String value = meta.get( spec.srcKey ); + String value = null; + if ( "site".equals( spec.srcKey ) || "host".equals( spec.srcKey ) ) + { + try + { + value = (new URL( meta.get( "url" ) ) ).getHost( ); + } + catch ( MalformedURLException mue ) { /* Eat it */ } + } + else if ( "content".equals( spec.srcKey ) ) + { + value = parse.getText( ); + } + else if ( "title".equals( spec.srcKey ) ) + { + value = parse.getData().getTitle(); + if ( value.length() > MAX_TITLE_LENGTH ) // truncate title if needed + { + value = value.substring( 0, MAX_TITLE_LENGTH ); + } + } + else if ( "type".equals( spec.srcKey ) ) + { + value = meta.get( spec.srcKey ); + + if ( value == null ) continue ; + + int p = value.indexOf( ';' ); + if ( p >= 0 ) value = value.substring( 0, p ); + } + else + { + value = meta.get( spec.srcKey ); + } if ( value == null ) continue; @@ -144,11 +187,14 @@ { doc.removeFields( spec.destKey ); } - - doc.add( new Field( spec.destKey, - value, - spec.store ? Field.Store.YES : Field.Store.NO, - spec.tokenize ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED ) ); + + if ( spec.store || spec.index != Index.NO ) + { + doc.add( new Field( spec.destKey, + value, + spec.store ? Field.Store.YES : Field.Store.NO, + spec.index ) ); + } } return doc; Modified: tags/nutchwax-0_12_5/archive/src/plugin/query-nutchwax/plugin.xml =================================================================== --- tags/nutchwax-0_12_5/archive/src/plugin/query-nutchwax/plugin.xml 2009-06-23 21:17:31 UTC (rev 2742) +++ tags/nutchwax-0_12_5/archive/src/plugin/query-nutchwax/plugin.xml 2009-06-23 21:35:00 UTC (rev 2743) @@ -40,8 +40,8 @@ point="org.apache.nutch.searcher.QueryFilter"> <implementation id="ConfigurableQueryFilter" class="org.archive.nutchwax.query.ConfigurableQueryFilter"> - <parameter name="raw-fields" value="collection,date,digest,exacturl,filename,fileoffset,type" /> - <parameter name="fields" value="anchor,content,host,title" /> + <parameter name="raw-fields" value="collection,site,type" /> + <parameter name="fields" value="content,title" /> </implementation> </extension> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |