From: <bi...@us...> - 2008-06-10 21:36:36
|
Revision: 2289 http://archive-access.svn.sourceforge.net/archive-access/?rev=2289&view=rev Author: binzino Date: 2008-06-10 14:36:32 -0700 (Tue, 10 Jun 2008) Log Message: ----------- Added GroupedQueryFilter, essentially a RawFieldQueryFilter that allows for multiple values. Added use thereof to conf/nutch-site.xml Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml trunk/archive-access/projects/nutchwax/archive/src/plugin/query-nutchwax/src/java/org/archive/nutchwax/query/ConfigurableQueryFilter.java Modified: trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml 2008-06-05 22:49:14 UTC (rev 2288) +++ trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml 2008-06-10 21:36:32 UTC (rev 2289) @@ -33,17 +33,21 @@ <property> <!-- Configure the 'query-nutchwax' plugin. Specify which fields to make searchable via "field:[term|phrase]" query syntax, and whether they are "raw" fields or not. - The specification format is "raw:name:lowercase:boost" or "field:name:boost". Default values are + The specification format is one of: + field:<name>:<boost> + raw:<name>:<lowercase>:<boost> + group:<name>:<lowercase>:<delimiter>:<boost> + Default values are lowercase = true + delimiter = "," boost = 1.0f - There is no "lowercase" property for "field" specification because the Nutch FieldQueryFilter doesn't expose the option, unlike the RawFieldQueryFilter. - AFAICT, the order isn't important. --> + --> <!-- We do *not* use this filter for handling "date" queries, there is a specific filter for that: DateQueryFilter --> <name>nutchwax.filter.query</name> <value> raw:arcname:false - raw:collection - raw:type + group:collection + group:type field:anchor field:content field:host Modified: trunk/archive-access/projects/nutchwax/archive/src/plugin/query-nutchwax/src/java/org/archive/nutchwax/query/ConfigurableQueryFilter.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/query-nutchwax/src/java/org/archive/nutchwax/query/ConfigurableQueryFilter.java 2008-06-05 22:49:14 UTC (rev 2288) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/query-nutchwax/src/java/org/archive/nutchwax/query/ConfigurableQueryFilter.java 2008-06-10 21:36:32 UTC (rev 2289) @@ -24,10 +24,14 @@ import java.util.ArrayList; import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.index.Term; import org.apache.nutch.searcher.QueryFilter; import org.apache.nutch.searcher.QueryException; import org.apache.nutch.searcher.Query; +import org.apache.nutch.searcher.Query.Clause; import org.apache.nutch.searcher.FieldQueryFilter; import org.apache.nutch.searcher.RawFieldQueryFilter; import org.apache.hadoop.conf.Configuration; @@ -130,10 +134,40 @@ // TODO: Warning, but ignore it. } } + QueryFilter filter = new RawFieldQueryFilterImpl( name, lowerCase, boost ); - + this.filters.add( filter ); } + else if ( "group".equals( spec[0] ) ) + { + String name = spec[1]; + boolean lowerCase = true; + String delimiter = ","; + float boost = 1.0f; + if ( spec.length > 2 ) + { + lowerCase = Boolean.parseBoolean( spec[2] ); + } + if ( spec.length > 3 ) + { + delimiter = spec[3]; + } + if ( spec.length > 4 ) + { + try + { + boost = Float.parseFloat( spec[4] ); + } + catch ( NumberFormatException nfe ) + { + // TODO: Warning, but ignore it. + } + } + QueryFilter filter = new GroupedQueryFilter( name, delimiter, lowerCase, boost ); + + this.filters.add( filter ); + } else { // TODO: Warning uknown filter type @@ -175,4 +209,78 @@ } } + public class GroupedQueryFilter implements QueryFilter + { + private String field; + private String delimiter; + private boolean lowerCase; + private float boost; + private Configuration conf; + + /** Construct for the named field, potentially lowercasing query values.*/ + public GroupedQueryFilter( String field, String delimiter, boolean lowerCase, float boost ) + { + this.field = field; + this.delimiter = delimiter; + this.lowerCase = lowerCase; + this.boost = boost; + + // Use the same conf as the owning instance. + this.setConf( ConfigurableQueryFilter.this.conf ); + } + + public BooleanQuery filter( Query input, BooleanQuery output ) + throws QueryException + { + // examine each clause in the Nutch query + for ( Clause c : input.getClauses() ) + { + // skip non-matching clauses + if ( !c.getField( ).equals( field ) ) continue; + + // get the field value from the clause + // raw fields are guaranteed to be Terms, not Phrases + String values = c.getTerm().toString(); + + BooleanQuery group = new BooleanQuery( output.isCoordDisabled( ) ); + for ( String value : values.split( this.delimiter ) ) + { + if (lowerCase) value = value.toLowerCase(); + + // Create a Lucene TermQuery for this value + TermQuery term = new TermQuery( new Term( field, value ) ); + + term.setBoost(boost); + + // Add it to the group + group.add( term, BooleanClause.Occur.SHOULD ); + } + + // Finally add the group to the overall query. The group's + // must/not/should is taken from the original Nutch clause + // with the multiple values. + output.add( group, (c.isProhibited() + ? BooleanClause.Occur.MUST_NOT + : (c.isRequired() + ? BooleanClause.Occur.MUST + : BooleanClause.Occur.SHOULD + ) + )); + } + + // return the modified Lucene query + return output; + } + + public void setConf( Configuration conf ) + { + this.conf = conf; + } + + public Configuration getConf( ) + { + return this.conf; + } + } + } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |