Thread: [Archive-access-cvs] SF.net SVN: archive-access: [2289] trunk/archive-access/projects/nutchwax/ arc

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 2289
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2289&view=rev
Author:   binzino
Date:     2008-06-10 14:36:32 -0700 (Tue, 10 Jun 2008)

Log Message:
-----------
Added GroupedQueryFilter, essentially a RawFieldQueryFilter that
allows for multiple values.  Added use thereof to conf/nutch-site.xml

Modified Paths:
--------------
    trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml
    trunk/archive-access/projects/nutchwax/archive/src/plugin/query-nutchwax/src/java/org/archive/nutchwax/query/ConfigurableQueryFilter.java

Modified: trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml
===================================================================

--- trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml	2008-06-05 22:49:14 UTC (rev 2288)
+++ trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml	2008-06-10 21:36:32 UTC (rev 2289)
@@ -33,17 +33,21 @@
 
 <property>
   <!-- Configure the 'query-nutchwax' plugin.  Specify which fields to make searchable via "field:[term|phrase]" query syntax, and whether they are "raw" fields or not.  
-       The specification format is "raw:name:lowercase:boost" or "field:name:boost".  Default values are
+       The specification format is one of:
+          field:<name>:<boost>
+          raw:<name>:<lowercase>:<boost>
+          group:<name>:<lowercase>:<delimiter>:<boost>
+       Default values are
           lowercase = true
+          delimiter = ","
           boost     = 1.0f
-       There is no "lowercase" property for "field" specification because the Nutch FieldQueryFilter doesn't expose the option, unlike the RawFieldQueryFilter.
-       AFAICT, the order isn't important. -->
+       -->
   <!-- We do *not* use this filter for handling "date" queries, there is a specific filter for that: DateQueryFilter -->
   <name>nutchwax.filter.query</name>
   <value>
     raw:arcname:false
-    raw:collection
-    raw:type
+    group:collection
+    group:type
     field:anchor
     field:content
     field:host

Modified: trunk/archive-access/projects/nutchwax/archive/src/plugin/query-nutchwax/src/java/org/archive/nutchwax/query/ConfigurableQueryFilter.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/plugin/query-nutchwax/src/java/org/archive/nutchwax/query/ConfigurableQueryFilter.java	2008-06-05 22:49:14 UTC (rev 2288)
+++ trunk/archive-access/projects/nutchwax/archive/src/plugin/query-nutchwax/src/java/org/archive/nutchwax/query/ConfigurableQueryFilter.java	2008-06-10 21:36:32 UTC (rev 2289)
@@ -24,10 +24,14 @@
 import java.util.ArrayList;
 
 import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.index.Term;
 
 import org.apache.nutch.searcher.QueryFilter;
 import org.apache.nutch.searcher.QueryException;
 import org.apache.nutch.searcher.Query;
+import org.apache.nutch.searcher.Query.Clause;
 import org.apache.nutch.searcher.FieldQueryFilter;
 import org.apache.nutch.searcher.RawFieldQueryFilter;
 import org.apache.hadoop.conf.Configuration;
@@ -130,10 +134,40 @@
                     // TODO: Warning, but ignore it.
                   }
               }
+
             QueryFilter filter = new RawFieldQueryFilterImpl( name, lowerCase, boost );
-
+            
             this.filters.add( filter );
           }
+        else if ( "group".equals( spec[0] ) )
+          {
+            String  name      = spec[1];
+            boolean lowerCase = true;
+            String  delimiter = ",";
+            float   boost     = 1.0f;
+            if ( spec.length > 2 )
+              {
+                lowerCase = Boolean.parseBoolean( spec[2] );
+              }
+            if ( spec.length > 3 )
+              {
+                delimiter = spec[3];
+              }
+            if ( spec.length > 4 )
+              {
+                try
+                  {
+                    boost = Float.parseFloat( spec[4] );
+                  }
+                catch ( NumberFormatException nfe )
+                  {
+                    // TODO: Warning, but ignore it.
+                  }
+              }
+            QueryFilter filter = new GroupedQueryFilter( name, delimiter, lowerCase, boost );
+            
+            this.filters.add( filter );
+          }
         else
           {
             // TODO: Warning uknown filter type
@@ -175,4 +209,78 @@
     }
   }
 
+  public class GroupedQueryFilter implements QueryFilter
+  {
+    private String  field;
+    private String  delimiter;
+    private boolean lowerCase;
+    private float   boost;
+    private Configuration conf;
+    
+    /** Construct for the named field, potentially lowercasing query values.*/
+    public GroupedQueryFilter( String field, String delimiter, boolean lowerCase, float boost )
+    {
+      this.field     = field;
+      this.delimiter = delimiter;
+      this.lowerCase = lowerCase;
+      this.boost     = boost;
+
+      // Use the same conf as the owning instance.
+      this.setConf( ConfigurableQueryFilter.this.conf );
+    }
+    
+    public BooleanQuery filter( Query input, BooleanQuery output )
+      throws QueryException 
+    {
+      // examine each clause in the Nutch query
+      for ( Clause c : input.getClauses() )
+        {
+          // skip non-matching clauses
+          if ( !c.getField( ).equals( field ) ) continue;
+          
+          // get the field value from the clause
+          // raw fields are guaranteed to be Terms, not Phrases
+          String values = c.getTerm().toString();
+
+          BooleanQuery group = new BooleanQuery( output.isCoordDisabled( ) );
+          for ( String value : values.split( this.delimiter ) )
+            {
+              if (lowerCase) value = value.toLowerCase();
+
+              // Create a Lucene TermQuery for this value
+              TermQuery term = new TermQuery( new Term( field, value ) );
+
+              term.setBoost(boost);
+
+              // Add it to the group
+              group.add( term, BooleanClause.Occur.SHOULD );
+            }
+
+          // Finally add the group to the overall query.  The group's
+          // must/not/should is taken from the original Nutch clause
+          // with the multiple values.
+          output.add( group, (c.isProhibited()
+                              ? BooleanClause.Occur.MUST_NOT
+                              : (c.isRequired()
+                                 ? BooleanClause.Occur.MUST
+                                 : BooleanClause.Occur.SHOULD
+                                 )
+                              ));
+        }
+      
+      // return the modified Lucene query
+      return output;
+    }
+
+    public void setConf( Configuration conf )
+    {
+      this.conf = conf;
+    }
+    
+    public Configuration getConf( )
+    {
+      return this.conf;
+    }
+  }
+
 }


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.




Thread: [Archive-access-cvs] SF.net SVN: archive-access: [2289] trunk/archive-access/projects/nutchwax/ arc

archive-access-cvs