From: <bi...@us...> - 2008-06-10 21:36:36
|
Revision: 2289 http://archive-access.svn.sourceforge.net/archive-access/?rev=2289&view=rev Author: binzino Date: 2008-06-10 14:36:32 -0700 (Tue, 10 Jun 2008) Log Message: ----------- Added GroupedQueryFilter, essentially a RawFieldQueryFilter that allows for multiple values. Added use thereof to conf/nutch-site.xml Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml trunk/archive-access/projects/nutchwax/archive/src/plugin/query-nutchwax/src/java/org/archive/nutchwax/query/ConfigurableQueryFilter.java Modified: trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml 2008-06-05 22:49:14 UTC (rev 2288) +++ trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml 2008-06-10 21:36:32 UTC (rev 2289) @@ -33,17 +33,21 @@ <property> <!-- Configure the 'query-nutchwax' plugin. Specify which fields to make searchable via "field:[term|phrase]" query syntax, and whether they are "raw" fields or not. - The specification format is "raw:name:lowercase:boost" or "field:name:boost". Default values are + The specification format is one of: + field:<name>:<boost> + raw:<name>:<lowercase>:<boost> + group:<name>:<lowercase>:<delimiter>:<boost> + Default values are lowercase = true + delimiter = "," boost = 1.0f - There is no "lowercase" property for "field" specification because the Nutch FieldQueryFilter doesn't expose the option, unlike the RawFieldQueryFilter. - AFAICT, the order isn't important. --> + --> <!-- We do *not* use this filter for handling "date" queries, there is a specific filter for that: DateQueryFilter --> <name>nutchwax.filter.query</name> <value> raw:arcname:false - raw:collection - raw:type + group:collection + group:type field:anchor field:content field:host Modified: trunk/archive-access/projects/nutchwax/archive/src/plugin/query-nutchwax/src/java/org/archive/nutchwax/query/ConfigurableQueryFilter.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/query-nutchwax/src/java/org/archive/nutchwax/query/ConfigurableQueryFilter.java 2008-06-05 22:49:14 UTC (rev 2288) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/query-nutchwax/src/java/org/archive/nutchwax/query/ConfigurableQueryFilter.java 2008-06-10 21:36:32 UTC (rev 2289) @@ -24,10 +24,14 @@ import java.util.ArrayList; import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.index.Term; import org.apache.nutch.searcher.QueryFilter; import org.apache.nutch.searcher.QueryException; import org.apache.nutch.searcher.Query; +import org.apache.nutch.searcher.Query.Clause; import org.apache.nutch.searcher.FieldQueryFilter; import org.apache.nutch.searcher.RawFieldQueryFilter; import org.apache.hadoop.conf.Configuration; @@ -130,10 +134,40 @@ // TODO: Warning, but ignore it. } } + QueryFilter filter = new RawFieldQueryFilterImpl( name, lowerCase, boost ); - + this.filters.add( filter ); } + else if ( "group".equals( spec[0] ) ) + { + String name = spec[1]; + boolean lowerCase = true; + String delimiter = ","; + float boost = 1.0f; + if ( spec.length > 2 ) + { + lowerCase = Boolean.parseBoolean( spec[2] ); + } + if ( spec.length > 3 ) + { + delimiter = spec[3]; + } + if ( spec.length > 4 ) + { + try + { + boost = Float.parseFloat( spec[4] ); + } + catch ( NumberFormatException nfe ) + { + // TODO: Warning, but ignore it. + } + } + QueryFilter filter = new GroupedQueryFilter( name, delimiter, lowerCase, boost ); + + this.filters.add( filter ); + } else { // TODO: Warning uknown filter type @@ -175,4 +209,78 @@ } } + public class GroupedQueryFilter implements QueryFilter + { + private String field; + private String delimiter; + private boolean lowerCase; + private float boost; + private Configuration conf; + + /** Construct for the named field, potentially lowercasing query values.*/ + public GroupedQueryFilter( String field, String delimiter, boolean lowerCase, float boost ) + { + this.field = field; + this.delimiter = delimiter; + this.lowerCase = lowerCase; + this.boost = boost; + + // Use the same conf as the owning instance. + this.setConf( ConfigurableQueryFilter.this.conf ); + } + + public BooleanQuery filter( Query input, BooleanQuery output ) + throws QueryException + { + // examine each clause in the Nutch query + for ( Clause c : input.getClauses() ) + { + // skip non-matching clauses + if ( !c.getField( ).equals( field ) ) continue; + + // get the field value from the clause + // raw fields are guaranteed to be Terms, not Phrases + String values = c.getTerm().toString(); + + BooleanQuery group = new BooleanQuery( output.isCoordDisabled( ) ); + for ( String value : values.split( this.delimiter ) ) + { + if (lowerCase) value = value.toLowerCase(); + + // Create a Lucene TermQuery for this value + TermQuery term = new TermQuery( new Term( field, value ) ); + + term.setBoost(boost); + + // Add it to the group + group.add( term, BooleanClause.Occur.SHOULD ); + } + + // Finally add the group to the overall query. The group's + // must/not/should is taken from the original Nutch clause + // with the multiple values. + output.add( group, (c.isProhibited() + ? BooleanClause.Occur.MUST_NOT + : (c.isRequired() + ? BooleanClause.Occur.MUST + : BooleanClause.Occur.SHOULD + ) + )); + } + + // return the modified Lucene query + return output; + } + + public void setConf( Configuration conf ) + { + this.conf = conf; + } + + public Configuration getConf( ) + { + return this.conf; + } + } + } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-07-14 21:24:27
|
Revision: 2441 http://archive-access.svn.sourceforge.net/archive-access/?rev=2441&view=rev Author: binzino Date: 2008-07-14 14:24:37 -0700 (Mon, 14 Jul 2008) Log Message: ----------- Fix JIRA: WAX-10, WAX-11, WAX-12. Added 'fileoffset' metadata field and changed 'arcname' to 'filename'. Also add 'exacturl' to be copy of 'url' that is untokenized so it can be matched exactly. All of these changes are driven by Wayback-NutchWAX integration. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java trunk/archive-access/projects/nutchwax/archive/src/plugin/query-nutchwax/plugin.xml Modified: trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml 2008-07-14 21:21:52 UTC (rev 2440) +++ trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml 2008-07-14 21:24:37 UTC (rev 2441) @@ -44,9 +44,11 @@ <name>nutchwax.filter.index</name> <value> url:false:true:true + url:flase:true:false:true:exacturl orig:false digest:false - arcname:false + filename:false + fileoffset:false collection date type @@ -68,7 +70,9 @@ <name>nutchwax.filter.query</name> <value> raw:digest:false - raw:arcname:false + raw:filename:false + raw:fileoffset:false + raw:exacturl:false group:collection group:type field:anchor Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2008-07-14 21:21:52 UTC (rev 2440) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2008-07-14 21:24:37 UTC (rev 2441) @@ -158,8 +158,8 @@ } /** - * <p>Runs the Map job to translate an arc file into output for Nutch - * segments.</p> + * <p>Runs the Map job to import records from an archive file into a + * Nutch segment.</p> * * @param key Line number in manifest corresponding to the <code>value</code> * @param value A line from the manifest @@ -306,7 +306,8 @@ contentMetadata.set( NutchWax.ORIG_KEY, key ); contentMetadata.set( NutchWax.CONTENT_TYPE_KEY, meta.getMimetype() ); - contentMetadata.set( NutchWax.ARCNAME_KEY, meta.getArcFile().getName() ); + contentMetadata.set( NutchWax.FILENAME_KEY, meta.getArcFile().getName() ); + contentMetadata.set( NutchWax.FILEOFFSET_KEY, String.valueOf( record.getHeader().getOffset( ) ) ); contentMetadata.set( NutchWax.COLLECTION_KEY, collectionName ); contentMetadata.set( NutchWax.DATE_KEY, meta.getDate() ); contentMetadata.set( NutchWax.DIGEST_KEY, meta.getDigest() ); @@ -360,7 +361,11 @@ } /** - * + * Writes the key and related content to the output collector. The + * division between <code>importRecord</code> and + * <code>output</code> is merely based on the way the code was + * structured in the <code>ArcSegmentCreator.java</code> which was + * used as a starting-point for this class. */ private void output( OutputCollector output, Text key, @@ -563,7 +568,10 @@ } /** - * + * Runs the import job with the given arguments. This method + * assumes that is is being run via the command-line; as such, it + * emits error messages regarding invalid/missing arguments to the + * system error stream. */ public int run( String[] args ) throws Exception { @@ -630,6 +638,8 @@ catch ( Exception e ) { LOG.fatal( "Importer: ", e ); + System.err.println( "Fatal error: " + e ); + e.printStackTrace( System.err ); return -1; } @@ -637,7 +647,7 @@ } /** - * + * Emit usage information for command-line driver. */ public void usage( ) { @@ -655,7 +665,7 @@ } /** - * + * Command-line driver. Runs the Importer as a Hadoop job. */ public static void main( String args[] ) throws Exception { Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java 2008-07-14 21:21:52 UTC (rev 2440) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java 2008-07-14 21:24:37 UTC (rev 2441) @@ -24,7 +24,8 @@ { public static final String URL_KEY = "url"; public static final String ORIG_KEY = "orig"; - public static final String ARCNAME_KEY = "arcname"; + public static final String FILENAME_KEY = "filename"; + public static final String FILEOFFSET_KEY = "fileoffset"; public static final String COLLECTION_KEY = "collection"; public static final String CONTENT_TYPE_KEY = "type"; public static final String DATE_KEY = "date"; Modified: trunk/archive-access/projects/nutchwax/archive/src/plugin/query-nutchwax/plugin.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/query-nutchwax/plugin.xml 2008-07-14 21:21:52 UTC (rev 2440) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/query-nutchwax/plugin.xml 2008-07-14 21:24:37 UTC (rev 2441) @@ -40,7 +40,7 @@ point="org.apache.nutch.searcher.QueryFilter"> <implementation id="ConfigurableQueryFilter" class="org.archive.nutchwax.query.ConfigurableQueryFilter"> - <parameter name="raw-fields" value="arcname,collection,date,type" /> + <parameter name="raw-fields" value="collection,date,digest,exacturl,filename,fileoffset,type" /> <parameter name="fields" value="anchor,content,host,title" /> </implementation> </extension> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |