From: <bi...@us...> - 2008-06-26 22:35:23
|
Revision: 2331 http://archive-access.svn.sourceforge.net/archive-access/?rev=2331&view=rev Author: binzino Date: 2008-06-26 15:35:33 -0700 (Thu, 26 Jun 2008) Log Message: ----------- Add WaybackURLFilter configuration. Add archive-digest field to indexing and query plugins configurations. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml Modified: trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml 2008-06-26 22:34:24 UTC (rev 2330) +++ trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml 2008-06-26 22:35:33 UTC (rev 2331) @@ -10,7 +10,7 @@ <!-- Add 'index-nutchwax' and 'query-nutchwax' to plugin list. --> <!-- Also, add 'parse-pdf' --> <!-- Remove 'urlfilter-regex' and 'normalizer-(pass|regex|basic)' --> - <value>protocol-http|parse-(text|html|js|pdf)|index-(basic|anchor|nutchwax)|query-(basic|site|url|nutchwax)|summary-basic|scoring-opic</value> + <value>protocol-http|parse-(text|html|js|pdf)|index-(basic|anchor|nutchwax)|query-(basic|site|url|nutchwax)|summary-basic|scoring-opic|urlfilter-nutchwax</value> </property> <property> @@ -24,6 +24,7 @@ --> <name>nutchwax.filter.index</name> <value> + archive-digest:false arcname:false collection date @@ -45,6 +46,7 @@ <!-- We do *not* use this filter for handling "date" queries, there is a specific filter for that: DateQueryFilter --> <name>nutchwax.filter.query</name> <value> + raw:archive-digest:false raw:arcname:false group:collection group:type @@ -62,8 +64,19 @@ <property> <name>mime.type.magic</name> <value>false</value> - <description>Defines if the mime content type detector uses magic resolution. - </description> + <description>Defines if the mime content type detector uses magic resolution.</description> </property> +<property> + <name>nutchwax.urlfilter.wayback.exclusions</name> + <value></value> + <description>Path to file containing list of exclusions.</description> +</property> + +<property> + <name>nutchwax.urlfilter.wayback.canonicalizer</name> + <value>org.archive.wayback.util.url.AggressiveUrlCanonicalizer</value> + <description></description> +</property> + </configuration> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-06-29 00:20:07
|
Revision: 2343 http://archive-access.svn.sourceforge.net/archive-access/?rev=2343&view=rev Author: binzino Date: 2008-06-28 17:20:16 -0700 (Sat, 28 Jun 2008) Log Message: ----------- Changed "archive-digest" to "digest" to match changes in NutchWax code. Added "exclusive" property to ConfigurableIndexingFilter config. Added explicit ordering of index filters so that ours is called last so it can over-write metadata values: url, orig, digest. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml Modified: trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml 2008-06-29 00:17:48 UTC (rev 2342) +++ trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml 2008-06-29 00:20:16 UTC (rev 2343) @@ -10,21 +10,32 @@ <!-- Add 'index-nutchwax' and 'query-nutchwax' to plugin list. --> <!-- Also, add 'parse-pdf' --> <!-- Remove 'urlfilter-regex' and 'normalizer-(pass|regex|basic)' --> - <value>protocol-http|parse-(text|html|js|pdf)|index-(basic|anchor|nutchwax)|query-(basic|site|url|nutchwax)|summary-basic|scoring-opic|urlfilter-nutchwax</value> + <value>protocol-http|parse-(text|html|js|pdf)|index-(basic|nutchwax)|query-(basic|site|url|nutchwax)|summary-basic|scoring-opic|urlfilter-nutchwax</value> </property> <property> + <name>indexingfilter.order</name> + <value> + org.apache.nutch.indexer.basic.BasicIndexingFilter + org.archive.nutchwax.index.ConfigurableIndexingFilter + </value> +</property> + +<property> <!-- Configure the 'index-nutchwax' plugin. Specify how the metadata fields added by the ArcsToSegment are mapped to the Lucene documents during indexing. The specifications here are of the form "src-key:lowercase:store:tokenize:dest-key" Where the only required part is the "src-key", the rest will assume the following defaults: lowercase = true store = true tokenize = false + exclusive = true dest-key = src-key --> <name>nutchwax.filter.index</name> <value> - archive-digest:false + url:false:true:true + orig:false + digest:false arcname:false collection date @@ -46,7 +57,7 @@ <!-- We do *not* use this filter for handling "date" queries, there is a specific filter for that: DateQueryFilter --> <name>nutchwax.filter.query</name> <value> - raw:archive-digest:false + raw:digest:false raw:arcname:false group:collection group:type This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-07-01 22:41:48
|
Revision: 2346 http://archive-access.svn.sourceforge.net/archive-access/?rev=2346&view=rev Author: binzino Date: 2008-07-01 15:41:57 -0700 (Tue, 01 Jul 2008) Log Message: ----------- Added nutchwax.import.content.limit property. And more comments. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml Modified: trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml 2008-06-30 20:38:36 UTC (rev 2345) +++ trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml 2008-07-01 22:41:57 UTC (rev 2346) @@ -13,6 +13,16 @@ <value>protocol-http|parse-(text|html|js|pdf)|index-(basic|nutchwax)|query-(basic|site|url|nutchwax)|summary-basic|scoring-opic|urlfilter-nutchwax</value> </property> +<!-- The indexing filter order *must* be specified in order for + NutchWAX's ConfigurableIndexingFilter to be called *after* the + BasicIndexingFilter. This is necessary so that the + ConfigurableIndexingFilter can over-write some of the values put + into the Lucene document by the BasicIndexingFilter. + + The over-written values are the 'url' and 'digest' fields, which + NutchWAX needs to handle specially in order for de-duplication to + work properly. + --> <property> <name>indexingfilter.order</name> <value> @@ -78,16 +88,38 @@ <description>Defines if the mime content type detector uses magic resolution.</description> </property> +<!-- Normally, this is specified on the command line with the NutchWAX + Importer is invoked. It can be specified here if the user + prefers. + --> <property> <name>nutchwax.urlfilter.wayback.exclusions</name> <value></value> <description>Path to file containing list of exclusions.</description> </property> +<!-- For CDX-based de-duplication to work properly, you must use the + same Wayback URLCanonicalizer that is used by the "(w)arc-indexer" + utility. By default, this is AggressiveUrlCanonicalizer, but + could by IdentityCanonicalizer if you use the "-i" (identity) option + with "(w)arc-indexer". + --> <property> <name>nutchwax.urlfilter.wayback.canonicalizer</name> <value>org.archive.wayback.util.url.AggressiveUrlCanonicalizer</value> <description></description> </property> +<!-- Similar to Nutch's + file.content.limit + http.content.limit + ftp.content.limit + properties, this specifies a limit on the size of a document + imported via NutchWAX. + --> +<property> + <name>nutchwax.import.content.limit</name> + <value>1048576</value> +</property> + </configuration> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |