Thread: [Archive-access-cvs] SF.net SVN: archive-access: [2331] trunk/archive-access/projects/nutchwax/ arc

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 2331
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2331&view=rev
Author:   binzino
Date:     2008-06-26 15:35:33 -0700 (Thu, 26 Jun 2008)

Log Message:
-----------
Add WaybackURLFilter configuration.  Add archive-digest field to
indexing and query plugins configurations.

Modified Paths:
--------------
    trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml

Modified: trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml
===================================================================

--- trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml	2008-06-26 22:34:24 UTC (rev 2330)
+++ trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml	2008-06-26 22:35:33 UTC (rev 2331)
@@ -10,7 +10,7 @@
   <!-- Add 'index-nutchwax' and 'query-nutchwax' to plugin list. -->
   <!-- Also, add 'parse-pdf' -->
   <!-- Remove 'urlfilter-regex' and 'normalizer-(pass|regex|basic)' -->
-  <value>protocol-http|parse-(text|html|js|pdf)|index-(basic|anchor|nutchwax)|query-(basic|site|url|nutchwax)|summary-basic|scoring-opic</value>
+  <value>protocol-http|parse-(text|html|js|pdf)|index-(basic|anchor|nutchwax)|query-(basic|site|url|nutchwax)|summary-basic|scoring-opic|urlfilter-nutchwax</value>
 </property>
 
 <property>
@@ -24,6 +24,7 @@
     -->
   <name>nutchwax.filter.index</name>
   <value>
+    archive-digest:false
     arcname:false
     collection
     date
@@ -45,6 +46,7 @@
   <!-- We do *not* use this filter for handling "date" queries, there is a specific filter for that: DateQueryFilter -->
   <name>nutchwax.filter.query</name>
   <value>
+    raw:archive-digest:false
     raw:arcname:false
     group:collection
     group:type
@@ -62,8 +64,19 @@
 <property>
   <name>mime.type.magic</name>
   <value>false</value>
-  <description>Defines if the mime content type detector uses magic resolution.
-  </description>
+  <description>Defines if the mime content type detector uses magic resolution.</description>
 </property>
 
+<property>
+  <name>nutchwax.urlfilter.wayback.exclusions</name>
+  <value></value>
+  <description>Path to file containing list of exclusions.</description>
+</property>
+
+<property>
+  <name>nutchwax.urlfilter.wayback.canonicalizer</name>
+  <value>org.archive.wayback.util.url.AggressiveUrlCanonicalizer</value>
+  <description></description>
+</property>
+
 </configuration>


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.




Thread: [Archive-access-cvs] SF.net SVN: archive-access: [2331] trunk/archive-access/projects/nutchwax/ arc

archive-access-cvs