|
From: <bi...@us...> - 2008-12-16 19:53:29
|
Revision: 2677
http://archive-access.svn.sourceforge.net/archive-access/?rev=2677&view=rev
Author: binzino
Date: 2008-12-16 19:53:25 +0000 (Tue, 16 Dec 2008)
Log Message:
-----------
Changed nutchwax.FetchedSegments.perCollection default value to false.
Modified Paths:
--------------
trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml
Modified: trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml 2008-12-16 19:52:42 UTC (rev 2676)
+++ trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml 2008-12-16 19:53:25 UTC (rev 2677)
@@ -144,7 +144,7 @@
-->
<property>
<name>nutchwax.FetchedSegments.perCollection</name>
- <value>true</value>
+ <value>false</value>
</property>
<!-- The following are over-rides of property values in
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <bi...@us...> - 2009-05-05 17:53:28
|
Revision: 2696
http://archive-access.svn.sourceforge.net/archive-access/?rev=2696&view=rev
Author: binzino
Date: 2009-05-05 17:52:47 +0000 (Tue, 05 May 2009)
Log Message:
-----------
Fix type-o.
Modified Paths:
--------------
trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml
Modified: trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml 2009-05-05 17:52:20 UTC (rev 2695)
+++ trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml 2009-05-05 17:52:47 UTC (rev 2696)
@@ -44,7 +44,7 @@
<name>nutchwax.filter.index</name>
<value>
url:false:true:true
- url:flase:true:false:true:exacturl
+ url:false:true:false:true:exacturl
orig:false
digest:false
filename:false
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <bi...@us...> - 2009-05-05 20:20:48
|
Revision: 2698
http://archive-access.svn.sourceforge.net/archive-access/?rev=2698&view=rev
Author: binzino
Date: 2009-05-05 20:20:45 +0000 (Tue, 05 May 2009)
Log Message:
-----------
Fixed type-o.
Modified Paths:
--------------
trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml
Modified: trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml 2009-05-05 19:24:16 UTC (rev 2697)
+++ trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml 2009-05-05 20:20:45 UTC (rev 2698)
@@ -186,7 +186,7 @@
<property>
<name>searcher.fieldcache</name>
- <property>true</property>
+ <value>true</value>
</property>
</configuration>
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <bi...@us...> - 2009-10-27 23:00:58
|
Revision: 2843
http://archive-access.svn.sourceforge.net/archive-access/?rev=2843&view=rev
Author: binzino
Date: 2009-10-27 23:00:46 +0000 (Tue, 27 Oct 2009)
Log Message:
-----------
Ported changes/fixes from NW 0.12.9.
Modified Paths:
--------------
trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml
Modified: trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml 2009-10-27 22:52:46 UTC (rev 2842)
+++ trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml 2009-10-27 23:00:46 UTC (rev 2843)
@@ -10,19 +10,18 @@
<!-- Add 'index-nutchwax' and 'query-nutchwax' to plugin list. -->
<!-- Also, add 'parse-pdf' -->
<!-- Remove 'urlfilter-regex' and 'normalizer-(pass|regex|basic)' -->
- <value>protocol-http|parse-(text|html|js|pdf)|index-(basic|nutchwax)|query-(basic|site|url|nutchwax)|summary-basic|scoring-nutchwax|urlfilter-nutchwax</value>
+ <value>protocol-http|parse-(text|html|js|pdf)|index-nutchwax|query-(basic|nutchwax)|summary-basic|scoring-nutchwax|urlfilter-nutchwax</value>
</property>
-<!-- The indexing filter order *must* be specified in order for
- NutchWAX's ConfigurableIndexingFilter to be called *after* the
- BasicIndexingFilter. This is necessary so that the
- ConfigurableIndexingFilter can over-write some of the values put
- into the Lucene document by the BasicIndexingFilter.
-
- The over-written values are the 'url' and 'digest' fields, which
- NutchWAX needs to handle specially in order for de-duplication to
- work properly.
- -->
+<!--
+ When using *only* the 'index-nutchwax' in 'plugin.includes' above,
+ we don't need to specify an order since there is only one plugin.
+
+ However, if you choose to use the Nutch 'index-basic', then you have
+ to specify the order such that the NutchWAX ConfigurableIndexingFilter
+ is after it. Whichever plugin comes last over-writes the values
+ of those that come before it.
+
<property>
<name>indexingfilter.order</name>
<value>
@@ -30,29 +29,31 @@
org.archive.nutchwax.index.ConfigurableIndexingFilter
</value>
</property>
+ -->
<property>
<!-- Configure the 'index-nutchwax' plugin. Specify how the metadata fields added by the Importer are mapped to the Lucene documents during indexing.
- The specifications here are of the form "src-key:lowercase:store:tokenize:dest-key"
+ The specifications here are of the form "src-key:lowercase:store:index:dest-key"
Where the only required part is the "src-key", the rest will assume the following defaults:
lowercase = true
store = true
- tokenize = false
+ index = tokenized
exclusive = true
dest-key = src-key
-->
<name>nutchwax.filter.index</name>
<value>
- url:false:true:true
- url:false:true:false:true:exacturl
- orig:false
- digest:false
- filename:false
- fileoffset:false
- collection
- date
- type
- length
+ title:false:true:tokenized
+ content:false:false:tokenized
+ site:false:false:untokenized
+
+ url:false:true:tokenized
+ digest:false:true:no
+
+ collection:true:true:no_norms
+ date:true:true:no_norms
+ type:true:true:no_norms
+ length:false:true:no
</value>
</property>
@@ -70,15 +71,10 @@
<!-- We do *not* use this filter for handling "date" queries, there is a specific filter for that: DateQueryFilter -->
<name>nutchwax.filter.query</name>
<value>
- raw:digest:false
- raw:filename:false
- raw:fileoffset:false
- raw:exacturl:false
group:collection
+ group:site:false
group:type
- field:anchor
field:content
- field:host
field:title
</value>
</property>
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <bi...@us...> - 2009-10-28 21:55:33
|
Revision: 2862
http://archive-access.svn.sourceforge.net/archive-access/?rev=2862&view=rev
Author: binzino
Date: 2009-10-28 21:55:11 +0000 (Wed, 28 Oct 2009)
Log Message:
-----------
Removed NutchWAX scoring filter since we now recommend to do the scoring/boosting after the index is built.
Modified Paths:
--------------
trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml
Modified: trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml 2009-10-28 21:49:56 UTC (rev 2861)
+++ trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml 2009-10-28 21:55:11 UTC (rev 2862)
@@ -10,7 +10,7 @@
<!-- Add 'index-nutchwax' and 'query-nutchwax' to plugin list. -->
<!-- Also, add 'parse-pdf' -->
<!-- Remove 'urlfilter-regex' and 'normalizer-(pass|regex|basic)' -->
- <value>protocol-http|parse-(text|html|js|pdf)|index-nutchwax|query-(basic|nutchwax)|summary-basic|scoring-nutchwax|urlfilter-nutchwax</value>
+ <value>protocol-http|parse-(text|html|js|pdf)|index-nutchwax|query-(basic|nutchwax)|summary-basic|urlfilter-nutchwax</value>
</property>
<!--
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <bi...@us...> - 2010-02-20 03:19:04
|
Revision: 2957
http://archive-access.svn.sourceforge.net/archive-access/?rev=2957&view=rev
Author: binzino
Date: 2010-02-20 03:18:57 +0000 (Sat, 20 Feb 2010)
Log Message:
-----------
WAX-73. Change fieldcache to false. Also added scoring-nutchwax to the plugin list even though we don't normally use it.
Modified Paths:
--------------
trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml
Modified: trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml 2010-02-12 20:54:15 UTC (rev 2956)
+++ trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml 2010-02-20 03:18:57 UTC (rev 2957)
@@ -10,7 +10,7 @@
<!-- Add 'index-nutchwax' and 'query-nutchwax' to plugin list. -->
<!-- Also, add 'parse-pdf' -->
<!-- Remove 'urlfilter-regex' and 'normalizer-(pass|regex|basic)' -->
- <value>protocol-http|parse-(text|html|js|pdf)|index-nutchwax|query-(basic|nutchwax)|summary-basic|urlfilter-nutchwax</value>
+ <value>protocol-http|parse-(text|html|js|pdf)|index-nutchwax|query-(basic|nutchwax)|summary-basic|scoring-nutchwax|urlfilter-nutchwax</value>
</property>
<!--
@@ -182,7 +182,7 @@
<property>
<name>searcher.fieldcache</name>
- <value>true</value>
+ <value>false</value>
</property>
</configuration>
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|