|
From: <bi...@us...> - 2010-11-16 23:17:11
|
Revision: 3336
http://archive-access.svn.sourceforge.net/archive-access/?rev=3336&view=rev
Author: binzino
Date: 2010-11-16 23:17:04 +0000 (Tue, 16 Nov 2010)
Log Message:
-----------
Use Nutch parsers for text and html, Tika for the rest.
Modified Paths:
--------------
tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml
tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/parse-plugins.xml
Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml 2010-11-16 23:16:35 UTC (rev 3335)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml 2010-11-16 23:17:04 UTC (rev 3336)
@@ -10,7 +10,7 @@
<!-- Add 'index-nutchwax' and 'query-nutchwax' to plugin list. -->
<!-- Also, add 'parse-pdf' -->
<!-- Remove 'urlfilter-regex' and 'normalizer-(pass|regex|basic)' -->
- <value>protocol-http|parse-tika|index-nutchwax|query-(basic|nutchwax)|summary-basic|scoring-nutchwax|urlfilter-nutchwax</value>
+ <value>protocol-http|parse-(tika|text|html)|index-nutchwax|query-(basic|nutchwax)|summary-basic|scoring-nutchwax|urlfilter-nutchwax</value>
</property>
<!--
Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/parse-plugins.xml
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/parse-plugins.xml 2010-11-16 23:16:35 UTC (rev 3335)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/parse-plugins.xml 2010-11-16 23:17:04 UTC (rev 3336)
@@ -107,15 +107,15 @@
</mimeType>
<mimeType name="text/html">
- <plugin id="parse-tika" />
+ <plugin id="parse-html" />
</mimeType>
<mimeType name="application/xhtml+xml">
- <plugin id="parse-tika" />
+ <plugin id="parse-html" />
</mimeType>
<mimeType name="text/plain">
- <plugin id="parse-tika" />
+ <plugin id="parse-text" />
</mimeType>
<mimeType name="text/richtext">
@@ -150,8 +150,9 @@
<aliases>
<alias name="parse-tika" extension-id="org.apache.nutch.parse.tika.Parser" />
<alias name="parse-ext" extension-id="ExtParser" />
+ <alias name="parse-text" extension-id="org.apache.nutch.parse.text.TextParser" />
+ <alias name="parse-html" extension-id="org.apache.nutch.parse.html.HtmlParser" />
<!--
- <alias name="parse-html" extension-id="org.apache.nutch.parse.html.HtmlParser" />
<alias name="parse-js" extension-id="JSParser" />
<alias name="parse-msexceld" extension-id="org.apache.nutch.parse.msexcel.MSExcelParser" />
<alias name="parse-mspowerpoint" extension-id="org.apache.nutch.parse.mspowerpoint.MSPowerPointParser" />
@@ -161,7 +162,6 @@
<alias name="parse-rss" extension-id="org.apache.nutch.parse.rss.RSSParser" />
<alias name="feed" extension-id="org.apache.nutch.parse.feed.FeedParser" />
<alias name="parse-swf" extension-id="org.apache.nutch.parse.swf.SWFParser" />
- <alias name="parse-text" extension-id="org.apache.nutch.parse.text.TextParser" />
<alias name="parse-zip" extension-id="org.apache.nutch.parse.zip.ZipParser" />
-->
</aliases>
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|