From: <bi...@us...> - 2010-11-16 23:17:11
|
Revision: 3336 http://archive-access.svn.sourceforge.net/archive-access/?rev=3336&view=rev Author: binzino Date: 2010-11-16 23:17:04 +0000 (Tue, 16 Nov 2010) Log Message: ----------- Use Nutch parsers for text and html, Tika for the rest. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/parse-plugins.xml Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml 2010-11-16 23:16:35 UTC (rev 3335) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml 2010-11-16 23:17:04 UTC (rev 3336) @@ -10,7 +10,7 @@ <!-- Add 'index-nutchwax' and 'query-nutchwax' to plugin list. --> <!-- Also, add 'parse-pdf' --> <!-- Remove 'urlfilter-regex' and 'normalizer-(pass|regex|basic)' --> - <value>protocol-http|parse-tika|index-nutchwax|query-(basic|nutchwax)|summary-basic|scoring-nutchwax|urlfilter-nutchwax</value> + <value>protocol-http|parse-(tika|text|html)|index-nutchwax|query-(basic|nutchwax)|summary-basic|scoring-nutchwax|urlfilter-nutchwax</value> </property> <!-- Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/parse-plugins.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/parse-plugins.xml 2010-11-16 23:16:35 UTC (rev 3335) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/parse-plugins.xml 2010-11-16 23:17:04 UTC (rev 3336) @@ -107,15 +107,15 @@ </mimeType> <mimeType name="text/html"> - <plugin id="parse-tika" /> + <plugin id="parse-html" /> </mimeType> <mimeType name="application/xhtml+xml"> - <plugin id="parse-tika" /> + <plugin id="parse-html" /> </mimeType> <mimeType name="text/plain"> - <plugin id="parse-tika" /> + <plugin id="parse-text" /> </mimeType> <mimeType name="text/richtext"> @@ -150,8 +150,9 @@ <aliases> <alias name="parse-tika" extension-id="org.apache.nutch.parse.tika.Parser" /> <alias name="parse-ext" extension-id="ExtParser" /> + <alias name="parse-text" extension-id="org.apache.nutch.parse.text.TextParser" /> + <alias name="parse-html" extension-id="org.apache.nutch.parse.html.HtmlParser" /> <!-- - <alias name="parse-html" extension-id="org.apache.nutch.parse.html.HtmlParser" /> <alias name="parse-js" extension-id="JSParser" /> <alias name="parse-msexceld" extension-id="org.apache.nutch.parse.msexcel.MSExcelParser" /> <alias name="parse-mspowerpoint" extension-id="org.apache.nutch.parse.mspowerpoint.MSPowerPointParser" /> @@ -161,7 +162,6 @@ <alias name="parse-rss" extension-id="org.apache.nutch.parse.rss.RSSParser" /> <alias name="feed" extension-id="org.apache.nutch.parse.feed.FeedParser" /> <alias name="parse-swf" extension-id="org.apache.nutch.parse.swf.SWFParser" /> - <alias name="parse-text" extension-id="org.apache.nutch.parse.text.TextParser" /> <alias name="parse-zip" extension-id="org.apache.nutch.parse.zip.ZipParser" /> --> </aliases> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |