From: <bi...@us...> - 2010-11-16 23:17:11
|
Revision: 3336 http://archive-access.svn.sourceforge.net/archive-access/?rev=3336&view=rev Author: binzino Date: 2010-11-16 23:17:04 +0000 (Tue, 16 Nov 2010) Log Message: ----------- Use Nutch parsers for text and html, Tika for the rest. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/parse-plugins.xml Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml 2010-11-16 23:16:35 UTC (rev 3335) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml 2010-11-16 23:17:04 UTC (rev 3336) @@ -10,7 +10,7 @@ <!-- Add 'index-nutchwax' and 'query-nutchwax' to plugin list. --> <!-- Also, add 'parse-pdf' --> <!-- Remove 'urlfilter-regex' and 'normalizer-(pass|regex|basic)' --> - <value>protocol-http|parse-tika|index-nutchwax|query-(basic|nutchwax)|summary-basic|scoring-nutchwax|urlfilter-nutchwax</value> + <value>protocol-http|parse-(tika|text|html)|index-nutchwax|query-(basic|nutchwax)|summary-basic|scoring-nutchwax|urlfilter-nutchwax</value> </property> <!-- Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/parse-plugins.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/parse-plugins.xml 2010-11-16 23:16:35 UTC (rev 3335) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/parse-plugins.xml 2010-11-16 23:17:04 UTC (rev 3336) @@ -107,15 +107,15 @@ </mimeType> <mimeType name="text/html"> - <plugin id="parse-tika" /> + <plugin id="parse-html" /> </mimeType> <mimeType name="application/xhtml+xml"> - <plugin id="parse-tika" /> + <plugin id="parse-html" /> </mimeType> <mimeType name="text/plain"> - <plugin id="parse-tika" /> + <plugin id="parse-text" /> </mimeType> <mimeType name="text/richtext"> @@ -150,8 +150,9 @@ <aliases> <alias name="parse-tika" extension-id="org.apache.nutch.parse.tika.Parser" /> <alias name="parse-ext" extension-id="ExtParser" /> + <alias name="parse-text" extension-id="org.apache.nutch.parse.text.TextParser" /> + <alias name="parse-html" extension-id="org.apache.nutch.parse.html.HtmlParser" /> <!-- - <alias name="parse-html" extension-id="org.apache.nutch.parse.html.HtmlParser" /> <alias name="parse-js" extension-id="JSParser" /> <alias name="parse-msexceld" extension-id="org.apache.nutch.parse.msexcel.MSExcelParser" /> <alias name="parse-mspowerpoint" extension-id="org.apache.nutch.parse.mspowerpoint.MSPowerPointParser" /> @@ -161,7 +162,6 @@ <alias name="parse-rss" extension-id="org.apache.nutch.parse.rss.RSSParser" /> <alias name="feed" extension-id="org.apache.nutch.parse.feed.FeedParser" /> <alias name="parse-swf" extension-id="org.apache.nutch.parse.swf.SWFParser" /> - <alias name="parse-text" extension-id="org.apache.nutch.parse.text.TextParser" /> <alias name="parse-zip" extension-id="org.apache.nutch.parse.zip.ZipParser" /> --> </aliases> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2011-08-18 22:52:57
|
Revision: 3507 http://archive-access.svn.sourceforge.net/archive-access/?rev=3507&view=rev Author: binzino Date: 2011-08-18 22:52:50 +0000 (Thu, 18 Aug 2011) Log Message: ----------- Fix ARI-2791. Add magic patterns for UTF-16 HTML files. Enable encoding detection. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/tika-mimetypes.xml Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml 2011-08-16 11:16:32 UTC (rev 3506) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml 2011-08-18 22:52:50 UTC (rev 3507) @@ -181,4 +181,12 @@ <value>false</value> </property> +<property> + <name>encodingdetector.charset.min.confidence</name> + <value>1</value> + <description>A integer between 0-100 indicating minimum confidence value + for charset auto-detection. Any negative value disables auto-detection. + </description> +</property> + </configuration> Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/tika-mimetypes.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/tika-mimetypes.xml 2011-08-16 11:16:32 UTC (rev 3506) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/tika-mimetypes.xml 2011-08-18 22:52:50 UTC (rev 3507) @@ -3584,6 +3584,15 @@ <match value="<H1" type="string" offset="0"/> <match value="<!doctype HTML" type="string" offset="0"/> <match value="<!DOCTYPE html" type="string" offset="0"/> + <!-- UTF-16BE BOM : Either <HTML or <html --> + <match value="0xfeff003c00480054004d004c" type="string" offset="0"/> + <match value="0xfeff003c00680074006d006c" type="string" offset="0"/> + <!-- UTF-16LE BOM : Either <HTML or <html --> + <match value="0xfffe3c00480054004d004c00" type="string" offset="0"/> + <match value="0xfffe3c00680074006d006c00" type="string" offset="0"/> + <!-- UTF-8 BOM : Either <HTML or <html --> + <match value="0xefbbbf3c48544d4c" type="string" offset="0"/> + <match value="0xefbbbf3c68746d6c" type="string" offset="0"/> </magic> <glob pattern="*.html"/> <glob pattern="*.htm"/> @@ -3600,11 +3609,11 @@ <match value="//" type="string" offset="0"/> <match value=";;" type="string" offset="0"/> <!-- UTF-16BE BOM --> - <match value="0xfeff" type="string" offset="0"/> +<!-- <match value="0xfeff" type="string" offset="0"/> --> <!-- UTF-16LE BOM --> - <match value="0xfffe" type="string" offset="0"/> +<!-- <match value="0xfffe" type="string" offset="0"/> --> <!-- UTF-8 BOM --> - <match value="0xefbbbf" type="string" offset="0"/> +<!-- <match value="0xefbbbf" type="string" offset="0"/> --> </magic> <glob pattern="*.txt"/> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |