From: <bi...@us...> - 2011-08-18 22:52:57
|
Revision: 3507 http://archive-access.svn.sourceforge.net/archive-access/?rev=3507&view=rev Author: binzino Date: 2011-08-18 22:52:50 +0000 (Thu, 18 Aug 2011) Log Message: ----------- Fix ARI-2791. Add magic patterns for UTF-16 HTML files. Enable encoding detection. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/tika-mimetypes.xml Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml 2011-08-16 11:16:32 UTC (rev 3506) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml 2011-08-18 22:52:50 UTC (rev 3507) @@ -181,4 +181,12 @@ <value>false</value> </property> +<property> + <name>encodingdetector.charset.min.confidence</name> + <value>1</value> + <description>A integer between 0-100 indicating minimum confidence value + for charset auto-detection. Any negative value disables auto-detection. + </description> +</property> + </configuration> Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/tika-mimetypes.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/tika-mimetypes.xml 2011-08-16 11:16:32 UTC (rev 3506) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/tika-mimetypes.xml 2011-08-18 22:52:50 UTC (rev 3507) @@ -3584,6 +3584,15 @@ <match value="<H1" type="string" offset="0"/> <match value="<!doctype HTML" type="string" offset="0"/> <match value="<!DOCTYPE html" type="string" offset="0"/> + <!-- UTF-16BE BOM : Either <HTML or <html --> + <match value="0xfeff003c00480054004d004c" type="string" offset="0"/> + <match value="0xfeff003c00680074006d006c" type="string" offset="0"/> + <!-- UTF-16LE BOM : Either <HTML or <html --> + <match value="0xfffe3c00480054004d004c00" type="string" offset="0"/> + <match value="0xfffe3c00680074006d006c00" type="string" offset="0"/> + <!-- UTF-8 BOM : Either <HTML or <html --> + <match value="0xefbbbf3c48544d4c" type="string" offset="0"/> + <match value="0xefbbbf3c68746d6c" type="string" offset="0"/> </magic> <glob pattern="*.html"/> <glob pattern="*.htm"/> @@ -3600,11 +3609,11 @@ <match value="//" type="string" offset="0"/> <match value=";;" type="string" offset="0"/> <!-- UTF-16BE BOM --> - <match value="0xfeff" type="string" offset="0"/> +<!-- <match value="0xfeff" type="string" offset="0"/> --> <!-- UTF-16LE BOM --> - <match value="0xfffe" type="string" offset="0"/> +<!-- <match value="0xfffe" type="string" offset="0"/> --> <!-- UTF-8 BOM --> - <match value="0xefbbbf" type="string" offset="0"/> +<!-- <match value="0xefbbbf" type="string" offset="0"/> --> </magic> <glob pattern="*.txt"/> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |