From: <mgu...@us...> - 2015-04-16 15:53:18
|
Revision: 347 http://sourceforge.net/p/nekohtml/code/347 Author: mguillem Date: 2015-04-16 15:53:16 +0000 (Thu, 16 Apr 2015) Log Message: ----------- improved detection of compatible encodings from meta charset when only decode is supported (patch from Steve McKay) Issue #20 Modified Paths: -------------- trunk/doc/changes.html trunk/src/org/cyberneko/html/HTMLScanner.java Added Paths: ----------- trunk/data/meta/test-meta-encoding3.html trunk/data/meta/test-meta-encoding3.html.canonical trunk/data/meta/test-meta-encoding3.html.settings Added: trunk/data/meta/test-meta-encoding3.html =================================================================== --- trunk/data/meta/test-meta-encoding3.html (rev 0) +++ trunk/data/meta/test-meta-encoding3.html 2015-04-16 15:53:16 UTC (rev 347) @@ -0,0 +1,14 @@ +<head> +<meta http-equiv="Content-Type" content="text/html;charset=iso-2022-cn"> +</head> +$)AKNLe + +PB + +$)G\XM||U + +IzN"~ + +d;\XM||U + +$)A#? Added: trunk/data/meta/test-meta-encoding3.html.canonical =================================================================== --- trunk/data/meta/test-meta-encoding3.html.canonical (rev 0) +++ trunk/data/meta/test-meta-encoding3.html.canonical 2015-04-16 15:53:16 UTC (rev 347) @@ -0,0 +1,13 @@ +(HTML +(HEAD +"\n +(META +Acontent text/html;charset=iso-2022-cn +Ahttp-equiv Content-Type +)META +"\n +)HEAD +(BODY +"\n宋体\n\n新\n\n細明體\n\n宋体\n\n浠茇忘�\n\n?\n\n +)BODY +)HTML Added: trunk/data/meta/test-meta-encoding3.html.settings =================================================================== --- trunk/data/meta/test-meta-encoding3.html.settings (rev 0) +++ trunk/data/meta/test-meta-encoding3.html.settings 2015-04-16 15:53:16 UTC (rev 347) @@ -0,0 +1 @@ +property http://cyberneko.org/html/properties/default-encoding UTF-8 Modified: trunk/doc/changes.html =================================================================== --- trunk/doc/changes.html 2015-04-16 15:42:57 UTC (rev 346) +++ trunk/doc/changes.html 2015-04-16 15:53:16 UTC (rev 347) @@ -31,7 +31,8 @@ <dt>Version 1.9.22 (to be released)</dt> <dd>Element <code>NOBR</code> closes <code>NOBR</code>, <code>BUTTON</code> closes <code>BUTTON</code> (patch from Ronald Brill), element <code>EMBED</code> has no body (patch from Ronald Brill), - element <code>A</code> shouldn't be inline (patch from Ahmed Ashour). + element <code>A</code> shouldn't be inline (patch from Ahmed Ashour), + improved detection of compatible encodings from meta charset when only decode is supported (patch from Steve McKay). </dd> <dt>Version 1.9.21 (2 Jun 2014)</dt> Modified: trunk/src/org/cyberneko/html/HTMLScanner.java =================================================================== --- trunk/src/org/cyberneko/html/HTMLScanner.java 2015-04-16 15:42:57 UTC (rev 346) +++ trunk/src/org/cyberneko/html/HTMLScanner.java 2015-04-16 15:53:16 UTC (rev 347) @@ -3709,17 +3709,33 @@ * be the same in both encodings */ boolean isEncodingCompatible(final String encoding1, final String encoding2) { - final String reference = "<html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset="; try { - final byte[] bytesEncoding1 = reference.getBytes(encoding1); - final String referenceWithEncoding2 = new String(bytesEncoding1, encoding2); - return reference.equals(referenceWithEncoding2); + try { + return canRoundtrip(encoding1, encoding2); + } + catch (final UnsupportedOperationException e) { + // if encoding1 only supports decode, we can test it the other way to only decode with it + try { + return canRoundtrip(encoding2, encoding1); + } + catch (final UnsupportedOperationException e1) { + // encoding2 only supports decode too. Time to give up. + return false; + } + } } catch (final UnsupportedEncodingException e) { return false; } } + private boolean canRoundtrip(final String encodeCharset, final String decodeCharset) throws UnsupportedEncodingException { + final String reference = "<html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset="; + final byte[] bytesEncoding1 = reference.getBytes(encodeCharset); + final String referenceWithEncoding2 = new String(bytesEncoding1, decodeCharset); + return reference.equals(referenceWithEncoding2); + } + private boolean endsWith(final XMLStringBuffer buffer, final String string) { final int l = string.length(); if (buffer.length < l) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |