From: <pj...@us...> - 2009-05-30 03:22:30
|
Revision: 6424 http://jython.svn.sourceforge.net/jython/?rev=6424&view=rev Author: pjenvey Date: 2009-05-30 02:08:23 +0000 (Sat, 30 May 2009) Log Message: ----------- o parse raw unicode escapes o add \U support and fix handling of EOF during a truncated escape in the raw unicode escape decoder fixes #1355 Modified Paths: -------------- trunk/jython/Lib/test/test_unicode_jy.py trunk/jython/NEWS trunk/jython/src/org/python/antlr/GrammarActions.java trunk/jython/src/org/python/core/codecs.java Modified: trunk/jython/Lib/test/test_unicode_jy.py =================================================================== --- trunk/jython/Lib/test/test_unicode_jy.py 2009-05-30 00:46:58 UTC (rev 6423) +++ trunk/jython/Lib/test/test_unicode_jy.py 2009-05-30 02:08:23 UTC (rev 6424) @@ -51,6 +51,29 @@ self.assertEqual(ord(bar[2]), 92) self.assertEqual(ord(bar[3]), 110) + for baz in ur'Hello\u0020World !', ur'Hello\U00000020World !': + self.assertEqual(len(baz), 13, repr(baz)) + self.assertEqual(repr(baz), "u'Hello World !'") + self.assertEqual(ord(baz[5]), 32) + + quux = ur'\U00100000' + self.assertEqual(repr(quux), "u'\\U00100000'") + if sys.maxunicode == 0xffff: + self.assertEqual(len(quux), 2) + self.assertEqual(ord(quux[0]), 56256) + self.assertEqual(ord(quux[1]), 56320) + else: + self.assertEqual(len(quux), 1) + self.assertEqual(ord(quux), 1048576) + + def test_raw_unicode_escape(self): + foo = u'\U00100000' + self.assertEqual(foo.encode('raw_unicode_escape'), '\\U00100000') + self.assertEqual(foo.encode('raw_unicode_escape').decode('raw_unicode_escape'), + foo) + for bar in '\\u', '\\u000', '\\U00000': + self.assertRaises(UnicodeDecodeError, bar.decode, 'raw_unicode_escape') + def test_encode_decimal(self): self.assertEqual(int(u'\u0039\u0032'), 92) self.assertEqual(int(u'\u0660'), 0) Modified: trunk/jython/NEWS =================================================================== --- trunk/jython/NEWS 2009-05-30 00:46:58 UTC (rev 6423) +++ trunk/jython/NEWS 2009-05-30 02:08:23 UTC (rev 6424) @@ -9,6 +9,7 @@ Fix file's repr with Windows paths Fix urllib and urllib2 path handling on Windows Fix r'\Jython25' not considered an abspath on Windows + Fix handling of raw unicode escapes Jython 2.5.0 rc3 Bugs fixed Modified: trunk/jython/src/org/python/antlr/GrammarActions.java =================================================================== --- trunk/jython/src/org/python/antlr/GrammarActions.java 2009-05-30 00:46:58 UTC (rev 6423) +++ trunk/jython/src/org/python/antlr/GrammarActions.java 2009-05-30 02:08:23 UTC (rev 6424) @@ -9,6 +9,7 @@ import org.python.core.PyLong; import org.python.core.PyString; import org.python.core.PyUnicode; +import org.python.core.codecs; import org.python.antlr.ast.alias; import org.python.antlr.ast.arguments; import org.python.antlr.ast.boolopType; @@ -441,8 +442,12 @@ ustring); } } else if (raw) { - // Raw str without an encoding or raw unicode: simply passthru + // Raw str without an encoding or raw unicode string = string.substring(start, end); + if (ustring) { + // Raw unicode: handle unicode escapes + string = codecs.PyUnicode_DecodeRawUnicodeEscape(string, "strict"); + } } else { // Plain unicode: already decoded, just handle escapes string = PyString.decode_UnicodeEscape(string, start, end, "strict", ustring); Modified: trunk/jython/src/org/python/core/codecs.java =================================================================== --- trunk/jython/src/org/python/core/codecs.java 2009-05-30 00:46:58 UTC (rev 6423) +++ trunk/jython/src/org/python/core/codecs.java 2009-05-30 02:08:23 UTC (rev 6424) @@ -920,45 +920,54 @@ private static char[] hexdigit = "0123456789ABCDEF".toCharArray(); // The modified flag is used by cPickle. - public static String PyUnicode_EncodeRawUnicodeEscape(String str, - String errors, - boolean modifed) { - - int size = str.length(); + public static String PyUnicode_EncodeRawUnicodeEscape(String str, String errors, + boolean modifed) { StringBuilder v = new StringBuilder(str.length()); - for (int i = 0; i < size; i++) { - char ch = str.charAt(i); - if (ch >= 256 || (modifed && (ch == '\n' || ch == '\\'))) { + for (Iterator<Integer> iter = new PyUnicode(str).newSubsequenceIterator(); + iter.hasNext();) { + int codePoint = iter.next(); + if (codePoint >= Character.MIN_SUPPLEMENTARY_CODE_POINT) { + // Map 32-bit characters to '\\Uxxxxxxxx' + v.append("\\U"); + v.append(hexdigit[(codePoint >> 28) & 0xF]); + v.append(hexdigit[(codePoint >> 24) & 0xF]); + v.append(hexdigit[(codePoint >> 20) & 0xF]); + v.append(hexdigit[(codePoint >> 16) & 0xF]); + v.append(hexdigit[(codePoint >> 12) & 0xF]); + v.append(hexdigit[(codePoint >> 8) & 0xF]); + v.append(hexdigit[(codePoint >> 4) & 0xF]); + v.append(hexdigit[codePoint & 0xF]); + } else if (codePoint >= 256 || (modifed && (codePoint == '\\' || codePoint == '\n'))) { + // Map 16-bit chararacters to '\\uxxxx' v.append("\\u"); - v.append(hexdigit[(ch >>> 12) & 0xF]); - v.append(hexdigit[(ch >>> 8) & 0xF]); - v.append(hexdigit[(ch >>> 4) & 0xF]); - v.append(hexdigit[ch & 0xF]); + v.append(hexdigit[(codePoint >> 12) & 0xF]); + v.append(hexdigit[(codePoint >> 8) & 0xF]); + v.append(hexdigit[(codePoint >> 4) & 0xF]); + v.append(hexdigit[codePoint & 0xF]); } else { - v.append(ch); + v.append((char)codePoint); } } return v.toString(); } - public static String PyUnicode_DecodeRawUnicodeEscape(String str, - String errors) { + public static String PyUnicode_DecodeRawUnicodeEscape(String str, String errors) { int size = str.length(); StringBuilder v = new StringBuilder(size); + for (int i = 0; i < size;) { char ch = str.charAt(i); - /* Non-escape characters are interpreted as Unicode ordinals */ + // Non-escape characters are interpreted as Unicode ordinals if (ch != '\\') { v.append(ch); i++; continue; } - /* - * \\u-escapes are only interpreted iff the number of leading - * backslashes is odd - */ + + // \\u-escapes are only interpreted if the number of leading backslashes is + // odd int bs = i; while (i < size) { ch = str.charAt(i); @@ -968,34 +977,37 @@ v.append(ch); i++; } - if (((i - bs) & 1) == 0 || i >= size || ch != 'u') { + if (((i - bs) & 1) == 0 || i >= size || (ch != 'u' && ch != 'U')) { continue; } v.setLength(v.length() - 1); + int count = ch == 'u' ? 4 : 8; i++; - /* \\uXXXX with 4 hex digits */ - int x = 0, d = 0, j = 0; - for (; j < 4; j++) { - ch = str.charAt(i + j); - d = Character.digit(ch, 16); - if (d == -1) { + + // \\uXXXX with 4 hex digits, \Uxxxxxxxx with 8 + int codePoint = 0, asDigit = -1; + for (int j = 0; j < count; i++, j++) { + if (i == size) { + // EOF in a truncated escape + asDigit = -1; break; } - x = ((x << 4) & ~0xF) + d; + + ch = str.charAt(i); + asDigit = Character.digit(ch, 16); + if (asDigit == -1) { + break; + } + codePoint = ((codePoint << 4) & ~0xF) + asDigit; } - if (d == -1) { - i = codecs.insertReplacementAndGetResume(v, - errors, - "unicodeescape", - str, - bs, - i + j, - "truncated \\uXXXX"); + if (asDigit == -1) { + i = codecs.insertReplacementAndGetResume(v, errors, "rawunicodeescape", str, bs, i, + "truncated \\uXXXX"); } else { - i += 4; - v.append((char) x); + v.appendCodePoint(codePoint); } } + return v.toString(); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |