[Jython-checkins] SF.net SVN: jython:[6424] trunk/jython

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 6424
          http://jython.svn.sourceforge.net/jython/?rev=6424&view=rev
Author:   pjenvey
Date:     2009-05-30 02:08:23 +0000 (Sat, 30 May 2009)

Log Message:
-----------
o parse raw unicode escapes
o add \U support and fix handling of EOF during a truncated escape in the
raw unicode escape decoder
fixes #1355

Modified Paths:
--------------
    trunk/jython/Lib/test/test_unicode_jy.py
    trunk/jython/NEWS
    trunk/jython/src/org/python/antlr/GrammarActions.java
    trunk/jython/src/org/python/core/codecs.java

Modified: trunk/jython/Lib/test/test_unicode_jy.py
===================================================================

--- trunk/jython/Lib/test/test_unicode_jy.py	2009-05-30 00:46:58 UTC (rev 6423)
+++ trunk/jython/Lib/test/test_unicode_jy.py	2009-05-30 02:08:23 UTC (rev 6424)
@@ -51,6 +51,29 @@
         self.assertEqual(ord(bar[2]), 92)
         self.assertEqual(ord(bar[3]), 110)
 
+        for baz in ur'Hello\u0020World !', ur'Hello\U00000020World !':
+            self.assertEqual(len(baz), 13, repr(baz))
+            self.assertEqual(repr(baz), "u'Hello World !'")
+            self.assertEqual(ord(baz[5]), 32)
+
+        quux = ur'\U00100000'
+        self.assertEqual(repr(quux), "u'\\U00100000'")
+        if sys.maxunicode == 0xffff:
+            self.assertEqual(len(quux), 2)
+            self.assertEqual(ord(quux[0]), 56256)
+            self.assertEqual(ord(quux[1]), 56320)
+        else:
+            self.assertEqual(len(quux), 1)
+            self.assertEqual(ord(quux), 1048576)
+
+    def test_raw_unicode_escape(self):
+        foo = u'\U00100000'
+        self.assertEqual(foo.encode('raw_unicode_escape'), '\\U00100000')
+        self.assertEqual(foo.encode('raw_unicode_escape').decode('raw_unicode_escape'),
+                         foo)
+        for bar in '\\u', '\\u000', '\\U00000':
+            self.assertRaises(UnicodeDecodeError, bar.decode, 'raw_unicode_escape')
+
     def test_encode_decimal(self):
         self.assertEqual(int(u'\u0039\u0032'), 92)
         self.assertEqual(int(u'\u0660'), 0)

Modified: trunk/jython/NEWS
===================================================================
--- trunk/jython/NEWS	2009-05-30 00:46:58 UTC (rev 6423)
+++ trunk/jython/NEWS	2009-05-30 02:08:23 UTC (rev 6424)
@@ -9,6 +9,7 @@
     Fix file's repr with Windows paths
     Fix urllib and urllib2 path handling on Windows
     Fix r'\Jython25' not considered an abspath on Windows
+    Fix handling of raw unicode escapes
 
 Jython 2.5.0 rc3
   Bugs fixed

Modified: trunk/jython/src/org/python/antlr/GrammarActions.java
===================================================================
--- trunk/jython/src/org/python/antlr/GrammarActions.java	2009-05-30 00:46:58 UTC (rev 6423)
+++ trunk/jython/src/org/python/antlr/GrammarActions.java	2009-05-30 02:08:23 UTC (rev 6424)
@@ -9,6 +9,7 @@
 import org.python.core.PyLong;
 import org.python.core.PyString;
 import org.python.core.PyUnicode;
+import org.python.core.codecs;
 import org.python.antlr.ast.alias;
 import org.python.antlr.ast.arguments;
 import org.python.antlr.ast.boolopType;
@@ -441,8 +442,12 @@
                                                        ustring);
             }
         } else if (raw) {
-            // Raw str without an encoding or raw unicode: simply passthru
+            // Raw str without an encoding or raw unicode
             string = string.substring(start, end);
+            if (ustring) {
+                // Raw unicode: handle unicode escapes
+                string = codecs.PyUnicode_DecodeRawUnicodeEscape(string, "strict");
+            }
         } else {
             // Plain unicode: already decoded, just handle escapes
             string = PyString.decode_UnicodeEscape(string, start, end, "strict", ustring);

Modified: trunk/jython/src/org/python/core/codecs.java
===================================================================
--- trunk/jython/src/org/python/core/codecs.java	2009-05-30 00:46:58 UTC (rev 6423)
+++ trunk/jython/src/org/python/core/codecs.java	2009-05-30 02:08:23 UTC (rev 6424)
@@ -920,45 +920,54 @@
     private static char[] hexdigit = "0123456789ABCDEF".toCharArray();
 
     // The modified flag is used by cPickle.
-    public static String PyUnicode_EncodeRawUnicodeEscape(String str,
-            String errors,
-            boolean modifed) {
-
-        int size = str.length();
+    public static String PyUnicode_EncodeRawUnicodeEscape(String str, String errors,
+                                                          boolean modifed) {
         StringBuilder v = new StringBuilder(str.length());
 
-        for (int i = 0; i < size; i++) {
-            char ch = str.charAt(i);
-            if (ch >= 256 || (modifed && (ch == '\n' || ch == '\\'))) {
+        for (Iterator<Integer> iter = new PyUnicode(str).newSubsequenceIterator();
+             iter.hasNext();) {
+            int codePoint = iter.next();
+            if (codePoint >= Character.MIN_SUPPLEMENTARY_CODE_POINT) {
+                // Map 32-bit characters to '\\Uxxxxxxxx'
+                v.append("\\U");
+                v.append(hexdigit[(codePoint >> 28) & 0xF]);
+                v.append(hexdigit[(codePoint >> 24) & 0xF]);
+                v.append(hexdigit[(codePoint >> 20) & 0xF]);
+                v.append(hexdigit[(codePoint >> 16) & 0xF]);
+                v.append(hexdigit[(codePoint >> 12) & 0xF]);
+                v.append(hexdigit[(codePoint >> 8) & 0xF]);
+                v.append(hexdigit[(codePoint >> 4) & 0xF]);
+                v.append(hexdigit[codePoint & 0xF]);
+            } else if (codePoint >= 256 || (modifed && (codePoint == '\\' || codePoint == '\n'))) {
+                // Map 16-bit chararacters to '\\uxxxx'
                 v.append("\\u");
-                v.append(hexdigit[(ch >>> 12) & 0xF]);
-                v.append(hexdigit[(ch >>> 8) & 0xF]);
-                v.append(hexdigit[(ch >>> 4) & 0xF]);
-                v.append(hexdigit[ch & 0xF]);
+                v.append(hexdigit[(codePoint >> 12) & 0xF]);
+                v.append(hexdigit[(codePoint >> 8) & 0xF]);
+                v.append(hexdigit[(codePoint >> 4) & 0xF]);
+                v.append(hexdigit[codePoint & 0xF]);
             } else {
-                v.append(ch);
+                v.append((char)codePoint);
             }
         }
 
         return v.toString();
     }
 
-    public static String PyUnicode_DecodeRawUnicodeEscape(String str,
-            String errors) {
+    public static String PyUnicode_DecodeRawUnicodeEscape(String str, String errors) {
         int size = str.length();
         StringBuilder v = new StringBuilder(size);
+
         for (int i = 0; i < size;) {
             char ch = str.charAt(i);
-            /* Non-escape characters are interpreted as Unicode ordinals */
+            // Non-escape characters are interpreted as Unicode ordinals
             if (ch != '\\') {
                 v.append(ch);
                 i++;
                 continue;
             }
-            /*
-             * \\u-escapes are only interpreted iff the number of leading
-             * backslashes is odd
-             */
+
+            // \\u-escapes are only interpreted if the number of leading backslashes is
+            // odd
             int bs = i;
             while (i < size) {
                 ch = str.charAt(i);
@@ -968,34 +977,37 @@
                 v.append(ch);
                 i++;
             }
-            if (((i - bs) & 1) == 0 || i >= size || ch != 'u') {
+            if (((i - bs) & 1) == 0 || i >= size || (ch != 'u' && ch != 'U')) {
                 continue;
             }
             v.setLength(v.length() - 1);
+            int count = ch == 'u' ? 4 : 8;
             i++;
-            /* \\uXXXX with 4 hex digits */
-            int x = 0, d = 0, j = 0;
-            for (; j < 4; j++) {
-                ch = str.charAt(i + j);
-                d = Character.digit(ch, 16);
-                if (d == -1) {
+
+            // \\uXXXX with 4 hex digits, \Uxxxxxxxx with 8
+            int codePoint = 0, asDigit = -1;
+            for (int j = 0; j < count; i++, j++) {
+                if (i == size) {
+                    // EOF in a truncated escape
+                    asDigit = -1;
                     break;
                 }
-                x = ((x << 4) & ~0xF) + d;
+
+                ch = str.charAt(i);
+                asDigit = Character.digit(ch, 16);
+                if (asDigit == -1) {
+                    break;
+                }
+                codePoint = ((codePoint << 4) & ~0xF) + asDigit;
             }
-            if (d == -1) {
-                i = codecs.insertReplacementAndGetResume(v,
-                        errors,
-                        "unicodeescape",
-                        str,
-                        bs,
-                        i + j,
-                        "truncated \\uXXXX");
+            if (asDigit == -1) {
+                i = codecs.insertReplacementAndGetResume(v, errors, "rawunicodeescape", str, bs, i,
+                                                         "truncated \\uXXXX");
             } else {
-                i += 4;
-                v.append((char) x);
+                v.appendCodePoint(codePoint);
             }
         }
+
         return v.toString();
     }
 


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.