[Jython-checkins] CVS: jython/org/python/core codecs.java,2.4,2.5

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/jython/jython/org/python/core
In directory usw-pr-cvs1:/tmp/cvs-serv839/core

Modified Files:
	codecs.java 
Log Message:
Moved UTF8 codec from modules to core. This codec is also used by cPickle
and life freezing an application is so much simpler if the codecs is
always available.


Index: codecs.java
===================================================================
RCS file: /cvsroot/jython/jython/org/python/core/codecs.java,v
retrieving revision 2.4
retrieving revision 2.5
diff -C2 -r2.4 -r2.5
*** codecs.java	2001/01/21 14:02:35	2.4
--- codecs.java	2001/01/21 16:24:31	2.5
***************
*** 160,163 ****
--- 160,348 ----
  
  
+     /* --- UTF-8 Codec -------------------------------------------------------- */
+     private static byte utf8_code_length[] = {
+        /* Map UTF-8 encoded prefix byte to sequence length.  zero means
+            illegal prefix.  see RFC 2279 for details */
+         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+         4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
+     };
+ 
+ 
+     public static String PyUnicode_DecodeUTF8(String str, String errors) {
+         int size = str.length();
+         StringBuffer unicode = new StringBuffer(size);
+ 
+         /* Unpack UTF-8 encoded data */
+         for (int i = 0; i < size; ) {
+             int ch = str.charAt(i);
+             if (ch > 0xFF) {
+                 codecs.decoding_error("utf-8", unicode, errors, 
+                                       "ordinal not in range(255)");
+                 i++;
+                 continue;
+             }
+ 
+             if (ch < 0x80) {
+                 unicode.append((char) ch);
+                 i++;
+                 continue;
+             }
+ 
+             int n = utf8_code_length[ch];
+ 
+             if (i + n > size) {
+                 codecs.decoding_error("utf-8", unicode, errors, 
+                                       "unexpected end of data");
+                 i++;
+                 continue;
+             }
+ 
+ 	
+             switch (n) {
+             case 0:
+                 codecs.decoding_error("utf-8", unicode, errors, 
+                                       "unexpected code byte");
+                 i++;
+                 continue;
+             case 1:
+                 codecs.decoding_error("utf-8", unicode, errors, 
+                                       "internal error");
+                 i++;
+                 continue;
+             case 2:
+                 char ch1 = str.charAt(i+1);
+                 if ((ch1 & 0xc0) != 0x80) {
+                     codecs.decoding_error("utf-8", unicode, errors, 
+                                           "invalid data");
+                     i++;
+                     continue;
+                 }
+                 ch = ((ch & 0x1f) << 6) + (ch1 & 0x3f);
+                 if (ch < 0x80) {
+                     codecs.decoding_error("utf-8", unicode, errors,
+                                           "illegal encoding");
+                     i++;
+                     continue;
+                 } else
+                     unicode.append((char) ch);
+                 break;
+ 
+             case 3:
+                 ch1 = str.charAt(i+1);
+                 char ch2 = str.charAt(i+2); 
+                 if ((ch1 & 0xc0) != 0x80 || (ch2 & 0xc0) != 0x80) {
+                     codecs.decoding_error("utf-8", unicode, errors, 
+                                           "invalid data");
+                     i++;
+                     continue;
+                 }
+                 ch = ((ch & 0x0f) << 12) + ((ch1 & 0x3f) << 6) + (ch2 & 0x3f);
+                 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
+                     codecs.decoding_error("utf-8", unicode, errors, 
+                                           "illegal encoding");
+                     i++;
+                     continue;
+                 } else
+                    unicode.append((char) ch);
+                 break;
+ 
+             case 4:
+                 ch1 = str.charAt(i+1);
+                 ch2 = str.charAt(i+2); 
+                 char ch3 = str.charAt(i+3); 
+                 if ((ch1 & 0xc0) != 0x80 ||
+                     (ch2 & 0xc0) != 0x80 ||
+                     (ch3 & 0xc0) != 0x80) {
+                     codecs.decoding_error("utf-8", unicode, errors, 
+                                           "invalid data");
+                     i++;
+                     continue;
+                 }
+                 ch = ((ch & 0x7) << 18) + ((ch1 & 0x3f) << 12) +
+                      ((ch2 & 0x3f) << 6) + (ch3 & 0x3f);
+                 /* validate and convert to UTF-16 */
+                 if ((ch < 0x10000) ||   /* minimum value allowed for 4
+                                            byte encoding */
+                     (ch > 0x10ffff)) {  /* maximum value allowed for
+                                            UTF-16 */
+                     codecs.decoding_error("utf-8", unicode, errors, 
+                                           "illegal encoding");
+                     i++;
+                     continue;
+                 }
+                 /*  compute and append the two surrogates: */
+ 
+                 /*  translate from 10000..10FFFF to 0..FFFF */
+                 ch -= 0x10000;
+ 
+                 /*  high surrogate = top 10 bits added to D800 */
+                 unicode.append((char) (0xD800 + (ch >> 10)));
+ 
+                 /*  low surrogate = bottom 10 bits added to DC00 */
+                 unicode.append((char) (0xDC00 + (ch & ~0xFC00)));
+                 break;
+ 
+             default:
+                 /* Other sizes are only needed for UCS-4 */
+                 codecs.decoding_error("utf-8", unicode, errors, 
+                                       "unsupported Unicode code range");
+                 i++;
+             }
+             i += n;
+         }
+ 
+         return unicode.toString();
+     }
+ 
+ 
+     public static String PyUnicode_EncodeUTF8(String str, String errors) {
+         int size = str.length();
+         StringBuffer v = new StringBuffer(size * 3);
+  
+         for (int i = 0; i < size; ) {
+             int ch = str.charAt(i++);
+             if (ch < 0x80)
+                 v.append((char) ch);
+             else if (ch < 0x0800) {
+                 v.append((char) (0xc0 | (ch >> 6)));
+                 v.append((char) (0x80 | (ch & 0x3f)));
+             } else {
+                 if (0xD800 <= ch && ch <= 0xDFFF) {
+                     if (i != size) {
+                         int ch2 = str.charAt(i);
+                         if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
+                             /* combine the two values */
+                             ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
+ 
+                             v.append((char)((ch >> 18) | 0xf0));
+                             v.append((char)(0x80 | ((ch >> 12) & 0x3f)));
+                             i++;
+                         }
+                     }
+                 } else {
+                     v.append((char)(0xe0 | (ch >> 12)));
+                 }
+                 v.append((char) (0x80 | ((ch >> 6) & 0x3f)));
+                 v.append((char) (0x80 | (ch & 0x3f)));
+             }
+         }
+         return v.toString();
+     }
+   
+ 
  
      /* --- 7-bit ASCII Codec -------------------------------------------- */