[KoCo-CVS] [Commit] iconvcodec _iconv_codec.c
Brought to you by:
perky
From: Hye-Shik C. <pe...@us...> - 2003-06-11 12:01:59
|
perky 03/06/11 05:01:58 Modified: . _iconv_codec.c Log: Utilize UCS-2 Surrogate-Pair to support ISO-10646 extended planes Revision Changes Path 1.8 +67 -37 iconvcodec/_iconv_codec.c Index: _iconv_codec.c =================================================================== RCS file: /cvsroot/koco/iconvcodec/_iconv_codec.c,v retrieving revision 1.7 retrieving revision 1.8 diff -u -r1.7 -r1.8 --- _iconv_codec.c 11 Jun 2003 11:06:50 -0000 1.7 +++ _iconv_codec.c 11 Jun 2003 12:01:57 -0000 1.8 @@ -24,7 +24,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: _iconv_codec.c,v 1.7 2003/06/11 11:06:50 perky Exp $ + * $Id: _iconv_codec.c,v 1.8 2003/06/11 12:01:57 perky Exp $ */ #include "Python.h" @@ -57,6 +57,14 @@ typedef const char **iconv_arg2_t; #endif +#ifndef ucs4_t +# ifdef uint32_t +typedef uint32_t ucs4_t; +# else +typedef unsigned long ucs4_t; +# endif +#endif + #define ERROR_STRICT (PyObject *)(1) #define ERROR_IGNORE (PyObject *)(2) #define ERROR_REPLACE (PyObject *)(3) @@ -570,22 +578,30 @@ return -1; buf->rinbuf_top = buf->rinbuf = rinbuf; for (; buf->inbuf < buf->inbuf_end; buf->inbuf++) { - Py_UNICODE code = *buf->inbuf; + ucs4_t code = *buf->inbuf; int size; if (code < 0x80) size = 1; else if (code < 0x800) size = 2; + else { #if Py_UNICODE_SIZE == 2 - else size = 3; /* XXX put surrogate characters for EMP! */ -#else - else if (code < 0x10000) size = 3; - else if (code < 0x200000) size = 4; - else if (code < 0x4000000) size = 5; - else size = 6; -#endif + /* Unfold a Surrogate-Pair */ + if (code >= 0xd800 && code < 0xdc00 && + buf->inbuf+1 < buf->inbuf_end && + buf->inbuf[1] >= 0xdc00 && + buf->inbuf[1] < 0xe000) { + code = 0x10000 + ((code - 0xd800) << 10) + + (buf->inbuf[1] - 0xdc00); + buf->inbuf++; + } +#endif + if (code < 0x10000) size = 3; + else if (code < 0x200000) size = 4; + else if (code < 0x4000000) size = 5; + else size = 6; + } switch (size) { -#if Py_UNICODE_SIZE == 4 case 6: rinbuf[5] = 0x80 | (code & 0x3f); code = code >> 6; @@ -601,7 +617,6 @@ code = code >> 6; code |= 0x10000; /* FALLTHROUGH */ -#endif case 3: rinbuf[2] = 0x80 | (code & 0x3f); code = code >> 6; @@ -1000,10 +1015,11 @@ if (nch > 0) RESERVE_DECODEBUFFER(buf, nch) for (ubuf = ubuf_top; ubuf < ubuf_end;) { - int uleft = (int)(ubuf_end - ubuf); + int uleft = (int)(ubuf_end - ubuf); + ucs4_t code; if (*ubuf < 0x80) { - *buf->outbuf++ = (unsigned char)*ubuf++; + code = (unsigned char)*ubuf++; } else if (*ubuf < 0xc2) { ilseq: PyErr_SetString(PyExc_RuntimeError, "iconv returned illegal utf-8 sequence"); @@ -1011,32 +1027,28 @@ } else if (*ubuf < 0xe0) { if (uleft < 2 || !((ubuf[1] ^ 0x80) < 0x40)) goto ilseq; - *buf->outbuf++ = ((Py_UNICODE)(ubuf[0] & 0x1f) << 6) - | (Py_UNICODE)(ubuf[1] ^ 0x80); + code = ((Py_UNICODE)(ubuf[0] & 0x1f) << 6) + | (Py_UNICODE)(ubuf[1] ^ 0x80); ubuf += 2; } else if (*ubuf < 0xf0) { if (uleft < 3 || !((ubuf[1] ^ 0x80) < 0x40 && (ubuf[2] ^ 0x80) < 0x40 && (ubuf[0] >= 0xe1 || ubuf[1] >= 0xa0))) goto ilseq; - *buf->outbuf++ = ((Py_UNICODE)(ubuf[0] & 0x0f) << 12) - | ((Py_UNICODE)(ubuf[1] ^ 0x80) << 6) - | (Py_UNICODE)(ubuf[2] ^ 0x80); + code = ((Py_UNICODE)(ubuf[0] & 0x0f) << 12) + | ((Py_UNICODE)(ubuf[1] ^ 0x80) << 6) + | (Py_UNICODE)(ubuf[2] ^ 0x80); ubuf += 3; } -#if Py_UNICODE_SIZE == 2 - else /* XXX: put surrogate characters here! */ - goto ilseq; -#else else if (*ubuf < 0xf8) { if (uleft < 4 || !((ubuf[1] ^ 0x80) < 0x40 && (ubuf[2] ^ 0x80) < 0x40 && (ubuf[3] ^ 0x80) < 0x40 && (ubuf[0] >= 0xf1 || ubuf[1] >= 0x90))) goto ilseq; - *buf->outbuf++ = ((Py_UNICODE)(ubuf[0] & 0x07) << 18) - | ((Py_UNICODE)(ubuf[1] ^ 0x80) << 12) - | ((Py_UNICODE)(ubuf[2] ^ 0x80) << 6) - | (Py_UNICODE)(ubuf[3] ^ 0x80); + code = ((Py_UNICODE)(ubuf[0] & 0x07) << 18) + | ((Py_UNICODE)(ubuf[1] ^ 0x80) << 12) + | ((Py_UNICODE)(ubuf[2] ^ 0x80) << 6) + | (Py_UNICODE)(ubuf[3] ^ 0x80); ubuf += 4; } else if (*ubuf < 0xfc) { if (uleft < 5 || !((ubuf[1] ^ 0x80) < 0x40 && @@ -1044,11 +1056,11 @@ (ubuf[4] ^ 0x80) < 0x40 && (ubuf[0] >= 0xf9 || ubuf[1] >= 0x88))) goto ilseq; - *buf->outbuf++ = ((Py_UNICODE)(ubuf[0] & 0x03) << 24) - | ((Py_UNICODE)(ubuf[1] ^ 0x80) << 18) - | ((Py_UNICODE)(ubuf[2] ^ 0x80) << 12) - | ((Py_UNICODE)(ubuf[3] ^ 0x80) << 6) - | (Py_UNICODE)(ubuf[4] ^ 0x80); + code = ((Py_UNICODE)(ubuf[0] & 0x03) << 24) + | ((Py_UNICODE)(ubuf[1] ^ 0x80) << 18) + | ((Py_UNICODE)(ubuf[2] ^ 0x80) << 12) + | ((Py_UNICODE)(ubuf[3] ^ 0x80) << 6) + | (Py_UNICODE)(ubuf[4] ^ 0x80); ubuf += 5; } else if (*ubuf < 0xff) { if (uleft < 6 || !((ubuf[1] ^ 0x80) < 0x40 && @@ -1056,16 +1068,34 @@ (ubuf[4] ^ 0x80) < 0x40 && (ubuf[5] ^ 0x80) < 0x40 && (ubuf[0] >= 0xfd || ubuf[1] >= 0x84))) goto ilseq; - *buf->outbuf++ = ((Py_UNICODE)(ubuf[0] & 0x01) << 30) - | ((Py_UNICODE)(ubuf[1] ^ 0x80) << 24) - | ((Py_UNICODE)(ubuf[2] ^ 0x80) << 18) - | ((Py_UNICODE)(ubuf[3] ^ 0x80) << 12) - | ((Py_UNICODE)(ubuf[4] ^ 0x80) << 6) - | (Py_UNICODE)(ubuf[5] ^ 0x80); + code = ((Py_UNICODE)(ubuf[0] & 0x01) << 30) + | ((Py_UNICODE)(ubuf[1] ^ 0x80) << 24) + | ((Py_UNICODE)(ubuf[2] ^ 0x80) << 18) + | ((Py_UNICODE)(ubuf[3] ^ 0x80) << 12) + | ((Py_UNICODE)(ubuf[4] ^ 0x80) << 6) + | (Py_UNICODE)(ubuf[5] ^ 0x80); ubuf += 6; } else goto ilseq; + +#if Py_UNICODE_SIZE == 2 + if (code >= 0x10000) { + if (code >= 0x110000) + goto ilseq; + + if (buf->outbuf_end <= buf->outbuf + 1) { + RESERVE_DECODEBUFFER(buf, -1) + } + *buf->outbuf++ = 0xd800 + ((code - 0x10000) >> 10); + *buf->outbuf++ = 0xdc00 + ((code - 0x10000) & 0x3ff); + } else #endif + { + if (buf->outbuf_end <= buf->outbuf) { + RESERVE_DECODEBUFFER(buf, -1) + } + *buf->outbuf++ = (Py_UNICODE)code; + } } PyMem_Del(ubuf_top); |