Thread: [KoCo-CVS] [Commit] cjkcodecs/src _utf_8.c cjkcommon.h codeccommon.h
Brought to you by:
perky
From: Hye-Shik C. <pe...@us...> - 2003-06-20 17:23:01
|
perky 03/06/20 10:22:59 Modified: src _utf_8.c cjkcommon.h codeccommon.h Log: Enable utf-8 codec encode and decode iso-10646-2 characters using surrogate pair. Revision Changes Path 1.6 +86 -40 cjkcodecs/src/_utf_8.c Index: _utf_8.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/_utf_8.c,v retrieving revision 1.5 retrieving revision 1.6 diff -u -r1.5 -r1.6 --- _utf_8.c 6 Jun 2003 06:26:59 -0000 1.5 +++ _utf_8.c 20 Jun 2003 17:22:59 -0000 1.6 @@ -26,32 +26,85 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: _utf_8.c,v 1.5 2003/06/06 06:26:59 perky Exp $ + * $Id: _utf_8.c,v 1.6 2003/06/20 17:22:59 perky Exp $ */ #include "codeccommon.h" + +#define ENCODESURROGATE(outbuf, c) \ + (*outbuf)[2] = 0x80 | ((c) & 0x3f); \ + (c) = (c) >> 6; (c) |= 0x800; \ + (*outbuf)[1] = 0x80 | ((c) & 0x3f); \ + (c) = (c) >> 6; (c) |= 0xc0; \ + (*outbuf)[0] = (c); + +#if Py_UNICODE_SIZE == 2 +#define HAVE_ENCODER_INIT +ENCODER_INIT(utf_8) +{ + state->i = 0; + return 0; +} + +#define HAVE_ENCODER_RESET +ENCODER_RESET(utf_8) +{ + if (state->i > 0) { + ucs4_t c = (ucs4_t)state->i; + + RESERVE_OUTBUF(3) + ENCODESURROGATE(outbuf, c) + state->i = 0; + NEXT_OUT(3) + } + return 0; +} +#endif /* Py_UNICODE_SIZE == 2 */ + ENCODER(utf_8) { while (inleft > 0) { - Py_UNICODE c = **inbuf; + ucs4_t c = **inbuf; int size; if (c < 0x80) size = 1; else if (c < 0x800) size = 2; + else { #if Py_UNICODE_SIZE == 2 - else size = 3; -#else - else if (c < 0x10000) size = 3; - else if (c < 0x200000) size = 4; - else if (c < 0x4000000) size = 5; - else size = 6; + if (c >> 10 == 0xd800 >> 10 && state->i == 0) { + /* high surrogate */ + state->i = (unsigned short)c; + NEXT_IN(1) + continue; + } else if (c >> 10 == 0xdc00 >> 10 && state->i != 0) { + /* low surrogate */ + c = 0x10000 + (((ucs4_t)(state->i) - 0xd800) << 10) + + (c - 0xdc00); + RESERVE_OUTBUF(6) /* preserve enough space not to lose state */ + state->i = 0; + } +#endif + if (c < 0x10000) size = 3; + else if (c < 0x200000) size = 4; + else if (c < 0x4000000) size = 5; + else size = 6; + } + +#if Py_UNICODE_SIZE == 2 + if (state->i > 0) { /* unmatched surrogates */ + ucs4_t sgc = (ucs4_t)state->i; + + RESERVE_OUTBUF(3) /* high surrogates are ..*/ + ENCODESURROGATE(outbuf, sgc) + state->i = 0; + NEXT_OUT(3) + } #endif RESERVE_OUTBUF(size) switch (size) { -#if Py_UNICODE_SIZE == 4 case 6: (*outbuf)[5] = 0x80 | (c & 0x3f); c = c >> 6; @@ -67,7 +120,6 @@ c = c >> 6; c |= 0x10000; /* FALLTHROUGH */ -#endif case 3: (*outbuf)[2] = 0x80 | (c & 0x3f); c = c >> 6; @@ -122,10 +174,8 @@ | (Py_UNICODE)(c3 ^ 0x80); NEXT(3, 1) } else if (c < 0xf8) { -#if Py_UNICODE_SIZE == 2 - return 4; -#else unsigned char c2, c3, c4; + ucs4_t code; RESERVE_INBUF(4) c2 = (*inbuf)[1]; c3 = (*inbuf)[2]; @@ -134,17 +184,15 @@ (c3 ^ 0x80) < 0x40 && (c4 ^ 0x80) < 0x40 && (c >= 0xf1 || c2 >= 0x90))) return 4; - **outbuf = ((Py_UNICODE)(c & 0x07) << 18) - | ((Py_UNICODE)(c2 ^ 0x80) << 12) - | ((Py_UNICODE)(c3 ^ 0x80) << 6) - | (Py_UNICODE)(c4 ^ 0x80); - NEXT(4, 1) -#endif + code = ((ucs4_t)(c & 0x07) << 18) + | ((ucs4_t)(c2 ^ 0x80) << 12) + | ((ucs4_t)(c3 ^ 0x80) << 6) + | (ucs4_t)(c4 ^ 0x80); + PUTUCS4(code) + NEXT_IN(4) } else if (c < 0xfc) { -#if Py_UNICODE_SIZE == 2 - return 5; -#else unsigned char c2, c3, c4, c5; + ucs4_t code; RESERVE_INBUF(5) c2 = (*inbuf)[1]; c3 = (*inbuf)[2]; @@ -153,18 +201,16 @@ (c3 ^ 0x80) < 0x40 && (c4 ^ 0x80) < 0x40 && (c5 ^ 0x80) < 0x40 && (c >= 0xf9 || c2 >= 0x88))) return 5; - **outbuf = ((Py_UNICODE)(c & 0x03) << 24) - | ((Py_UNICODE)(c2 ^ 0x80) << 18) - | ((Py_UNICODE)(c3 ^ 0x80) << 12) - | ((Py_UNICODE)(c4 ^ 0x80) << 6) - | (Py_UNICODE)(c5 ^ 0x80); - NEXT(5, 1) -#endif + code = ((ucs4_t)(c & 0x03) << 24) + | ((ucs4_t)(c2 ^ 0x80) << 18) + | ((ucs4_t)(c3 ^ 0x80) << 12) + | ((ucs4_t)(c4 ^ 0x80) << 6) + | (ucs4_t)(c5 ^ 0x80); + PUTUCS4(code) + NEXT_IN(5) } else if (c < 0xff) { -#if Py_UNICODE_SIZE == 2 - return 6; -#else unsigned char c2, c3, c4, c5, c6; + ucs4_t code; RESERVE_INBUF(6) c2 = (*inbuf)[1]; c3 = (*inbuf)[2]; @@ -175,14 +221,14 @@ (c5 ^ 0x80) < 0x40 && (c6 ^ 0x80) < 0x40 && (c >= 0xfd || c2 >= 0x84))) return 6; - **outbuf = ((Py_UNICODE)(c & 0x01) << 30) - | ((Py_UNICODE)(c2 ^ 0x80) << 24) - | ((Py_UNICODE)(c3 ^ 0x80) << 18) - | ((Py_UNICODE)(c4 ^ 0x80) << 12) - | ((Py_UNICODE)(c5 ^ 0x80) << 6) - | (Py_UNICODE)(c6 ^ 0x80); - NEXT(6, 1) -#endif + code = ((ucs4_t)(c & 0x01) << 30) + | ((ucs4_t)(c2 ^ 0x80) << 24) + | ((ucs4_t)(c3 ^ 0x80) << 18) + | ((ucs4_t)(c4 ^ 0x80) << 12) + | ((ucs4_t)(c5 ^ 0x80) << 6) + | (ucs4_t)(c6 ^ 0x80); + PUTUCS4(code) + NEXT_IN(6) } else return 1; } 1.9 +7 -1 cjkcodecs/src/cjkcommon.h Index: cjkcommon.h =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/cjkcommon.h,v retrieving revision 1.8 retrieving revision 1.9 diff -u -r1.8 -r1.9 --- cjkcommon.h 19 May 2003 23:07:12 -0000 1.8 +++ cjkcommon.h 20 Jun 2003 17:22:59 -0000 1.9 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: cjkcommon.h,v 1.8 2003/05/19 23:07:12 perky Exp $ + * $Id: cjkcommon.h,v 1.9 2003/06/20 17:22:59 perky Exp $ */ #ifndef _CJKCOMMON_H_ @@ -54,6 +54,12 @@ const struct unim_index *encmap; const struct dbcs_index *decmap; }; + +#ifdef uint32_t +typedef uint32_t ucs4_t; +#else +typedef unsigned int ucs4_t; +#endif #endif 1.13 +14 -1 cjkcodecs/src/codeccommon.h Index: codeccommon.h =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/codeccommon.h,v retrieving revision 1.12 retrieving revision 1.13 diff -u -r1.12 -r1.13 --- codeccommon.h 6 Jun 2003 06:27:41 -0000 1.12 +++ codeccommon.h 20 Jun 2003 17:22:59 -0000 1.13 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: codeccommon.h,v 1.12 2003/06/06 06:27:41 perky Exp $ + * $Id: codeccommon.h,v 1.13 2003/06/20 17:22:59 perky Exp $ */ #include "Python.h" @@ -106,6 +106,19 @@ (*outbuf)[1] = (unsigned char)(c2); \ (*outbuf)[2] = (unsigned char)(c3); \ (*outbuf)[3] = (unsigned char)(c4); + +#if Py_UNICODE_SIZE == 2 +# define PUTUCS4(c) \ + RESERVE_OUTBUF(2) \ + (*outbuf)[0] = 0xd800 + (((c) - 0x10000) >> 10); \ + (*outbuf)[1] = 0xdc00 + (((c) - 0x10000) & 0x3ff); \ + NEXT_OUT(2) +#else +# define PUTUCS4(c) \ + RESERVE_OUTBUF(1) \ + **outbuf = (Py_UNICODE)(c); \ + NEXT_OUT(1) +#endif #define _TRYMAP_ENC(m, assi, val) \ if ((m)->map != NULL && (val) >= (m)->bottom && \ |