Thread: [KoCo-CVS] [Commit] cjkcodecs/src _utf_8.c
Brought to you by:
perky
From: Hye-Shik C. <pe...@us...> - 2003-05-29 09:12:30
|
perky 03/05/29 02:12:30 Added: src _utf_8.c Log: Add utf-8 codec. (we need this to have a 'sane' UTF-8 StreamReader.) Revision Changes Path 1.1 cjkcodecs/src/_utf_8.c Index: _utf_8.c =================================================================== /* * _utf_8.c: the UTF-8 codec * * Copyright (C) 2003 Hye-Shik Chang <pe...@Fr...>. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $Id: _utf_8.c,v 1.1 2003/05/29 09:12:30 perky Exp $ */ #include "codeccommon.h" ENCODER(utf_8) { while (inleft > 0) { Py_UNICODE c = **inbuf; int size; if (c < 0x80) size = 1; else if (c < 0x800) size = 2; #if Py_UNICODE_SIZE == 2 else size = 3; #else else if (c < 0x10000) size = 3; else if (c < 0x200000) size = 4; else if (c < 0x4000000) size = 5; else size = 6; #endif RESERVE_OUTBUF(size) switch (size) { #if Py_UNICODE_SIZE == 4 case 6: (*outbuf)[5] = 0x80 | (c & 0x3f); c = c >> 6; c |= 0x4000000; /* FALLTHROUGH */ case 5: (*outbuf)[4] = 0x80 | (c & 0x3f); c = c >> 6; c |= 0x200000; /* FALLTHROUGH */ case 4: (*outbuf)[3] = 0x80 | (c & 0x3f); c = c >> 6; c |= 0x10000; /* FALLTHROUGH */ #endif case 3: (*outbuf)[2] = 0x80 | (c & 0x3f); c = c >> 6; c |= 0x800; /* FALLTHROUGH */ case 2: (*outbuf)[1] = 0x80 | (c & 0x3f); c = c >> 6; c |= 0xc0; /* FALLTHROUGH */ case 1: (*outbuf)[0] = c; } NEXT(1, size) } return 0; } DECODER(utf_8) { while (inleft > 0) { unsigned char c = **inbuf; RESERVE_OUTBUF(1) if (c < 0x80) { (*outbuf)[0] = (unsigned char)c; NEXT(1, 1) } else if (c < 0xc2 || c == 0xff) { return 1; } else if (c < 0xe0) { unsigned char c2; RESERVE_INBUF(2) c2 = (*inbuf)[1]; if (!((c2 ^ 0x80) < 0x40)) return 2; **outbuf = ((Py_UNICODE)(c & 0x1f) << 6) | (Py_UNICODE)(c2 ^ 0x80); NEXT(2, 1) } else if (c < 0xf0) { unsigned char c2, c3; RESERVE_INBUF(3) c2 = (*inbuf)[1]; c3 = (*inbuf)[2]; if (!((c2 ^ 0x80) < 0x40 && (c3 ^ 0x80) < 0x40 && (c >= 0xe1 || c2 >= 0xa0))) return 3; **outbuf = ((Py_UNICODE)(c & 0x0f) << 12) | ((Py_UNICODE)(c2 ^ 0x80) << 6) | (Py_UNICODE)(c3 ^ 0x80); NEXT(3, 1) } #if Py_UNICODE_SIZE == 2 else return 3; #else } else if (c < 0xf8) { unsigned char c2, c3, c4; RESERVER_INBUF(4) c2 = (*inbuf)[1]; c3 = (*inbuf)[2]; c4 = (*inbuf)[3]; if (!((c2 ^ 0x80) < 0x40 && (c3 ^ 0x80) < 0x40 && (c4 ^ 0x80) < 0x40 && (c >= 0xf1 || c2 >= 0x90))) return 4; **outbuf = ((Py_UNICODE)(c & 0x07) << 18) | ((Py_UNICODE)(c2 ^ 0x80) << 12) | ((Py_UNICODE)(c3 ^ 0x80) << 6) | (Py_UNICODE)(c4 ^ 0x80); NEXT(4, 1) } else if (c < 0xfc) { unsigned char c2, c3, c4, c5; RESERVER_INBUF(5) c2 = (*inbuf)[1]; c3 = (*inbuf)[2]; c4 = (*inbuf)[3]; c5 = (*inbuf)[4]; if (!((c2 ^ 0x80) < 0x40 && (c3 ^ 0x80) < 0x40 && (c4 ^ 0x80) < 0x40 && (c5 ^ 0x80) < 0x40 && (c >= 0xf9 || c2 >= 0x88))) return 5; **outbuf = ((Py_UNICODE)(c & 0x03) << 24) | ((Py_UNICODE)(c2 ^ 0x80) << 18) | ((Py_UNICODE)(c3 ^ 0x80) << 12) | ((Py_UNICODE)(c4 ^ 0x80) << 6) | (Py_UNICODE)(c5 ^ 0x80); NEXT(5, 1) } else { /* 0xff is excluded above */ unsigned char c2, c3, c4, c5, c6; RESERVER_INBUF(6) c2 = (*inbuf)[1]; c3 = (*inbuf)[2]; c4 = (*inbuf)[3]; c5 = (*inbuf)[4]; c6 = (*inbuf)[5]; if (!((c2 ^ 0x80) < 0x40 && (c3 ^ 0x80) < 0x40 && (c4 ^ 0x80) < 0x40 && (c5 ^ 0x80) < 0x40 && (c6 ^ 0x80) < 0x40 && (c >= 0xfd || c2 >= 0x84))) return 6; **outbuf = ((Py_UNICODE)(c & 0x01) << 30) | ((Py_UNICODE)(c2 ^ 0x80) << 24) | ((Py_UNICODE)(c3 ^ 0x80) << 18) | ((Py_UNICODE)(c4 ^ 0x80) << 12) | ((Py_UNICODE)(c5 ^ 0x80) << 6) | (Py_UNICODE)(c6 ^ 0x80); NEXT(6, 1) } #endif } return 0; } BEGIN_CODEC_REGISTRY(utf_8) /* no maps */ END_CODEC_REGISTRY(utf_8) /* * ex: ts=8 sts=4 et */ |
From: Hye-Shik C. <pe...@us...> - 2003-05-31 03:51:34
|
perky 03/05/30 20:41:14 Modified: src _utf_8.c Log: Detect utf-8 lengths > 3 correctly even on --with-unicode=ucs2 Revision Changes Path 1.2 +15 -8 cjkcodecs/src/_utf_8.c Index: _utf_8.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/_utf_8.c,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- _utf_8.c 29 May 2003 09:12:30 -0000 1.1 +++ _utf_8.c 31 May 2003 03:41:14 -0000 1.2 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: _utf_8.c,v 1.1 2003/05/29 09:12:30 perky Exp $ + * $Id: _utf_8.c,v 1.2 2003/05/31 03:41:14 perky Exp $ */ #include "codeccommon.h" @@ -98,7 +98,7 @@ if (c < 0x80) { (*outbuf)[0] = (unsigned char)c; NEXT(1, 1) - } else if (c < 0xc2 || c == 0xff) { + } else if (c < 0xc2) { return 1; } else if (c < 0xe0) { unsigned char c2; @@ -121,12 +121,10 @@ | ((Py_UNICODE)(c2 ^ 0x80) << 6) | (Py_UNICODE)(c3 ^ 0x80); NEXT(3, 1) - } + } else if (c < 0xf8) { #if Py_UNICODE_SIZE == 2 - else - return 3; + return 4; #else - } else if (c < 0xf8) { unsigned char c2, c3, c4; RESERVER_INBUF(4) @@ -141,7 +139,11 @@ | ((Py_UNICODE)(c3 ^ 0x80) << 6) | (Py_UNICODE)(c4 ^ 0x80); NEXT(4, 1) +#endif } else if (c < 0xfc) { +#if Py_UNICODE_SIZE == 2 + return 5; +#else unsigned char c2, c3, c4, c5; RESERVER_INBUF(5) @@ -157,7 +159,11 @@ | ((Py_UNICODE)(c4 ^ 0x80) << 6) | (Py_UNICODE)(c5 ^ 0x80); NEXT(5, 1) - } else { /* 0xff is excluded above */ +#endif + } else if (c < 0xff) { +#if Py_UNICODE_SIZE == 2 + return 6; +#else unsigned char c2, c3, c4, c5, c6; RESERVER_INBUF(6) @@ -176,8 +182,9 @@ | ((Py_UNICODE)(c5 ^ 0x80) << 6) | (Py_UNICODE)(c6 ^ 0x80); NEXT(6, 1) - } #endif + } else + return 1; } return 0; |
From: Hye-Shik C. <pe...@us...> - 2003-06-06 06:27:00
|
perky 03/06/05 23:26:59 Modified: src _utf_8.c Log: Fix typos Revision Changes Path 1.5 +4 -4 cjkcodecs/src/_utf_8.c Index: _utf_8.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/_utf_8.c,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- _utf_8.c 6 Jun 2003 05:52:03 -0000 1.4 +++ _utf_8.c 6 Jun 2003 06:26:59 -0000 1.5 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: _utf_8.c,v 1.4 2003/06/06 05:52:03 perky Exp $ + * $Id: _utf_8.c,v 1.5 2003/06/06 06:26:59 perky Exp $ */ #include "codeccommon.h" @@ -127,7 +127,7 @@ #else unsigned char c2, c3, c4; - RESERVER_INBUF(4) + RESERVE_INBUF(4) c2 = (*inbuf)[1]; c3 = (*inbuf)[2]; c4 = (*inbuf)[3]; if (!((c2 ^ 0x80) < 0x40 && @@ -146,7 +146,7 @@ #else unsigned char c2, c3, c4, c5; - RESERVER_INBUF(5) + RESERVE_INBUF(5) c2 = (*inbuf)[1]; c3 = (*inbuf)[2]; c4 = (*inbuf)[3]; c5 = (*inbuf)[4]; if (!((c2 ^ 0x80) < 0x40 && @@ -166,7 +166,7 @@ #else unsigned char c2, c3, c4, c5, c6; - RESERVER_INBUF(6) + RESERVE_INBUF(6) c2 = (*inbuf)[1]; c3 = (*inbuf)[2]; c4 = (*inbuf)[3]; c5 = (*inbuf)[4]; c6 = (*inbuf)[5]; |
From: Hye-Shik C. <pe...@us...> - 2003-06-20 17:56:10
|
perky 03/06/20 10:56:08 Modified: src _utf_8.c Log: utf-8 is described on rfc2279 Revision Changes Path 1.7 +2 -2 cjkcodecs/src/_utf_8.c Index: _utf_8.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/_utf_8.c,v retrieving revision 1.6 retrieving revision 1.7 diff -u -r1.6 -r1.7 --- _utf_8.c 20 Jun 2003 17:22:59 -0000 1.6 +++ _utf_8.c 20 Jun 2003 17:56:08 -0000 1.7 @@ -1,5 +1,5 @@ /* - * _utf_8.c: the UTF-8 codec + * _utf_8.c: the UTF-8 codec (RFC2279) * * Copyright (C) 2003 Hye-Shik Chang <pe...@Fr...>. * All rights reserved. @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: _utf_8.c,v 1.6 2003/06/20 17:22:59 perky Exp $ + * $Id: _utf_8.c,v 1.7 2003/06/20 17:56:08 perky Exp $ */ #include "codeccommon.h" |