koco-cvs Mailing List for Python Korean Codecs (Page 4)
Brought to you by:
perky
You can subscribe to this list here.
2002 |
Jan
|
Feb
|
Mar
|
Apr
(88) |
May
(5) |
Jun
|
Jul
(27) |
Aug
|
Sep
|
Oct
(5) |
Nov
|
Dec
|
---|---|---|---|---|---|---|---|---|---|---|---|---|
2003 |
Jan
(77) |
Feb
(3) |
Mar
|
Apr
(22) |
May
(123) |
Jun
(80) |
Jul
(83) |
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
From: Hye-Shik C. <pe...@us...> - 2003-07-06 10:10:33
|
perky 03/07/06 03:10:31 Modified: src codeccommon.h _iso_2022_jp.c _iso_2022_jp_1.c _iso_2022_kr.c Log: Rename PAVE* -> WRITE* Revision Changes Path 1.16 +138 -138 cjkcodecs/src/codeccommon.h Index: codeccommon.h =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/codeccommon.h,v retrieving revision 1.15 retrieving revision 1.16 diff -u -r1.15 -r1.16 --- codeccommon.h 5 Jul 2003 19:49:02 -0000 1.15 +++ codeccommon.h 6 Jul 2003 10:10:31 -0000 1.16 @@ -1,138 +1,138 @@ -/* - * codeccommon.h: Common Codec Routines - * - * Copyright (C) 2003 Hye-Shik Chang <pe...@Fr...>. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, - * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING - * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - * $Id: codeccommon.h,v 1.15 2003/07/05 19:49:02 perky Exp $ - */ - -#include "Python.h" -#include "multibytecodec.h" -#include "multibytecodec_compat.h" -#include "cjkcommon.h" - -#define ENCMAP(encoding) \ - const static encode_map *encoding##encmap; -#define DECMAP(encoding) \ - const static decode_map *encoding##decmap; - -#define ENCODER_INIT(encoding) \ - static int encoding##_encode_init( \ - MultibyteCodec_State *state) -#define ENCODER(encoding) \ - static int encoding##_encode( \ - MultibyteCodec_State *state, \ - const Py_UNICODE **inbuf, size_t inleft, \ - unsigned char **outbuf, size_t outleft, int flags) -#define ENCODER_RESET(encoding) \ - static int encoding##_encode_reset( \ - MultibyteCodec_State *state, \ - unsigned char **outbuf, size_t outleft) - -#define DECODER_INIT(encoding) \ - static int encoding##_decode_init( \ - MultibyteCodec_State *state) -#define DECODER(encoding) \ - static int encoding##_decode( \ - MultibyteCodec_State *state, \ - const unsigned char **inbuf, size_t inleft, \ - Py_UNICODE **outbuf, size_t outleft) -#define DECODER_RESET(encoding) \ - static int encoding##_decode_reset( \ - MultibyteCodec_State *state) - -#if Py_UNICODE_SIZE == 4 -#define UCS4INVALID(code) \ - if ((code) > 0xFFFF) \ - return 1; -#else -#define UCS4INVALID(code) \ - if (0) ; -#endif - -#define NEXT_IN(i) \ - (*inbuf) += (i); \ - (inleft) -= (i); -#define NEXT_OUT(o) \ - (*outbuf) += (o); \ - (outleft) -= (o); -#define NEXT(i, o) NEXT_IN(i) NEXT_OUT(o) - -#define RESERVE_INBUF(n) \ - if (inleft < (n)) \ - return MBERR_TOOFEW; -#define RESERVE_OUTBUF(n) \ - if (outleft < (n)) \ - return MBERR_TOOSMALL; - -#define PAVE1(c1) \ - RESERVE_OUTBUF(1) \ - (*outbuf)[0] = (unsigned char)(c1); -#define PAVE2(c1, c2) \ - RESERVE_OUTBUF(2) \ - (*outbuf)[0] = (unsigned char)(c1); \ - (*outbuf)[1] = (unsigned char)(c2); -#define PAVE3(c1, c2, c3) \ - RESERVE_OUTBUF(3) \ - (*outbuf)[0] = (unsigned char)(c1); \ - (*outbuf)[1] = (unsigned char)(c2); \ - (*outbuf)[2] = (unsigned char)(c3); -#define PAVE4(c1, c2, c3, c4) \ - RESERVE_OUTBUF(4) \ - (*outbuf)[0] = (unsigned char)(c1); \ - (*outbuf)[1] = (unsigned char)(c2); \ - (*outbuf)[2] = (unsigned char)(c3); \ - (*outbuf)[3] = (unsigned char)(c4); - -#if Py_UNICODE_SIZE == 2 -# define PUTUCS4(c) \ - RESERVE_OUTBUF(2) \ - (*outbuf)[0] = 0xd800 + (((c) - 0x10000) >> 10); \ - (*outbuf)[1] = 0xdc00 + (((c) - 0x10000) & 0x3ff); \ - NEXT_OUT(2) -#else -# define PUTUCS4(c) \ - RESERVE_OUTBUF(1) \ - **outbuf = (Py_UNICODE)(c); \ - NEXT_OUT(1) -#endif - -#define _TRYMAP_ENC(m, assi, val) \ - if ((m)->map != NULL && (val) >= (m)->bottom && \ - (val)<= (m)->top && ((assi) = (m)->map[(val) - \ - (m)->bottom]) != NOCHAR) -#define TRYMAP_ENC(charset, assi, uni) \ - _TRYMAP_ENC(&charset##encmap[uni >> 8], assi, uni & 0xff) -#define _TRYMAP_DEC(m, assi, val) \ - if ((m)->map != NULL && (val) >= (m)->bottom && \ - (val)<= (m)->top && ((assi) = (m)->map[(val) - \ - (m)->bottom]) != UNIINV) -#define TRYMAP_DEC(charset, assi, c1, c2) \ - _TRYMAP_DEC(&charset##decmap[c1], assi, c2) - -/* - * ex: ts=8 sts=4 et - */ +/* + * codeccommon.h: Common Codec Routines + * + * Copyright (C) 2003 Hye-Shik Chang <pe...@Fr...>. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * $Id: codeccommon.h,v 1.16 2003/07/06 10:10:31 perky Exp $ + */ + +#include "Python.h" +#include "multibytecodec.h" +#include "multibytecodec_compat.h" +#include "cjkcommon.h" + +#define ENCMAP(encoding) \ + const static encode_map *encoding##encmap; +#define DECMAP(encoding) \ + const static decode_map *encoding##decmap; + +#define ENCODER_INIT(encoding) \ + static int encoding##_encode_init( \ + MultibyteCodec_State *state) +#define ENCODER(encoding) \ + static int encoding##_encode( \ + MultibyteCodec_State *state, \ + const Py_UNICODE **inbuf, size_t inleft, \ + unsigned char **outbuf, size_t outleft, int flags) +#define ENCODER_RESET(encoding) \ + static int encoding##_encode_reset( \ + MultibyteCodec_State *state, \ + unsigned char **outbuf, size_t outleft) + +#define DECODER_INIT(encoding) \ + static int encoding##_decode_init( \ + MultibyteCodec_State *state) +#define DECODER(encoding) \ + static int encoding##_decode( \ + MultibyteCodec_State *state, \ + const unsigned char **inbuf, size_t inleft, \ + Py_UNICODE **outbuf, size_t outleft) +#define DECODER_RESET(encoding) \ + static int encoding##_decode_reset( \ + MultibyteCodec_State *state) + +#if Py_UNICODE_SIZE == 4 +#define UCS4INVALID(code) \ + if ((code) > 0xFFFF) \ + return 1; +#else +#define UCS4INVALID(code) \ + if (0) ; +#endif + +#define NEXT_IN(i) \ + (*inbuf) += (i); \ + (inleft) -= (i); +#define NEXT_OUT(o) \ + (*outbuf) += (o); \ + (outleft) -= (o); +#define NEXT(i, o) NEXT_IN(i) NEXT_OUT(o) + +#define RESERVE_INBUF(n) \ + if (inleft < (n)) \ + return MBERR_TOOFEW; +#define RESERVE_OUTBUF(n) \ + if (outleft < (n)) \ + return MBERR_TOOSMALL; + +#define WRITE1(c1) \ + RESERVE_OUTBUF(1) \ + (*outbuf)[0] = (unsigned char)(c1); +#define WRITE2(c1, c2) \ + RESERVE_OUTBUF(2) \ + (*outbuf)[0] = (unsigned char)(c1); \ + (*outbuf)[1] = (unsigned char)(c2); +#define WRITE3(c1, c2, c3) \ + RESERVE_OUTBUF(3) \ + (*outbuf)[0] = (unsigned char)(c1); \ + (*outbuf)[1] = (unsigned char)(c2); \ + (*outbuf)[2] = (unsigned char)(c3); +#define WRITE4(c1, c2, c3, c4) \ + RESERVE_OUTBUF(4) \ + (*outbuf)[0] = (unsigned char)(c1); \ + (*outbuf)[1] = (unsigned char)(c2); \ + (*outbuf)[2] = (unsigned char)(c3); \ + (*outbuf)[3] = (unsigned char)(c4); + +#if Py_UNICODE_SIZE == 2 +# define PUTUCS4(c) \ + RESERVE_OUTBUF(2) \ + (*outbuf)[0] = 0xd800 + (((c) - 0x10000) >> 10); \ + (*outbuf)[1] = 0xdc00 + (((c) - 0x10000) & 0x3ff); \ + NEXT_OUT(2) +#else +# define PUTUCS4(c) \ + RESERVE_OUTBUF(1) \ + **outbuf = (Py_UNICODE)(c); \ + NEXT_OUT(1) +#endif + +#define _TRYMAP_ENC(m, assi, val) \ + if ((m)->map != NULL && (val) >= (m)->bottom && \ + (val)<= (m)->top && ((assi) = (m)->map[(val) - \ + (m)->bottom]) != NOCHAR) +#define TRYMAP_ENC(charset, assi, uni) \ + _TRYMAP_ENC(&charset##encmap[uni >> 8], assi, uni & 0xff) +#define _TRYMAP_DEC(m, assi, val) \ + if ((m)->map != NULL && (val) >= (m)->bottom && \ + (val)<= (m)->top && ((assi) = (m)->map[(val) - \ + (m)->bottom]) != UNIINV) +#define TRYMAP_DEC(charset, assi, c1, c2) \ + _TRYMAP_DEC(&charset##decmap[c1], assi, c2) + +/* + * ex: ts=8 sts=4 et + */ 1.5 +271 -271 cjkcodecs/src/_iso_2022_jp.c Index: _iso_2022_jp.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/_iso_2022_jp.c,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- _iso_2022_jp.c 5 Jun 2003 09:56:21 -0000 1.4 +++ _iso_2022_jp.c 6 Jul 2003 10:10:31 -0000 1.5 @@ -1,271 +1,271 @@ -/* - * _iso_2022_jp.c: the ISO-2022-JP codec (RFC1468) - * - * Copyright (C) 2003 Hye-Shik Chang <pe...@Fr...>. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, - * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING - * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - * $Id: _iso_2022_jp.c,v 1.4 2003/06/05 09:56:21 perky Exp $ - */ - -#include "codeccommon.h" -#include "iso2022common.h" -#include "maps/alg_jisx0201.h" - -ENCMAP(jisxcommon) -DECMAP(jisx0208) - -#define HAVE_ENCODER_INIT -ENCODER_INIT(iso_2022_jp) -{ - state->i = 0; - STATE_SETG0(state, CHARSET_ASCII) - STATE_SETG1(state, CHARSET_ASCII) - return 0; -} - -#define HAVE_ENCODER_RESET -ENCODER_RESET(iso_2022_jp) -{ - if (STATE_GETG0(state) != CHARSET_ASCII) { - RESERVE_OUTBUF(3) - PAVE3(ESC, '(', 'B') - STATE_SETG0(state, CHARSET_ASCII) - NEXT_OUT(3) - } - return 0; -} - -/* ISO-2022-JP changes designations instead of shifting-out */ - -ENCODER(iso_2022_jp) -{ - while (inleft > 0) { - Py_UNICODE c = **inbuf; - DBCHAR code; - - if (c < 0x80) { - switch (STATE_GETG0(state)) { - case CHARSET_ASCII: - PAVE1(c) - NEXT(1, 1) - break; - case CHARSET_JISX0201_R: - JISX0201_R_ENCODE(c, code) - else { /* FALLTHROUGH (yay!) */ - default: - PAVE3(ESC, '(', 'B') - NEXT_OUT(3) - STATE_SETG0(state, CHARSET_ASCII) - code = c; - } - PAVE1(code) - NEXT(1, 1) - break; - } - if (c == '\n') - STATE_CLEARFLAG(state, F_SHIFTED) - } else UCS4INVALID(c) - else { - unsigned char charset; - - charset = STATE_GETG0(state); - if (charset == CHARSET_JISX0201_R) { - code = DBCINV; - JISX0201_R_ENCODE(c, code) - if (code != DBCINV) { - PAVE1(code) - NEXT(1, 1) - continue; - } - } - - TRYMAP_ENC(jisxcommon, code, c) { - if (code & 0x8000) /* MSB set: JIS X 0212 */ - return 1; - if (charset != CHARSET_JISX0208) { - PAVE3(ESC, '$', 'B') - STATE_SETG0(state, CHARSET_JISX0208) - NEXT_OUT(3) - } - PAVE2(code >> 8, code & 0xff) - NEXT(1, 2) - } else { - JISX0201_R_ENCODE(c, code) - else - return 1; - /* if (charset == CHARSET_JISX0201_R) : already checked */ - PAVE4(ESC, '(', 'J', code) - STATE_SETG0(state, CHARSET_JISX0201_R) - NEXT(1, 4) - } - } - } - - return 0; -} - -#define HAVE_DECODER_INIT -DECODER_INIT(iso_2022_jp) -{ - state->i = 0; - STATE_SETG0(state, CHARSET_ASCII) - STATE_SETG1(state, CHARSET_ASCII) - return 0; -} - -#define HAVE_DECODER_RESET -DECODER_RESET(iso_2022_jp) -{ - STATE_CLEARFLAG(state, F_SHIFTED) - return 0; -} - -DECODER(iso_2022_jp) -{ - while (inleft > 0) { - unsigned char c = **inbuf; - - if (STATE_GETFLAG(state, F_ESCTHROUGHOUT)) { - /* ESC throughout mode: for non-iso2022 escape sequences */ - RESERVE_OUTBUF(1) - **outbuf = c; /* assume as ISO-8859-1 */ - NEXT(1, 1) - if (IS_ESCEND(c)) { - STATE_CLEARFLAG(state, F_ESCTHROUGHOUT) - } - continue; - } - - switch (c) { - case ESC: - RESERVE_INBUF(2) - if (IS_ISO2022ESC((*inbuf)[1])) { - int eslen; - - eslen = iso2022esclen(*inbuf, inleft); - if (eslen < 0) - return eslen == MBERR_INTERNAL ? 1 : eslen; - - if (eslen == 3) { - unsigned char charset; - - if ((*inbuf)[1] == '$') { - if ((*inbuf)[2] == '@' || (*inbuf)[2] == 'B') { - charset = (*inbuf)[2] | CHARSET_DOUBLEBYTE; - STATE_SETG0(state, charset); - } else - return 3; - } else { - if ((*inbuf)[2] == 'B' || (*inbuf)[2] == 'J') - charset = (*inbuf)[2]; - else - return 3; - - if ((*inbuf)[1] == '(') { - STATE_SETG0(state, charset) - } else if ((*inbuf)[1] == ')') { - STATE_SETG1(state, charset) - } else - return 3; - } - } else - return eslen; - NEXT_IN(eslen) - } else { - STATE_SETFLAG(state, F_ESCTHROUGHOUT) - **outbuf = ESC; - NEXT(1, 1) - } - break; - case SI: - STATE_CLEARFLAG(state, F_SHIFTED) - NEXT_IN(1) - break; - case SO: - STATE_SETFLAG(state, F_SHIFTED) - NEXT_IN(1) - break; - case '\n': - STATE_CLEARFLAG(state, F_SHIFTED) - /* FALLTHROUGH */ - case SP: /* FALLTHROUGH */ - case DEL: - RESERVE_OUTBUF(1) - **outbuf = c; - NEXT(1, 1) - break; - default: - if ((c & 0x7f) < 0x20) { /* C0 and C1 */ - RESERVE_OUTBUF(1) - **outbuf = c & 0x7f; - NEXT(1, 1) - } else { - unsigned char charset; - - if (!STATE_GETFLAG(state, F_SHIFTED) && c < 0x80) /* G0 */ - charset = STATE_GETG0(state); - else /* G1 */ - charset = STATE_GETG1(state); - - if (charset & CHARSET_DOUBLEBYTE) { - /* all double byte character sets are in JIS X 0208 here. - * this means that we don't distinguish :1978 from :1983. */ - RESERVE_INBUF(2) - RESERVE_OUTBUF(1) - TRYMAP_DEC(jisx0208, **outbuf, c & 0x7f, - (*inbuf)[1] & 0x7f) { - NEXT(2, 1) - } else - return 2; - } else if (charset == CHARSET_ASCII) { - RESERVE_OUTBUF(1) - **outbuf = c & 0x7f; - NEXT(1, 1) - } else if (charset == CHARSET_JISX0201_R) { - RESERVE_OUTBUF(1) - JISX0201_R_DECODE(c & 0x7f, **outbuf) - else - return 1; - NEXT(1, 1) - } else - return MBERR_INTERNAL; - } - } - } - - return 0; -} - -#include "codecentry.h" -BEGIN_CODEC_REGISTRY(iso_2022_jp) - MAPOPEN(ja_JP) - IMPORTMAP_DEC(jisx0208) - IMPORTMAP_ENC(jisxcommon) - MAPCLOSE() -END_CODEC_REGISTRY(iso_2022_jp) - -/* - * ex: ts=8 sts=4 et - */ +/* + * _iso_2022_jp.c: the ISO-2022-JP codec (RFC1468) + * + * Copyright (C) 2003 Hye-Shik Chang <pe...@Fr...>. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * $Id: _iso_2022_jp.c,v 1.5 2003/07/06 10:10:31 perky Exp $ + */ + +#include "codeccommon.h" +#include "iso2022common.h" +#include "maps/alg_jisx0201.h" + +ENCMAP(jisxcommon) +DECMAP(jisx0208) + +#define HAVE_ENCODER_INIT +ENCODER_INIT(iso_2022_jp) +{ + state->i = 0; + STATE_SETG0(state, CHARSET_ASCII) + STATE_SETG1(state, CHARSET_ASCII) + return 0; +} + +#define HAVE_ENCODER_RESET +ENCODER_RESET(iso_2022_jp) +{ + if (STATE_GETG0(state) != CHARSET_ASCII) { + RESERVE_OUTBUF(3) + WRITE3(ESC, '(', 'B') + STATE_SETG0(state, CHARSET_ASCII) + NEXT_OUT(3) + } + return 0; +} + +/* ISO-2022-JP changes designations instead of shifting-out */ + +ENCODER(iso_2022_jp) +{ + while (inleft > 0) { + Py_UNICODE c = **inbuf; + DBCHAR code; + + if (c < 0x80) { + switch (STATE_GETG0(state)) { + case CHARSET_ASCII: + WRITE1(c) + NEXT(1, 1) + break; + case CHARSET_JISX0201_R: + JISX0201_R_ENCODE(c, code) + else { /* FALLTHROUGH (yay!) */ + default: + WRITE3(ESC, '(', 'B') + NEXT_OUT(3) + STATE_SETG0(state, CHARSET_ASCII) + code = c; + } + WRITE1(code) + NEXT(1, 1) + break; + } + if (c == '\n') + STATE_CLEARFLAG(state, F_SHIFTED) + } else UCS4INVALID(c) + else { + unsigned char charset; + + charset = STATE_GETG0(state); + if (charset == CHARSET_JISX0201_R) { + code = DBCINV; + JISX0201_R_ENCODE(c, code) + if (code != DBCINV) { + WRITE1(code) + NEXT(1, 1) + continue; + } + } + + TRYMAP_ENC(jisxcommon, code, c) { + if (code & 0x8000) /* MSB set: JIS X 0212 */ + return 1; + if (charset != CHARSET_JISX0208) { + WRITE3(ESC, '$', 'B') + STATE_SETG0(state, CHARSET_JISX0208) + NEXT_OUT(3) + } + WRITE2(code >> 8, code & 0xff) + NEXT(1, 2) + } else { + JISX0201_R_ENCODE(c, code) + else + return 1; + /* if (charset == CHARSET_JISX0201_R) : already checked */ + WRITE4(ESC, '(', 'J', code) + STATE_SETG0(state, CHARSET_JISX0201_R) + NEXT(1, 4) + } + } + } + + return 0; +} + +#define HAVE_DECODER_INIT +DECODER_INIT(iso_2022_jp) +{ + state->i = 0; + STATE_SETG0(state, CHARSET_ASCII) + STATE_SETG1(state, CHARSET_ASCII) + return 0; +} + +#define HAVE_DECODER_RESET +DECODER_RESET(iso_2022_jp) +{ + STATE_CLEARFLAG(state, F_SHIFTED) + return 0; +} + +DECODER(iso_2022_jp) +{ + while (inleft > 0) { + unsigned char c = **inbuf; + + if (STATE_GETFLAG(state, F_ESCTHROUGHOUT)) { + /* ESC throughout mode: for non-iso2022 escape sequences */ + RESERVE_OUTBUF(1) + **outbuf = c; /* assume as ISO-8859-1 */ + NEXT(1, 1) + if (IS_ESCEND(c)) { + STATE_CLEARFLAG(state, F_ESCTHROUGHOUT) + } + continue; + } + + switch (c) { + case ESC: + RESERVE_INBUF(2) + if (IS_ISO2022ESC((*inbuf)[1])) { + int eslen; + + eslen = iso2022esclen(*inbuf, inleft); + if (eslen < 0) + return eslen == MBERR_INTERNAL ? 1 : eslen; + + if (eslen == 3) { + unsigned char charset; + + if ((*inbuf)[1] == '$') { + if ((*inbuf)[2] == '@' || (*inbuf)[2] == 'B') { + charset = (*inbuf)[2] | CHARSET_DOUBLEBYTE; + STATE_SETG0(state, charset); + } else + return 3; + } else { + if ((*inbuf)[2] == 'B' || (*inbuf)[2] == 'J') + charset = (*inbuf)[2]; + else + return 3; + + if ((*inbuf)[1] == '(') { + STATE_SETG0(state, charset) + } else if ((*inbuf)[1] == ')') { + STATE_SETG1(state, charset) + } else + return 3; + } + } else + return eslen; + NEXT_IN(eslen) + } else { + STATE_SETFLAG(state, F_ESCTHROUGHOUT) + **outbuf = ESC; + NEXT(1, 1) + } + break; + case SI: + STATE_CLEARFLAG(state, F_SHIFTED) + NEXT_IN(1) + break; + case SO: + STATE_SETFLAG(state, F_SHIFTED) + NEXT_IN(1) + break; + case '\n': + STATE_CLEARFLAG(state, F_SHIFTED) + /* FALLTHROUGH */ + case SP: /* FALLTHROUGH */ + case DEL: + RESERVE_OUTBUF(1) + **outbuf = c; + NEXT(1, 1) + break; + default: + if ((c & 0x7f) < 0x20) { /* C0 and C1 */ + RESERVE_OUTBUF(1) + **outbuf = c & 0x7f; + NEXT(1, 1) + } else { + unsigned char charset; + + if (!STATE_GETFLAG(state, F_SHIFTED) && c < 0x80) /* G0 */ + charset = STATE_GETG0(state); + else /* G1 */ + charset = STATE_GETG1(state); + + if (charset & CHARSET_DOUBLEBYTE) { + /* all double byte character sets are in JIS X 0208 here. + * this means that we don't distinguish :1978 from :1983. */ + RESERVE_INBUF(2) + RESERVE_OUTBUF(1) + TRYMAP_DEC(jisx0208, **outbuf, c & 0x7f, + (*inbuf)[1] & 0x7f) { + NEXT(2, 1) + } else + return 2; + } else if (charset == CHARSET_ASCII) { + RESERVE_OUTBUF(1) + **outbuf = c & 0x7f; + NEXT(1, 1) + } else if (charset == CHARSET_JISX0201_R) { + RESERVE_OUTBUF(1) + JISX0201_R_DECODE(c & 0x7f, **outbuf) + else + return 1; + NEXT(1, 1) + } else + return MBERR_INTERNAL; + } + } + } + + return 0; +} + +#include "codecentry.h" +BEGIN_CODEC_REGISTRY(iso_2022_jp) + MAPOPEN(ja_JP) + IMPORTMAP_DEC(jisx0208) + IMPORTMAP_ENC(jisxcommon) + MAPCLOSE() +END_CODEC_REGISTRY(iso_2022_jp) + +/* + * ex: ts=8 sts=4 et + */ 1.4 +295 -295 cjkcodecs/src/_iso_2022_jp_1.c Index: _iso_2022_jp_1.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/_iso_2022_jp_1.c,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- _iso_2022_jp_1.c 5 Jun 2003 09:56:22 -0000 1.3 +++ _iso_2022_jp_1.c 6 Jul 2003 10:10:31 -0000 1.4 @@ -1,295 +1,295 @@ -/* - * _iso_2022_jp_1.c: the ISO-2022-JP-1 codec (RFC2237) - * - * Copyright (C) 2003 Hye-Shik Chang <pe...@Fr...>. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, - * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING - * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - * $Id: _iso_2022_jp_1.c,v 1.3 2003/06/05 09:56:22 perky Exp $ - */ - -#include "codeccommon.h" -#include "iso2022common.h" -#include "maps/alg_jisx0201.h" - -ENCMAP(jisxcommon) -DECMAP(jisx0208) -DECMAP(jisx0212) - -#define HAVE_ENCODER_INIT -ENCODER_INIT(iso_2022_jp_1) -{ - state->i = 0; - STATE_SETG0(state, CHARSET_ASCII) - STATE_SETG1(state, CHARSET_ASCII) - return 0; -} - -#define HAVE_ENCODER_RESET -ENCODER_RESET(iso_2022_jp_1) -{ - if (STATE_GETG0(state) != CHARSET_ASCII) { - RESERVE_OUTBUF(3) - PAVE3(ESC, '(', 'B') - STATE_SETG0(state, CHARSET_ASCII) - NEXT_OUT(3) - } - return 0; -} - -/* ISO-2022-JP-1 changes designations instead of shifting-out */ - -ENCODER(iso_2022_jp_1) -{ - while (inleft > 0) { - Py_UNICODE c = **inbuf; - DBCHAR code; - - if (c < 0x80) { - switch (STATE_GETG0(state)) { - case CHARSET_ASCII: - PAVE1(c) - NEXT(1, 1) - break; - case CHARSET_JISX0201_R: - JISX0201_R_ENCODE(c, code) - else { /* FALLTHROUGH (yay!) */ - default: - PAVE3(ESC, '(', 'B') - NEXT_OUT(3) - STATE_SETG0(state, CHARSET_ASCII) - code = c; - } - PAVE1(code) - NEXT(1, 1) - break; - } - if (c == '\n') - STATE_CLEARFLAG(state, F_SHIFTED) - } else UCS4INVALID(c) - else { - unsigned char charset; - - charset = STATE_GETG0(state); - if (charset == CHARSET_JISX0201_R) { - code = DBCINV; - JISX0201_R_ENCODE(c, code) - if (code != DBCINV) { - PAVE1(code) - NEXT(1, 1) - continue; - } - } - - TRYMAP_ENC(jisxcommon, code, c) { - if (code & 0x8000) { /* MSB set: JIS X 0212 */ - if (charset != CHARSET_JISX0212) { - PAVE4(ESC, '$', '(', 'D') - STATE_SETG0(state, CHARSET_JISX0212) - NEXT_OUT(4) - } - PAVE2((code >> 8) & 0x7f, code & 0x7f) - } else { /* MSB unset: JIS X 0208 */ - if (charset != CHARSET_JISX0208) { - PAVE3(ESC, '$', 'B') - STATE_SETG0(state, CHARSET_JISX0208) - NEXT_OUT(3) - } - PAVE2(code >> 8, code & 0xff) - } - NEXT(1, 2) - } else { - JISX0201_R_ENCODE(c, code) - else - return 1; - /* if (charset == CHARSET_JISX0201_R) : already checked */ - PAVE4(ESC, '(', 'J', code) - STATE_SETG0(state, CHARSET_JISX0201_R) - NEXT(1, 4) - } - } - } - - return 0; -} - -#define HAVE_DECODER_INIT -DECODER_INIT(iso_2022_jp_1) -{ - state->i = 0; - STATE_SETG0(state, CHARSET_ASCII) - STATE_SETG1(state, CHARSET_ASCII) - return 0; -} - -#define HAVE_DECODER_RESET -DECODER_RESET(iso_2022_jp_1) -{ - STATE_CLEARFLAG(state, F_SHIFTED) - return 0; -} - -DECODER(iso_2022_jp_1) -{ - while (inleft > 0) { - unsigned char c = **inbuf; - - if (STATE_GETFLAG(state, F_ESCTHROUGHOUT)) { - /* ESC throughout mode: for non-iso2022 escape sequences */ - RESERVE_OUTBUF(1) - **outbuf = c; /* assume as ISO-8859-1 */ - NEXT(1, 1) - if (IS_ESCEND(c)) { - STATE_CLEARFLAG(state, F_ESCTHROUGHOUT) - } - continue; - } - - switch (c) { - case ESC: - RESERVE_INBUF(2) - if (IS_ISO2022ESC((*inbuf)[1])) { - int eslen; - - eslen = iso2022esclen(*inbuf, inleft); - if (eslen < 0) - return eslen == MBERR_INTERNAL ? 1 : eslen; - - if (eslen == 3) { - unsigned char charset; - - if ((*inbuf)[1] == '$') { - if ((*inbuf)[2] == '@' || (*inbuf)[2] == 'B') { - charset = (*inbuf)[2] | CHARSET_DOUBLEBYTE; - STATE_SETG0(state, charset); - } else - return 3; - } else { - if ((*inbuf)[2] == 'B' || (*inbuf)[2] == 'J') - charset = (*inbuf)[2]; - else - return 3; - - if ((*inbuf)[1] == '(') { - STATE_SETG0(state, charset) - } else if ((*inbuf)[1] == ')') { - STATE_SETG1(state, charset) - } else - return 3; - } - } else if (eslen == 4) { - if ((*inbuf)[1] == '$' && (*inbuf)[3] == 'D') { - if ((*inbuf)[2] == '(') { - STATE_SETG0(state, CHARSET_JISX0212) - } else if ((*inbuf)[2] == ')') { - STATE_SETG1(state, CHARSET_JISX0212) - } else - return 4; - } else - return 4; - } else - return eslen; - NEXT_IN(eslen) - } else { - STATE_SETFLAG(state, F_ESCTHROUGHOUT) - **outbuf = ESC; - NEXT(1, 1) - } - break; - case SI: - STATE_CLEARFLAG(state, F_SHIFTED) - NEXT_IN(1) - break; - case SO: - STATE_SETFLAG(state, F_SHIFTED) - NEXT_IN(1) - break; - case '\n': - STATE_CLEARFLAG(state, F_SHIFTED) - /* FALLTHROUGH */ - case SP: /* FALLTHROUGH */ - case DEL: - RESERVE_OUTBUF(1) - **outbuf = c; - NEXT(1, 1) - break; - default: - if ((c & 0x7f) < 0x20) { /* C0 and C1 */ - RESERVE_OUTBUF(1) - **outbuf = c & 0x7f; - NEXT(1, 1) - } else { - unsigned char charset; - - if (!STATE_GETFLAG(state, F_SHIFTED) && c < 0x80) /* G0 */ - charset = STATE_GETG0(state); - else /* G1 */ - charset = STATE_GETG1(state); - - if (charset & CHARSET_DOUBLEBYTE) { - RESERVE_INBUF(2) - RESERVE_OUTBUF(1) - if (charset == CHARSET_JISX0208 || - charset == CHARSET_JISX0208_O) { - TRYMAP_DEC(jisx0208, **outbuf, c & 0x7f, - (*inbuf)[1] & 0x7f); - else return 2; - } else if (charset == CHARSET_JISX0212) { - TRYMAP_DEC(jisx0212, **outbuf, c & 0x7f, - (*inbuf)[1] & 0x7f); - else return 2; - } else - return MBERR_INTERNAL; - NEXT(2, 1) - } else if (charset == CHARSET_ASCII) { - RESERVE_OUTBUF(1) - **outbuf = c & 0x7f; - NEXT(1, 1) - } else if (charset == CHARSET_JISX0201_R) { - RESERVE_OUTBUF(1) - JISX0201_R_DECODE(c & 0x7f, **outbuf) - else - return 1; - NEXT(1, 1) - } else - return MBERR_INTERNAL; - } - } - } - - return 0; -} - -#include "codecentry.h" -BEGIN_CODEC_REGISTRY(iso_2022_jp_1) - MAPOPEN(ja_JP) - IMPORTMAP_DEC(jisx0208) - IMPORTMAP_DEC(jisx0212) - IMPORTMAP_ENC(jisxcommon) - MAPCLOSE() -END_CODEC_REGISTRY(iso_2022_jp_1) - -/* - * ex: ts=8 sts=4 et - */ +/* + * _iso_2022_jp_1.c: the ISO-2022-JP-1 codec (RFC2237) + * + * Copyright (C) 2003 Hye-Shik Chang <pe...@Fr...>. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * $Id: _iso_2022_jp_1.c,v 1.4 2003/07/06 10:10:31 perky Exp $ + */ + +#include "codeccommon.h" +#include "iso2022common.h" +#include "maps/alg_jisx0201.h" + +ENCMAP(jisxcommon) +DECMAP(jisx0208) +DECMAP(jisx0212) + +#define HAVE_ENCODER_INIT +ENCODER_INIT(iso_2022_jp_1) +{ + state->i = 0; + STATE_SETG0(state, CHARSET_ASCII) + STATE_SETG1(state, CHARSET_ASCII) + return 0; +} + +#define HAVE_ENCODER_RESET +ENCODER_RESET(iso_2022_jp_1) +{ + if (STATE_GETG0(state) != CHARSET_ASCII) { + RESERVE_OUTBUF(3) + WRITE3(ESC, '(', 'B') + STATE_SETG0(state, CHARSET_ASCII) + NEXT_OUT(3) + } + return 0; +} + +/* ISO-2022-JP-1 changes designations instead of shifting-out */ + +ENCODER(iso_2022_jp_1) +{ + while (inleft > 0) { + Py_UNICODE c = **inbuf; + DBCHAR code; + + if (c < 0x80) { + switch (STATE_GETG0(state)) { + case CHARSET_ASCII: + WRITE1(c) + NEXT(1, 1) + break; + case CHARSET_JISX0201_R: + JISX0201_R_ENCODE(c, code) + else { /* FALLTHROUGH (yay!) */ + default: + WRITE3(ESC, '(', 'B') + NEXT_OUT(3) + STATE_SETG0(state, CHARSET_ASCII) + code = c; + } + WRITE1(code) + NEXT(1, 1) + break; + } + if (c == '\n') + STATE_CLEARFLAG(state, F_SHIFTED) + } else UCS4INVALID(c) + else { + unsigned char charset; + + charset = STATE_GETG0(state); + if (charset == CHARSET_JISX0201_R) { + code = DBCINV; + JISX0201_R_ENCODE(c, code) + if (code != DBCINV) { + WRITE1(code) + NEXT(1, 1) + continue; + } + } + + TRYMAP_ENC(jisxcommon, code, c) { + if (code & 0x8000) { /* MSB set: JIS X 0212 */ + if (charset != CHARSET_JISX0212) { + WRITE4(ESC, '$', '(', 'D') + STATE_SETG0(state, CHARSET_JISX0212) + NEXT_OUT(4) + } + WRITE2((code >> 8) & 0x7f, code & 0x7f) + } else { /* MSB unset: JIS X 0208 */ + if (charset != CHARSET_JISX0208) { + WRITE3(ESC, '$', 'B') + STATE_SETG0(state, CHARSET_JISX0208) + NEXT_OUT(3) + } + WRITE2(code >> 8, code & 0xff) + } + NEXT(1, 2) + } else { + JISX0201_R_ENCODE(c, code) + else + return 1; + /* if (charset == CHARSET_JISX0201_R) : already checked */ + WRITE4(ESC, '(', 'J', code) + STATE_SETG0(state, CHARSET_JISX0201_R) + NEXT(1, 4) + } + } + } + + return 0; +} + +#define HAVE_DECODER_INIT +DECODER_INIT(iso_2022_jp_1) +{ + state->i = 0; + STATE_SETG0(state, CHARSET_ASCII) + STATE_SETG1(state, CHARSET_ASCII) + return 0; +} + +#define HAVE_DECODER_RESET +DECODER_RESET(iso_2022_jp_1) +{ + STATE_CLEARFLAG(state, F_SHIFTED) + return 0; +} + +DECODER(iso_2022_jp_1) +{ + while (inleft > 0) { + unsigned char c = **inbuf; + + if (STATE_GETFLAG(state, F_ESCTHROUGHOUT)) { + /* ESC throughout mode: for non-iso2022 escape sequences */ + RESERVE_OUTBUF(1) + **outbuf = c; /* assume as ISO-8859-1 */ + NEXT(1, 1) + if (IS_ESCEND(c)) { + STATE_CLEARFLAG(state, F_ESCTHROUGHOUT) + } + continue; + } + + switch (c) { + case ESC: + RESERVE_INBUF(2) + if (IS_ISO2022ESC((*inbuf)[1])) { + int eslen; + + eslen = iso2022esclen(*inbuf, inleft); + if (eslen < 0) + return eslen == MBERR_INTERNAL ? 1 : eslen; + + if (eslen == 3) { + unsigned char charset; + + if ((*inbuf)[1] == '$') { + if ((*inbuf)[2] == '@' || (*inbuf)[2] == 'B') { + charset = (*inbuf)[2] | CHARSET_DOUBLEBYTE; + STATE_SETG0(state, charset); + } else + return 3; + } else { + if ((*inbuf)[2] == 'B' || (*inbuf)[2] == 'J') + charset = (*inbuf)[2]; + else + return 3; + + if ((*inbuf)[1] == '(') { + STATE_SETG0(state, charset) + } else if ((*inbuf)[1] == ')') { + STATE_SETG1(state, charset) + } else + return 3; + } + } else if (eslen == 4) { + if ((*inbuf)[1] == '$' && (*inbuf)[3] == 'D') { + if ((*inbuf)[2] == '(') { + STATE_SETG0(state, CHARSET_JISX0212) + } else if ((*inbuf)[2] == ')') { + STATE_SETG1(state, CHARSET_JISX0212) + } else + return 4; + } else + return 4; + } else + return eslen; + NEXT_IN(eslen) + } else { + STATE_SETFLAG(state, F_ESCTHROUGHOUT) + **outbuf = ESC; + NEXT(1, 1) + } + break; + case SI: + STATE_CLEARFLAG(state, F_SHIFTED) + NEXT_IN(1) + break; + case SO: + STATE_SETFLAG(state, F_SHIFTED) + NEXT_IN(1) + break; + case '\n': + STATE_CLEARFLAG(state, F_SHIFTED) + /* FALLTHROUGH */ + case SP: /* FALLTHROUGH */ + case DEL: + RESERVE_OUTBUF(1) + **outbuf = c; + NEXT(1, 1) + break; + default: + if ((c & 0x7f) < 0x20) { /* C0 and C1 */ + RESERVE_OUTBUF(1) + **outbuf = c & 0x7f; + NEXT(1, 1) + } else { + unsigned char charset; + + if (!STATE_GETFLAG(state, F_SHIFTED) && c < 0x80) /* G0 */ + charset = STATE_GETG0(state); + else /* G1 */ + charset = STATE_GETG1(state); + + if (charset & CHARSET_DOUBLEBYTE) { + RESERVE_INBUF(2) + RESERVE_OUTBUF(1) + if (charset == CHARSET_JISX0208 || + charset == CHARSET_JISX0208_O) { + TRYMAP_DEC(jisx0208, **outbuf, c & 0x7f, + (*inbuf)[1] & 0x7f); + else return 2; + } else if (charset == CHARSET_JISX0212) { + TRYMAP_DEC(jisx0212, **outbuf, c & 0x7f, + (*inbuf)[1] & 0x7f); + else return 2; + } else + return MBERR_INTERNAL; + NEXT(2, 1) + } else if (charset == CHARSET_ASCII) { + RESERVE_OUTBUF(1) + **outbuf = c & 0x7f; + NEXT(1, 1) + } else if (charset == CHARSET_JISX0201_R) { + RESERVE_OUTBUF(1) + JISX0201_R_DECODE(c & 0x7f, **outbuf) + else + return 1; + NEXT(1, 1) + } else + return MBERR_INTERNAL; + } + } + } + + return 0; +} + +#include "codecentry.h" +BEGIN_CODEC_REGISTRY(iso_2022_jp_1) + MAPOPEN(ja_JP) + IMPORTMAP_DEC(jisx0208) + IMPORTMAP_DEC(jisx0212) + IMPORTMAP_ENC(jisxcommon) + MAPCLOSE() +END_CODEC_REGISTRY(iso_2022_jp_1) + +/* + * ex: ts=8 sts=4 et + */ 1.6 +235 -235 cjkcodecs/src/_iso_2022_kr.c Index: _iso_2022_kr.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/_iso_2022_kr.c,v retrieving revision 1.5 retrieving revision 1.6 diff -u -r1.5 -r1.6 --- _iso_2022_kr.c 2 Jun 2003 10:52:48 -0000 1.5 +++ _iso_2022_kr.c 6 Jul 2003 10:10:31 -0000 1.6 @@ -1,235 +1,235 @@ -/* - * _iso_2022_kr.c: the ISO-2022-KR codec (RFC1557) - * - * Copyright (C) 2003 Hye-Shik Chang <pe...@Fr...>. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, - * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING - * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - * $Id: _iso_2022_kr.c,v 1.5 2003/06/02 10:52:48 perky Exp $ - */ - -#include "codeccommon.h" -#include "iso2022common.h" - -ENCMAP(cp949) -DECMAP(ksx1001) - -#define HAVE_ENCODER_INIT -ENCODER_INIT(iso_2022_kr) -{ - state->i = 0; - STATE_SETG0(state, CHARSET_ASCII) - STATE_SETG1(state, CHARSET_ASCII) - return 0; -} - -#define HAVE_ENCODER_RESET -ENCODER_RESET(iso_2022_kr) -{ - if (STATE_GETFLAG(state, F_SHIFTED)) { - RESERVE_OUTBUF(1) - **outbuf = SI; - NEXT_OUT(1) - STATE_CLEARFLAG(state, F_SHIFTED) - } - return 0; -} - -ENCODER(iso_2022_kr) -{ - while (inleft > 0) { - Py_UNICODE c = **inbuf; - DBCHAR code; - - if (c < 0x80) { - if (STATE_GETFLAG(state, F_SHIFTED)) { - PAVE2(SI, c) - STATE_CLEARFLAG(state, F_SHIFTED) - NEXT(1, 2) - } else { - PAVE1(c) - NEXT(1, 1) - } - if (c == '\n') - STATE_CLEARFLAG(state, F_SHIFTED) - } else UCS4INVALID(c) - else { - if (STATE_GETG1(state) != CHARSET_KSX1001) { - PAVE4(ESC, '$', ')', 'C') - STATE_SETG1(state, CHARSET_KSX1001) - NEXT_OUT(4) - } - - if (!STATE_GETFLAG(state, F_SHIFTED)) { - PAVE1(SO) - STATE_SETFLAG(state, F_SHIFTED) - NEXT_OUT(1) - } - - TRYMAP_ENC(cp949, code, c) { - if (code & 0x8000) /* MSB set: CP949 */ - return 1; - PAVE2(code >> 8, code & 0xff) - NEXT(1, 2) - } else - return 1; - } - } - - return 0; -} - -#define HAVE_DECODER_INIT -DECODER_INIT(iso_2022_kr) -{ - state->i = 0; - STATE_SETG0(state, CHARSET_ASCII) - STATE_SETG1(state, CHARSET_ASCII) - return 0; -} - -#define HAVE_DECODER_RESET -DECODER_RESET(iso_2022_kr) -{ - STATE_CLEARFLAG(state, F_SHIFTED) - return 0; -} - -DECODER(iso_2022_kr) -{ - while (inleft > 0) { - unsigned char c = **inbuf; - - if (STATE_GETFLAG(state, F_ESCTHROUGHOUT)) { - /* ESC throughout mode: for non-iso2022 escape sequences */ - RESERVE_OUTBUF(1) - **outbuf = c; /* assume as ISO-8859-1 */ - NEXT(1, 1) - if (IS_ESCEND(c)) { - STATE_CLEARFLAG(state, F_ESCTHROUGHOUT) - } - continue; - } - - switch (c) { - case ESC: - RESERVE_INBUF(2) - if (IS_ISO2022ESC((*inbuf)[1])) { - int eslen; - - eslen = iso2022esclen(*inbuf, inleft); - if (eslen < 0) - return eslen == MBERR_INTERNAL ? 1 : eslen; - - if (eslen == 3) { - if ((*inbuf)[2] == 'B') { /* ASCII */ - if ((*inbuf)[1] == '(') { - STATE_SETG0(state, CHARSET_ASCII) - } else if ((*inbuf)[1] == ')') { - STATE_SETG1(state, CHARSET_ASCII) - } else - return 3; - } else - return 3; - } else if (eslen == 4) { - if ((*inbuf)[1] == '$' && (*inbuf)[3] == 'C') { - /* KS X 1001 */ - if ((*inbuf)[2] == '(') { - STATE_SETG0(state, CHARSET_KSX1001) - } else if ((*inbuf)[2] == ')') { - STATE_SETG1(state, CHARSET_KSX1001) - } else - return 4; - } else - return 4; - } else - return eslen; - NEXT_IN(eslen) - } else { - ... [truncated message content] |
From: Hye-Shik C. <pe...@us...> - 2003-07-05 20:09:22
|
perky 03/07/05 13:09:21 Added: tests test_multibytecodec.py Log: Add a unittest for StreamWriter Revision Changes Path 1.1 cjkcodecs/tests/test_multibytecodec.py Index: test_multibytecodec.py =================================================================== #!/usr/bin/env python # # test_multibytecodec.py: Unit test for multibytecodec itself # # Copyright (C) 2003 Hye-Shik Chang <pe...@Fr...>. # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # # $Id: test_multibytecodec.py,v 1.1 2003/07/05 20:09:21 perky Exp $ # from test import test_support import test_multibytecodec_support import unittest, StringIO, codecs class Test_StreamWriter(unittest.TestCase): if len(u'\U00012345') == 2: # UCS2 def test_gb18030(self): s= StringIO.StringIO() c = codecs.lookup('cjkcodecs.gb18030')[3](s) c.write(u'123') self.assertEqual(s.getvalue(), '123') c.write(u'\U00012345') self.assertEqual(s.getvalue(), '123\x907\x959') c.write(u'\U00012345'[0]) self.assertEqual(s.getvalue(), '123\x907\x959') c.write(u'\U00012345'[1] + u'\U00012345' + u'\uac00\u00ac') self.assertEqual(s.getvalue(), '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851') c.write(u'\U00012345'[0]) self.assertEqual(s.getvalue(), '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851') self.assertRaises(UnicodeError, c.reset) self.assertEqual(s.getvalue(), '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851') def test_utf_8(self): s= StringIO.StringIO() c = codecs.lookup('cjkcodecs.utf-8')[3](s) c.write(u'123') self.assertEqual(s.getvalue(), '123') c.write(u'\U00012345') self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85') c.write(u'\U00012345'[0]) self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85') c.write(u'\U00012345'[1] + u'\U00012345' + u'\uac00\u00ac') self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85' '\xea\xb0\x80\xc2\xac') c.write(u'\U00012345'[0]) self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85' '\xea\xb0\x80\xc2\xac') c.reset() self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85' '\xea\xb0\x80\xc2\xac\xed\xa0\x88') c.write(u'\U00012345'[1]) self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85' '\xea\xb0\x80\xc2\xac\xed\xa0\x88\xed\xbd\x85') else: # UCS4 pass def test_main(): suite = unittest.TestSuite() suite.addTest(unittest.makeSuite(Test_StreamWriter)) test_support.run_suite(suite) if __name__ == "__main__": test_main() # ex: ts=8 sts=4 et |
From: Hye-Shik C. <pe...@us...> - 2003-07-05 19:53:48
|
perky 03/07/05 12:53:47 Modified: . setup.py Log: Check mingw32 only in win32 platform. Revision Changes Path 1.23 +2 -2 cjkcodecs/setup.py Index: setup.py =================================================================== RCS file: /cvsroot/koco/cjkcodecs/setup.py,v retrieving revision 1.22 retrieving revision 1.23 diff -u -r1.22 -r1.23 --- setup.py 20 Jun 2003 17:33:24 -0000 1.22 +++ setup.py 5 Jul 2003 19:53:47 -0000 1.23 @@ -27,7 +27,7 @@ # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # -# $Id: setup.py,v 1.22 2003/06/20 17:33:24 perky Exp $ +# $Id: setup.py,v 1.23 2003/07/05 19:53:47 perky Exp $ # import sys @@ -69,7 +69,7 @@ sys.argv.remove(arg) -if sys.platform and '--compiler=mingw32' in sys.argv: +if sys.platform == 'win32' and '--compiler=mingw32' in sys.argv: LIBDIRS.append('.') # libpython23.a and libpython23.def for loc in locales: |
From: Hye-Shik C. <pe...@us...> - 2003-07-05 19:49:04
|
perky 03/07/05 12:49:03 Modified: src _utf_8.c codeccommon.h multibytecodec.c multibytecodec.h Log: StreamWriter became to be able to buffer incomplete sequences. (this feature is used for surrogate-pair and mapping from unicode character with a following modifier) Revision Changes Path 1.9 +6 -24 cjkcodecs/src/_utf_8.c Index: _utf_8.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/_utf_8.c,v retrieving revision 1.8 retrieving revision 1.9 diff -u -r1.8 -r1.9 --- _utf_8.c 1 Jul 2003 20:45:27 -0000 1.8 +++ _utf_8.c 5 Jul 2003 19:49:02 -0000 1.9 @@ -26,32 +26,11 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: _utf_8.c,v 1.8 2003/07/01 20:45:27 perky Exp $ + * $Id: _utf_8.c,v 1.9 2003/07/05 19:49:02 perky Exp $ */ #include "codeccommon.h" -#if Py_UNICODE_SIZE == 2 -#define HAVE_ENCODER_RESET -ENCODER_RESET(utf_8) -{ - assert(inleft == 0 || inleft == 1); - - if (inleft) { /* all pending characters are "high surrogate" */ - ucs4_t c = **inbuf; - - RESERVE_OUTBUF(3) - (*outbuf)[2] = 0x80 | ((c) & 0x3f); - (c) = (c) >> 6; (c) |= 0x800; - (*outbuf)[1] = 0x80 | ((c) & 0x3f); - (c) = (c) >> 6; (c) |= 0xc0; - (*outbuf)[0] = (c); - NEXT(1, 3) - } - return 0; -} -#endif /* Py_UNICODE_SIZE == 2 */ - ENCODER(utf_8) { while (inleft > 0) { @@ -63,8 +42,11 @@ else { #if Py_UNICODE_SIZE == 2 if (c >> 10 == 0xd800 >> 10) { /* high surrogate */ - RESERVE_INBUF(2) - if ((*inbuf)[1] >> 10 == 0xdc00 >> 10) { /* low surrogate */ + if (inleft < 2) { + if (!(flags & MBENC_FLUSH)) + return MBERR_TOOFEW; + } else if ((*inbuf)[1] >> 10 == 0xdc00 >> 10) { + /* low surrogate */ c = 0x10000 + ((c - 0xd800) << 10) + ((ucs4_t)((*inbuf)[1]) - 0xdc00); insize = 2; 1.15 +2 -3 cjkcodecs/src/codeccommon.h Index: codeccommon.h =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/codeccommon.h,v retrieving revision 1.14 retrieving revision 1.15 diff -u -r1.14 -r1.15 --- codeccommon.h 1 Jul 2003 19:33:43 -0000 1.14 +++ codeccommon.h 5 Jul 2003 19:49:02 -0000 1.15 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: codeccommon.h,v 1.14 2003/07/01 19:33:43 perky Exp $ + * $Id: codeccommon.h,v 1.15 2003/07/05 19:49:02 perky Exp $ */ #include "Python.h" @@ -46,11 +46,10 @@ static int encoding##_encode( \ MultibyteCodec_State *state, \ const Py_UNICODE **inbuf, size_t inleft, \ - unsigned char **outbuf, size_t outleft) + unsigned char **outbuf, size_t outleft, int flags) #define ENCODER_RESET(encoding) \ static int encoding##_encode_reset( \ MultibyteCodec_State *state, \ - const Py_UNICODE **inbuf, size_t inleft, \ unsigned char **outbuf, size_t outleft) #define DECODER_INIT(encoding) \ 1.20 +81 -66 cjkcodecs/src/multibytecodec.c Index: multibytecodec.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/multibytecodec.c,v retrieving revision 1.19 retrieving revision 1.20 diff -u -r1.19 -r1.20 --- multibytecodec.c 1 Jul 2003 20:45:27 -0000 1.19 +++ multibytecodec.c 5 Jul 2003 19:49:02 -0000 1.20 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: multibytecodec.c,v 1.19 2003/07/01 20:45:27 perky Exp $ + * $Id: multibytecodec.c,v 1.20 2003/07/05 19:49:02 perky Exp $ */ #include "Python.h" @@ -73,13 +73,15 @@ static char *streamkwarglist[] = {"stream", "errors", NULL}; static PyObject *multibytecodec_encode(MultibyteCodec *, - MultibyteCodec_State *, const Py_UNICODE *, int, PyObject *, - int flush); + MultibyteCodec_State *, const Py_UNICODE **, size_t, + PyObject *, int); static PyObject *mbstreamreader_create(MultibyteCodec *, PyObject *, const char *); static PyObject *mbstreamwriter_create(MultibyteCodec *, PyObject *, const char *); +#define MBENC_RESET MBENC_MAX<<1 /* reset after an encoding session */ + static PyObject * make_tuple(PyObject *unicode, int len) { @@ -214,7 +216,7 @@ size_t outleft; outleft = (size_t)(buf->outbuf_end - buf->outbuf); - r = codec->encode(state, &inbuf, 1, &buf->outbuf, outleft); + r = codec->encode(state, &inbuf, 1, &buf->outbuf, outleft, 0); if (r == MBERR_TOOSMALL) { RESERVE_ENCODEBUFFER(buf, -1); continue; @@ -291,10 +293,14 @@ goto errorexit; } - retstr = multibytecodec_encode(codec, state, PyUnicode_AS_UNICODE(tobj), - PyUnicode_GET_SIZE(tobj), ERROR_STRICT, 0); - if (retstr == NULL) - goto errorexit; + { + const Py_UNICODE *uraw = PyUnicode_AS_UNICODE(tobj); + + retstr = multibytecodec_encode(codec, state, &uraw, + PyUnicode_GET_SIZE(tobj), ERROR_STRICT, MBENC_FLUSH); + if (retstr == NULL) + goto errorexit; + } retstrsize = PyString_GET_SIZE(retstr); RESERVE_ENCODEBUFFER(buf, retstrsize); @@ -448,8 +454,8 @@ static PyObject * multibytecodec_encode(MultibyteCodec *codec, MultibyteCodec_State *state, - const Py_UNICODE *data, int datalen, - PyObject *errors, int flush) + const Py_UNICODE **data, size_t datalen, + PyObject *errors, int flags) { MultibyteEncodeBuffer buf; int finalsize, r = 0; @@ -458,7 +464,7 @@ return PyString_FromString(""); buf.excobj = NULL; - buf.inbuf = buf.inbuf_top = data; + buf.inbuf = buf.inbuf_top = *data; buf.inbuf_end = buf.inbuf_top + datalen; buf.outobj = PyString_FromStringAndSize(NULL, datalen * 2 + 16); if (buf.outobj == NULL) @@ -473,33 +479,28 @@ * error callbacks can relocate the cursor anywhere on buffer */ inleft = (size_t)(buf.inbuf_end - buf.inbuf); outleft = (size_t)(buf.outbuf_end - buf.outbuf); - r = codec->encode(state, &buf.inbuf, inleft, &buf.outbuf, outleft); - if (r == 0 || r == MBERR_TOOFEW) + r = codec->encode(state, &buf.inbuf, inleft, + &buf.outbuf, outleft, flags); + *data = buf.inbuf; + if ((r == 0) || (r == MBERR_TOOFEW && !(flags & MBENC_FLUSH))) break; else if (multibytecodec_encerror(codec, state, &buf, errors, r)) goto errorexit; + else if (r == MBERR_TOOFEW) + break; } - if (flush) { - if (codec->encreset == NULL) { - if (r == MBERR_TOOFEW) { - if (multibytecodec_encerror(codec, state, &buf, errors, r)) - goto errorexit; - } - } else for (;;) { - size_t inleft, outleft; + if (codec->encreset != NULL) + for (;;) { + size_t outleft; - /* inleft can be non-zero value when r == MBERR_TOOFEW */ - inleft = (size_t)(buf.inbuf_end - buf.inbuf); outleft = (size_t)(buf.outbuf_end - buf.outbuf); - r = codec->encreset(state, &buf.inbuf, inleft, - &buf.outbuf, outleft); + r = codec->encreset(state, &buf.outbuf, outleft); if (r == 0) break; else if (multibytecodec_encerror(codec, state, &buf, errors, r)) goto errorexit; } - } finalsize = (int)((char*)buf.outbuf - PyString_AS_STRING(buf.outobj)); @@ -536,7 +537,8 @@ if (self->codec->encinit != NULL && self->codec->encinit(&state) != 0) goto errorexit; - r = multibytecodec_encode(self->codec, &state, data, datalen, errorcb, 1); + r = multibytecodec_encode(self->codec, &state, (const Py_UNICODE **)&data, + datalen, errorcb, MBENC_FLUSH | MBENC_RESET); if (r == NULL) goto errorexit; @@ -977,6 +979,7 @@ PyObject *unistr) { PyObject *wr, *r = NULL; + Py_UNICODE *inbuf, *inbuf_end, *inbuf_tmp = NULL; int rsize; if (!PyUnicode_Check(unistr)) { @@ -989,20 +992,49 @@ if (rsize == 0) return 0; + if (self->pendingsize > 0) { + inbuf_tmp = PyMem_New(Py_UNICODE, rsize + self->pendingsize); + if (inbuf_tmp == NULL) + goto errorexit; + memcpy(inbuf_tmp, self->pending, Py_UNICODE_SIZE * self->pendingsize); + memcpy(inbuf_tmp + self->pendingsize, PyUnicode_AS_UNICODE(unistr), + Py_UNICODE_SIZE * rsize); + rsize += self->pendingsize; + self->pendingsize = 0; + inbuf = inbuf_tmp; + } else + inbuf = (Py_UNICODE *)PyUnicode_AS_UNICODE(unistr); + + inbuf_end = inbuf + rsize; + r = multibytecodec_encode(self->codec, &self->state, - (Py_UNICODE *)PyUnicode_AS_UNICODE(unistr), rsize, self->errors, 0); + (const Py_UNICODE **)&inbuf, rsize, self->errors, 0); if (r == NULL) goto errorexit; + if (inbuf < inbuf_end) { + self->pendingsize = (int)(inbuf_end - inbuf); + if (self->pendingsize > MAXENCPENDING) { + self->pendingsize = 0; + PyErr_SetString(PyExc_RuntimeError, "pending buffer overflow"); + goto errorexit; + } + memcpy(self->pending, inbuf, self->pendingsize * Py_UNICODE_SIZE); + } + wr = PyObject_CallMethod(self->stream, "write", "O", r); if (wr == NULL) goto errorexit; + if (inbuf_tmp != NULL) + PyMem_Del(inbuf_tmp); Py_DECREF(r); Py_DECREF(wr); return 0; errorexit: + if (inbuf_tmp != NULL) + PyMem_Del(inbuf_tmp); Py_XDECREF(r); return -1; } @@ -1056,48 +1088,30 @@ static PyObject * mbstreamwriter_reset(MultibyteStreamWriterObject *self) { - if (self->codec->encreset != NULL) { - PyObject *rsbuf = NULL; - size_t rsbufsiz, rsbufnc; - int r; - unsigned char *rsbuf_top, *rsbuf_cur; - - rsbufnc = 0; - for (rsbufsiz = 0;;rsbufsiz *= 2) { - if (rsbuf == NULL) { - rsbuf = PyString_FromStringAndSize(NULL, rsbufsiz); - if (rsbuf == NULL) - return NULL; - } else { - if (_PyString_Resize(&rsbuf, rsbufsiz)) - goto errorexit; - } - rsbuf_top = (unsigned char *)PyString_AS_STRING(rsbuf); - rsbuf_cur = rsbuf_top + rsbufnc; - - r = self->codec->encreset(&self->state, - NULL, 0, &rsbuf_cur, rsbufsiz - rsbufnc); - rsbufnc = (size_t)(rsbuf_cur - rsbuf_top); - if (r == MBERR_TOOSMALL) - continue; - else { - if (r != 0) - goto errorexit; - else - break; - } - } + const Py_UNICODE *pending; + PyObject *pwrt; - if (_PyString_Resize(&rsbuf, rsbufnc)) { -errorexit: Py_DECREF(rsbuf); - return NULL; - } + pending = self->pending; + pwrt = multibytecodec_encode(self->codec, &self->state, + &pending, self->pendingsize, self->errors, + MBENC_FLUSH | MBENC_RESET); + /* some pending buffer can be truncated when UnicodeEncodeError is + * raised on 'strict' mode. but, 'reset' method is designed to + * reset the pending buffer or states so failed string sequence + * ought to be missed */ + self->pendingsize = 0; + if (pwrt == NULL) + return NULL; - r = mbstreamwriter_iwrite(self, rsbuf); - Py_DECREF(rsbuf); - if (r == -1) + if (PyString_Size(pwrt) > 0) { + PyObject *wr; + wr = PyObject_CallMethod(self->stream, "write", "O", pwrt); + if (wr == NULL) { + Py_DECREF(pwrt); return NULL; + } } + Py_DECREF(pwrt); Py_INCREF(Py_None); return Py_None; @@ -1232,6 +1246,7 @@ self->codec = codec; self->stream = stream; Py_INCREF(stream); + self->pendingsize = 0; self->errors = get_errorcallback(errors); if (self->errors == NULL) goto errorexit; 1.9 +8 -3 cjkcodecs/src/multibytecodec.h Index: multibytecodec.h =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/multibytecodec.h,v retrieving revision 1.8 retrieving revision 1.9 diff -u -r1.8 -r1.9 --- multibytecodec.h 1 Jul 2003 19:33:43 -0000 1.8 +++ multibytecodec.h 5 Jul 2003 19:49:02 -0000 1.9 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: multibytecodec.h,v 1.8 2003/07/01 19:33:43 perky Exp $ + * $Id: multibytecodec.h,v 1.9 2003/07/05 19:49:02 perky Exp $ */ #ifndef _PYTHON_MULTIBYTECODEC_H_ @@ -35,17 +35,19 @@ extern "C" { #endif +#define MAXCHARSTATE 8 typedef union { unsigned long i; void *p; + unsigned char c[MAXCHARSTATE]; } MultibyteCodec_State; typedef int (*mbencode_func)(MultibyteCodec_State *state, const Py_UNICODE **inbuf, size_t inleft, - unsigned char **outbuf, size_t outleft); + unsigned char **outbuf, size_t outleft, + int flags); typedef int (*mbencodeinit_func)(MultibyteCodec_State *state); typedef int (*mbencodereset_func)(MultibyteCodec_State *state, - const Py_UNICODE **inbuf, size_t inleft, unsigned char **outbuf, size_t outleft); typedef int (*mbdecode_func)(MultibyteCodec_State *state, const unsigned char **inbuf, size_t inleft, @@ -97,6 +99,9 @@ #define ERROR_IGNORE (PyObject *)(2) #define ERROR_REPLACE (PyObject *)(3) #define ERROR_MAX ERROR_REPLACE + +#define MBENC_FLUSH 0x0001 /* encode all characters encodable */ +#define MBENC_MAX MBENC_FLUSH #ifdef __cplusplus } |
From: Hye-Shik C. <pe...@us...> - 2003-07-05 19:49:04
|
perky 03/07/05 12:49:02 Modified: . CHANGES Log: StreamWriter became to be able to buffer incomplete sequences. (this feature is used for surrogate-pair and mapping from unicode character with a following modifier) Revision Changes Path 1.8 +4 -0 cjkcodecs/CHANGES Index: CHANGES =================================================================== RCS file: /cvsroot/koco/cjkcodecs/CHANGES,v retrieving revision 1.7 retrieving revision 1.8 diff -u -r1.7 -r1.8 --- CHANGES 1 Jul 2003 20:45:27 -0000 1.7 +++ CHANGES 5 Jul 2003 19:49:02 -0000 1.8 @@ -25,3 +25,7 @@ *) Fixed gb18030 codec's syntax error that disturbs compilation on python compiled with --with-unicode=ucs4 option. [Son, Kyung-uk] + *) StreamWriter became to be able to buffer incomplete sequences. + (this feature is used for surrogate-pair and mapping from unicode + character with a following modifier) + |
From: Hye-Shik C. <pe...@us...> - 2003-07-01 19:35:16
|
perky 03/07/01 12:33:43 Modified: src codeccommon.h multibytecodec.c multibytecodec.h Log: - Prepare buffering encoder framework for jisx0213 and surrogates - Set '\U+xxxxxxxx' instead of '\u..' on python versions under 2.2 Revision Changes Path 1.14 +2 -1 cjkcodecs/src/codeccommon.h Index: codeccommon.h =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/codeccommon.h,v retrieving revision 1.13 retrieving revision 1.14 diff -u -r1.13 -r1.14 --- codeccommon.h 20 Jun 2003 17:22:59 -0000 1.13 +++ codeccommon.h 1 Jul 2003 19:33:43 -0000 1.14 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: codeccommon.h,v 1.13 2003/06/20 17:22:59 perky Exp $ + * $Id: codeccommon.h,v 1.14 2003/07/01 19:33:43 perky Exp $ */ #include "Python.h" @@ -50,6 +50,7 @@ #define ENCODER_RESET(encoding) \ static int encoding##_encode_reset( \ MultibyteCodec_State *state, \ + const Py_UNICODE **inbuf, size_t inleft, \ unsigned char **outbuf, size_t outleft) #define DECODER_INIT(encoding) \ 1.18 +16 -9 cjkcodecs/src/multibytecodec.c Index: multibytecodec.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/multibytecodec.c,v retrieving revision 1.17 retrieving revision 1.18 diff -u -r1.17 -r1.18 --- multibytecodec.c 6 Jun 2003 06:56:01 -0000 1.17 +++ multibytecodec.c 1 Jul 2003 19:33:43 -0000 1.18 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: multibytecodec.c,v 1.17 2003/06/06 06:56:01 perky Exp $ + * $Id: multibytecodec.c,v 1.18 2003/07/01 19:33:43 perky Exp $ */ #include "Python.h" @@ -236,11 +236,18 @@ end = start + esize; #ifdef NO_ERROR_CALLBACKS - if (esize == 1) - PyErr_Format(PyExc_UnicodeError, - "'%s' codec can't encode byte '\\u%04x' in position %d: %s", - codec->encoding, *buf->inbuf, start, reason); - else + if (esize == 1) { +#if Py_UNICODE_SIZE == 4 + if (*buf->inbuf >= 0x10000) + PyErr_Format(PyExc_UnicodeError, + "'%s' codec can't encode byte '\\U%08x' in position %d: %s", + codec->encoding, *buf->inbuf, start, reason); + else +#endif + PyErr_Format(PyExc_UnicodeError, + "'%s' codec can't encode byte '\\u%04x' in position %d: %s", + codec->encoding, *buf->inbuf, start, reason); + } else PyErr_Format(PyExc_UnicodeError, "'%s' codec can't encode bytes in position %d-%d: %s", codec->encoding, start, end, reason); @@ -480,7 +487,7 @@ size_t outleft; outleft = (size_t)(buf.outbuf_end - buf.outbuf); - r = codec->encreset(state, &buf.outbuf, outleft); + r = codec->encreset(state, NULL, 0, &buf.outbuf, outleft); if (r == 0) break; else if (multibytecodec_encerror(codec, state, &buf, errors, r)) @@ -783,7 +790,7 @@ /* we can't assume that pendingsize is still 0 here. because * this function can be called recursively from error callback */ npendings = (size_t)(buf.inbuf_end - buf.inbuf); - if (npendings + self->pendingsize > MAXPENDING) { + if (npendings + self->pendingsize > MAXDECPENDING) { PyErr_SetString(PyExc_RuntimeError, "pending buffer overflow"); goto errorexit; @@ -1062,7 +1069,7 @@ rsbuf_cur = rsbuf_top + rsbufnc; r = self->codec->encreset(&self->state, - &rsbuf_cur, rsbufsiz - rsbufnc); + NULL, 0, &rsbuf_cur, rsbufsiz - rsbufnc); rsbufnc = (size_t)(rsbuf_cur - rsbuf_top); if (r == MBERR_TOOSMALL) continue; 1.8 +7 -3 cjkcodecs/src/multibytecodec.h Index: multibytecodec.h =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/multibytecodec.h,v retrieving revision 1.7 retrieving revision 1.8 diff -u -r1.7 -r1.8 --- multibytecodec.h 31 May 2003 11:50:19 -0000 1.7 +++ multibytecodec.h 1 Jul 2003 19:33:43 -0000 1.8 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: multibytecodec.h,v 1.7 2003/05/31 11:50:19 perky Exp $ + * $Id: multibytecodec.h,v 1.8 2003/07/01 19:33:43 perky Exp $ */ #ifndef _PYTHON_MULTIBYTECODEC_H_ @@ -45,6 +45,7 @@ unsigned char **outbuf, size_t outleft); typedef int (*mbencodeinit_func)(MultibyteCodec_State *state); typedef int (*mbencodereset_func)(MultibyteCodec_State *state, + const Py_UNICODE **inbuf, size_t inleft, unsigned char **outbuf, size_t outleft); typedef int (*mbdecode_func)(MultibyteCodec_State *state, const unsigned char **inbuf, size_t inleft, @@ -67,20 +68,23 @@ MultibyteCodec *codec; } MultibyteCodecObject; -#define MAXPENDING 8 +#define MAXDECPENDING 8 typedef struct { PyObject_HEAD MultibyteCodec *codec; MultibyteCodec_State state; - unsigned char pending[MAXPENDING]; + unsigned char pending[MAXDECPENDING]; int pendingsize; PyObject *stream, *errors; } MultibyteStreamReaderObject; +#define MAXENCPENDING 2 typedef struct { PyObject_HEAD MultibyteCodec *codec; MultibyteCodec_State state; + Py_UNICODE pending[MAXENCPENDING]; + int pendingsize; PyObject *stream, *errors; } MultibyteStreamWriterObject; |
From: Hye-Shik C. <pe...@us...> - 2003-07-01 08:55:13
|
perky 03/07/01 01:55:11 Modified: . CHANGES THANKS Log: *) Fixed gb18030 codec's syntax error that disturbs compilation on python compiled with --with-unicode=ucs4 option. Reported by: Son, Kyung-uk <vv...@ch...> Revision Changes Path 1.6 +3 -0 cjkcodecs/CHANGES Index: CHANGES =================================================================== RCS file: /cvsroot/koco/cjkcodecs/CHANGES,v retrieving revision 1.5 retrieving revision 1.6 diff -u -r1.5 -r1.6 --- CHANGES 20 Jun 2003 17:22:59 -0000 1.5 +++ CHANGES 1 Jul 2003 08:55:11 -0000 1.6 @@ -22,3 +22,6 @@ *) Enable utf-8 codec encode and decode iso-10646-2 characters using surrogate pair. + *) Fixed gb18030 codec's syntax error that disturbs compilation on + python compiled with --with-unicode=ucs4 option. [Son, Kyung-uk] + 1.5 +1 -0 cjkcodecs/THANKS Index: THANKS =================================================================== RCS file: /cvsroot/koco/cjkcodecs/THANKS,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- THANKS 20 Jun 2003 09:46:51 -0000 1.4 +++ THANKS 1 Jul 2003 08:55:11 -0000 1.5 @@ -3,5 +3,6 @@ Jason R. Mastaler <ja...@ma...> MacOS X porting Kazuhiro ABE <abe...@ni...> advice on JIS X 0213 Martin v. Loewis <martin@v.loewis.de> comments on mapping differences +Son, Kyung-uk <vv...@ch...> reported a bug on ucs4 mode Yoshiki Ohshima <Yos...@ac...> advice on JIS X 0213 Young-Sik Won <mon...@dr...> mingw32 compilation fix |
From: Hye-Shik C. <pe...@us...> - 2003-07-01 08:55:13
|
perky 03/07/01 01:55:11 Modified: src _gb18030.c Log: *) Fixed gb18030 codec's syntax error that disturbs compilation on python compiled with --with-unicode=ucs4 option. Reported by: Son, Kyung-uk <vv...@ch...> Revision Changes Path 1.8 +3 -3 cjkcodecs/src/_gb18030.c Index: _gb18030.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/_gb18030.c,v retrieving revision 1.7 retrieving revision 1.8 diff -u -r1.7 -r1.8 --- _gb18030.c 9 Jun 2003 10:25:36 -0000 1.7 +++ _gb18030.c 1 Jul 2003 08:55:11 -0000 1.8 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: _gb18030.c,v 1.7 2003/06/09 10:25:36 perky Exp $ + * $Id: _gb18030.c,v 1.8 2003/07/01 08:55:11 perky Exp $ */ #include "codeccommon.h" @@ -52,9 +52,9 @@ continue; } #if Py_UNICODE_SIZE == 4 - else if (nc > 0x10FFFF) + else if (c > 0x10FFFF) return 1; - else if (nc >= 0x10000) { + else if (c >= 0x10000) { Py_UNICODE tc = c; RESERVE_OUTBUF(4) |
From: Hye-Shik C. <pe...@us...> - 2003-06-20 17:56:10
|
perky 03/06/20 10:56:08 Modified: src _utf_8.c Log: utf-8 is described on rfc2279 Revision Changes Path 1.7 +2 -2 cjkcodecs/src/_utf_8.c Index: _utf_8.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/_utf_8.c,v retrieving revision 1.6 retrieving revision 1.7 diff -u -r1.6 -r1.7 --- _utf_8.c 20 Jun 2003 17:22:59 -0000 1.6 +++ _utf_8.c 20 Jun 2003 17:56:08 -0000 1.7 @@ -1,5 +1,5 @@ /* - * _utf_8.c: the UTF-8 codec + * _utf_8.c: the UTF-8 codec (RFC2279) * * Copyright (C) 2003 Hye-Shik Chang <pe...@Fr...>. * All rights reserved. @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: _utf_8.c,v 1.6 2003/06/20 17:22:59 perky Exp $ + * $Id: _utf_8.c,v 1.7 2003/06/20 17:56:08 perky Exp $ */ #include "codeccommon.h" |
From: Hye-Shik C. <pe...@us...> - 2003-06-20 17:33:28
|
perky 03/06/20 10:33:24 Modified: . setup.py Log: Add '.' to library_dirs on mingw32 compilers to locate libpython*.{def,a} Revision Changes Path 1.22 +9 -3 cjkcodecs/setup.py Index: setup.py =================================================================== RCS file: /cvsroot/koco/cjkcodecs/setup.py,v retrieving revision 1.21 retrieving revision 1.22 diff -u -r1.21 -r1.22 --- setup.py 10 Jun 2003 07:03:06 -0000 1.21 +++ setup.py 20 Jun 2003 17:33:24 -0000 1.22 @@ -27,13 +27,14 @@ # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # -# $Id: setup.py,v 1.21 2003/06/10 07:03:06 perky Exp $ +# $Id: setup.py,v 1.22 2003/06/20 17:33:24 perky Exp $ # import sys from distutils.core import setup, Extension from distutils.command.install import install +LIBDIRS = [] extensions = [] encodings = { 'ja_JP': ['shift_jis', 'cp932', 'euc_jp', 'iso_2022_jp', 'iso_2022_jp_1'], @@ -68,12 +69,16 @@ sys.argv.remove(arg) +if sys.platform and '--compiler=mingw32' in sys.argv: + LIBDIRS.append('.') # libpython23.a and libpython23.def + for loc in locales: if loc: extensions.append(Extension('cjkcodecs.mapdata_'+loc, ['src/maps/mapdata_%s.c'%loc])) for enc in encodings[loc]: - extensions.append(Extension('cjkcodecs._'+enc, ['src/_%s.c'%enc])) + extensions.append(Extension('cjkcodecs._'+enc, ['src/_%s.c'%enc], + library_dirs=LIBDIRS)) class Install(install): def initialize_options (self): @@ -97,7 +102,8 @@ cmdclass = {'install': Install}, packages = ['cjkcodecs'], ext_modules = - [Extension("cjkcodecs.multibytecodec", ["src/multibytecodec.c"])] + [Extension("cjkcodecs.multibytecodec", ["src/multibytecodec.c"], + library_dirs=LIBDIRS)] + extensions ) |
From: Hye-Shik C. <pe...@us...> - 2003-06-20 17:23:01
|
perky 03/06/20 10:22:59 Modified: . CHANGES Log: Enable utf-8 codec encode and decode iso-10646-2 characters using surrogate pair. Revision Changes Path 1.5 +3 -0 cjkcodecs/CHANGES Index: CHANGES =================================================================== RCS file: /cvsroot/koco/cjkcodecs/CHANGES,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- CHANGES 20 Jun 2003 09:22:57 -0000 1.4 +++ CHANGES 20 Jun 2003 17:22:59 -0000 1.5 @@ -19,3 +19,6 @@ *) Added a workaround for PyObject_GenericGetAttr to enable compiling with mingw32. [Young-Sik Won] + *) Enable utf-8 codec encode and decode iso-10646-2 characters using + surrogate pair. + |
From: Hye-Shik C. <pe...@us...> - 2003-06-20 17:23:01
|
perky 03/06/20 10:22:59 Modified: src _utf_8.c cjkcommon.h codeccommon.h Log: Enable utf-8 codec encode and decode iso-10646-2 characters using surrogate pair. Revision Changes Path 1.6 +86 -40 cjkcodecs/src/_utf_8.c Index: _utf_8.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/_utf_8.c,v retrieving revision 1.5 retrieving revision 1.6 diff -u -r1.5 -r1.6 --- _utf_8.c 6 Jun 2003 06:26:59 -0000 1.5 +++ _utf_8.c 20 Jun 2003 17:22:59 -0000 1.6 @@ -26,32 +26,85 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: _utf_8.c,v 1.5 2003/06/06 06:26:59 perky Exp $ + * $Id: _utf_8.c,v 1.6 2003/06/20 17:22:59 perky Exp $ */ #include "codeccommon.h" + +#define ENCODESURROGATE(outbuf, c) \ + (*outbuf)[2] = 0x80 | ((c) & 0x3f); \ + (c) = (c) >> 6; (c) |= 0x800; \ + (*outbuf)[1] = 0x80 | ((c) & 0x3f); \ + (c) = (c) >> 6; (c) |= 0xc0; \ + (*outbuf)[0] = (c); + +#if Py_UNICODE_SIZE == 2 +#define HAVE_ENCODER_INIT +ENCODER_INIT(utf_8) +{ + state->i = 0; + return 0; +} + +#define HAVE_ENCODER_RESET +ENCODER_RESET(utf_8) +{ + if (state->i > 0) { + ucs4_t c = (ucs4_t)state->i; + + RESERVE_OUTBUF(3) + ENCODESURROGATE(outbuf, c) + state->i = 0; + NEXT_OUT(3) + } + return 0; +} +#endif /* Py_UNICODE_SIZE == 2 */ + ENCODER(utf_8) { while (inleft > 0) { - Py_UNICODE c = **inbuf; + ucs4_t c = **inbuf; int size; if (c < 0x80) size = 1; else if (c < 0x800) size = 2; + else { #if Py_UNICODE_SIZE == 2 - else size = 3; -#else - else if (c < 0x10000) size = 3; - else if (c < 0x200000) size = 4; - else if (c < 0x4000000) size = 5; - else size = 6; + if (c >> 10 == 0xd800 >> 10 && state->i == 0) { + /* high surrogate */ + state->i = (unsigned short)c; + NEXT_IN(1) + continue; + } else if (c >> 10 == 0xdc00 >> 10 && state->i != 0) { + /* low surrogate */ + c = 0x10000 + (((ucs4_t)(state->i) - 0xd800) << 10) + + (c - 0xdc00); + RESERVE_OUTBUF(6) /* preserve enough space not to lose state */ + state->i = 0; + } +#endif + if (c < 0x10000) size = 3; + else if (c < 0x200000) size = 4; + else if (c < 0x4000000) size = 5; + else size = 6; + } + +#if Py_UNICODE_SIZE == 2 + if (state->i > 0) { /* unmatched surrogates */ + ucs4_t sgc = (ucs4_t)state->i; + + RESERVE_OUTBUF(3) /* high surrogates are ..*/ + ENCODESURROGATE(outbuf, sgc) + state->i = 0; + NEXT_OUT(3) + } #endif RESERVE_OUTBUF(size) switch (size) { -#if Py_UNICODE_SIZE == 4 case 6: (*outbuf)[5] = 0x80 | (c & 0x3f); c = c >> 6; @@ -67,7 +120,6 @@ c = c >> 6; c |= 0x10000; /* FALLTHROUGH */ -#endif case 3: (*outbuf)[2] = 0x80 | (c & 0x3f); c = c >> 6; @@ -122,10 +174,8 @@ | (Py_UNICODE)(c3 ^ 0x80); NEXT(3, 1) } else if (c < 0xf8) { -#if Py_UNICODE_SIZE == 2 - return 4; -#else unsigned char c2, c3, c4; + ucs4_t code; RESERVE_INBUF(4) c2 = (*inbuf)[1]; c3 = (*inbuf)[2]; @@ -134,17 +184,15 @@ (c3 ^ 0x80) < 0x40 && (c4 ^ 0x80) < 0x40 && (c >= 0xf1 || c2 >= 0x90))) return 4; - **outbuf = ((Py_UNICODE)(c & 0x07) << 18) - | ((Py_UNICODE)(c2 ^ 0x80) << 12) - | ((Py_UNICODE)(c3 ^ 0x80) << 6) - | (Py_UNICODE)(c4 ^ 0x80); - NEXT(4, 1) -#endif + code = ((ucs4_t)(c & 0x07) << 18) + | ((ucs4_t)(c2 ^ 0x80) << 12) + | ((ucs4_t)(c3 ^ 0x80) << 6) + | (ucs4_t)(c4 ^ 0x80); + PUTUCS4(code) + NEXT_IN(4) } else if (c < 0xfc) { -#if Py_UNICODE_SIZE == 2 - return 5; -#else unsigned char c2, c3, c4, c5; + ucs4_t code; RESERVE_INBUF(5) c2 = (*inbuf)[1]; c3 = (*inbuf)[2]; @@ -153,18 +201,16 @@ (c3 ^ 0x80) < 0x40 && (c4 ^ 0x80) < 0x40 && (c5 ^ 0x80) < 0x40 && (c >= 0xf9 || c2 >= 0x88))) return 5; - **outbuf = ((Py_UNICODE)(c & 0x03) << 24) - | ((Py_UNICODE)(c2 ^ 0x80) << 18) - | ((Py_UNICODE)(c3 ^ 0x80) << 12) - | ((Py_UNICODE)(c4 ^ 0x80) << 6) - | (Py_UNICODE)(c5 ^ 0x80); - NEXT(5, 1) -#endif + code = ((ucs4_t)(c & 0x03) << 24) + | ((ucs4_t)(c2 ^ 0x80) << 18) + | ((ucs4_t)(c3 ^ 0x80) << 12) + | ((ucs4_t)(c4 ^ 0x80) << 6) + | (ucs4_t)(c5 ^ 0x80); + PUTUCS4(code) + NEXT_IN(5) } else if (c < 0xff) { -#if Py_UNICODE_SIZE == 2 - return 6; -#else unsigned char c2, c3, c4, c5, c6; + ucs4_t code; RESERVE_INBUF(6) c2 = (*inbuf)[1]; c3 = (*inbuf)[2]; @@ -175,14 +221,14 @@ (c5 ^ 0x80) < 0x40 && (c6 ^ 0x80) < 0x40 && (c >= 0xfd || c2 >= 0x84))) return 6; - **outbuf = ((Py_UNICODE)(c & 0x01) << 30) - | ((Py_UNICODE)(c2 ^ 0x80) << 24) - | ((Py_UNICODE)(c3 ^ 0x80) << 18) - | ((Py_UNICODE)(c4 ^ 0x80) << 12) - | ((Py_UNICODE)(c5 ^ 0x80) << 6) - | (Py_UNICODE)(c6 ^ 0x80); - NEXT(6, 1) -#endif + code = ((ucs4_t)(c & 0x01) << 30) + | ((ucs4_t)(c2 ^ 0x80) << 24) + | ((ucs4_t)(c3 ^ 0x80) << 18) + | ((ucs4_t)(c4 ^ 0x80) << 12) + | ((ucs4_t)(c5 ^ 0x80) << 6) + | (ucs4_t)(c6 ^ 0x80); + PUTUCS4(code) + NEXT_IN(6) } else return 1; } 1.9 +7 -1 cjkcodecs/src/cjkcommon.h Index: cjkcommon.h =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/cjkcommon.h,v retrieving revision 1.8 retrieving revision 1.9 diff -u -r1.8 -r1.9 --- cjkcommon.h 19 May 2003 23:07:12 -0000 1.8 +++ cjkcommon.h 20 Jun 2003 17:22:59 -0000 1.9 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: cjkcommon.h,v 1.8 2003/05/19 23:07:12 perky Exp $ + * $Id: cjkcommon.h,v 1.9 2003/06/20 17:22:59 perky Exp $ */ #ifndef _CJKCOMMON_H_ @@ -54,6 +54,12 @@ const struct unim_index *encmap; const struct dbcs_index *decmap; }; + +#ifdef uint32_t +typedef uint32_t ucs4_t; +#else +typedef unsigned int ucs4_t; +#endif #endif 1.13 +14 -1 cjkcodecs/src/codeccommon.h Index: codeccommon.h =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/codeccommon.h,v retrieving revision 1.12 retrieving revision 1.13 diff -u -r1.12 -r1.13 --- codeccommon.h 6 Jun 2003 06:27:41 -0000 1.12 +++ codeccommon.h 20 Jun 2003 17:22:59 -0000 1.13 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: codeccommon.h,v 1.12 2003/06/06 06:27:41 perky Exp $ + * $Id: codeccommon.h,v 1.13 2003/06/20 17:22:59 perky Exp $ */ #include "Python.h" @@ -106,6 +106,19 @@ (*outbuf)[1] = (unsigned char)(c2); \ (*outbuf)[2] = (unsigned char)(c3); \ (*outbuf)[3] = (unsigned char)(c4); + +#if Py_UNICODE_SIZE == 2 +# define PUTUCS4(c) \ + RESERVE_OUTBUF(2) \ + (*outbuf)[0] = 0xd800 + (((c) - 0x10000) >> 10); \ + (*outbuf)[1] = 0xdc00 + (((c) - 0x10000) & 0x3ff); \ + NEXT_OUT(2) +#else +# define PUTUCS4(c) \ + RESERVE_OUTBUF(1) \ + **outbuf = (Py_UNICODE)(c); \ + NEXT_OUT(1) +#endif #define _TRYMAP_ENC(m, assi, val) \ if ((m)->map != NULL && (val) >= (m)->bottom && \ |
From: Hye-Shik C. <pe...@us...> - 2003-06-20 09:46:53
|
perky 03/06/20 02:46:52 Modified: . THANKS Log: Add a table head. Revision Changes Path 1.4 +2 -0 cjkcodecs/THANKS Index: THANKS =================================================================== RCS file: /cvsroot/koco/cjkcodecs/THANKS,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- THANKS 20 Jun 2003 09:44:15 -0000 1.3 +++ THANKS 20 Jun 2003 09:46:51 -0000 1.4 @@ -1,3 +1,5 @@ + Thanks to for + Jason R. Mastaler <ja...@ma...> MacOS X porting Kazuhiro ABE <abe...@ni...> advice on JIS X 0213 Martin v. Loewis <martin@v.loewis.de> comments on mapping differences |
From: Hye-Shik C. <pe...@us...> - 2003-06-20 09:44:16
|
perky 03/06/20 02:44:15 Modified: . THANKS Log: Thanks to Jason R. Mastaler and Martin v. Loewis for their favors. Revision Changes Path 1.3 +2 -0 cjkcodecs/THANKS Index: THANKS =================================================================== RCS file: /cvsroot/koco/cjkcodecs/THANKS,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- THANKS 11 Jun 2003 09:14:10 -0000 1.2 +++ THANKS 20 Jun 2003 09:44:15 -0000 1.3 @@ -1,3 +1,5 @@ +Jason R. Mastaler <ja...@ma...> MacOS X porting Kazuhiro ABE <abe...@ni...> advice on JIS X 0213 +Martin v. Loewis <martin@v.loewis.de> comments on mapping differences Yoshiki Ohshima <Yos...@ac...> advice on JIS X 0213 Young-Sik Won <mon...@dr...> mingw32 compilation fix |
From: Hye-Shik C. <pe...@us...> - 2003-06-20 09:22:59
|
perky 03/06/20 02:22:58 Modified: . CHANGES Log: Add a workaround for PyObject_GenericGetAttr to enable compiling with mingw32. Revision Changes Path 1.4 +3 -0 cjkcodecs/CHANGES Index: CHANGES =================================================================== RCS file: /cvsroot/koco/cjkcodecs/CHANGES,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- CHANGES 20 Jun 2003 09:04:52 -0000 1.3 +++ CHANGES 20 Jun 2003 09:22:57 -0000 1.4 @@ -16,3 +16,6 @@ *) Fixed a bug that cp932 codec couldn't decode half-width katakana. + *) Added a workaround for PyObject_GenericGetAttr to enable compiling + with mingw32. [Young-Sik Won] + |
From: Hye-Shik C. <pe...@us...> - 2003-06-20 09:22:59
|
perky 03/06/20 02:22:58 Modified: src multibytecodec_compat.h Log: Add a workaround for PyObject_GenericGetAttr to enable compiling with mingw32. Revision Changes Path 1.3 +10 -2 cjkcodecs/src/multibytecodec_compat.h Index: multibytecodec_compat.h =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/multibytecodec_compat.h,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- multibytecodec_compat.h 6 Jun 2003 06:27:42 -0000 1.2 +++ multibytecodec_compat.h 20 Jun 2003 09:22:58 -0000 1.3 @@ -24,7 +24,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: multibytecodec_compat.h,v 1.2 2003/06/06 06:27:42 perky Exp $ + * $Id: multibytecodec_compat.h,v 1.3 2003/06/20 09:22:58 perky Exp $ */ /* We don't support 2.0 and older */ @@ -62,7 +62,15 @@ #else # define OLD_GETATTR_DEF(prefix) # define GETATTR_FUNC(prefix) 0 -# define GETATTRO_FUNC(prefix) PyObject_GenericGetAttr +# ifdef __MINGW32__ +__inline static PyObject* __dummy_getattro(PyObject* self, PyObject* args) +{ + return PyObject_GenericGetAttr(self, args); +} +# define GETATTRO_FUNC(prefix) __dummy_getattro +# else +# define GETATTRO_FUNC(prefix) PyObject_GenericGetAttr +# endif #endif /* |
From: Hye-Shik C. <pe...@us...> - 2003-06-20 09:12:22
|
perky 03/06/20 02:12:20 Modified: tests test_encoding_cp932.py Log: 0x80 is UNDEFINED valid single byte now. Revision Changes Path 1.5 +5 -5 cjkcodecs/tests/test_encoding_cp932.py Index: test_encoding_cp932.py =================================================================== RCS file: /cvsroot/koco/cjkcodecs/tests/test_encoding_cp932.py,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- test_encoding_cp932.py 6 Jun 2003 05:55:30 -0000 1.4 +++ test_encoding_cp932.py 20 Jun 2003 09:12:20 -0000 1.5 @@ -27,7 +27,7 @@ # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # -# $Id: test_encoding_cp932.py,v 1.4 2003/06/06 05:55:30 perky Exp $ +# $Id: test_encoding_cp932.py,v 1.5 2003/06/20 09:12:20 perky Exp $ # from test import test_support @@ -39,11 +39,11 @@ tstring = test_multibytecodec_support.load_teststring('shift_jis') errortests = ( # invalid bytes - ("abc\x80\x80\x82\x84", "strict", None), + ("abc\x81\x00\x81\x00\x82\x84", "strict", None), ("abc\xf8", "strict", None), - ("abc\x80\x80\x82\x84", "replace", u"abc\ufffd\uff44"), - ("abc\x80\x80\x82\x84\x88", "replace", u"abc\ufffd\uff44\ufffd"), - ("abc\x80\x80\x82\x84", "ignore", u"abc\uff44"), + ("abc\x81\x00\x82\x84", "replace", u"abc\ufffd\uff44"), + ("abc\x81\x00\x82\x84\x88", "replace", u"abc\ufffd\uff44\ufffd"), + ("abc\x81\x00\x82\x84", "ignore", u"abc\uff44"), # sjis vs cp932 ("\\\x7e", "replace", u"\\\x7e"), ("\x81\x5f\x81\x61\x81\x7c", "replace", u"\uff3c\u2225\uff0d"), |
From: Hye-Shik C. <pe...@us...> - 2003-06-20 09:04:54
|
perky 03/06/20 02:04:53 Modified: src/maps map_big5.h map_cp950ext.h Log: - Tweaked some mapping for cp932 and cp950 to make more consistency with MS Windows. - CP932: Added single byte "UNDEFINED" characters 0x80, 0xa0, 0xfd, 0xfe, 0xff (documented on NOTES.cp932) - CP950: Changed encode mappings to another more popular for duplicated unicode points: 5341 -> A451, 5345 -> A4CA - A unittest for big5 mapping is added. - Fixed a bug that cp932 codec couldn't decode half-width katakana. Revision Changes Path 1.4 +0 -0 cjkcodecs/src/maps/map_big5.h Index: map_big5.h =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/maps/map_big5.h,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- map_big5.h 19 Jun 2003 17:49:01 -0000 1.3 +++ map_big5.h 20 Jun 2003 09:04:53 -0000 1.4 @@ -1,5 +1,5 @@ /* - * $Id: map_big5.h,v 1.3 2003/06/19 17:49:01 perky Exp $ + * $Id: map_big5.h,v 1.4 2003/06/20 09:04:53 perky Exp $ */ static const Py_UNICODE __big5_decmap[16702] = { 1.5 +14 -4 cjkcodecs/src/maps/map_cp950ext.h Index: map_cp950ext.h =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/maps/map_cp950ext.h,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- map_cp950ext.h 19 Jun 2003 17:49:01 -0000 1.4 +++ map_cp950ext.h 20 Jun 2003 09:04:53 -0000 1.5 @@ -1,5 +1,5 @@ /* - * $Id: map_cp950ext.h,v 1.4 2003/06/19 17:49:01 perky Exp $ + * $Id: map_cp950ext.h,v 1.5 2003/06/20 09:04:53 perky Exp $ */ static const Py_UNICODE __cp950ext_decmap[224] = { @@ -292,7 +292,7 @@ /* 0xFF */ {0, 0, 0}, }; -static const DBCHAR __cp950ext_encmap[502] = { +static const DBCHAR __cp950ext_encmap[581] = { 0xa1c2, 0xa145, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, @@ -338,15 +338,17 @@ 0xf9dc, 0xf9da, 0xf9d6, 0xf9db, 0xf9d8, 0xf9d7, 0xa14e, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, - NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, 0xa242, 0xa1e3, NOCHAR, + NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, 0xa242, 0xa1fe, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, + NOCHAR, NOCHAR, NOCHAR, 0xa240, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, + NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, 0xa1e3, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, @@ -355,7 +357,15 @@ NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, NOCHAR, - 0xa246, 0xa247, NOCHAR, NOCHAR, NOCHAR, 0xaxa246, + 0xa247, NOCHAR, NOCHAR, NOCHAR, 0xa244, }; static const struct unim_index cp950ext_encmap[256] = { @@ -614,6 +624,6 @@ /* 0xFC */ {0, 0, 0}, /* 0xFD */ {0, 0, 0}, /* 0xFE */ {__cp950ext_encmap+342, 0x51, 0x68}, -/* 0xFF */ {__cp950ext_encmap+366, 0x5e, 0xe5}, +/* 0xFF */ {__cp950ext_encmap+366, 0x0f, 0xe5}, }; |
From: Hye-Shik C. <pe...@us...> - 2003-06-20 09:04:54
|
perky 03/06/20 02:04:52 Modified: . CHANGES NOTES.big5 Added: . NOTES.cp932 Log: - Tweaked some mapping for cp932 and cp950 to make more consistency with MS Windows. - CP932: Added single byte "UNDEFINED" characters 0x80, 0xa0, 0xfd, 0xfe, 0xff (documented on NOTES.cp932) - CP950: Changed encode mappings to another more popular for duplicated unicode points: 5341 -> A451, 5345 -> A4CA - A unittest for big5 mapping is added. - Fixed a bug that cp932 codec couldn't decode half-width katakana. Revision Changes Path 1.3 +11 -0 cjkcodecs/CHANGES Index: CHANGES =================================================================== RCS file: /cvsroot/koco/cjkcodecs/CHANGES,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- CHANGES 19 Jun 2003 19:12:58 -0000 1.2 +++ CHANGES 20 Jun 2003 09:04:52 -0000 1.3 @@ -5,3 +5,14 @@ *) Fixed a bug that JIS X 0201 routine doesn't encode and decode 0x7f. + *) Tweaked some mapping for cp932 and cp950 to make more consistency + with MS Windows. + - CP932: Added single byte "UNDEFINED" characters 0x80, 0xa0, 0xfd, + 0xfe, 0xff (documented on NOTES.cp932) + - CP950: Changed encode mappings to another more popular for + duplicated unicode points: 5341 -> A451, 5345 -> A4CA + + *) A unittest for big5 mapping is added. + + *) Fixed a bug that cp932 codec couldn't decode half-width katakana. + 1.3 +11 -10 cjkcodecs/NOTES.big5 Index: NOTES.big5 =================================================================== RCS file: /cvsroot/koco/cjkcodecs/NOTES.big5,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- NOTES.big5 19 Jun 2003 18:02:11 -0000 1.2 +++ NOTES.big5 20 Jun 2003 09:04:52 -0000 1.3 @@ -1,15 +1,16 @@ big5 codec maps the following characters as cp950 does rather than conforming Unicode.org's that maps to 0xFFFD. -BIG5 Unicode Description + BIG5 Unicode Description -0xA15A 0x2574 SPACING UNDERSCORE -0xA1C3 0xFFE3 SPACING HEAVY OVERSCORE -0xA1C5 0x02CD SPACING HEAVY UNDERSCORE -0xA1FE 0xFF0F LT DIAG UP RIGHT TO LOW LEFT -0xA240 0xFF3C LT DIAG UP LEFT TO LOW RIGHT -0xA2CC 0x5341 HANGZHOU NUMERAL TEN -0xA2CE 0x5345 HANGZHOU NUMERAL THIRTY + 0xA15A 0x2574 SPACING UNDERSCORE + 0xA1C3 0xFFE3 SPACING HEAVY OVERSCORE + 0xA1C5 0x02CD SPACING HEAVY UNDERSCORE + 0xA1FE 0xFF0F LT DIAG UP RIGHT TO LOW LEFT + 0xA240 0xFF3C LT DIAG UP LEFT TO LOW RIGHT + 0xA2CC 0x5341 HANGZHOU NUMERAL TEN + 0xA2CE 0x5345 HANGZHOU NUMERAL THIRTY -Because unicode 0x5341, 0x5345 is mapped to another big5 codes already, -a roundtrip compatibility is not guaranteed for them. +Because unicode 0x5341, 0x5345, 0xFF0F, 0xFF3C is mapped to another +big5 codes already, a roundtrip compatibility is not guaranteed for +them. 1.1 cjkcodecs/NOTES.cp932 Index: NOTES.cp932 =================================================================== To conform to Windows's real mapping, cp932 codec maps the following codepoints in addition of the official cp932 mapping. CP932 Unicode Description 0x80 0x80 UNDEFINED 0xA0 0xF8F0 UNDEFINED 0xFD 0xF8F1 UNDEFINED 0xFE 0xF8F2 UNDEFINED 0xFF 0xF8F3 UNDEFINED |
From: Hye-Shik C. <pe...@us...> - 2003-06-20 09:04:54
|
perky 03/06/20 02:04:53 Modified: tests test_mapping_cp932.py test_mapping_cp950.py test_multibytecodec_support.py Added: tests test_mapping_big5.py Log: - Tweaked some mapping for cp932 and cp950 to make more consistency with MS Windows. - CP932: Added single byte "UNDEFINED" characters 0x80, 0xa0, 0xfd, 0xfe, 0xff (documented on NOTES.cp932) - CP950: Changed encode mappings to another more popular for duplicated unicode points: 5341 -> A451, 5345 -> A4CA - A unittest for big5 mapping is added. - Fixed a bug that cp932 codec couldn't decode half-width katakana. Revision Changes Path 1.4 +10 -1 cjkcodecs/tests/test_mapping_cp932.py Index: test_mapping_cp932.py =================================================================== RCS file: /cvsroot/koco/cjkcodecs/tests/test_mapping_cp932.py,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- test_mapping_cp932.py 6 Jun 2003 05:55:31 -0000 1.3 +++ test_mapping_cp932.py 20 Jun 2003 09:04:53 -0000 1.4 @@ -27,7 +27,7 @@ # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # -# $Id: test_mapping_cp932.py,v 1.3 2003/06/06 05:55:31 perky Exp $ +# $Id: test_mapping_cp932.py,v 1.4 2003/06/20 09:04:53 perky Exp $ # from test import test_support @@ -43,6 +43,15 @@ unittest.TestCase): encoding = 'cjkcodecs.cp932' mapfilename = 'CP932.TXT' + supmaps = [ + ('\x80', u'\u0080'), + ('\xa0', u'\uf8f0'), + ('\xfd', u'\uf8f1'), + ('\xfe', u'\uf8f2'), + ('\xff', u'\uf8f3'), + ] + for i in range(0xa1, 0xe0): + supmaps.append((chr(i), unichr(i+0xfec0))) def test_main(): suite = unittest.TestSuite() 1.4 +5 -1 cjkcodecs/tests/test_mapping_cp950.py Index: test_mapping_cp950.py =================================================================== RCS file: /cvsroot/koco/cjkcodecs/tests/test_mapping_cp950.py,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- test_mapping_cp950.py 6 Jun 2003 05:55:31 -0000 1.3 +++ test_mapping_cp950.py 20 Jun 2003 09:04:53 -0000 1.4 @@ -27,7 +27,7 @@ # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # -# $Id: test_mapping_cp950.py,v 1.3 2003/06/06 05:55:31 perky Exp $ +# $Id: test_mapping_cp950.py,v 1.4 2003/06/20 09:04:53 perky Exp $ # from test import test_support @@ -44,6 +44,10 @@ unittest.TestCase): encoding = 'cjkcodecs.cp950' mapfilename = 'CP950.TXT' + pass_enctest = [ + ('\xa2\xcc', u'\u5341'), + ('\xa2\xce', u'\u5345'), + ] def test_main(): suite = unittest.TestSuite() 1.6 +16 -7 cjkcodecs/tests/test_multibytecodec_support.py Index: test_multibytecodec_support.py =================================================================== RCS file: /cvsroot/koco/cjkcodecs/tests/test_multibytecodec_support.py,v retrieving revision 1.5 retrieving revision 1.6 diff -u -r1.5 -r1.6 --- test_multibytecodec_support.py 29 May 2003 09:17:38 -0000 1.5 +++ test_multibytecodec_support.py 20 Jun 2003 09:04:53 -0000 1.6 @@ -27,7 +27,7 @@ # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # -# $Id: test_multibytecodec_support.py,v 1.5 2003/05/29 09:17:38 perky Exp $ +# $Id: test_multibytecodec_support.py,v 1.6 2003/06/20 09:04:53 perky Exp $ # import sys, codecs, os.path @@ -155,8 +155,9 @@ class TestBase_Mapping(unittest.TestCase): pass_enctest = [] pass_dectest = [] + supmaps = [] - def test_mapping(self): + def test_mapping_file(self): unichrs = lambda s: u''.join(map(unichr, map(eval, s.split('+')))) urt_wa = {} @@ -180,15 +181,23 @@ csetch = chr(csetval >> 8) + chr(csetval & 0xff) else: continue - unich = unichrs(data[1]) + + unich = unichrs(data[1]) if ord(unich) == 0xfffd or urt_wa.has_key(unich): continue urt_wa[unich] = csetch - if (csetch, unich) not in self.pass_enctest: - self.assertEqual(unich.encode(self.encoding), csetch) - if (csetch, unich) not in self.pass_dectest: - self.assertEqual(unicode(csetch, self.encoding), unich) + self._testpoint(csetch, unich) + + def test_mapping_supplemental(self): + for mapping in self.supmaps: + self._testpoint(*mapping) + + def _testpoint(self, csetch, unich): + if (csetch, unich) not in self.pass_enctest: + self.assertEqual(unich.encode(self.encoding), csetch) + if (csetch, unich) not in self.pass_dectest: + self.assertEqual(unicode(csetch, self.encoding), unich) def load_teststring(encoding): etxt = open(os.path.join('sampletexts', encoding) + '.txt').read() 1.1 cjkcodecs/tests/test_mapping_big5.py Index: test_mapping_big5.py =================================================================== #!/usr/bin/env python # # test_mapping_big5.py: Mapping test for BIG5 codec # # Copyright (C) 2003 Hye-Shik Chang <pe...@Fr...>. # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # # $Id: test_mapping_big5.py,v 1.1 2003/06/20 09:04:53 perky Exp $ # from test import test_support import test_multibytecodec_support import sys, codecs, os import unittest if not os.path.exists('BIG5.TXT'): raise test_support.TestSkipped( 'BIG5.TXT not found, download from http://www.unicode.' 'org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/BIG5.TXT') class TestBIG5Map(test_multibytecodec_support.TestBase_Mapping, unittest.TestCase): encoding = 'cjkcodecs.big5' mapfilename = 'BIG5.TXT' def test_main(): suite = unittest.TestSuite() suite.addTest(unittest.makeSuite(TestBIG5Map)) test_support.run_suite(suite) if __name__ == "__main__": test_main() # ex: ts=8 sts=4 et |
From: Hye-Shik C. <pe...@us...> - 2003-06-20 09:04:54
|
perky 03/06/20 02:04:53 Modified: src _cp932.c Log: - Tweaked some mapping for cp932 and cp950 to make more consistency with MS Windows. - CP932: Added single byte "UNDEFINED" characters 0x80, 0xa0, 0xfd, 0xfe, 0xff (documented on NOTES.cp932) - CP950: Changed encode mappings to another more popular for duplicated unicode points: 5341 -> A451, 5345 -> A4CA - A unittest for big5 mapping is added. - Fixed a bug that cp932 codec couldn't decode half-width katakana. Revision Changes Path 1.4 +24 -3 cjkcodecs/src/_cp932.c Index: _cp932.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/_cp932.c,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- _cp932.c 9 Jun 2003 10:25:36 -0000 1.3 +++ _cp932.c 20 Jun 2003 09:04:52 -0000 1.4 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: _cp932.c,v 1.3 2003/06/09 10:25:36 perky Exp $ + * $Id: _cp932.c,v 1.4 2003/06/20 09:04:52 perky Exp $ */ #include "codeccommon.h" @@ -43,7 +43,7 @@ DBCHAR code; unsigned char c1, c2; - if (c < 0x80) { + if (c <= 0x80) { RESERVE_OUTBUF(1) **outbuf = (unsigned char)c; NEXT(1, 1) @@ -53,6 +53,15 @@ **outbuf = (unsigned char)(c - 0xfec0); NEXT(1, 1) continue; + } else if (c >= 0xf8f0 && c <= 0xf8f3) { + /* Windows compatability */ + RESERVE_OUTBUF(1) + if (c == 0xf8f0) + **outbuf = 0xa0; + else + **outbuf = (unsigned char)(c - 0xfef1 + 0xfd); + NEXT(1, 1) + continue; } UCS4INVALID(c) @@ -93,8 +102,20 @@ unsigned char c = **inbuf, c2; RESERVE_OUTBUF(1) - if (c < 0x80) { + if (c <= 0x80) { **outbuf = c; + NEXT(1, 1) + continue; + } else if (c >= 0xa0 && c <= 0xdf) { + if (c == 0xa0) + **outbuf = 0xf8f0; /* half-width katakana */ + else + **outbuf = 0xfec0 + c; + NEXT(1, 1) + continue; + } else if (c >= 0xfd/* && c <= 0xff*/) { + /* Windows compatibility */ + **outbuf = 0xf8f1 - 0xfd + c; NEXT(1, 1) continue; } |
From: Hye-Shik C. <pe...@us...> - 2003-06-20 09:04:54
|
perky 03/06/20 02:04:53 Modified: tools genmap_tchinese.py Log: - Tweaked some mapping for cp932 and cp950 to make more consistency with MS Windows. - CP932: Added single byte "UNDEFINED" characters 0x80, 0xa0, 0xfd, 0xfe, 0xff (documented on NOTES.cp932) - CP950: Changed encode mappings to another more popular for duplicated unicode points: 5341 -> A451, 5345 -> A4CA - A unittest for big5 mapping is added. - Fixed a bug that cp932 codec couldn't decode half-width katakana. Revision Changes Path 1.5 +21 -14 cjkcodecs/tools/genmap_tchinese.py Index: genmap_tchinese.py =================================================================== RCS file: /cvsroot/koco/cjkcodecs/tools/genmap_tchinese.py,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- genmap_tchinese.py 19 Jun 2003 17:49:01 -0000 1.4 +++ genmap_tchinese.py 20 Jun 2003 09:04:53 -0000 1.5 @@ -26,7 +26,7 @@ # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # -# $Id: genmap_tchinese.py,v 1.4 2003/06/19 17:49:01 perky Exp $ +# $Id: genmap_tchinese.py,v 1.5 2003/06/20 09:04:53 perky Exp $ # from genmap_support import * @@ -68,28 +68,35 @@ big5decmap[bcode >> 8][bcode & 0xff] = ucode big5encmap, cp950encmap = {}, {} +for c1, m in big5decmap.items(): + for c2, code in m.items(): + big5encmap.setdefault(code >> 8, {}) + if not big5encmap[code >> 8].has_key(code & 0xff): + big5encmap[code >> 8][code & 0xff] = c1 << 8 | c2 for c1, m in cp950decmap.items(): for c2, code in m.items(): - if not (not big5decmap.has_key(c1) or not big5decmap[c1].has_key(c2) - or big5decmap[c1][c2] != code): - del cp950decmap[c1][c2] cp950encmap.setdefault(code >> 8, {}) if not cp950encmap[code >> 8].has_key(code & 0xff): cp950encmap[code >> 8][code & 0xff] = c1 << 8 | c2 -for c1, m in big5decmap.items(): - for c2, code in m.items(): - big5encmap.setdefault(code >> 8, {}) - big5encmap[code >> 8][code & 0xff] = c1 << 8 | c2 - if (cp950encmap.has_key(code >> 8) and - cp950encmap[code >> 8].has_key(code & 0xff) and - cp950encmap[code >> 8][code & 0xff] == c1 << 8 | c2): - del cp950encmap[code >> 8][code & 0xff] - if not cp950encmap[code >> 8]: - del cp950encmap[code >>8] # fix unicode->big5 duplicated mapping priority +big5encmap[0xFF][0x0F] = 0xA241 +big5encmap[0xFF][0x3C] = 0xA242 big5encmap[0x53][0x41] = 0xA451 big5encmap[0x53][0x45] = 0xA4CA +cp950encmap[0x53][0x41] = 0xA451 +cp950encmap[0x53][0x45] = 0xA4CA + +for c1, m in cp950encmap.items(): + for c2, code in m.items(): + if (big5encmap.has_key(c1) and big5encmap[c1].has_key(c2) + and big5encmap[c1][c2] == code): + del cp950encmap[c1][c2] +for c1, m in cp950decmap.items(): + for c2, code in m.items(): + if (big5decmap.has_key(c1) and big5decmap[c1].has_key(c2) + and big5decmap[c1][c2] == code): + del cp950decmap[c1][c2] omap = open('map_big5.h', 'w') printcopyright(omap) |
From: Hye-Shik C. <pe...@us...> - 2003-06-19 19:13:00
|
perky 03/06/19 12:12:58 Modified: . CHANGES Log: Fix a bug that JIS X 0201 routine doesn't encode and decode 0x7f. Revision Changes Path 1.2 +3 -1 cjkcodecs/CHANGES Index: CHANGES =================================================================== RCS file: /cvsroot/koco/cjkcodecs/CHANGES,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- CHANGES 19 Jun 2003 17:49:01 -0000 1.1 +++ CHANGES 19 Jun 2003 19:12:58 -0000 1.2 @@ -1,5 +1,7 @@ Changes with CJKCodecs 1.0 - *) Changes a few characters of a big5 codepoint mapping to cp950's + *) Changed a few characters of a big5 codepoint mapping to cp950's rather than 0xfffd. (documented on NOTES.big5) + + *) Fixed a bug that JIS X 0201 routine doesn't encode and decode 0x7f. |
From: Hye-Shik C. <pe...@us...> - 2003-06-19 19:13:00
|
perky 03/06/19 12:12:59 Modified: src/maps alg_jisx0201.h Log: Fix a bug that JIS X 0201 routine doesn't encode and decode 0x7f. Revision Changes Path 1.5 +4 -3 cjkcodecs/src/maps/alg_jisx0201.h Index: alg_jisx0201.h =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/maps/alg_jisx0201.h,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- alg_jisx0201.h 5 Jun 2003 10:20:25 -0000 1.4 +++ alg_jisx0201.h 19 Jun 2003 19:12:59 -0000 1.5 @@ -1,7 +1,7 @@ -/* $Id: alg_jisx0201.h,v 1.4 2003/06/05 10:20:25 perky Exp $ */ +/* $Id: alg_jisx0201.h,v 1.5 2003/06/19 19:12:59 perky Exp $ */ #define JISX0201_R_ENCODE(c, assi) \ - if ((c) < 0x5c) (assi) = (c); \ + if ((c) < 0x5c || (c) == 0x7f) (assi) = (c);\ else if ((c) > 0x5c && (c) < 0x7e) \ (assi) = (c); \ else if ((c) == 0x00a5) (assi) = 0x5c; \ @@ -17,7 +17,8 @@ if ((c) < 0x5c) (assi) = (c); \ else if ((c) == 0x5c) (assi) = 0x00a5; \ else if ((c) < 0x7e) (assi) = (c); \ - else if ((c) == 0x7e) (assi) = 0x203e; + else if ((c) == 0x7e) (assi) = 0x203e; \ + else if ((c) == 0x7f) (assi) = 0x7f; #define JISX0201_K_DECODE(c, assi) \ if ((c) >= 0xa1 && (c) <= 0xdf) \ (assi) = 0xfec0 + (c); |
From: Hye-Shik C. <pe...@us...> - 2003-06-19 18:02:14
|
perky 03/06/19 11:02:12 Modified: . NOTES.big5 Log: Describe more clearly Revision Changes Path 1.2 +1 -1 cjkcodecs/NOTES.big5 Index: NOTES.big5 =================================================================== RCS file: /cvsroot/koco/cjkcodecs/NOTES.big5,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- NOTES.big5 19 Jun 2003 17:49:01 -0000 1.1 +++ NOTES.big5 19 Jun 2003 18:02:11 -0000 1.2 @@ -1,5 +1,5 @@ big5 codec maps the following characters as cp950 does rather than -following Unicode.org's mapping. +conforming Unicode.org's that maps to 0xFFFD. BIG5 Unicode Description |