koco-cvs Mailing List for Python Korean Codecs (Page 7)
Brought to you by:
perky
You can subscribe to this list here.
2002 |
Jan
|
Feb
|
Mar
|
Apr
(88) |
May
(5) |
Jun
|
Jul
(27) |
Aug
|
Sep
|
Oct
(5) |
Nov
|
Dec
|
---|---|---|---|---|---|---|---|---|---|---|---|---|
2003 |
Jan
(77) |
Feb
(3) |
Mar
|
Apr
(22) |
May
(123) |
Jun
(80) |
Jul
(83) |
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
From: Hye-Shik C. <pe...@us...> - 2003-06-02 09:33:31
|
perky 03/06/02 02:25:17 Modified: src/maps alg_jisx0201.h Log: Split Roman half and Katakana halfs. Revision Changes Path 1.2 +14 -6 cjkcodecs/src/maps/alg_jisx0201.h Index: alg_jisx0201.h =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/maps/alg_jisx0201.h,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- alg_jisx0201.h 20 May 2003 11:33:32 -0000 1.1 +++ alg_jisx0201.h 2 Jun 2003 09:25:17 -0000 1.2 @@ -1,18 +1,26 @@ -/* $Id: alg_jisx0201.h,v 1.1 2003/05/20 11:33:32 perky Exp $ */ +/* $Id: alg_jisx0201.h,v 1.2 2003/06/02 09:25:17 perky Exp $ */ -#define JISX0201_ENCODE(c, assi) \ +#define JISX0201_R_ENCODE(c, assi) \ if ((c) < 0x5c) (assi) = (c); \ else if ((c) > 0x5c && (c) < 0x7e) \ (assi) = (c); \ else if ((c) == 0x00a5) (assi) = 0x5c; \ - else if ((c) == 0x203e) (assi) = 0x7e; \ - else if ((c) >= 0xff61 && (c) <= 0xff9f) \ + else if ((c) == 0x203e) (assi) = 0x7e; +#define JISX0201_K_ENCODE(c, assi) \ + if ((c) >= 0xff61 && (c) <= 0xff9f) \ (assi) = (c) - 0xfec0; +#define JISX0201_ENCODE(c, assi) \ + JISX0201_R_ENCODE(c, assi) \ + else JISX0201_K_ENCODE(c, assi) -#define JISX0201_DECODE(c, assi) \ +#define JISX0201_R_DECODE(c, assi) \ if ((c) < 0x5c) (assi) = (c); \ else if ((c) == 0x5c) (assi) = 0x00a5; \ else if ((c) < 0x7e) (assi) = (c); \ - else if ((c) == 0x7e) (assi) = 0x203e; \ + else if ((c) == 0x7e) (assi) = 0x203e; +#define JISX0201_K_DECODE(c, assi) \ else if ((c) >= 0xa1 && (c) <= 0xdf) \ (assi) = 0xfec0 + (c); +#define JISX0201_DECODE(c, assi) \ + JISX0201_R_ENCODE(c, assi) \ + else JISX0201_K_ENCODE(c, assi) |
From: Hye-Shik C. <pe...@us...> - 2003-06-02 09:27:26
|
perky 03/06/02 02:27:25 Modified: src _iso_2022_jp.c Log: Correct a comment. Revision Changes Path 1.2 +2 -2 cjkcodecs/src/_iso_2022_jp.c Index: _iso_2022_jp.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/_iso_2022_jp.c,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- _iso_2022_jp.c 2 Jun 2003 09:25:59 -0000 1.1 +++ _iso_2022_jp.c 2 Jun 2003 09:27:25 -0000 1.2 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: _iso_2022_jp.c,v 1.1 2003/06/02 09:25:59 perky Exp $ + * $Id: _iso_2022_jp.c,v 1.2 2003/06/02 09:27:25 perky Exp $ */ #include "codeccommon.h" @@ -45,7 +45,7 @@ return 0; } -/* ISO-2022-JP don't changes designations instead of shifting-out */ +/* ISO-2022-JP changes designations instead of shifting-out */ ENCODER(iso_2022_jp) { |
From: Hye-Shik C. <pe...@us...> - 2003-06-02 09:26:00
|
perky 03/06/02 02:25:59 Added: src _iso_2022_jp.c Log: Add iso-2022-jp codec. Revision Changes Path 1.1 cjkcodecs/src/_iso_2022_jp.c Index: _iso_2022_jp.c =================================================================== /* * _iso_2022_jp.c: the ISO-2022-JP codec (RFC1468) * * Copyright (C) 2003 Hye-Shik Chang <pe...@Fr...>. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $Id: _iso_2022_jp.c,v 1.1 2003/06/02 09:25:59 perky Exp $ */ #include "codeccommon.h" #include "iso2022common.h" #include "maps/alg_jisx0201.h" ENCMAP(jisxcommon) DECMAP(jisx0208) #define HAVE_ENCODER_INIT ENCODER_INIT(iso_2022_jp) { state->i = 0; STATE_SETG0(state, CHARSET_ASCII) STATE_SETG1(state, CHARSET_ASCII) return 0; } /* ISO-2022-JP don't changes designations instead of shifting-out */ ENCODER(iso_2022_jp) { while (inleft > 0) { Py_UNICODE c = **inbuf; DBCHAR code; if (c < 0x80) { switch (STATE_GETG0(state)) { case CHARSET_ASCII: PAVE1(c) NEXT(1, 1) break; case CHARSET_JISX0201_R: JISX0201_R_ENCODE(c, code) else { /* FALLTHROUGH (yay!) */ default: PAVE3(ESC, '(', 'B') NEXT_OUT(3) STATE_SETG0(state, CHARSET_ASCII) code = c; } PAVE1(code) NEXT(1, 1) break; } if (c == '\n') STATE_CLEARFLAG(state, F_SHIFTED) } else UCS4INVALID(c) else { unsigned char charset; charset = STATE_GETG0(state); if (charset == CHARSET_JISX0201_R) { code = DBCINV; JISX0201_R_ENCODE(c, code) if (code != DBCINV) { PAVE1(code) NEXT(1, 1) continue; } } TRYMAP_ENC(jisxcommon, code, c) { if (code & 0x8000) /* MSB set: JIS X 0212 */ return 1; if (charset != CHARSET_JISX0208) { PAVE3(ESC, '$', 'B') STATE_SETG0(state, CHARSET_JISX0208) NEXT_OUT(3) } PAVE2(code >> 8, code & 0xff) NEXT(1, 2) } else { JISX0201_R_ENCODE(c, code) else return 1; /* if (charset == CHARSET_JISX0201_R) : already checked */ PAVE4(ESC, '(', 'J', code) STATE_SETG0(state, CHARSET_JISX0201_R) NEXT(1, 4) } } } return 0; } #define HAVE_DECODER_INIT DECODER_INIT(iso_2022_jp) { state->i = 0; STATE_SETG0(state, CHARSET_ASCII) STATE_SETG1(state, CHARSET_ASCII) return 0; } #define HAVE_DECODER_RESET DECODER_RESET(iso_2022_jp) { STATE_CLEARFLAG(state, F_SHIFTED) return 0; } DECODER(iso_2022_jp) { while (inleft > 0) { unsigned char c = **inbuf; if (STATE_GETFLAG(state, F_ESCTHROUGHOUT)) { /* ESC throughout mode: for non-iso2022 escape sequences */ RESERVE_OUTBUF(1) **outbuf = c; /* assume as ISO-8859-1 */ NEXT(1, 1) if (IS_ESCEND(c)) { STATE_CLEARFLAG(state, F_ESCTHROUGHOUT) } continue; } switch (c) { case ESC: RESERVE_INBUF(2) if (IS_ISO2022ESC((*inbuf)[1])) { int eslen; eslen = iso2022esclen(*inbuf, inleft); if (eslen < 0) return eslen == MBERR_INTERNAL ? 1 : eslen; if (eslen == 3) { unsigned char charset; if ((*inbuf)[1] == '$') { if ((*inbuf)[2] == '@' || (*inbuf)[2] == 'B') { charset = (*inbuf)[2] | CHARSET_DOUBLEBYTE; STATE_SETG0(state, charset); } else return 3; } else { if ((*inbuf)[2] == 'B' || (*inbuf)[2] == 'J') charset = (*inbuf)[2]; else return 3; if ((*inbuf)[1] == '(') { STATE_SETG0(state, charset) } else if ((*inbuf)[1] == ')') { STATE_SETG1(state, charset) } else return 3; } } else return eslen; NEXT_IN(eslen) } else { STATE_SETFLAG(state, F_ESCTHROUGHOUT) **outbuf = ESC; NEXT(1, 1) } break; case SI: STATE_CLEARFLAG(state, F_SHIFTED) NEXT_IN(1) break; case SO: STATE_SETFLAG(state, F_SHIFTED) NEXT_IN(1) break; case '\n': STATE_CLEARFLAG(state, F_SHIFTED) /* FALLTHROUGH */ case SP: /* FALLTHROUGH */ case DEL: RESERVE_OUTBUF(1) **outbuf = c; NEXT(1, 1) break; default: if ((c & 0x7f) < 0x20) { /* C0 and C1 */ RESERVE_OUTBUF(1) **outbuf = c & 0x7f; NEXT(1, 1) } else { unsigned char charset; if (!STATE_GETFLAG(state, F_SHIFTED) && c < 0x80) /* G0 */ charset = STATE_GETG0(state); else /* G1 */ charset = STATE_GETG1(state); if (charset & CHARSET_DOUBLEBYTE) { /* all double byte character sets are in JIS X 0208 here. * this means that we don't distinguish :1978 from :1983. */ RESERVE_OUTBUF(1) TRYMAP_DEC(jisx0208, **outbuf, c & 0x7f, (*inbuf)[1] & 0x7f) { NEXT(2, 1) } else return 2; } else if (charset == CHARSET_ASCII) { RESERVE_OUTBUF(1) **outbuf = c & 0x7f; NEXT(1, 1) } else if (charset == CHARSET_JISX0201_R) { RESERVE_OUTBUF(1) JISX0201_R_DECODE(c & 0x7f, **outbuf) else return 1; NEXT(1, 1) } else return MBERR_INTERNAL; } } } return 0; } #include "codecentry.h" BEGIN_CODEC_REGISTRY(iso_2022_jp) MAPOPEN(ja_JP) IMPORTMAP_DEC(jisx0208) IMPORTMAP_ENC(jisxcommon) MAPCLOSE() END_CODEC_REGISTRY(iso_2022_jp) /* * ex: ts=8 sts=4 et */ |
From: Hye-Shik C. <pe...@us...> - 2003-06-02 09:25:59
|
perky 03/06/02 02:25:59 Modified: . setup.py Log: Add iso-2022-jp codec. Revision Changes Path 1.18 +2 -2 cjkcodecs/setup.py Index: setup.py =================================================================== RCS file: /cvsroot/koco/cjkcodecs/setup.py,v retrieving revision 1.17 retrieving revision 1.18 diff -u -r1.17 -r1.18 --- setup.py 2 Jun 2003 07:39:20 -0000 1.17 +++ setup.py 2 Jun 2003 09:25:58 -0000 1.18 @@ -27,7 +27,7 @@ # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # -# $Id: setup.py,v 1.17 2003/06/02 07:39:20 perky Exp $ +# $Id: setup.py,v 1.18 2003/06/02 09:25:58 perky Exp $ # import sys @@ -36,7 +36,7 @@ extensions = [] encodings = { -'ja_JP': ['shift_jis', 'cp932', 'euc_jp'], +'ja_JP': ['shift_jis', 'cp932', 'euc_jp', 'iso_2022_jp'], 'ko_KR': ['euc_kr', 'cp949', 'johab', 'iso_2022_kr'], 'zh_CN': ['gb2312', 'gbk', 'gb18030', 'hz'], 'zh_TW': ['big5', 'cp950'], |
From: Hye-Shik C. <pe...@us...> - 2003-06-02 09:24:52
|
perky 03/06/02 02:24:51 Modified: src codeccommon.h Log: Add PAVE* macroes for encoders. Revision Changes Path 1.11 +20 -1 cjkcodecs/src/codeccommon.h Index: codeccommon.h =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/codeccommon.h,v retrieving revision 1.10 retrieving revision 1.11 diff -u -r1.10 -r1.11 --- codeccommon.h 31 May 2003 11:50:19 -0000 1.10 +++ codeccommon.h 2 Jun 2003 09:24:50 -0000 1.11 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: codeccommon.h,v 1.10 2003/05/31 11:50:19 perky Exp $ + * $Id: codeccommon.h,v 1.11 2003/06/02 09:24:50 perky Exp $ */ #include "Python.h" @@ -86,6 +86,25 @@ #define RESERVE_OUTBUF(n) \ if (outleft < (n)) \ return MBERR_TOOSMALL; + +#define PAVE1(c1) \ + RESERVE_OUTBUF(1) \ + (*outbuf)[0] = (unsigned char)(c1); +#define PAVE2(c1, c2) \ + RESERVE_OUTBUF(2) \ + (*outbuf)[0] = (unsigned char)(c1); \ + (*outbuf)[1] = (unsigned char)(c2); +#define PAVE3(c1, c2, c3) \ + RESERVE_OUTBUF(3) \ + (*outbuf)[0] = (unsigned char)(c1); \ + (*outbuf)[1] = (unsigned char)(c2); \ + (*outbuf)[2] = (unsigned char)(c3); +#define PAVE4(c1, c2, c3, c4) \ + RESERVE_OUTBUF(4) \ + (*outbuf)[0] = (unsigned char)(c1); \ + (*outbuf)[1] = (unsigned char)(c2); \ + (*outbuf)[2] = (unsigned char)(c3); \ + (*outbuf)[3] = (unsigned char)(c4); #define _TRYMAP_ENC(m, assi, val) \ if ((m)->map != NULL && (val) >= (m)->bottom && \ |
From: Hye-Shik C. <pe...@us...> - 2003-06-02 08:43:22
|
perky 03/06/02 01:43:21 Modified: src _iso_2022_kr.c Log: Decode correctly for ASCII charset when G1-assigned. Revision Changes Path 1.3 +2 -2 cjkcodecs/src/_iso_2022_kr.c Index: _iso_2022_kr.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/_iso_2022_kr.c,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- _iso_2022_kr.c 2 Jun 2003 08:16:29 -0000 1.2 +++ _iso_2022_kr.c 2 Jun 2003 08:43:20 -0000 1.3 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: _iso_2022_kr.c,v 1.2 2003/06/02 08:16:29 perky Exp $ + * $Id: _iso_2022_kr.c,v 1.3 2003/06/02 08:43:20 perky Exp $ */ #include "codeccommon.h" @@ -221,7 +221,7 @@ return 2; } else { RESERVE_OUTBUF(1) - **outbuf = c; + **outbuf = c & 0x7f; NEXT(1, 1) } } |
From: Hye-Shik C. <pe...@us...> - 2003-06-02 08:16:30
|
perky 03/06/02 01:16:29 Modified: src _iso_2022_kr.c Log: Mention about RFC1557 Revision Changes Path 1.2 +2 -2 cjkcodecs/src/_iso_2022_kr.c Index: _iso_2022_kr.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/_iso_2022_kr.c,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- _iso_2022_kr.c 2 Jun 2003 07:39:22 -0000 1.1 +++ _iso_2022_kr.c 2 Jun 2003 08:16:29 -0000 1.2 @@ -1,5 +1,5 @@ /* - * _iso_2022_kr.c: the ISO-2022-KR codec + * _iso_2022_kr.c: the ISO-2022-KR codec (RFC1557) * * Copyright (C) 2003 Hye-Shik Chang <pe...@Fr...>. * All rights reserved. @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: _iso_2022_kr.c,v 1.1 2003/06/02 07:39:22 perky Exp $ + * $Id: _iso_2022_kr.c,v 1.2 2003/06/02 08:16:29 perky Exp $ */ #include "codeccommon.h" |
From: Hye-Shik C. <pe...@us...> - 2003-06-02 08:07:46
|
perky 03/06/02 01:07:43 Modified: cjkcodecs Makefile aliases.py Removed: cjkcodecs iso_2022_cn.py Log: Remove iso-2022-cn. We can't implement it soon due to lack of CNS11640 mapping. Revision Changes Path 1.8 +2 -2 cjkcodecs/cjkcodecs/Makefile Index: Makefile =================================================================== RCS file: /cvsroot/koco/cjkcodecs/cjkcodecs/Makefile,v retrieving revision 1.7 retrieving revision 1.8 diff -u -r1.7 -r1.8 --- Makefile 2 Jun 2003 07:48:18 -0000 1.7 +++ Makefile 2 Jun 2003 08:07:40 -0000 1.8 @@ -25,10 +25,10 @@ # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # -# $Id: Makefile,v 1.7 2003/06/02 07:48:18 perky Exp $ +# $Id: Makefile,v 1.8 2003/06/02 08:07:40 perky Exp $ # -GENERIC_ENCODINGS= gb2312 gbk gb18030 hz iso_2022_cn \ +GENERIC_ENCODINGS= gb2312 gbk gb18030 hz \ big5 cp950 \ cp932 shift_jis euc_jp \ iso_2022_jp iso_2022_jp_1 iso_2022_jp_2 \ 1.8 +1 -6 cjkcodecs/cjkcodecs/aliases.py Index: aliases.py =================================================================== RCS file: /cvsroot/koco/cjkcodecs/cjkcodecs/aliases.py,v retrieving revision 1.7 retrieving revision 1.8 diff -u -r1.7 -r1.8 --- aliases.py 2 Jun 2003 07:44:41 -0000 1.7 +++ aliases.py 2 Jun 2003 08:07:42 -0000 1.8 @@ -26,7 +26,7 @@ # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # -# $Id: aliases.py,v 1.7 2003/06/02 07:44:41 perky Exp $ +# $Id: aliases.py,v 1.8 2003/06/02 08:07:42 perky Exp $ # from encodings.aliases import aliases @@ -95,11 +95,6 @@ 'hzgb' : 'cjkcodecs.hz', 'hz_gb' : 'cjkcodecs.hz', 'hz_gb_2312' : 'cjkcodecs.hz', - - # iso_2022_cn codec - 'iso_2022_cn' : 'cjkcodecs.iso_2022_cn', - 'iso2022_cn' : 'cjkcodecs.iso_2022_cn', - 'iso2022cn' : 'cjkcodecs.iso_2022_cn', # iso_2022_jp codec 'iso_2022_jp' : 'cjkcodecs.iso_2022_jp', |
From: Hye-Shik C. <pe...@us...> - 2003-06-02 07:48:20
|
perky 03/06/02 00:48:19 Modified: cjkcodecs Makefile Log: Add iso-2022-cjk codecs. Revision Changes Path 1.7 +3 -2 cjkcodecs/cjkcodecs/Makefile Index: Makefile =================================================================== RCS file: /cvsroot/koco/cjkcodecs/cjkcodecs/Makefile,v retrieving revision 1.6 retrieving revision 1.7 diff -u -r1.6 -r1.7 --- Makefile 2 Jun 2003 07:39:22 -0000 1.6 +++ Makefile 2 Jun 2003 07:48:18 -0000 1.7 @@ -25,12 +25,13 @@ # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # -# $Id: Makefile,v 1.6 2003/06/02 07:39:22 perky Exp $ +# $Id: Makefile,v 1.7 2003/06/02 07:48:18 perky Exp $ # -GENERIC_ENCODINGS= gb2312 gbk gb18030 hz \ +GENERIC_ENCODINGS= gb2312 gbk gb18030 hz iso_2022_cn \ big5 cp950 \ cp932 shift_jis euc_jp \ + iso_2022_jp iso_2022_jp_1 iso_2022_jp_2 \ cp949 euc_kr johab iso_2022_kr \ utf_8 |
From: Hye-Shik C. <pe...@us...> - 2003-06-02 07:44:43
|
perky 03/06/02 00:44:41 Modified: cjkcodecs aliases.py Added: cjkcodecs iso_2022_cn.py iso_2022_jp.py iso_2022_jp_1.py iso_2022_jp_2.py Log: Add iso-2022 codecs which will be implemented in a near future. Revision Changes Path 1.7 +21 -1 cjkcodecs/cjkcodecs/aliases.py Index: aliases.py =================================================================== RCS file: /cvsroot/koco/cjkcodecs/cjkcodecs/aliases.py,v retrieving revision 1.6 retrieving revision 1.7 diff -u -r1.6 -r1.7 --- aliases.py 2 Jun 2003 07:39:22 -0000 1.6 +++ aliases.py 2 Jun 2003 07:44:41 -0000 1.7 @@ -26,7 +26,7 @@ # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # -# $Id: aliases.py,v 1.6 2003/06/02 07:39:22 perky Exp $ +# $Id: aliases.py,v 1.7 2003/06/02 07:44:41 perky Exp $ # from encodings.aliases import aliases @@ -95,6 +95,26 @@ 'hzgb' : 'cjkcodecs.hz', 'hz_gb' : 'cjkcodecs.hz', 'hz_gb_2312' : 'cjkcodecs.hz', + + # iso_2022_cn codec + 'iso_2022_cn' : 'cjkcodecs.iso_2022_cn', + 'iso2022_cn' : 'cjkcodecs.iso_2022_cn', + 'iso2022cn' : 'cjkcodecs.iso_2022_cn', + + # iso_2022_jp codec + 'iso_2022_jp' : 'cjkcodecs.iso_2022_jp', + 'iso2022_jp' : 'cjkcodecs.iso_2022_jp', + 'iso2022jp' : 'cjkcodecs.iso_2022_jp', + + # iso_2022_jp_1 codec + 'iso_2022_jp_1' : 'cjkcodecs.iso_2022_jp_1', + 'iso2022_jp_1' : 'cjkcodecs.iso_2022_jp_1', + 'iso2022jp_1' : 'cjkcodecs.iso_2022_jp_1', + + # iso_2022_jp_2 codec + 'iso_2022_jp_2' : 'cjkcodecs.iso_2022_jp_2', + 'iso2022_jp_2' : 'cjkcodecs.iso_2022_jp_2', + 'iso2022jp_2' : 'cjkcodecs.iso_2022_jp_2', # iso_2022_kr codec 'iso_2022_kr' : 'cjkcodecs.iso_2022_kr', 1.1 cjkcodecs/cjkcodecs/iso_2022_cn.py Index: iso_2022_cn.py =================================================================== # ACHTUNG: This file is generated automatically. Please do not edit. # # iso_2022_cn.py: Python Unicode Codec for ISO_2022_CN # # Copyright (C) 2003 Hye-Shik Chang <pe...@Fr...>. # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # # $Id: iso_2022_cn.py,v 1.1 2003/06/02 07:44:41 perky Exp $ # from cjkcodecs._iso_2022_cn import codec import codecs class Codec(codecs.Codec): encode = codec.encode decode = codec.decode class StreamReader(Codec, codecs.StreamReader): def __init__(self, stream, errors='strict'): codecs.StreamReader.__init__(self, stream, errors) __codec = codec.StreamReader(stream, errors) self.read = __codec.read self.readline = __codec.readline self.readlines = __codec.readlines self.reset = __codec.reset class StreamWriter(Codec, codecs.StreamWriter): def __init__(self, stream, errors='strict'): codecs.StreamWriter.__init__(self, stream, errors) __codec = codec.StreamWriter(stream, errors) self.write = __codec.write self.writelines = __codec.writelines self.reset = __codec.reset def getregentry(): return (Codec().encode,Codec().decode,StreamReader,StreamWriter) # ex: ts=8 sts=4 et 1.1 cjkcodecs/cjkcodecs/iso_2022_jp.py Index: iso_2022_jp.py =================================================================== # ACHTUNG: This file is generated automatically. Please do not edit. # # iso_2022_jp.py: Python Unicode Codec for ISO_2022_JP # # Copyright (C) 2003 Hye-Shik Chang <pe...@Fr...>. # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # # $Id: iso_2022_jp.py,v 1.1 2003/06/02 07:44:41 perky Exp $ # from cjkcodecs._iso_2022_jp import codec import codecs class Codec(codecs.Codec): encode = codec.encode decode = codec.decode class StreamReader(Codec, codecs.StreamReader): def __init__(self, stream, errors='strict'): codecs.StreamReader.__init__(self, stream, errors) __codec = codec.StreamReader(stream, errors) self.read = __codec.read self.readline = __codec.readline self.readlines = __codec.readlines self.reset = __codec.reset class StreamWriter(Codec, codecs.StreamWriter): def __init__(self, stream, errors='strict'): codecs.StreamWriter.__init__(self, stream, errors) __codec = codec.StreamWriter(stream, errors) self.write = __codec.write self.writelines = __codec.writelines self.reset = __codec.reset def getregentry(): return (Codec().encode,Codec().decode,StreamReader,StreamWriter) # ex: ts=8 sts=4 et 1.1 cjkcodecs/cjkcodecs/iso_2022_jp_1.py Index: iso_2022_jp_1.py =================================================================== # ACHTUNG: This file is generated automatically. Please do not edit. # # iso_2022_jp_1.py: Python Unicode Codec for ISO_2022_JP_1 # # Copyright (C) 2003 Hye-Shik Chang <pe...@Fr...>. # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # # $Id: iso_2022_jp_1.py,v 1.1 2003/06/02 07:44:41 perky Exp $ # from cjkcodecs._iso_2022_jp_1 import codec import codecs class Codec(codecs.Codec): encode = codec.encode decode = codec.decode class StreamReader(Codec, codecs.StreamReader): def __init__(self, stream, errors='strict'): codecs.StreamReader.__init__(self, stream, errors) __codec = codec.StreamReader(stream, errors) self.read = __codec.read self.readline = __codec.readline self.readlines = __codec.readlines self.reset = __codec.reset class StreamWriter(Codec, codecs.StreamWriter): def __init__(self, stream, errors='strict'): codecs.StreamWriter.__init__(self, stream, errors) __codec = codec.StreamWriter(stream, errors) self.write = __codec.write self.writelines = __codec.writelines self.reset = __codec.reset def getregentry(): return (Codec().encode,Codec().decode,StreamReader,StreamWriter) # ex: ts=8 sts=4 et 1.1 cjkcodecs/cjkcodecs/iso_2022_jp_2.py Index: iso_2022_jp_2.py =================================================================== # ACHTUNG: This file is generated automatically. Please do not edit. # # iso_2022_jp_2.py: Python Unicode Codec for ISO_2022_JP_2 # # Copyright (C) 2003 Hye-Shik Chang <pe...@Fr...>. # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # # $Id: iso_2022_jp_2.py,v 1.1 2003/06/02 07:44:41 perky Exp $ # from cjkcodecs._iso_2022_jp_2 import codec import codecs class Codec(codecs.Codec): encode = codec.encode decode = codec.decode class StreamReader(Codec, codecs.StreamReader): def __init__(self, stream, errors='strict'): codecs.StreamReader.__init__(self, stream, errors) __codec = codec.StreamReader(stream, errors) self.read = __codec.read self.readline = __codec.readline self.readlines = __codec.readlines self.reset = __codec.reset class StreamWriter(Codec, codecs.StreamWriter): def __init__(self, stream, errors='strict'): codecs.StreamWriter.__init__(self, stream, errors) __codec = codec.StreamWriter(stream, errors) self.write = __codec.write self.writelines = __codec.writelines self.reset = __codec.reset def getregentry(): return (Codec().encode,Codec().decode,StreamReader,StreamWriter) # ex: ts=8 sts=4 et |
From: Hye-Shik C. <pe...@us...> - 2003-06-02 07:43:20
|
perky 03/06/02 00:39:21 Modified: . setup.py Log: Add iso-2022-kr codec finally! Revision Changes Path 1.17 +2 -2 cjkcodecs/setup.py Index: setup.py =================================================================== RCS file: /cvsroot/koco/cjkcodecs/setup.py,v retrieving revision 1.16 retrieving revision 1.17 diff -u -r1.16 -r1.17 --- setup.py 29 May 2003 09:12:28 -0000 1.16 +++ setup.py 2 Jun 2003 07:39:20 -0000 1.17 @@ -27,7 +27,7 @@ # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # -# $Id: setup.py,v 1.16 2003/05/29 09:12:28 perky Exp $ +# $Id: setup.py,v 1.17 2003/06/02 07:39:20 perky Exp $ # import sys @@ -37,7 +37,7 @@ extensions = [] encodings = { 'ja_JP': ['shift_jis', 'cp932', 'euc_jp'], -'ko_KR': ['euc_kr', 'cp949', 'johab'], +'ko_KR': ['euc_kr', 'cp949', 'johab', 'iso_2022_kr'], 'zh_CN': ['gb2312', 'gbk', 'gb18030', 'hz'], 'zh_TW': ['big5', 'cp950'], '': ['utf_8'], |
From: Hye-Shik C. <pe...@us...> - 2003-06-02 07:43:19
|
perky 03/06/02 00:39:22 Modified: cjkcodecs Makefile aliases.py Added: cjkcodecs iso_2022_kr.py Log: Add iso-2022-kr codec finally! Revision Changes Path 1.6 +2 -2 cjkcodecs/cjkcodecs/Makefile Index: Makefile =================================================================== RCS file: /cvsroot/koco/cjkcodecs/cjkcodecs/Makefile,v retrieving revision 1.5 retrieving revision 1.6 diff -u -r1.5 -r1.6 --- Makefile 29 May 2003 09:12:29 -0000 1.5 +++ Makefile 2 Jun 2003 07:39:22 -0000 1.6 @@ -25,13 +25,13 @@ # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # -# $Id: Makefile,v 1.5 2003/05/29 09:12:29 perky Exp $ +# $Id: Makefile,v 1.6 2003/06/02 07:39:22 perky Exp $ # GENERIC_ENCODINGS= gb2312 gbk gb18030 hz \ big5 cp950 \ cp932 shift_jis euc_jp \ - cp949 euc_kr johab \ + cp949 euc_kr johab iso_2022_kr \ utf_8 all: 1.6 +6 -1 cjkcodecs/cjkcodecs/aliases.py Index: aliases.py =================================================================== RCS file: /cvsroot/koco/cjkcodecs/cjkcodecs/aliases.py,v retrieving revision 1.5 retrieving revision 1.6 diff -u -r1.5 -r1.6 --- aliases.py 29 May 2003 09:12:29 -0000 1.5 +++ aliases.py 2 Jun 2003 07:39:22 -0000 1.6 @@ -26,7 +26,7 @@ # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # -# $Id: aliases.py,v 1.5 2003/05/29 09:12:29 perky Exp $ +# $Id: aliases.py,v 1.6 2003/06/02 07:39:22 perky Exp $ # from encodings.aliases import aliases @@ -95,6 +95,11 @@ 'hzgb' : 'cjkcodecs.hz', 'hz_gb' : 'cjkcodecs.hz', 'hz_gb_2312' : 'cjkcodecs.hz', + + # iso_2022_kr codec + 'iso_2022_kr' : 'cjkcodecs.iso_2022_kr', + 'iso2022_kr' : 'cjkcodecs.iso_2022_kr', + 'iso2022kr' : 'cjkcodecs.iso_2022_kr', # johab codec 'johab' : 'cjkcodecs.johab', 1.1 cjkcodecs/cjkcodecs/iso_2022_kr.py Index: iso_2022_kr.py =================================================================== # ACHTUNG: This file is generated automatically. Please do not edit. # # iso_2022_kr.py: Python Unicode Codec for ISO_2022_KR # # Copyright (C) 2003 Hye-Shik Chang <pe...@Fr...>. # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # # $Id: iso_2022_kr.py,v 1.1 2003/06/02 07:39:22 perky Exp $ # from cjkcodecs._iso_2022_kr import codec import codecs class Codec(codecs.Codec): encode = codec.encode decode = codec.decode class StreamReader(Codec, codecs.StreamReader): def __init__(self, stream, errors='strict'): codecs.StreamReader.__init__(self, stream, errors) __codec = codec.StreamReader(stream, errors) self.read = __codec.read self.readline = __codec.readline self.readlines = __codec.readlines self.reset = __codec.reset class StreamWriter(Codec, codecs.StreamWriter): def __init__(self, stream, errors='strict'): codecs.StreamWriter.__init__(self, stream, errors) __codec = codec.StreamWriter(stream, errors) self.write = __codec.write self.writelines = __codec.writelines self.reset = __codec.reset def getregentry(): return (Codec().encode,Codec().decode,StreamReader,StreamWriter) # ex: ts=8 sts=4 et |
From: Hye-Shik C. <pe...@us...> - 2003-06-02 07:39:24
|
perky 03/06/02 00:39:23 Added: src _iso_2022_kr.c iso2022common.h Log: Add iso-2022-kr codec finally! Revision Changes Path 1.1 cjkcodecs/src/_iso_2022_kr.c Index: _iso_2022_kr.c =================================================================== /* * _iso_2022_kr.c: the ISO-2022-KR codec * * Copyright (C) 2003 Hye-Shik Chang <pe...@Fr...>. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $Id: _iso_2022_kr.c,v 1.1 2003/06/02 07:39:22 perky Exp $ */ #include "codeccommon.h" #include "iso2022common.h" ENCMAP(cp949) DECMAP(ksx1001) #define HAVE_ENCODER_INIT ENCODER_INIT(iso_2022_kr) { state->i = 0; STATE_SETG0(state, CHARSET_ASCII) STATE_SETG1(state, CHARSET_ASCII) return 0; } #define HAVE_ENCODER_RESET ENCODER_RESET(iso_2022_kr) { if (STATE_GETFLAG(state, F_SHIFTED)) { RESERVE_OUTBUF(1) **outbuf = SI; NEXT_OUT(1) STATE_CLEARFLAG(state, F_SHIFTED) } return 0; } ENCODER(iso_2022_kr) { while (inleft > 0) { Py_UNICODE c = **inbuf; DBCHAR code; if (c < 0x80) { if (STATE_GETFLAG(state, F_SHIFTED)) { RESERVE_OUTBUF(2) STATE_CLEARFLAG(state, F_SHIFTED) (*outbuf)[0] = SI; (*outbuf)[1] = c; NEXT(1, 2) } else { RESERVE_OUTBUF(1) (*outbuf)[0] = c; NEXT(1, 1) } if (c == '\n') STATE_CLEARFLAG(state, F_SHIFTED) } else UCS4INVALID(c) else { if (STATE_GETG1(state) != CHARSET_KSX1001) { RESERVE_OUTBUF(4) STATE_SETG1(state, CHARSET_KSX1001) (*outbuf)[0] = ESC; (*outbuf)[1] = '$'; (*outbuf)[2] = ')'; (*outbuf)[3] = 'C'; NEXT_OUT(4) } if (!STATE_GETFLAG(state, F_SHIFTED)) { RESERVE_OUTBUF(1) STATE_SETFLAG(state, F_SHIFTED) (*outbuf)[0] = SO; NEXT_OUT(1) } TRYMAP_ENC(cp949, code, c) { if (code & 0x8000) /* MSB set: CP949 */ return 1; RESERVE_OUTBUF(1) (*outbuf)[0] = code >> 8; (*outbuf)[1] = code & 0xff; NEXT(1, 2) } else return 1; } } return 0; } #define HAVE_DECODER_INIT DECODER_INIT(iso_2022_kr) { state->i = 0; STATE_SETG0(state, CHARSET_ASCII) STATE_SETG1(state, CHARSET_ASCII) return 0; } #define HAVE_DECODER_RESET DECODER_RESET(iso_2022_kr) { STATE_CLEARFLAG(state, F_SHIFTED) return 0; } DECODER(iso_2022_kr) { while (inleft > 0) { unsigned char c = **inbuf; if (STATE_GETFLAG(state, F_ESCTHROUGHOUT)) { /* ESC throughout mode: for non-iso2022 escape sequences */ RESERVE_OUTBUF(1) **outbuf = c; /* assume as ISO-8859-1 */ NEXT(1, 1) if (IS_ESCEND(c)) { STATE_CLEARFLAG(state, F_ESCTHROUGHOUT) } continue; } switch (c) { case ESC: RESERVE_INBUF(2) if (IS_ISO2022ESC((*inbuf)[1])) { int eslen; eslen = iso2022esclen(*inbuf, inleft); if (eslen < 0) return eslen == MBERR_INTERNAL ? 1 : eslen; if (eslen == 3) { if ((*inbuf)[2] == 'B') { /* ASCII */ if ((*inbuf)[1] == '(') { STATE_SETG0(state, CHARSET_ASCII) } else if ((*inbuf)[1] == ')') { STATE_SETG1(state, CHARSET_ASCII) } else return 3; } else return 3; } else if (eslen == 4) { if ((*inbuf)[1] == '$' && (*inbuf)[3] == 'C') { /* KS X 1001 */ if ((*inbuf)[2] == '(') { STATE_SETG0(state, CHARSET_KSX1001) } else if ((*inbuf)[2] == ')') { STATE_SETG1(state, CHARSET_KSX1001) } else return 4; } else return 4; } else return eslen; NEXT_IN(eslen) } else { STATE_SETFLAG(state, F_ESCTHROUGHOUT) **outbuf = ESC; NEXT(1, 1) } break; case SI: STATE_CLEARFLAG(state, F_SHIFTED) NEXT_IN(1) break; case SO: STATE_SETFLAG(state, F_SHIFTED) NEXT_IN(1) break; case '\n': STATE_CLEARFLAG(state, F_SHIFTED) /* FALLTHROUGH */ case SP: /* FALLTHROUGH */ case DEL: RESERVE_OUTBUF(1) **outbuf = c; NEXT(1, 1) break; default: if ((c & 0x7f) < 0x20) { /* C0 and C1 */ RESERVE_OUTBUF(1) **outbuf = c & 0x7f; NEXT(1, 1) } else { unsigned char charset; if (!STATE_GETFLAG(state, F_SHIFTED) && c < 0x80) /* G0 */ charset = STATE_GETG0(state); else /* G1 */ charset = STATE_GETG1(state); if (charset & CHARSET_DOUBLEBYTE) { /* all double byte character sets are in KS X 1001 here */ RESERVE_OUTBUF(1) TRYMAP_DEC(ksx1001, **outbuf, c & 0x7f, (*inbuf)[1] & 0x7f){ NEXT(2, 1) } else return 2; } else { RESERVE_OUTBUF(1) **outbuf = c; NEXT(1, 1) } } } } return 0; } #include "codecentry.h" BEGIN_CODEC_REGISTRY(iso_2022_kr) MAPOPEN(ko_KR) IMPORTMAP_DEC(ksx1001) IMPORTMAP_ENC(cp949) MAPCLOSE() END_CODEC_REGISTRY(iso_2022_kr) /* * ex: ts=8 sts=4 et */ 1.1 cjkcodecs/src/iso2022common.h Index: iso2022common.h =================================================================== /* * iso2022common.h: Common Codec Routines for ISO-2022 codecs. * * Copyright (C) 2003 Hye-Shik Chang <pe...@Fr...>. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $Id: iso2022common.h,v 1.1 2003/06/02 07:39:22 perky Exp $ */ /* This ISO-2022 implementation is intended to comply ECMA-43 Level 1 * rather than RFCs itself */ #define ESC 0x1b #define SO 0x0e #define SI 0x0f #define SP 0x20 #define DEL 0x7f #define MAX_ESCSEQLEN 16 #define IS_ESCEND(c) ((c) >= 'A' && (c) <= 'Z') #define IS_ISO2022ESC(c2) ((c2) == '(' || (c2) == ')' || (c2) == '$') /* this is not a full list of ISO-2022 escape sequence headers. * but, it's enough to implement CJK instances of iso-2022. */ /* STATE 00000000 00000000 00000000 || ||^^^^^| ||^^^^^| || || | |+-----+---- G0 Character Set || || | +----------- Is G0 double byte? || |+-----+------------- G1 Character Set || +-------------------- Is G1 double byte? |+---------------------- Shifted in? +----------------------- ESC Throughout */ #define CHARSET_DOUBLEBYTE 0x80 #define CHARSET_ASCII 'B' #define CHARSET_KSX1001 ('C'|CHARSET_DOUBLEBYTE) #define CHARSET_JISX0201_R 'J' #define CHARSET_JISX0201_K 'I' #define CHARSET_JISX0208 ('B'|CHARSET_DOUBLEBYTE) #define CHARSET_JISX0208_O ('@'|CHARSET_DOUBLEBYTE) #define CHARSET_JISX0212 ('D'|CHARSET_DOUBLEBYTE) #define CHARSET_JISX0213_1 ('O'|CHARSET_DOUBLEBYTE) #define CHARSET_JISX0213_2 ('P'|CHARSET_DOUBLEBYTE) #define CHARSET_GB2312 ('A'|CHARSET_DOUBLEBYTE) #define CHARSET_GB2312_8565 ('E'|CHARSET_DOUBLEBYTE) #define CHARSET_DESIGN(c) ((c) & 0x7f) #define CHARSET_ISDBCS(c) ((c) & 0x80) #define F_SHIFTED 0x010000 #define F_ESCTHROUGHOUT 0x020000 #define STATE_SETG0(s, v) ((s)->i) = (((s)->i) & ~0x0000ff) | (v); #define STATE_GETG0(s) ((s)->i & 0x0000ff) #define STATE_SETG1(s, v) ((s)->i) = (((s)->i) & ~0x00ff00) | ((v) << 8); #define STATE_GETG1(s) (((s)->i & 0x00ff00) >> 8) #define STATE_SETFLAG(s, f) ((s)->i) |= (f); #define STATE_GETFLAG(s, f) ((s)->i & (f)) #define STATE_CLEARFLAG(s, f) ((s)->i) &= ~(f); static int iso2022esclen(const unsigned char *s, size_t len) { int i; for (i = 1;i < MAX_ESCSEQLEN;i++) { if (i >= len) return MBERR_TOOFEW; if (IS_ESCEND(s[i])) return i + 1; } return MBERR_INTERNAL; /* unterminated escape sequence */ } /* * ex: ts=8 sts=4 et */ |
From: Hye-Shik C. <pe...@us...> - 2003-05-31 12:10:29
|
perky 03/05/31 04:50:20 Modified: src _big5.c _cp932.c _cp949.c _cp950.c _euc_jp.c _euc_kr.c _gb18030.c _gb2312.c _gbk.c _hz.c _johab.c _shift_jis.c _utf_8.c codeccommon.h multibytecodec.c multibytecodec.h Added: src codecentry.h Log: Activate init, reset methods for the codecs. (these features are mainly iso-2022-* and mac_* codecs.) Revision Changes Path 1.5 +2 -1 cjkcodecs/src/_big5.c Index: _big5.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/_big5.c,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- _big5.c 26 May 2003 07:37:09 -0000 1.4 +++ _big5.c 31 May 2003 11:50:18 -0000 1.5 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: _big5.c,v 1.4 2003/05/26 07:37:09 perky Exp $ + * $Id: _big5.c,v 1.5 2003/05/31 11:50:18 perky Exp $ */ #include "codeccommon.h" @@ -86,6 +86,7 @@ return 0; } +#include "codecentry.h" BEGIN_CODEC_REGISTRY(big5) MAPOPEN(zh_TW) IMPORTMAP_ENCDEC(big5) 1.2 +2 -1 cjkcodecs/src/_cp932.c Index: _cp932.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/_cp932.c,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- _cp932.c 26 May 2003 07:57:52 -0000 1.1 +++ _cp932.c 31 May 2003 11:50:19 -0000 1.2 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: _cp932.c,v 1.1 2003/05/26 07:57:52 perky Exp $ + * $Id: _cp932.c,v 1.2 2003/05/31 11:50:19 perky Exp $ */ #include "codeccommon.h" @@ -131,6 +131,7 @@ return 0; } +#include "codecentry.h" BEGIN_CODEC_REGISTRY(cp932) MAPOPEN(ja_JP) IMPORTMAP_DEC(jisx0208) 1.9 +2 -1 cjkcodecs/src/_cp949.c Index: _cp949.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/_cp949.c,v retrieving revision 1.8 retrieving revision 1.9 diff -u -r1.8 -r1.9 --- _cp949.c 26 May 2003 07:37:09 -0000 1.8 +++ _cp949.c 31 May 2003 11:50:19 -0000 1.9 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: _cp949.c,v 1.8 2003/05/26 07:37:09 perky Exp $ + * $Id: _cp949.c,v 1.9 2003/05/31 11:50:19 perky Exp $ */ #include "codeccommon.h" @@ -90,6 +90,7 @@ return 0; } +#include "codecentry.h" BEGIN_CODEC_REGISTRY(cp949) MAPOPEN(ko_KR) IMPORTMAP_DEC(ksx1001) 1.5 +2 -1 cjkcodecs/src/_cp950.c Index: _cp950.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/_cp950.c,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- _cp950.c 26 May 2003 07:37:09 -0000 1.4 +++ _cp950.c 31 May 2003 11:50:19 -0000 1.5 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: _cp950.c,v 1.4 2003/05/26 07:37:09 perky Exp $ + * $Id: _cp950.c,v 1.5 2003/05/31 11:50:19 perky Exp $ */ #include "codeccommon.h" @@ -90,6 +90,7 @@ return 0; } +#include "codecentry.h" BEGIN_CODEC_REGISTRY(cp950) MAPOPEN(zh_TW) IMPORTMAP_ENCDEC(big5) 1.3 +2 -1 cjkcodecs/src/_euc_jp.c Index: _euc_jp.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/_euc_jp.c,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- _euc_jp.c 26 May 2003 07:37:09 -0000 1.2 +++ _euc_jp.c 31 May 2003 11:50:19 -0000 1.3 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: _euc_jp.c,v 1.2 2003/05/26 07:37:09 perky Exp $ + * $Id: _euc_jp.c,v 1.3 2003/05/31 11:50:19 perky Exp $ */ #include "codeccommon.h" @@ -158,6 +158,7 @@ return 0; } +#include "codecentry.h" BEGIN_CODEC_REGISTRY(euc_jp) MAPOPEN(ja_JP) IMPORTMAP_DEC(jisx0208) 1.12 +2 -1 cjkcodecs/src/_euc_kr.c Index: _euc_kr.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/_euc_kr.c,v retrieving revision 1.11 retrieving revision 1.12 diff -u -r1.11 -r1.12 --- _euc_kr.c 26 May 2003 07:37:09 -0000 1.11 +++ _euc_kr.c 31 May 2003 11:50:19 -0000 1.12 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: _euc_kr.c,v 1.11 2003/05/26 07:37:09 perky Exp $ + * $Id: _euc_kr.c,v 1.12 2003/05/31 11:50:19 perky Exp $ */ #include "codeccommon.h" @@ -89,6 +89,7 @@ return 0; } +#include "codecentry.h" BEGIN_CODEC_REGISTRY(euc_kr) MAPOPEN(ko_KR) IMPORTMAP_DEC(ksx1001) 1.6 +2 -1 cjkcodecs/src/_gb18030.c Index: _gb18030.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/_gb18030.c,v retrieving revision 1.5 retrieving revision 1.6 diff -u -r1.5 -r1.6 --- _gb18030.c 26 May 2003 07:37:09 -0000 1.5 +++ _gb18030.c 31 May 2003 11:50:19 -0000 1.6 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: _gb18030.c,v 1.5 2003/05/26 07:37:09 perky Exp $ + * $Id: _gb18030.c,v 1.6 2003/05/31 11:50:19 perky Exp $ */ #include "codeccommon.h" @@ -191,6 +191,7 @@ return 0; } +#include "codecentry.h" BEGIN_CODEC_REGISTRY(gb18030) MAPOPEN(zh_CN) IMPORTMAP_DEC(gb2312) 1.7 +2 -1 cjkcodecs/src/_gb2312.c Index: _gb2312.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/_gb2312.c,v retrieving revision 1.6 retrieving revision 1.7 diff -u -r1.6 -r1.7 --- _gb2312.c 26 May 2003 07:37:09 -0000 1.6 +++ _gb2312.c 31 May 2003 11:50:19 -0000 1.7 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: _gb2312.c,v 1.6 2003/05/26 07:37:09 perky Exp $ + * $Id: _gb2312.c,v 1.7 2003/05/31 11:50:19 perky Exp $ */ #include "codeccommon.h" @@ -88,6 +88,7 @@ return 0; } +#include "codecentry.h" BEGIN_CODEC_REGISTRY(gb2312) MAPOPEN(zh_CN) IMPORTMAP_DEC(gb2312) 1.6 +2 -1 cjkcodecs/src/_gbk.c Index: _gbk.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/_gbk.c,v retrieving revision 1.5 retrieving revision 1.6 diff -u -r1.5 -r1.6 --- _gbk.c 26 May 2003 07:37:09 -0000 1.5 +++ _gbk.c 31 May 2003 11:50:19 -0000 1.6 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: _gbk.c,v 1.5 2003/05/26 07:37:09 perky Exp $ + * $Id: _gbk.c,v 1.6 2003/05/31 11:50:19 perky Exp $ */ #include "codeccommon.h" @@ -95,6 +95,7 @@ return 0; } +#include "codecentry.h" BEGIN_CODEC_REGISTRY(gbk) MAPOPEN(zh_CN) IMPORTMAP_DEC(gb2312) 1.2 +2 -1 cjkcodecs/src/_hz.c Index: _hz.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/_hz.c,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- _hz.c 28 May 2003 09:18:14 -0000 1.1 +++ _hz.c 31 May 2003 11:50:19 -0000 1.2 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: _hz.c,v 1.1 2003/05/28 09:18:14 perky Exp $ + * $Id: _hz.c,v 1.2 2003/05/31 11:50:19 perky Exp $ */ #include "codeccommon.h" @@ -129,6 +129,7 @@ return 0; } +#include "codecentry.h" BEGIN_CODEC_REGISTRY(hz) MAPOPEN(zh_CN) IMPORTMAP_DEC(gb2312) 1.4 +2 -1 cjkcodecs/src/_johab.c Index: _johab.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/_johab.c,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- _johab.c 31 May 2003 03:05:38 -0000 1.3 +++ _johab.c 31 May 2003 11:50:19 -0000 1.4 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: _johab.c,v 1.3 2003/05/31 03:05:38 perky Exp $ + * $Id: _johab.c,v 1.4 2003/05/31 11:50:19 perky Exp $ */ #include "codeccommon.h" @@ -239,6 +239,7 @@ #undef NONE #undef FILL +#include "codecentry.h" BEGIN_CODEC_REGISTRY(johab) MAPOPEN(ko_KR) IMPORTMAP_DEC(ksx1001) 1.5 +2 -1 cjkcodecs/src/_shift_jis.c Index: _shift_jis.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/_shift_jis.c,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- _shift_jis.c 26 May 2003 15:06:45 -0000 1.4 +++ _shift_jis.c 31 May 2003 11:50:19 -0000 1.5 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: _shift_jis.c,v 1.4 2003/05/26 15:06:45 perky Exp $ + * $Id: _shift_jis.c,v 1.5 2003/05/31 11:50:19 perky Exp $ */ #include "codeccommon.h" @@ -131,6 +131,7 @@ return 0; } +#include "codecentry.h" BEGIN_CODEC_REGISTRY(shift_jis) MAPOPEN(ja_JP) IMPORTMAP_DEC(jisx0208) 1.3 +2 -1 cjkcodecs/src/_utf_8.c Index: _utf_8.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/_utf_8.c,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- _utf_8.c 31 May 2003 03:41:14 -0000 1.2 +++ _utf_8.c 31 May 2003 11:50:19 -0000 1.3 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: _utf_8.c,v 1.2 2003/05/31 03:41:14 perky Exp $ + * $Id: _utf_8.c,v 1.3 2003/05/31 11:50:19 perky Exp $ */ #include "codeccommon.h" @@ -190,6 +190,7 @@ return 0; } +#include "codecentry.h" BEGIN_CODEC_REGISTRY(utf_8) /* no maps */ END_CODEC_REGISTRY(utf_8) 1.10 +19 -102 cjkcodecs/src/codeccommon.h Index: codeccommon.h =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/codeccommon.h,v retrieving revision 1.9 retrieving revision 1.10 diff -u -r1.9 -r1.10 --- codeccommon.h 26 May 2003 07:37:09 -0000 1.9 +++ codeccommon.h 31 May 2003 11:50:19 -0000 1.10 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: codeccommon.h,v 1.9 2003/05/26 07:37:09 perky Exp $ + * $Id: codeccommon.h,v 1.10 2003/05/31 11:50:19 perky Exp $ */ #include "Python.h" @@ -38,16 +38,30 @@ #define DECMAP(encoding) \ const static decode_map *encoding##decmap; +#define ENCODER_INIT(encoding) \ + static int encoding##_encode_init( \ + MultibyteCodec_State *state) #define ENCODER(encoding) \ static int encoding##_encode( \ MultibyteCodec_State *state, \ const Py_UNICODE **inbuf, size_t inleft, \ unsigned char **outbuf, size_t outleft) +#define ENCODER_RESET(encoding) \ + static int encoding##_encode_reset( \ + MultibyteCodec_State *state, \ + unsigned char **outbuf, size_t outleft) + +#define DECODER_INIT(encoding) \ + static int encoding##_decode_init( \ + MultibyteCodec_State *state) #define DECODER(encoding) \ static int encoding##_decode( \ MultibyteCodec_State *state, \ const unsigned char **inbuf, size_t inleft, \ Py_UNICODE **outbuf, size_t outleft) +#define DECODER_RESET(encoding) \ + static int encoding##_decode_reset( \ + MultibyteCodec_State *state) #if Py_UNICODE_SIZE == 4 #define UCS4INVALID(code) \ @@ -58,70 +72,13 @@ if (0) ; #endif -#define BEGIN_CODEC_REGISTRY(encoding) \ - static MultibyteCodec __codec = { \ - #encoding, encoding##_encode, encoding##_decode \ - }; \ - \ - static struct PyMethodDef __methods[] = { \ - {NULL, NULL}, \ - }; \ - \ - void \ - init_##encoding(void) \ - { \ - PyObject *codec; \ - PyObject *m = NULL, *mod = NULL, *o = NULL; \ - \ - m = Py_InitModule("_" #encoding, __methods); - -#define MAPOPEN(locale) \ - mod = PyImport_ImportModule("mapdata_" #locale); \ - if (mod == NULL) goto errorexit; \ - if ( -#define IMPORTMAP_ENCDEC(charset) \ - importmap(mod, "__map_" #charset, &charset##encmap, \ - &charset##decmap) || -#define IMPORTMAP_ENC(charset) \ - importmap(mod, "__map_" #charset, &charset##encmap, \ - NULL) || -#define IMPORTMAP_DEC(charset) \ - importmap(mod, "__map_" #charset, NULL, \ - &charset##decmap) || -#define MAPCLOSE() \ - 0) goto errorexit; \ - Py_DECREF(mod); - -#define END_CODEC_REGISTRY(encoding) \ - mod = PyImport_ImportModule("multibytecodec"); \ - if (mod == NULL) goto errorexit; \ - o = PyObject_GetAttrString(mod, "__create_codec"); \ - if (o == NULL || !PyCallable_Check(o)) \ - goto errorexit; \ - \ - codec = createcodec(o, &__codec); \ - if (codec == NULL) \ - goto errorexit; \ - PyModule_AddObject(m, "codec", codec); \ - Py_DECREF(o); Py_DECREF(mod); \ - \ - if (PyErr_Occurred()) \ - Py_FatalError("can't initialize the _" #encoding \ - " module"); \ - \ - return; \ - \ -errorexit: \ - Py_XDECREF(m); \ - Py_XDECREF(mod); \ - Py_XDECREF(o); \ -} - -#define NEXT(i, o) \ +#define NEXT_IN(i) \ (*inbuf) += (i); \ - (inleft) -= (i); \ + (inleft) -= (i); +#define NEXT_OUT(o) \ (*outbuf) += (o); \ (outleft) -= (o); +#define NEXT(i, o) NEXT_IN(i) NEXT_OUT(o) #define RESERVE_INBUF(n) \ if (inleft < (n)) \ @@ -142,46 +99,6 @@ (m)->bottom]) != UNIINV) #define TRYMAP_DEC(charset, assi, c1, c2) \ _TRYMAP_DEC(&charset##decmap[c1], assi, c2) - -static int -importmap(PyObject *mod, const char *symbol, - const struct unim_index **encmap, const struct dbcs_index **decmap) -{ - PyObject *o; - - o = PyObject_GetAttrString(mod, (char*)symbol); - if (o == NULL) - return -1; - else if (!PyCObject_Check(o)) { - PyErr_SetString(PyExc_ValueError, "map data must be a CObject."); - return -1; - } else { - struct dbcs_map *map; - map = PyCObject_AsVoidPtr(o); - if (encmap != NULL) - *encmap = map->encmap; - if (decmap != NULL) - *decmap = map->decmap; - Py_DECREF(o); - } - - return 0; -} - -static PyObject * -createcodec(PyObject *cofunc, MultibyteCodec *codec) -{ - PyObject *args, *r; - - args = PyTuple_New(1); - if (args == NULL) return NULL; - PyTuple_SET_ITEM(args, 0, PyCObject_FromVoidPtr(codec, NULL)); - - r = PyObject_CallObject(cofunc, args); - Py_DECREF(args); - - return r; -} /* * ex: ts=8 sts=4 et 1.13 +67 -15 cjkcodecs/src/multibytecodec.c Index: multibytecodec.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/multibytecodec.c,v retrieving revision 1.12 retrieving revision 1.13 diff -u -r1.12 -r1.13 --- multibytecodec.c 27 May 2003 06:32:36 -0000 1.12 +++ multibytecodec.c 31 May 2003 11:50:19 -0000 1.13 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: multibytecodec.c,v 1.12 2003/05/27 06:32:36 perky Exp $ + * $Id: multibytecodec.c,v 1.13 2003/05/31 11:50:19 perky Exp $ */ #include "Python.h" @@ -466,7 +466,8 @@ if (errorcb == NULL) return NULL; - state.p = NULL; + if (self->codec->encinit != NULL && self->codec->encinit(&state) != 0) + goto errorexit; r = multibytecodec_encode(self->codec, &state, data, datalen, errorcb); if (r == NULL) goto errorexit; @@ -516,7 +517,9 @@ buf.outbuf = PyUnicode_AS_UNICODE(buf.outobj); buf.outbuf_end = buf.outbuf + PyUnicode_GET_SIZE(buf.outobj); - state.p = NULL; + if (self->codec->decinit != NULL && self->codec->decinit(&state) != 0) + goto errorexit; + while (buf.inbuf < buf.inbuf_end) { size_t inleft, outleft; int r; @@ -829,7 +832,9 @@ static PyObject * mbstreamreader_reset(MultibyteStreamReaderObject *self) { - self->state.p = NULL; + if (self->codec->decreset != NULL && + self->codec->decreset(&self->state) != 0) + return NULL; self->pendingsize = 0; Py_INCREF(Py_None); @@ -975,7 +980,48 @@ static PyObject * mbstreamwriter_reset(MultibyteStreamWriterObject *self) { - self->state.p = NULL; + if (self->codec->encreset != NULL) { + PyObject *rsbuf = NULL; + size_t rsbufsiz, rsbufnc; + int r; + unsigned char *rsbuf_top, *rsbuf_cur; + + rsbufnc = 0; + for (rsbufsiz = NULL;;rsbufsiz *= 2) { + if (rsbuf == NULL) { + rsbuf = PyString_FromStringAndSize(NULL, rsbufsiz); + if (rsbuf == NULL) + return NULL; + } else { + if (_PyString_Resize(&rsbuf, rsbufsiz)) + goto errorexit; + } + rsbuf_top = (unsigned char *)PyString_AS_STRING(rsbuf); + rsbuf_cur = rsbuf_top + rsbufnc; + + r = self->codec->encreset(&self->state, + &rsbuf_cur, rsbufsiz - rsbufnc); + rsbufnc = (size_t)(rsbuf_cur - rsbuf_top); + if (r == MBERR_TOOSMALL) + continue; + else { + if (r != 0) + goto errorexit; + else + break; + } + } + + if (_PyString_Resize(&rsbuf, rsbufnc)) { +errorexit: Py_DECREF(rsbuf); + return NULL; + } + + r = mbstreamwriter_iwrite(self, rsbuf); + Py_DECREF(rsbuf); + if (r == -1) + return NULL; + } Py_INCREF(Py_None); return Py_None; @@ -1067,15 +1113,18 @@ self->codec = codec; self->stream = stream; Py_INCREF(stream); - self->state.p = NULL; self->pendingsize = 0; self->errors = get_errorcallback(errors); - if (self->errors == NULL) { - Py_DECREF(self); - return NULL; - } + if (self->errors == NULL) + goto errorexit; + if (self->codec->decinit != NULL && self->codec->decinit(&self->state) != 0) + goto errorexit; return (PyObject *)self; + +errorexit: + Py_XDECREF(self); + return NULL; } static PyObject * @@ -1092,14 +1141,17 @@ self->codec = codec; self->stream = stream; Py_INCREF(stream); - self->state.p = NULL; self->errors = get_errorcallback(errors); - if (self->errors == NULL) { - Py_DECREF(self); - return NULL; - } + if (self->errors == NULL) + goto errorexit; + if (self->codec->encinit != NULL && self->codec->encinit(&self->state) != 0) + goto errorexit; return (PyObject *)self; + +errorexit: + Py_XDECREF(self); + return NULL; } static struct PyMethodDef __methods[] = { 1.7 +13 -4 cjkcodecs/src/multibytecodec.h Index: multibytecodec.h =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/multibytecodec.h,v retrieving revision 1.6 retrieving revision 1.7 diff -u -r1.6 -r1.7 --- multibytecodec.h 22 May 2003 00:01:50 -0000 1.6 +++ multibytecodec.h 31 May 2003 11:50:19 -0000 1.7 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: multibytecodec.h,v 1.6 2003/05/22 00:01:50 perky Exp $ + * $Id: multibytecodec.h,v 1.7 2003/05/31 11:50:19 perky Exp $ */ #ifndef _PYTHON_MULTIBYTECODEC_H_ @@ -43,14 +43,23 @@ typedef int (*mbencode_func)(MultibyteCodec_State *state, const Py_UNICODE **inbuf, size_t inleft, unsigned char **outbuf, size_t outleft); +typedef int (*mbencodeinit_func)(MultibyteCodec_State *state); +typedef int (*mbencodereset_func)(MultibyteCodec_State *state, + unsigned char **outbuf, size_t outleft); typedef int (*mbdecode_func)(MultibyteCodec_State *state, const unsigned char **inbuf, size_t inleft, Py_UNICODE **outbuf, size_t outleft); +typedef int (*mbdecodeinit_func)(MultibyteCodec_State *state); +typedef int (*mbdecodereset_func)(MultibyteCodec_State *state); typedef struct { - const char *encoding; - mbencode_func encode; - mbdecode_func decode; + const char *encoding; + mbencode_func encode; + mbencodeinit_func encinit; + mbencodereset_func encreset; + mbdecode_func decode; + mbdecodeinit_func decinit; + mbdecodereset_func decreset; } MultibyteCodec; typedef struct { 1.1 cjkcodecs/src/codecentry.h Index: codecentry.h =================================================================== /* * codecentry.h: Common Codec Entry Routines * * Copyright (C) 2003 Hye-Shik Chang <pe...@Fr...>. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $Id: codecentry.h,v 1.1 2003/05/31 11:50:19 perky Exp $ */ #ifdef HAVE_ENCODER_INIT #define ENCODER_INIT_FUNC(encoding) encoding##_encode_init #else #define ENCODER_INIT_FUNC(encoding) NULL #endif #ifdef HAVE_ENCODER_RESET #define ENCODER_RESET_FUNC(encoding) encoding##_encode_reset #else #define ENCODER_RESET_FUNC(encoding) NULL #endif #ifdef HAVE_DECODER_INIT #define DECODER_INIT_FUNC(encoding) encoding##_decode_init #else #define DECODER_INIT_FUNC(encoding) NULL #endif #ifdef HAVE_DECODER_RESET #define DECODER_RESET_FUNC(encoding) encoding##_decode_reset #else #define DECODER_RESET_FUNC(encoding) NULL #endif #define BEGIN_CODEC_REGISTRY(encoding) \ static MultibyteCodec __codec = { \ #encoding, \ encoding##_encode, \ ENCODER_INIT_FUNC(encoding), \ ENCODER_RESET_FUNC(encoding), \ encoding##_decode, \ DECODER_INIT_FUNC(encoding), \ DECODER_RESET_FUNC(encoding), \ }; \ \ static struct PyMethodDef __methods[] = { \ {NULL, NULL}, \ }; \ \ void \ init_##encoding(void) \ { \ PyObject *codec; \ PyObject *m = NULL, *mod = NULL, *o = NULL; \ \ m = Py_InitModule("_" #encoding, __methods); #define MAPOPEN(locale) \ mod = PyImport_ImportModule("mapdata_" #locale); \ if (mod == NULL) goto errorexit; \ if ( #define IMPORTMAP_ENCDEC(charset) \ importmap(mod, "__map_" #charset, &charset##encmap, \ &charset##decmap) || #define IMPORTMAP_ENC(charset) \ importmap(mod, "__map_" #charset, &charset##encmap, \ NULL) || #define IMPORTMAP_DEC(charset) \ importmap(mod, "__map_" #charset, NULL, \ &charset##decmap) || #define MAPCLOSE() \ 0) goto errorexit; \ Py_DECREF(mod); #define END_CODEC_REGISTRY(encoding) \ mod = PyImport_ImportModule("multibytecodec"); \ if (mod == NULL) goto errorexit; \ o = PyObject_GetAttrString(mod, "__create_codec"); \ if (o == NULL || !PyCallable_Check(o)) \ goto errorexit; \ \ codec = createcodec(o, &__codec); \ if (codec == NULL) \ goto errorexit; \ PyModule_AddObject(m, "codec", codec); \ Py_DECREF(o); Py_DECREF(mod); \ \ if (PyErr_Occurred()) \ Py_FatalError("can't initialize the _" #encoding \ " module"); \ \ return; \ \ errorexit: \ Py_XDECREF(m); \ Py_XDECREF(mod); \ Py_XDECREF(o); \ } static int importmap(PyObject *mod, const char *symbol, const struct unim_index **encmap, const struct dbcs_index **decmap) { PyObject *o; o = PyObject_GetAttrString(mod, (char*)symbol); if (o == NULL) return -1; else if (!PyCObject_Check(o)) { PyErr_SetString(PyExc_ValueError, "map data must be a CObject."); return -1; } else { struct dbcs_map *map; map = PyCObject_AsVoidPtr(o); if (encmap != NULL) *encmap = map->encmap; if (decmap != NULL) *decmap = map->decmap; Py_DECREF(o); } return 0; } static PyObject * createcodec(PyObject *cofunc, MultibyteCodec *codec) { PyObject *args, *r; args = PyTuple_New(1); if (args == NULL) return NULL; PyTuple_SET_ITEM(args, 0, PyCObject_FromVoidPtr(codec, NULL)); r = PyObject_CallObject(cofunc, args); Py_DECREF(args); return r; } /* * ex: ts=8 sts=4 et */ |
From: Hye-Shik C. <pe...@us...> - 2003-05-31 03:51:34
|
perky 03/05/30 20:41:14 Modified: src _utf_8.c Log: Detect utf-8 lengths > 3 correctly even on --with-unicode=ucs2 Revision Changes Path 1.2 +15 -8 cjkcodecs/src/_utf_8.c Index: _utf_8.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/_utf_8.c,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- _utf_8.c 29 May 2003 09:12:30 -0000 1.1 +++ _utf_8.c 31 May 2003 03:41:14 -0000 1.2 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: _utf_8.c,v 1.1 2003/05/29 09:12:30 perky Exp $ + * $Id: _utf_8.c,v 1.2 2003/05/31 03:41:14 perky Exp $ */ #include "codeccommon.h" @@ -98,7 +98,7 @@ if (c < 0x80) { (*outbuf)[0] = (unsigned char)c; NEXT(1, 1) - } else if (c < 0xc2 || c == 0xff) { + } else if (c < 0xc2) { return 1; } else if (c < 0xe0) { unsigned char c2; @@ -121,12 +121,10 @@ | ((Py_UNICODE)(c2 ^ 0x80) << 6) | (Py_UNICODE)(c3 ^ 0x80); NEXT(3, 1) - } + } else if (c < 0xf8) { #if Py_UNICODE_SIZE == 2 - else - return 3; + return 4; #else - } else if (c < 0xf8) { unsigned char c2, c3, c4; RESERVER_INBUF(4) @@ -141,7 +139,11 @@ | ((Py_UNICODE)(c3 ^ 0x80) << 6) | (Py_UNICODE)(c4 ^ 0x80); NEXT(4, 1) +#endif } else if (c < 0xfc) { +#if Py_UNICODE_SIZE == 2 + return 5; +#else unsigned char c2, c3, c4, c5; RESERVER_INBUF(5) @@ -157,7 +159,11 @@ | ((Py_UNICODE)(c4 ^ 0x80) << 6) | (Py_UNICODE)(c5 ^ 0x80); NEXT(5, 1) - } else { /* 0xff is excluded above */ +#endif + } else if (c < 0xff) { +#if Py_UNICODE_SIZE == 2 + return 6; +#else unsigned char c2, c3, c4, c5, c6; RESERVER_INBUF(6) @@ -176,8 +182,9 @@ | ((Py_UNICODE)(c5 ^ 0x80) << 6) | (Py_UNICODE)(c6 ^ 0x80); NEXT(6, 1) - } #endif + } else + return 1; } return 0; |
From: Hye-Shik C. <pe...@us...> - 2003-05-31 03:05:41
|
perky 03/05/30 20:05:39 Modified: src _johab.c Log: Damn my fat fingers! Revision Changes Path 1.3 +2 -2 cjkcodecs/src/_johab.c Index: _johab.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/_johab.c,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- _johab.c 29 May 2003 07:31:07 -0000 1.2 +++ _johab.c 31 May 2003 03:05:38 -0000 1.3 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: _johab.c,v 1.2 2003/05/29 07:31:07 perky Exp $ + * $Id: _johab.c,v 1.3 2003/05/31 03:05:38 perky Exp $ */ #include "codeccommon.h" @@ -89,7 +89,7 @@ unsigned char c1, c2, t2; unsigned short t1; - assert(code & 0x8000 = 0); + assert(code & 0x8000 == 0); c1 = code >> 8; c2 = code & 0xff; if (((c1 >= 0x21 && c1 <= 0x2c) || (c1 >= 0x4a && c1 <= 0x7d)) |
From: Hye-Shik C. <pe...@us...> - 2003-05-29 09:17:39
|
perky 03/05/29 02:17:38 Modified: tests test_multibytecodec_support.py Log: We have a working 'utf-8' StreamReader now. Revision Changes Path 1.5 +2 -3 cjkcodecs/tests/test_multibytecodec_support.py Index: test_multibytecodec_support.py =================================================================== RCS file: /cvsroot/koco/cjkcodecs/tests/test_multibytecodec_support.py,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- test_multibytecodec_support.py 27 May 2003 05:17:48 -0000 1.4 +++ test_multibytecodec_support.py 29 May 2003 09:17:38 -0000 1.5 @@ -27,7 +27,7 @@ # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # -# $Id: test_multibytecodec_support.py,v 1.4 2003/05/27 05:17:48 perky Exp $ +# $Id: test_multibytecodec_support.py,v 1.5 2003/05/29 09:17:38 perky Exp $ # import sys, codecs, os.path @@ -134,8 +134,7 @@ # We can't test with the real utf-8 StreamReader here. # The standard SR.readline{,s} are mostly broken for multibyte seqs. #UTF8Reader = codecs.lookup('utf-8')[2] - return - UTF8Reader = iconv_codec.lookup('utf-8')[2] + UTF8Reader = codecs.lookup('cjkcodecs.utf-8')[2] for name in ["read", "readline", "readlines"]: for sizehint in [None, -1] + range(1, 33) + \ [64, 128, 256, 512, 1024]: |
From: Hye-Shik C. <pe...@us...> - 2003-05-29 09:12:30
|
perky 03/05/29 02:12:30 Added: src _utf_8.c Log: Add utf-8 codec. (we need this to have a 'sane' UTF-8 StreamReader.) Revision Changes Path 1.1 cjkcodecs/src/_utf_8.c Index: _utf_8.c =================================================================== /* * _utf_8.c: the UTF-8 codec * * Copyright (C) 2003 Hye-Shik Chang <pe...@Fr...>. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $Id: _utf_8.c,v 1.1 2003/05/29 09:12:30 perky Exp $ */ #include "codeccommon.h" ENCODER(utf_8) { while (inleft > 0) { Py_UNICODE c = **inbuf; int size; if (c < 0x80) size = 1; else if (c < 0x800) size = 2; #if Py_UNICODE_SIZE == 2 else size = 3; #else else if (c < 0x10000) size = 3; else if (c < 0x200000) size = 4; else if (c < 0x4000000) size = 5; else size = 6; #endif RESERVE_OUTBUF(size) switch (size) { #if Py_UNICODE_SIZE == 4 case 6: (*outbuf)[5] = 0x80 | (c & 0x3f); c = c >> 6; c |= 0x4000000; /* FALLTHROUGH */ case 5: (*outbuf)[4] = 0x80 | (c & 0x3f); c = c >> 6; c |= 0x200000; /* FALLTHROUGH */ case 4: (*outbuf)[3] = 0x80 | (c & 0x3f); c = c >> 6; c |= 0x10000; /* FALLTHROUGH */ #endif case 3: (*outbuf)[2] = 0x80 | (c & 0x3f); c = c >> 6; c |= 0x800; /* FALLTHROUGH */ case 2: (*outbuf)[1] = 0x80 | (c & 0x3f); c = c >> 6; c |= 0xc0; /* FALLTHROUGH */ case 1: (*outbuf)[0] = c; } NEXT(1, size) } return 0; } DECODER(utf_8) { while (inleft > 0) { unsigned char c = **inbuf; RESERVE_OUTBUF(1) if (c < 0x80) { (*outbuf)[0] = (unsigned char)c; NEXT(1, 1) } else if (c < 0xc2 || c == 0xff) { return 1; } else if (c < 0xe0) { unsigned char c2; RESERVE_INBUF(2) c2 = (*inbuf)[1]; if (!((c2 ^ 0x80) < 0x40)) return 2; **outbuf = ((Py_UNICODE)(c & 0x1f) << 6) | (Py_UNICODE)(c2 ^ 0x80); NEXT(2, 1) } else if (c < 0xf0) { unsigned char c2, c3; RESERVE_INBUF(3) c2 = (*inbuf)[1]; c3 = (*inbuf)[2]; if (!((c2 ^ 0x80) < 0x40 && (c3 ^ 0x80) < 0x40 && (c >= 0xe1 || c2 >= 0xa0))) return 3; **outbuf = ((Py_UNICODE)(c & 0x0f) << 12) | ((Py_UNICODE)(c2 ^ 0x80) << 6) | (Py_UNICODE)(c3 ^ 0x80); NEXT(3, 1) } #if Py_UNICODE_SIZE == 2 else return 3; #else } else if (c < 0xf8) { unsigned char c2, c3, c4; RESERVER_INBUF(4) c2 = (*inbuf)[1]; c3 = (*inbuf)[2]; c4 = (*inbuf)[3]; if (!((c2 ^ 0x80) < 0x40 && (c3 ^ 0x80) < 0x40 && (c4 ^ 0x80) < 0x40 && (c >= 0xf1 || c2 >= 0x90))) return 4; **outbuf = ((Py_UNICODE)(c & 0x07) << 18) | ((Py_UNICODE)(c2 ^ 0x80) << 12) | ((Py_UNICODE)(c3 ^ 0x80) << 6) | (Py_UNICODE)(c4 ^ 0x80); NEXT(4, 1) } else if (c < 0xfc) { unsigned char c2, c3, c4, c5; RESERVER_INBUF(5) c2 = (*inbuf)[1]; c3 = (*inbuf)[2]; c4 = (*inbuf)[3]; c5 = (*inbuf)[4]; if (!((c2 ^ 0x80) < 0x40 && (c3 ^ 0x80) < 0x40 && (c4 ^ 0x80) < 0x40 && (c5 ^ 0x80) < 0x40 && (c >= 0xf9 || c2 >= 0x88))) return 5; **outbuf = ((Py_UNICODE)(c & 0x03) << 24) | ((Py_UNICODE)(c2 ^ 0x80) << 18) | ((Py_UNICODE)(c3 ^ 0x80) << 12) | ((Py_UNICODE)(c4 ^ 0x80) << 6) | (Py_UNICODE)(c5 ^ 0x80); NEXT(5, 1) } else { /* 0xff is excluded above */ unsigned char c2, c3, c4, c5, c6; RESERVER_INBUF(6) c2 = (*inbuf)[1]; c3 = (*inbuf)[2]; c4 = (*inbuf)[3]; c5 = (*inbuf)[4]; c6 = (*inbuf)[5]; if (!((c2 ^ 0x80) < 0x40 && (c3 ^ 0x80) < 0x40 && (c4 ^ 0x80) < 0x40 && (c5 ^ 0x80) < 0x40 && (c6 ^ 0x80) < 0x40 && (c >= 0xfd || c2 >= 0x84))) return 6; **outbuf = ((Py_UNICODE)(c & 0x01) << 30) | ((Py_UNICODE)(c2 ^ 0x80) << 24) | ((Py_UNICODE)(c3 ^ 0x80) << 18) | ((Py_UNICODE)(c4 ^ 0x80) << 12) | ((Py_UNICODE)(c5 ^ 0x80) << 6) | (Py_UNICODE)(c6 ^ 0x80); NEXT(6, 1) } #endif } return 0; } BEGIN_CODEC_REGISTRY(utf_8) /* no maps */ END_CODEC_REGISTRY(utf_8) /* * ex: ts=8 sts=4 et */ |
From: Hye-Shik C. <pe...@us...> - 2003-05-29 09:12:30
|
perky 03/05/29 02:12:29 Modified: cjkcodecs Makefile aliases.py Added: cjkcodecs utf_8.py Log: Add utf-8 codec. (we need this to have a 'sane' UTF-8 StreamReader.) Revision Changes Path 1.5 +3 -2 cjkcodecs/cjkcodecs/Makefile Index: Makefile =================================================================== RCS file: /cvsroot/koco/cjkcodecs/cjkcodecs/Makefile,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- Makefile 29 May 2003 07:22:37 -0000 1.4 +++ Makefile 29 May 2003 09:12:29 -0000 1.5 @@ -25,13 +25,14 @@ # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # -# $Id: Makefile,v 1.4 2003/05/29 07:22:37 perky Exp $ +# $Id: Makefile,v 1.5 2003/05/29 09:12:29 perky Exp $ # GENERIC_ENCODINGS= gb2312 gbk gb18030 hz \ big5 cp950 \ cp932 shift_jis euc_jp \ - cp949 euc_kr johab + cp949 euc_kr johab \ + utf_8 all: for cset in ${GENERIC_ENCODINGS}; do \ 1.5 +5 -1 cjkcodecs/cjkcodecs/aliases.py Index: aliases.py =================================================================== RCS file: /cvsroot/koco/cjkcodecs/cjkcodecs/aliases.py,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- aliases.py 29 May 2003 07:22:38 -0000 1.4 +++ aliases.py 29 May 2003 09:12:29 -0000 1.5 @@ -26,7 +26,7 @@ # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # -# $Id: aliases.py,v 1.4 2003/05/29 07:22:38 perky Exp $ +# $Id: aliases.py,v 1.5 2003/05/29 09:12:29 perky Exp $ # from encodings.aliases import aliases @@ -107,6 +107,10 @@ 'shift_jis' : 'cjkcodecs.shift_jis', 'sjis' : 'cjkcodecs.shift_jis', 's_jis' : 'cjkcodecs.shift_jis', + + # utf_8 codec + #'utf8' : 'cjkcodecs.utf8', + #'utf_8' : 'cjkcodecs.utf_8', }) del aliases 1.1 cjkcodecs/cjkcodecs/utf_8.py Index: utf_8.py =================================================================== # ACHTUNG: This file is generated automatically. Please do not edit. # # utf_8.py: Python Unicode Codec for UTF_8 # # Copyright (C) 2003 Hye-Shik Chang <pe...@Fr...>. # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # # $Id: utf_8.py,v 1.1 2003/05/29 09:12:29 perky Exp $ # from cjkcodecs._utf_8 import codec import codecs class Codec(codecs.Codec): encode = codec.encode decode = codec.decode class StreamReader(Codec, codecs.StreamReader): def __init__(self, stream, errors='strict'): codecs.StreamReader.__init__(self, stream, errors) __codec = codec.StreamReader(stream, errors) self.read = __codec.read self.readline = __codec.readline self.readlines = __codec.readlines self.reset = __codec.reset class StreamWriter(Codec, codecs.StreamWriter): def __init__(self, stream, errors='strict'): codecs.StreamWriter.__init__(self, stream, errors) __codec = codec.StreamWriter(stream, errors) self.write = __codec.write self.writelines = __codec.writelines self.reset = __codec.reset def getregentry(): return (Codec().encode,Codec().decode,StreamReader,StreamWriter) # ex: ts=8 sts=4 et |
From: Hye-Shik C. <pe...@us...> - 2003-05-29 09:12:29
|
perky 03/05/29 02:12:29 Modified: . setup.py Log: Add utf-8 codec. (we need this to have a 'sane' UTF-8 StreamReader.) Revision Changes Path 1.16 +7 -5 cjkcodecs/setup.py Index: setup.py =================================================================== RCS file: /cvsroot/koco/cjkcodecs/setup.py,v retrieving revision 1.15 retrieving revision 1.16 diff -u -r1.15 -r1.16 --- setup.py 29 May 2003 07:22:37 -0000 1.15 +++ setup.py 29 May 2003 09:12:28 -0000 1.16 @@ -27,7 +27,7 @@ # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # -# $Id: setup.py,v 1.15 2003/05/29 07:22:37 perky Exp $ +# $Id: setup.py,v 1.16 2003/05/29 09:12:28 perky Exp $ # import sys @@ -39,9 +39,10 @@ 'ja_JP': ['shift_jis', 'cp932', 'euc_jp'], 'ko_KR': ['euc_kr', 'cp949', 'johab'], 'zh_CN': ['gb2312', 'gbk', 'gb18030', 'hz'], -'zh_TW': ['big5', 'cp950'] +'zh_TW': ['big5', 'cp950'], +'': ['utf_8'], } -locales = ['ja_JP', 'ko_KR', 'zh_CN', 'zh_TW'] +locales = ['ja_JP', 'ko_KR', 'zh_CN', 'zh_TW', ''] for arg in sys.argv[1:]: # don't use getopt to ignore arguments for distutils args = arg.split('=', 1) @@ -68,8 +69,9 @@ sys.argv.remove(arg) for loc in locales: - extensions.append(Extension('cjkcodecs.mapdata_'+loc, - ['src/maps/mapdata_%s.c'%loc])) + if loc: + extensions.append(Extension('cjkcodecs.mapdata_'+loc, + ['src/maps/mapdata_%s.c'%loc])) for enc in encodings[loc]: extensions.append(Extension('cjkcodecs._'+enc, ['src/_%s.c'%enc])) |
From: Hye-Shik C. <pe...@us...> - 2003-05-29 08:33:56
|
perky 03/05/29 00:58:14 Modified: tests test_mapping_johab.py Log: Mention about why we ignore WON SIGN mapping here. Revision Changes Path 1.2 +5 -1 cjkcodecs/tests/test_mapping_johab.py Index: test_mapping_johab.py =================================================================== RCS file: /cvsroot/koco/cjkcodecs/tests/test_mapping_johab.py,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- test_mapping_johab.py 29 May 2003 07:33:44 -0000 1.1 +++ test_mapping_johab.py 29 May 2003 07:58:13 -0000 1.2 @@ -27,7 +27,7 @@ # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # -# $Id: test_mapping_johab.py,v 1.1 2003/05/29 07:33:44 perky Exp $ +# $Id: test_mapping_johab.py,v 1.2 2003/05/29 07:58:13 perky Exp $ # from test import test_support @@ -44,6 +44,10 @@ unittest.TestCase): encoding = 'johab' mapfilename = 'JOHAB.TXT' + # KS X 1001 standard assigned 0x5c as WON SIGN. + # but, in early 90s that is the only era used johab widely, + # the most softwares implements it as REVERSE SOLIDUS. + # So, we ignore the standard here. pass_enctest = [('\\', u'\u20a9')] pass_dectest = [('\\', u'\u20a9')] |
From: Hye-Shik C. <pe...@us...> - 2003-05-29 07:40:34
|
perky 03/05/29 00:31:08 Modified: src _johab.c Log: Remove debug routines. Revision Changes Path 1.2 +2 -6 cjkcodecs/src/_johab.c Index: _johab.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/_johab.c,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- _johab.c 29 May 2003 07:22:38 -0000 1.1 +++ _johab.c 29 May 2003 07:31:07 -0000 1.2 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: _johab.c,v 1.1 2003/05/29 07:22:38 perky Exp $ + * $Id: _johab.c,v 1.2 2003/05/29 07:31:07 perky Exp $ */ #include "codeccommon.h" @@ -83,7 +83,6 @@ (u2johabidx_choseong[c / 588] << 10) | (u2johabidx_jungseong[(c / 28) % 21] << 5) | u2johabidx_jongseong[c % 28]; - printf("cho %d jung %d jong %d\n", c / 588, (c / 28) % 21, c % 28); } else if (c >= 0x3131 && c <= 0x3163) code = u2johabjamo[c - 0x3131]; else TRYMAP_ENC(cp949, code, c) { @@ -206,13 +205,11 @@ **outbuf = 0x3100 | johabjamo_choseong[c_cho]; else return 2; - } else { - printf("cho %d jung %d jong %d\n", i_cho, i_jung, i_jong); + } else **outbuf = 0xac00 + i_cho * 588 + i_jung * 28 + (i_jong == FILL ? 0 : i_jong); - } } NEXT(2, 1) } else { @@ -230,7 +227,6 @@ t1 = t1 + (t2 < 0x5e ? 0 : 1) + 0x21; t2 = (t2 < 0x5e ? t2 : t2 - 0x5e) + 0x21; - printf("t1 %02x t2 %02x\n", t1, t2); TRYMAP_DEC(ksx1001, **outbuf, t1, t2); else return 2; NEXT(2, 1) |
From: Hye-Shik C. <pe...@us...> - 2003-05-29 07:33:45
|
perky 03/05/29 00:33:44 Added: tests/sampletexts johab.txt johab.utf8 Log: Add unittests for johab encoding. Revision Changes Path 1.1 cjkcodecs/tests/sampletexts/johab.txt <<Binary file>> 1.1 cjkcodecs/tests/sampletexts/johab.utf8 <<Binary file>> |
From: Hye-Shik C. <pe...@us...> - 2003-05-29 07:33:45
|
perky 03/05/29 00:33:44 Added: tests test_encoding_johab.py test_mapping_johab.py Log: Add unittests for johab encoding. Revision Changes Path 1.1 cjkcodecs/tests/test_encoding_johab.py Index: test_encoding_johab.py =================================================================== #!/usr/bin/env python # # test_encoding_johab.py: Encoding test for the JOHAB codec # # Copyright (C) 2003 Hye-Shik Chang <pe...@Fr...>. # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # # $Id: test_encoding_johab.py,v 1.1 2003/05/29 07:33:44 perky Exp $ # from test import test_support import test_multibytecodec_support import unittest class Test_JOHAB(test_multibytecodec_support.TestBase, unittest.TestCase): encoding = 'johab' tstring = test_multibytecodec_support.load_teststring('johab') errortests = ( # invalid bytes ("abc\x80\x80\xc1\xc4", "strict", None), ("abc\xc8", "strict", None), ("abc\x80\x80\xc1\xc4", "replace", u"abc\ufffd\ucd27"), ("abc\x80\x80\xc1\xc4\xc8", "replace", u"abc\ufffd\ucd27\ufffd"), ("abc\x80\x80\xc1\xc4", "ignore", u"abc\ucd27"), ) def test_main(): suite = unittest.TestSuite() suite.addTest(unittest.makeSuite(Test_JOHAB)) test_support.run_suite(suite) if __name__ == "__main__": test_main() # ex: ts=8 sts=4 et 1.1 cjkcodecs/tests/test_mapping_johab.py Index: test_mapping_johab.py =================================================================== #!/usr/bin/env python # # test_mapping_johab.py: Mapping test for JOHAB codec # # Copyright (C) 2003 Hye-Shik Chang <pe...@Fr...>. # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # # $Id: test_mapping_johab.py,v 1.1 2003/05/29 07:33:44 perky Exp $ # from test import test_support import test_multibytecodec_support import sys, codecs, os import unittest if not os.path.exists('JOHAB.TXT'): raise test_support.TestSkipped( 'JOHAB.TXT not found, download from ftp://ftp.unicode.' 'org/Public/MAPPINGS/OBSOLETE/EASTASIA/KSC/JOHAB.TXT') class TestJOHABMap(test_multibytecodec_support.TestBase_Mapping, unittest.TestCase): encoding = 'johab' mapfilename = 'JOHAB.TXT' pass_enctest = [('\\', u'\u20a9')] pass_dectest = [('\\', u'\u20a9')] def test_main(): suite = unittest.TestSuite() suite.addTest(unittest.makeSuite(TestJOHABMap)) test_support.run_suite(suite) if __name__ == "__main__": test_main() # ex: ts=8 sts=4 et |
From: Hye-Shik C. <pe...@us...> - 2003-05-29 07:22:39
|
perky 03/05/29 00:22:38 Added: src _johab.c Log: Add 'johab' codec. Revision Changes Path 1.1 cjkcodecs/src/_johab.c Index: _johab.c =================================================================== /* * _johab.c: the Johab codec * * Copyright (C) 2003 Hye-Shik Chang <pe...@Fr...>. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $Id: _johab.c,v 1.1 2003/05/29 07:22:38 perky Exp $ */ #include "codeccommon.h" ENCMAP(cp949) DECMAP(ksx1001) static const unsigned char u2johabidx_choseong[32] = { 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, }; static const unsigned char u2johabidx_jungseong[32] = { 0x03, 0x04, 0x05, 0x06, 0x07, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x1a, 0x1b, 0x1c, 0x1d, }; static const unsigned char u2johabidx_jongseong[32] = { 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, }; static const DBCHAR u2johabjamo[] = { 0x8841, 0x8c41, 0x8444, 0x9041, 0x8446, 0x8447, 0x9441, 0x9841, 0x9c41, 0x844a, 0x844b, 0x844c, 0x844d, 0x844e, 0x844f, 0x8450, 0xa041, 0xa441, 0xa841, 0x8454, 0xac41, 0xb041, 0xb441, 0xb841, 0xbc41, 0xc041, 0xc441, 0xc841, 0xcc41, 0xd041, 0x8461, 0x8481, 0x84a1, 0x84c1, 0x84e1, 0x8541, 0x8561, 0x8581, 0x85a1, 0x85c1, 0x85e1, 0x8641, 0x8661, 0x8681, 0x86a1, 0x86c1, 0x86e1, 0x8741, 0x8761, 0x8781, 0x87a1, }; ENCODER(johab) { while (inleft > 0) { Py_UNICODE c = **inbuf; DBCHAR code; if (c < 0x80) { RESERVE_OUTBUF(1) **outbuf = c; NEXT(1, 1) continue; } UCS4INVALID(c) RESERVE_OUTBUF(2) if (c >= 0xac00 && c <= 0xd7a3) { c -= 0xac00; code = 0x8000 | (u2johabidx_choseong[c / 588] << 10) | (u2johabidx_jungseong[(c / 28) % 21] << 5) | u2johabidx_jongseong[c % 28]; printf("cho %d jung %d jong %d\n", c / 588, (c / 28) % 21, c % 28); } else if (c >= 0x3131 && c <= 0x3163) code = u2johabjamo[c - 0x3131]; else TRYMAP_ENC(cp949, code, c) { unsigned char c1, c2, t2; unsigned short t1; assert(code & 0x8000 = 0); c1 = code >> 8; c2 = code & 0xff; if (((c1 >= 0x21 && c1 <= 0x2c) || (c1 >= 0x4a && c1 <= 0x7d)) && (c2 >= 0x21 && c2 <= 0x7e)) { t1 = (c1 < 0x4a ? (c1 - 0x21 + 0x1b2) : (c1 - 0x21 + 0x197)); t2 = ((t1 & 1) ? 0x5e : 0) + (c2 - 0x21); (*outbuf)[0] = t1 >> 1; (*outbuf)[1] = (t2 < 0x4e ? t2 + 0x31 : t2 + 0x43); NEXT(1, 2) continue; } else return 1; } else return 1; (*outbuf)[0] = code >> 8; (*outbuf)[1] = code & 0xFF; NEXT(1, 2) } return 0; } #define FILL 0xfd #define NONE 0xff static const unsigned char johabidx_choseong[32] = { NONE, FILL, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE, }; static const unsigned char johabidx_jungseong[32] = { NONE, NONE, FILL, 0x00, 0x01, 0x02, 0x03, 0x04, NONE, NONE, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, NONE, NONE, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, NONE, NONE, 0x11, 0x12, 0x13, 0x14, NONE, NONE, }; static const unsigned char johabidx_jongseong[32] = { NONE, FILL, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, NONE, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, NONE, NONE, }; static const unsigned char johabjamo_choseong[32] = { NONE, FILL, 0x31, 0x32, 0x34, 0x37, 0x38, 0x39, 0x41, 0x42, 0x43, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE, }; static const unsigned char johabjamo_jungseong[32] = { NONE, NONE, FILL, 0x4f, 0x50, 0x51, 0x52, 0x53, NONE, NONE, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, NONE, NONE, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, NONE, NONE, 0x60, 0x61, 0x62, 0x63, NONE, NONE, }; static const unsigned char johabjamo_jongseong[32] = { NONE, FILL, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, NONE, 0x42, 0x44, 0x45, 0x46, 0x47, 0x48, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, NONE, NONE, }; DECODER(johab) { while (inleft > 0) { unsigned char c = **inbuf, c2; RESERVE_OUTBUF(1) if (c < 0x80) { **outbuf = c; NEXT(1, 1) continue; } RESERVE_INBUF(2) c2 = (*inbuf)[1]; if (c < 0xd8) { /* johab hangul */ unsigned char c_cho, c_jung, c_jong; unsigned char i_cho, i_jung, i_jong; c_cho = (c >> 2) & 0x1f; c_jung = ((c << 3) | c2 >> 5) & 0x1f; c_jong = c2 & 0x1f; i_cho = johabidx_choseong[c_cho]; i_jung = johabidx_jungseong[c_jung]; i_jong = johabidx_jongseong[c_jong]; if (i_cho == NONE || i_jung == NONE || i_jong == NONE) return 2; /* we don't use U+1100 hangul jomo yet. */ if (i_cho == FILL) { if (i_jung == FILL) { if (i_jong == FILL) **outbuf = 0x3000; else **outbuf = 0x3100 | johabjamo_jongseong[c_jong]; } else { if (i_jong == FILL) **outbuf = 0x3100 | johabjamo_jungseong[c_jung]; else return 2; } } else { if (i_jung == FILL) { if (i_jong == FILL) **outbuf = 0x3100 | johabjamo_choseong[c_cho]; else return 2; } else { printf("cho %d jung %d jong %d\n", i_cho, i_jung, i_jong); **outbuf = 0xac00 + i_cho * 588 + i_jung * 28 + (i_jong == FILL ? 0 : i_jong); } } NEXT(2, 1) } else { /* KS X 1001 except hangul jamos and syllables */ if (c == 0xdf || c > 0xf9 || c2 < 0x31 || (c2 >= 0x80 && c2 < 0x91) || (c2 & 0x7f) == 0x7f || (c == 0xda && (c2 >= 0xa1 && c2 <= 0xd3))) return 2; else { unsigned char t1, t2; t1 = (c < 0xe0 ? 2 * (c - 0xd9) : 2 * c - 0x197); t2 = (c2 < 0x91 ? c2 - 0x31 : c2 - 0x43); t1 = t1 + (t2 < 0x5e ? 0 : 1) + 0x21; t2 = (t2 < 0x5e ? t2 : t2 - 0x5e) + 0x21; printf("t1 %02x t2 %02x\n", t1, t2); TRYMAP_DEC(ksx1001, **outbuf, t1, t2); else return 2; NEXT(2, 1) } } } return 0; } #undef NONE #undef FILL BEGIN_CODEC_REGISTRY(johab) MAPOPEN(ko_KR) IMPORTMAP_DEC(ksx1001) IMPORTMAP_ENC(cp949) MAPCLOSE() END_CODEC_REGISTRY(johab) /* * ex: ts=8 sts=4 et */ |