Thread: [KoCo-CVS] [Commit] cjkcodecs/src _iso_2022_kr.c iso2022common.h
Brought to you by:
perky
From: Hye-Shik C. <pe...@us...> - 2003-06-02 07:39:24
|
perky 03/06/02 00:39:23 Added: src _iso_2022_kr.c iso2022common.h Log: Add iso-2022-kr codec finally! Revision Changes Path 1.1 cjkcodecs/src/_iso_2022_kr.c Index: _iso_2022_kr.c =================================================================== /* * _iso_2022_kr.c: the ISO-2022-KR codec * * Copyright (C) 2003 Hye-Shik Chang <pe...@Fr...>. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $Id: _iso_2022_kr.c,v 1.1 2003/06/02 07:39:22 perky Exp $ */ #include "codeccommon.h" #include "iso2022common.h" ENCMAP(cp949) DECMAP(ksx1001) #define HAVE_ENCODER_INIT ENCODER_INIT(iso_2022_kr) { state->i = 0; STATE_SETG0(state, CHARSET_ASCII) STATE_SETG1(state, CHARSET_ASCII) return 0; } #define HAVE_ENCODER_RESET ENCODER_RESET(iso_2022_kr) { if (STATE_GETFLAG(state, F_SHIFTED)) { RESERVE_OUTBUF(1) **outbuf = SI; NEXT_OUT(1) STATE_CLEARFLAG(state, F_SHIFTED) } return 0; } ENCODER(iso_2022_kr) { while (inleft > 0) { Py_UNICODE c = **inbuf; DBCHAR code; if (c < 0x80) { if (STATE_GETFLAG(state, F_SHIFTED)) { RESERVE_OUTBUF(2) STATE_CLEARFLAG(state, F_SHIFTED) (*outbuf)[0] = SI; (*outbuf)[1] = c; NEXT(1, 2) } else { RESERVE_OUTBUF(1) (*outbuf)[0] = c; NEXT(1, 1) } if (c == '\n') STATE_CLEARFLAG(state, F_SHIFTED) } else UCS4INVALID(c) else { if (STATE_GETG1(state) != CHARSET_KSX1001) { RESERVE_OUTBUF(4) STATE_SETG1(state, CHARSET_KSX1001) (*outbuf)[0] = ESC; (*outbuf)[1] = '$'; (*outbuf)[2] = ')'; (*outbuf)[3] = 'C'; NEXT_OUT(4) } if (!STATE_GETFLAG(state, F_SHIFTED)) { RESERVE_OUTBUF(1) STATE_SETFLAG(state, F_SHIFTED) (*outbuf)[0] = SO; NEXT_OUT(1) } TRYMAP_ENC(cp949, code, c) { if (code & 0x8000) /* MSB set: CP949 */ return 1; RESERVE_OUTBUF(1) (*outbuf)[0] = code >> 8; (*outbuf)[1] = code & 0xff; NEXT(1, 2) } else return 1; } } return 0; } #define HAVE_DECODER_INIT DECODER_INIT(iso_2022_kr) { state->i = 0; STATE_SETG0(state, CHARSET_ASCII) STATE_SETG1(state, CHARSET_ASCII) return 0; } #define HAVE_DECODER_RESET DECODER_RESET(iso_2022_kr) { STATE_CLEARFLAG(state, F_SHIFTED) return 0; } DECODER(iso_2022_kr) { while (inleft > 0) { unsigned char c = **inbuf; if (STATE_GETFLAG(state, F_ESCTHROUGHOUT)) { /* ESC throughout mode: for non-iso2022 escape sequences */ RESERVE_OUTBUF(1) **outbuf = c; /* assume as ISO-8859-1 */ NEXT(1, 1) if (IS_ESCEND(c)) { STATE_CLEARFLAG(state, F_ESCTHROUGHOUT) } continue; } switch (c) { case ESC: RESERVE_INBUF(2) if (IS_ISO2022ESC((*inbuf)[1])) { int eslen; eslen = iso2022esclen(*inbuf, inleft); if (eslen < 0) return eslen == MBERR_INTERNAL ? 1 : eslen; if (eslen == 3) { if ((*inbuf)[2] == 'B') { /* ASCII */ if ((*inbuf)[1] == '(') { STATE_SETG0(state, CHARSET_ASCII) } else if ((*inbuf)[1] == ')') { STATE_SETG1(state, CHARSET_ASCII) } else return 3; } else return 3; } else if (eslen == 4) { if ((*inbuf)[1] == '$' && (*inbuf)[3] == 'C') { /* KS X 1001 */ if ((*inbuf)[2] == '(') { STATE_SETG0(state, CHARSET_KSX1001) } else if ((*inbuf)[2] == ')') { STATE_SETG1(state, CHARSET_KSX1001) } else return 4; } else return 4; } else return eslen; NEXT_IN(eslen) } else { STATE_SETFLAG(state, F_ESCTHROUGHOUT) **outbuf = ESC; NEXT(1, 1) } break; case SI: STATE_CLEARFLAG(state, F_SHIFTED) NEXT_IN(1) break; case SO: STATE_SETFLAG(state, F_SHIFTED) NEXT_IN(1) break; case '\n': STATE_CLEARFLAG(state, F_SHIFTED) /* FALLTHROUGH */ case SP: /* FALLTHROUGH */ case DEL: RESERVE_OUTBUF(1) **outbuf = c; NEXT(1, 1) break; default: if ((c & 0x7f) < 0x20) { /* C0 and C1 */ RESERVE_OUTBUF(1) **outbuf = c & 0x7f; NEXT(1, 1) } else { unsigned char charset; if (!STATE_GETFLAG(state, F_SHIFTED) && c < 0x80) /* G0 */ charset = STATE_GETG0(state); else /* G1 */ charset = STATE_GETG1(state); if (charset & CHARSET_DOUBLEBYTE) { /* all double byte character sets are in KS X 1001 here */ RESERVE_OUTBUF(1) TRYMAP_DEC(ksx1001, **outbuf, c & 0x7f, (*inbuf)[1] & 0x7f){ NEXT(2, 1) } else return 2; } else { RESERVE_OUTBUF(1) **outbuf = c; NEXT(1, 1) } } } } return 0; } #include "codecentry.h" BEGIN_CODEC_REGISTRY(iso_2022_kr) MAPOPEN(ko_KR) IMPORTMAP_DEC(ksx1001) IMPORTMAP_ENC(cp949) MAPCLOSE() END_CODEC_REGISTRY(iso_2022_kr) /* * ex: ts=8 sts=4 et */ 1.1 cjkcodecs/src/iso2022common.h Index: iso2022common.h =================================================================== /* * iso2022common.h: Common Codec Routines for ISO-2022 codecs. * * Copyright (C) 2003 Hye-Shik Chang <pe...@Fr...>. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $Id: iso2022common.h,v 1.1 2003/06/02 07:39:22 perky Exp $ */ /* This ISO-2022 implementation is intended to comply ECMA-43 Level 1 * rather than RFCs itself */ #define ESC 0x1b #define SO 0x0e #define SI 0x0f #define SP 0x20 #define DEL 0x7f #define MAX_ESCSEQLEN 16 #define IS_ESCEND(c) ((c) >= 'A' && (c) <= 'Z') #define IS_ISO2022ESC(c2) ((c2) == '(' || (c2) == ')' || (c2) == '$') /* this is not a full list of ISO-2022 escape sequence headers. * but, it's enough to implement CJK instances of iso-2022. */ /* STATE 00000000 00000000 00000000 || ||^^^^^| ||^^^^^| || || | |+-----+---- G0 Character Set || || | +----------- Is G0 double byte? || |+-----+------------- G1 Character Set || +-------------------- Is G1 double byte? |+---------------------- Shifted in? +----------------------- ESC Throughout */ #define CHARSET_DOUBLEBYTE 0x80 #define CHARSET_ASCII 'B' #define CHARSET_KSX1001 ('C'|CHARSET_DOUBLEBYTE) #define CHARSET_JISX0201_R 'J' #define CHARSET_JISX0201_K 'I' #define CHARSET_JISX0208 ('B'|CHARSET_DOUBLEBYTE) #define CHARSET_JISX0208_O ('@'|CHARSET_DOUBLEBYTE) #define CHARSET_JISX0212 ('D'|CHARSET_DOUBLEBYTE) #define CHARSET_JISX0213_1 ('O'|CHARSET_DOUBLEBYTE) #define CHARSET_JISX0213_2 ('P'|CHARSET_DOUBLEBYTE) #define CHARSET_GB2312 ('A'|CHARSET_DOUBLEBYTE) #define CHARSET_GB2312_8565 ('E'|CHARSET_DOUBLEBYTE) #define CHARSET_DESIGN(c) ((c) & 0x7f) #define CHARSET_ISDBCS(c) ((c) & 0x80) #define F_SHIFTED 0x010000 #define F_ESCTHROUGHOUT 0x020000 #define STATE_SETG0(s, v) ((s)->i) = (((s)->i) & ~0x0000ff) | (v); #define STATE_GETG0(s) ((s)->i & 0x0000ff) #define STATE_SETG1(s, v) ((s)->i) = (((s)->i) & ~0x00ff00) | ((v) << 8); #define STATE_GETG1(s) (((s)->i & 0x00ff00) >> 8) #define STATE_SETFLAG(s, f) ((s)->i) |= (f); #define STATE_GETFLAG(s, f) ((s)->i & (f)) #define STATE_CLEARFLAG(s, f) ((s)->i) &= ~(f); static int iso2022esclen(const unsigned char *s, size_t len) { int i; for (i = 1;i < MAX_ESCSEQLEN;i++) { if (i >= len) return MBERR_TOOFEW; if (IS_ESCEND(s[i])) return i + 1; } return MBERR_INTERNAL; /* unterminated escape sequence */ } /* * ex: ts=8 sts=4 et */ |