Thread: [KoCo-CVS] [Commit] cjkcodecs/src _cp932.c
Brought to you by:
perky
From: Hye-Shik C. <pe...@us...> - 2003-05-26 07:57:53
|
perky 03/05/26 00:57:52 Added: src _cp932.c Log: Add cp932 codec. Revision Changes Path 1.1 cjkcodecs/src/_cp932.c Index: _cp932.c =================================================================== /* * _cp932.c: the CP932 codec * * Copyright (C) 2003 Hye-Shik Chang <pe...@Fr...>. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $Id: _cp932.c,v 1.1 2003/05/26 07:57:52 perky Exp $ */ #include "codeccommon.h" ENCMAP(jisxcommon) ENCMAP(cp932ext) DECMAP(jisx0208) DECMAP(cp932ext) ENCODER(cp932) { while (inleft > 0) { Py_UNICODE c = **inbuf; DBCHAR code; unsigned char c1, c2; if (c < 0x80) { RESERVE_OUTBUF(1) **outbuf = (unsigned char)c; NEXT(1, 1) continue; } else if (c >= 0xff61 && c <= 0xff9f) { RESERVE_OUTBUF(1) **outbuf = (unsigned char)(c - 0xfec0); NEXT(1, 1) continue; } UCS4INVALID(c) RESERVE_OUTBUF(2) TRYMAP_ENC(cp932ext, code, c) { (*outbuf)[0] = code >> 8; (*outbuf)[1] = code & 0xff; } else TRYMAP_ENC(jisxcommon, code, c) { if (code & 0x8000) /* MSB set: JIS X 0212 */ return 1; /* JIS X 0208 */ c1 = code >> 8; c2 = code & 0xff; c2 = (((c1 - 0x21) & 1) ? 0x5e : 0) + (c2 - 0x21); c1 = (c1 - 0x21) >> 1; (*outbuf)[0] = c1 < 0x1f ? c1 + 0x81 : c1 + 0xc1; (*outbuf)[1] = c2 < 0x3f ? c2 + 0x40 : c2 + 0x41; } else if (c >= 0xe000 && c < 0xe758) { /* User-defined area */ c1 = (Py_UNICODE)(c - 0xe000) / 188; c2 = (Py_UNICODE)(c - 0xe000) % 188; (*outbuf)[0] = c1 + 0xf0; (*outbuf)[1] = (c2 < 0x3f ? c2 + 0x40 : c2 + 0x41); } else return 1; NEXT(1, 2) } return 0; } DECODER(cp932) { while (inleft > 0) { unsigned char c = **inbuf, c2; Py_UNICODE code; RESERVE_OUTBUF(1) if (c < 0x80) { **outbuf = c; NEXT(1, 1) continue; } RESERVE_INBUF(2) c2 = (*inbuf)[1]; TRYMAP_DEC(cp932ext, code, c, c2); else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)) { if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc) return 2; c = (c < 0xe0 ? c - 0x81 : c - 0xc1); c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41); c = (2 * c + (c2 < 0x5e ? 0 : 1) + 0x21); c2 = (c2 < 0x5e ? c2 : c2 - 0x5e) + 0x21; TRYMAP_DEC(jisx0208, code, c, c2); else return 2; } else if (c >= 0xf0 && c <= 0xf9) { if ((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfc)) code = 0xe000 + 188 * (c - 0xf0) + (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41); else return 2; } else return 2; **outbuf = code; NEXT(2, 1) } return 0; } BEGIN_CODEC_REGISTRY(cp932) MAPOPEN(ja_JP) IMPORTMAP_DEC(jisx0208) IMPORTMAP_ENCDEC(cp932ext) IMPORTMAP_ENC(jisxcommon) MAPCLOSE() END_CODEC_REGISTRY(cp932) /* * ex: ts=8 sts=4 et */ |
From: Hye-Shik C. <pe...@us...> - 2003-06-20 09:04:54
|
perky 03/06/20 02:04:53 Modified: src _cp932.c Log: - Tweaked some mapping for cp932 and cp950 to make more consistency with MS Windows. - CP932: Added single byte "UNDEFINED" characters 0x80, 0xa0, 0xfd, 0xfe, 0xff (documented on NOTES.cp932) - CP950: Changed encode mappings to another more popular for duplicated unicode points: 5341 -> A451, 5345 -> A4CA - A unittest for big5 mapping is added. - Fixed a bug that cp932 codec couldn't decode half-width katakana. Revision Changes Path 1.4 +24 -3 cjkcodecs/src/_cp932.c Index: _cp932.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/_cp932.c,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- _cp932.c 9 Jun 2003 10:25:36 -0000 1.3 +++ _cp932.c 20 Jun 2003 09:04:52 -0000 1.4 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: _cp932.c,v 1.3 2003/06/09 10:25:36 perky Exp $ + * $Id: _cp932.c,v 1.4 2003/06/20 09:04:52 perky Exp $ */ #include "codeccommon.h" @@ -43,7 +43,7 @@ DBCHAR code; unsigned char c1, c2; - if (c < 0x80) { + if (c <= 0x80) { RESERVE_OUTBUF(1) **outbuf = (unsigned char)c; NEXT(1, 1) @@ -53,6 +53,15 @@ **outbuf = (unsigned char)(c - 0xfec0); NEXT(1, 1) continue; + } else if (c >= 0xf8f0 && c <= 0xf8f3) { + /* Windows compatability */ + RESERVE_OUTBUF(1) + if (c == 0xf8f0) + **outbuf = 0xa0; + else + **outbuf = (unsigned char)(c - 0xfef1 + 0xfd); + NEXT(1, 1) + continue; } UCS4INVALID(c) @@ -93,8 +102,20 @@ unsigned char c = **inbuf, c2; RESERVE_OUTBUF(1) - if (c < 0x80) { + if (c <= 0x80) { **outbuf = c; + NEXT(1, 1) + continue; + } else if (c >= 0xa0 && c <= 0xdf) { + if (c == 0xa0) + **outbuf = 0xf8f0; /* half-width katakana */ + else + **outbuf = 0xfec0 + c; + NEXT(1, 1) + continue; + } else if (c >= 0xfd/* && c <= 0xff*/) { + /* Windows compatibility */ + **outbuf = 0xf8f1 - 0xfd + c; NEXT(1, 1) continue; } |