Thread: [KoCo-CVS] [Commit] KoreanCodecs/src koco_stream.h
Brought to you by:
perky
From: Chang <pe...@us...> - 2002-04-28 19:43:38
|
perky 02/04/28 01:02:32 Modified: src koco_stream.h Log: - Add StreamReader for CP949 encoding Revision Changes Path 1.3 +145 -3 KoreanCodecs/src/koco_stream.h Index: koco_stream.h =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/src/koco_stream.h,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- koco_stream.h 28 Apr 2002 06:54:12 -0000 1.2 +++ koco_stream.h 28 Apr 2002 08:02:32 -0000 1.3 @@ -4,10 +4,15 @@ * KoreanCodecs EUC-KR StreamReader C Implementation * * Author : Hye-Shik Chang <pe...@fa...> - * Date : $Date: 2002/04/28 06:54:12 $ + * Date : $Date: 2002/04/28 08:02:32 $ * Created : 28 April 2002 * - * $Revision: 1.2 $ + * $Revision: 1.3 $ + */ + +/* + * TODO: + * __euc_kr_decode and __cp949_decode has so many big duplicated codes, now. */ static PyObject * @@ -103,6 +108,143 @@ return r; } +static PyObject * +__cp949_decode( + state_t *state, char *s, int slen, int errtype, + PyObject* (*finalizer)(const Py_UNICODE *, int) +) { + unsigned char *srccur, *srcend; + Py_UNICODE *destptr, *destcur, *codemap, code; + PyObject *r; + + destcur = destptr = PyMem_New(Py_UNICODE, slen+1); + srccur = s; + srcend = s + slen; + + if (HAS_STATE(*state)) { + unsigned char c = GET_STATE(*state); + + if (c & 0x80) { + if (slen > 0) { + if (uhc_decode_hint[c]) { /* UHC page0 region */ + codemap = uhc_decode_map[c & 0x7F]; + + if (uhc_page0_bottom <= *srccur && *srccur <= uhc_page0_top) { + code = codemap[*srccur - uhc_page0_bottom]; + if (code == UNIFIL) + goto invalid; + *(destcur++) = code; + srccur++; + } else + goto invalid_state; + } else if (uhc_decode_hint[*srccur]) { /* UHC page1 region */ + codemap = uhc_decode_map[c & 0x7F]; + if (!codemap) + goto invalid; + + code = codemap[*srccur - uhc_page1_bottom]; + if (code == UNIFIL) + goto invalid; + *(destcur++) = code; + srccur++; + } else { /* KSC5601 */ + codemap = ksc5601_decode_map[c & 0x7F]; + + if (!codemap) + goto invalid_state; + if (ksc5601_decode_bottom <= *srccur && *srccur <= ksc5601_decode_top) { + code = codemap[*srccur - ksc5601_decode_bottom]; + if (code == UNIFIL) + goto invalid_state; + *(destcur++) = code; + srccur++; + } else { +invalid_state: switch (errtype) { + case error_strict: + PyErr_Format(PyExc_UnicodeError, + "CP949 decoding error: invalid character \\x%02x%02x", + c, *srccur); + r = NULL; + goto out; + case error_replace: + *(destcur++) = UNIFIL; + break; + case error_ignore: break; + } + srccur++; + } + } + } else { /* keep state */ + r = PyUnicode_FromUnicode(NULL, 0); + goto out; + } + } else + *(destcur++) = c; + + RESET_STATE(*state); + } + + for (; srccur < srcend; srccur++) { + if (*srccur & 0x80) { + if (srccur+1 >= srcend) /* state out */ + SET_STATE(*state, *srccur); + else { + if (uhc_decode_hint[*srccur]) { /* UHC page0 region */ + codemap = uhc_decode_map[*srccur & 0x7F]; + if (uhc_page0_bottom <= srccur[1] && srccur[1] <= uhc_page0_top) { + code = codemap[srccur[1] - uhc_page0_bottom]; + if (code == UNIFIL) + goto invalid; + *(destcur++) = code; + srccur++; + } else + goto invalid; + } else if (uhc_decode_hint[srccur[1]]) { /* UHC page1 region */ + codemap = uhc_decode_map[*srccur & 0x7F]; + if (!codemap) + goto invalid; + code = codemap[srccur[1] - uhc_page1_bottom]; + if (code == UNIFIL) + goto invalid; + *(destcur++) = code; + srccur++; + } else { + codemap = ksc5601_decode_map[*srccur & 0x7F]; + if (!codemap) + goto invalid; + if (ksc5601_decode_bottom <= srccur[1] && srccur[1] <= ksc5601_decode_top) { + code = codemap[srccur[1] - ksc5601_decode_bottom]; + if (code == UNIFIL) + goto invalid; + *(destcur++) = code; + srccur++; + } else { +invalid: switch (errtype) { + case error_strict: + PyErr_Format(PyExc_UnicodeError, + "CP949 decoding error: invalid character \\x%02x%02x", + srccur[0], srccur[1]); + r = NULL; + goto out; + case error_replace: + *(destcur++) = UNIFIL; + break; + case error_ignore: break; + } + srccur++; + } + } + } + } else + *(destcur++) = *srccur; + } + + r = finalizer(destptr, destcur-destptr); +out: + PyMem_Del(destptr); + return r; +} + PyObject* readline_finalizer(const Py_UNICODE *data, int datalen) { @@ -166,7 +308,7 @@ if (!strcmp(encoding, "euc-kr")) stnfo->decoder = __euc_kr_decode; else if (!strcmp(encoding, "cp949")) - stnfo->decoder = __euc_kr_decode; + stnfo->decoder = __cp949_decode; else { PyMem_Del(stnfo); PyErr_Format(PyExc_UnicodeError, |
From: Hye-Shik C. <pe...@us...> - 2003-01-02 07:44:41
|
perky 03/01/01 23:44:40 Modified: src koco_stream.h Log: StreamReader_methods is static, too. Revision Changes Path 1.12 +4 -4 KoreanCodecs/src/koco_stream.h Index: koco_stream.h =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/src/koco_stream.h,v retrieving revision 1.11 retrieving revision 1.12 diff -u -r1.11 -r1.12 --- koco_stream.h 2 Jan 2003 07:41:23 -0000 1.11 +++ koco_stream.h 2 Jan 2003 07:44:40 -0000 1.12 @@ -1,10 +1,10 @@ /* - * euckr_stream.c - $Revision: 1.11 $ + * euckr_stream.c - $Revision: 1.12 $ * * KoreanCodecs EUC-KR StreamReader C Implementation * * Author : Hye-Shik Chang <pe...@Fr...> - * Date : $Date: 2003/01/02 07:41:23 $ + * Date : $Date: 2003/01/02 07:44:40 $ * Created : 28 April 2002 * * This file is part of KoreanCodecs. @@ -593,7 +593,7 @@ return Py_None; } -struct PyMethodDef StreamReader_methods[] = { +static struct PyMethodDef StreamReader_methods[] = { {"__init__", (PyCFunction) StreamReader___init__, METH_VARARGS | METH_KEYWORDS, StreamReader___init____doc__}, {"read", (PyCFunction) StreamReader_read, @@ -608,6 +608,6 @@ }; /* - * $Id: koco_stream.h,v 1.11 2003/01/02 07:41:23 perky Exp $ + * $Id: koco_stream.h,v 1.12 2003/01/02 07:44:40 perky Exp $ * ex: ts=8 sts=4 et */ |