Thread: [KoCo-CVS] [Commit] cjkcodecs/src _cp949.c _euc_kr.c multibytecodec.c
Brought to you by:
perky
From: Hye-Shik C. <pe...@us...> - 2003-05-19 10:38:08
|
perky 03/05/19 03:38:08 Modified: src _euc_kr.c multibytecodec.c Added: src _cp949.c Log: Add decoder implementation and cp949 codec. Revision Changes Path 1.4 +32 -3 cjkcodecs/src/_euc_kr.c Index: _euc_kr.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/_euc_kr.c,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- _euc_kr.c 19 May 2003 08:12:23 -0000 1.3 +++ _euc_kr.c 19 May 2003 10:38:08 -0000 1.4 @@ -26,14 +26,13 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: _euc_kr.c,v 1.3 2003/05/19 08:12:23 perky Exp $ + * $Id: _euc_kr.c,v 1.4 2003/05/19 10:38:08 perky Exp $ */ #include "codeccommon.h" ENCMAP(cp949) DECMAP(ksx1001) -DECMAP(cp949ext) ENCODER(euc_kr) { @@ -72,6 +71,37 @@ DECODER(euc_kr) { + while (inleft > 0) { + const struct dbcs_index *map; + unsigned char c = **inbuf, c2; + Py_UNICODE code; + + if (outleft < 1) + return MBERR_TOOSMALL; + + if (c < 0x80) { + **outbuf = c; + (*inbuf)++; inleft--; + (*outbuf)++; outleft--; + continue; + } + + if (inleft < 2) + return MBERR_TOOFEW; + if ((c2 = (*inbuf)[1]) < 0x80) + return 2; + else + c2 &= 0x7f; + map = &ksx1001decmap[c & 0x7f]; + if (map->map == NULL || c2 < map->bottom || c2 > map->top || + (code = map->map[c2 - map->bottom]) == UNIINV) + return 1; + + **outbuf = code; + (*outbuf)++; outleft--; + (*inbuf) += 2; inleft -= 2; + } + return 0; } @@ -89,7 +119,6 @@ /* Import mapdata */ MAPOPEN(mod, "ko_KR") if (IMPORTMAP(mod, ksx1001, NULL, &ksx1001decmap) || - IMPORTMAP(mod, cp949ext, NULL, &cp949extdecmap) || IMPORTMAP(mod, cp949, &cp949encmap, NULL)) goto errorexit; MAPCLOSE(mod) 1.5 +250 -20 cjkcodecs/src/multibytecodec.c Index: multibytecodec.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/multibytecodec.c,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- multibytecodec.c 19 May 2003 06:32:17 -0000 1.4 +++ multibytecodec.c 19 May 2003 10:38:08 -0000 1.5 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: multibytecodec.c,v 1.4 2003/05/19 06:32:17 perky Exp $ + * $Id: multibytecodec.c,v 1.5 2003/05/19 10:38:08 perky Exp $ */ #include "Python.h" @@ -38,6 +38,12 @@ PyObject *excobj, *outobj; } MultibyteEncodeBuffer; +typedef struct { + const unsigned char *inbuf, *inbuf_top, *inbuf_end; + Py_UNICODE *outbuf, *outbuf_end; + PyObject *excobj, *outobj; +} MultibyteDecodeBuffer; + PyDoc_STRVAR(MultibyteCodec_Encode__doc__, "I.encode(unicode, [,errors]) -> (string, length consumed)\n\ \n\ @@ -47,7 +53,16 @@ 'ignore', 'replace' and 'xmlcharrefreplace' as well as any other name\n\ registered with codecs.register_error that can handle UnicodeEncodeErrors."); -static char *kwarglist[] = {"input", "errors", NULL}; +PyDoc_STRVAR(MultibyteCodec_Decode__doc__, +"I.decode(string, [,errors]) -> (unicodeobject, length consumed)\n\ +\n\ +Decodes `string' using I, an MultibyteCodec instance. errors may be given\n\ +to set a different error handling scheme. Default is 'strict' meaning\n\ +that encoding errors raise a UnicodeDecodeError. Other possible values\n\ +are 'ignore' and 'replace' as well as any other name registerd with\n\ +codecs.register_error that is able to handle UnicodeDecodeErrors."); + +static char *codeckwarglist[] = {"input", "errors", NULL}; static PyObject *multibytecodec_encode(PyMultibyteCodec *, PyMultibyteCodec_State *, const Py_UNICODE *, int, PyObject *); @@ -113,15 +128,39 @@ goto errorexit; \ } +static int +expand_decodebuffer(MultibyteDecodeBuffer *buf, int esize) +{ + int orgpos, orgsize; + + orgpos = (int)(buf->outbuf - PyUnicode_AS_UNICODE(buf->outobj)); + orgsize = PyUnicode_GET_SIZE(buf->outobj); + if (PyUnicode_Resize(&buf->outobj, orgsize + ( + esize < (orgsize >> 1) ? (orgsize >> 1) | 1 : esize)) == -1) + return -1; + + buf->outbuf = PyUnicode_AS_UNICODE(buf->outobj) + orgpos; + buf->outbuf_end = PyUnicode_AS_UNICODE(buf->outobj) + + PyUnicode_GET_SIZE(buf->outobj); + + return 0; +} +#define RESERVE_DECODEBUFFER(buf, s) { \ + if ((s) < 1 || (buf)->outbuf + (s) > (buf)->outbuf_end) \ + if (expand_decodebuffer(buf, s) == -1) \ + goto errorexit; \ +} + static int -multibytecodec_error(PyMultibyteCodec *codec, - PyMultibyteCodec_State *state, - MultibyteEncodeBuffer *buf, - PyObject *errors, int e) +multibytecodec_encerror(PyMultibyteCodec *codec, + PyMultibyteCodec_State *state, + MultibyteEncodeBuffer *buf, + PyObject *errors, int e) { PyObject *retobj = NULL, *retstr = NULL, *argsobj, *tobj; const char *reason; - int retstrsize, newpos, start, end, esize; + size_t esize; + int retstrsize, newpos, start, end; if (e == MBERR_TOOSMALL) { RESERVE_ENCODEBUFFER(buf, -1); @@ -133,7 +172,7 @@ switch (e) { case MBERR_TOOFEW: reason = "incomplete multibyte sequence"; - esize = (int)(buf->inbuf_end - buf->inbuf); + esize = (size_t)(buf->inbuf_end - buf->inbuf); break; case MBERR_INTERNAL: PyErr_SetString(PyExc_RuntimeError, "internal codec error"); @@ -180,14 +219,11 @@ start, end, reason); if (buf->excobj == NULL) goto errorexit; - } else { - if (PyUnicodeEncodeError_SetStart(buf->excobj, start) != 0) - goto errorexit; - if (PyUnicodeEncodeError_SetEnd(buf->excobj, end) != 0) - goto errorexit; - if (PyUnicodeEncodeError_SetReason(buf->excobj, reason) != 0) + } else + if (PyUnicodeEncodeError_SetStart(buf->excobj, start) != 0 || + PyUnicodeEncodeError_SetEnd(buf->excobj, end) != 0 || + PyUnicodeEncodeError_SetReason(buf->excobj, reason) != 0) goto errorexit; - } if (errors == ERROR_STRICT) { PyCodec_StrictErrors(buf->excobj); @@ -245,6 +281,112 @@ } static int +multibytecodec_decerror(PyMultibyteCodec *codec, + PyMultibyteCodec_State *state, + MultibyteDecodeBuffer *buf, + PyObject *errors, int e) +{ + PyObject *argsobj, *retobj = NULL, *retuni = NULL; + const char *reason; + size_t esize; + int start, end, retunisize, newpos; + + if (e == MBERR_TOOSMALL) { + RESERVE_DECODEBUFFER(buf, -1); + return 0; /* retry it */ + } else if (e > 0) { + reason = "illegal multibyte sequence"; + esize = e; + } else { + switch (e) { + case MBERR_TOOFEW: + reason = "incomplete multibyte sequence"; + esize = (size_t)(buf->inbuf_end - buf->inbuf); + break; + case MBERR_INTERNAL: + PyErr_SetString(PyExc_RuntimeError, "internal codec error"); + return -1; + default: + PyErr_SetString(PyExc_RuntimeError, "unknown runtime error"); + return -1; + } + } + + if (errors == ERROR_REPLACE) { + RESERVE_DECODEBUFFER(buf, 1); + *buf->outbuf++ = Py_UNICODE_REPLACEMENT_CHARACTER; + } + if (errors == ERROR_IGNORE || errors == ERROR_REPLACE) { + buf->inbuf += esize; + return 0; + } + + start = (int)(buf->inbuf - buf->inbuf_top); + end = start + esize; + + /* use cached exception object if available */ + if (buf->excobj == NULL) { + buf->excobj = PyUnicodeDecodeError_Create(codec->encoding, + buf->inbuf_top, (int)(buf->inbuf_end - buf->inbuf_top), + start, end, reason); + if (buf->excobj == NULL) + goto errorexit; + } else + if (PyUnicodeDecodeError_SetStart(buf->excobj, start) || + PyUnicodeDecodeError_SetEnd(buf->excobj, end) || + PyUnicodeDecodeError_SetReason(buf->excobj, reason)) + goto errorexit; + + if (errors == ERROR_STRICT) { + PyCodec_StrictErrors(buf->excobj); + goto errorexit; + } + + argsobj = PyTuple_New(1); + if (argsobj == NULL) + goto errorexit; + + PyTuple_SET_ITEM(argsobj, 0, buf->excobj); + Py_INCREF(buf->excobj); + retobj = PyObject_CallObject(errors, argsobj); + Py_DECREF(argsobj); + if (retobj == NULL) + goto errorexit; + + if (!PyTuple_Check(retobj) || PyTuple_GET_SIZE(retobj) != 2 || + !PyUnicode_Check((retuni = PyTuple_GET_ITEM(retobj, 0))) || + !PyInt_Check(PyTuple_GET_ITEM(retobj, 1))) { + PyErr_SetString(PyExc_ValueError, + "decoding error handler must return (unicode, int) tuple"); + goto errorexit; + } + + retunisize = PyUnicode_GET_SIZE(retuni); + if (retunisize > 0) { + RESERVE_DECODEBUFFER(buf, retunisize); + memcpy((char *)buf->outbuf, PyUnicode_AS_DATA(retuni), + retunisize * Py_UNICODE_SIZE); + buf->outbuf += retunisize; + } + + newpos = (int)PyInt_AS_LONG(PyTuple_GET_ITEM(retobj, 1)); + if (newpos < 0) + newpos += (int)(buf->inbuf_end - buf->inbuf_top); + if (newpos < 0 || buf->inbuf_top + newpos > buf->inbuf_end) { + PyErr_Format(PyExc_IndexError, + "position %d from error handler out of bounds", newpos); + goto errorexit; + } + buf->inbuf = buf->inbuf_top + newpos; + Py_DECREF(retobj); + return 0; + +errorexit: + Py_XDECREF(retobj); + return -1; +} + +static int multibytecodec_iencode(PyMultibyteCodec *codec, PyMultibyteCodec_State *state, MultibyteEncodeBuffer *buf, @@ -257,13 +399,12 @@ /* we don't reuse inleft and outleft here. * error callbacks can relocate the cursor anywhere on buffer */ inleft = (size_t)(buf->inbuf_end - buf->inbuf); + if (inleft == 0) return 0; outleft = (size_t)(buf->outbuf_end - buf->outbuf); - r = codec->encode(state, &buf->inbuf, inleft, - &buf->outbuf, outleft); - + r = codec->encode(state, &buf->inbuf, inleft, &buf->outbuf, outleft); if (r == 0) return 0; - else if (multibytecodec_error(codec, state, buf, errors, r)) + else if (multibytecodec_encerror(codec, state, buf, errors, r)) return -1; else if (buf->inbuf >= buf->inbuf_end) return 0; @@ -303,6 +444,7 @@ if (multibytecodec_iencode(codec, state, &buf, errors) == -1) goto errorexit; + /* XXX: FLUSH IT! */ finalsize = (int)((char*)buf.outbuf - PyString_AS_STRING(buf.outobj)); @@ -330,7 +472,7 @@ int datalen; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "u#|z:encode", - kwarglist, &data, &datalen, &errors)) + codeckwarglist, &data, &datalen, &errors)) return NULL; errorcb = get_errorcallback(errors); @@ -354,10 +496,98 @@ return NULL; } +static int +multibytecodec_idecode(PyMultibyteCodec *codec, + PyMultibyteCodec_State *state, + MultibyteDecodeBuffer *buf, + PyObject *errors) +{ + for (;;) { + size_t inleft, outleft; + int r; + + inleft = (size_t)(buf->inbuf_end - buf->inbuf); + if (inleft == 0) return 0; + outleft = (size_t)(buf->outbuf_end - buf->outbuf); + + r = codec->decode(state, &buf->inbuf, inleft, &buf->outbuf, outleft); + if (r == 0) + return 0; + else if (multibytecodec_decerror(codec, state, buf, errors, r)) + return -1; + else if (buf->inbuf >= buf->inbuf_end) + return 0; + } + + return 0; +} + +static PyObject * +MultibyteCodec_Decode(PyMultibyteCodecObject *self, + PyObject *args, PyObject *kwargs) +{ + PyMultibyteCodec_State state; + MultibyteDecodeBuffer buf; + PyObject *errorcb; + const char *data, *errors = NULL; + int datalen, finalsize, r; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|z:decode", + codeckwarglist, &data, &datalen, &errors)) + return NULL; + + errorcb = get_errorcallback(errors); + if (errorcb == NULL) + return NULL; + + if (datalen == 0) { + if (errorcb > ERROR_MAX) + {Py_DECREF(errorcb);} + return PyUnicode_FromUnicode(NULL, 0); + } + + buf.outobj = buf.excobj = NULL; + buf.inbuf = buf.inbuf_top = (unsigned char *)data; + buf.inbuf_end = buf.inbuf_top + datalen; + buf.outobj = PyUnicode_FromUnicode(NULL, datalen); + if (buf.outobj == NULL) + goto errorexit; + buf.outbuf = PyUnicode_AS_UNICODE(buf.outobj); + buf.outbuf_end = buf.outbuf + PyUnicode_GET_SIZE(buf.outobj); + + state.p = NULL; + r = multibytecodec_idecode(self->codec, &state, &buf, errorcb); + /* XXX: FLUSH IT! */ + if (r != 0) + goto errorexit; + + finalsize = (int)(buf.outbuf - PyUnicode_AS_UNICODE(buf.outobj)); + + if (finalsize != PyUnicode_GET_SIZE(buf.outobj)) + if (PyUnicode_Resize(&buf.outobj, finalsize) == -1) + goto errorexit; + + Py_XDECREF(buf.excobj); + if (errorcb > ERROR_MAX) + {Py_DECREF(errorcb);} + return make_tuple(buf.outobj, datalen); + +errorexit: + if (errorcb > ERROR_MAX) + {Py_DECREF(errorcb);} + Py_XDECREF(buf.excobj); + Py_XDECREF(buf.outobj); + + return NULL; +} + static struct PyMethodDef multibytecodec_methods[] = { {"encode", (PyCFunction)MultibyteCodec_Encode, METH_VARARGS | METH_KEYWORDS, MultibyteCodec_Encode__doc__}, + {"decode", (PyCFunction)MultibyteCodec_Decode, + METH_VARARGS | METH_KEYWORDS, + MultibyteCodec_Decode__doc__}, {NULL, NULL}, }; 1.1 cjkcodecs/src/_cp949.c Index: _cp949.c =================================================================== /* * _cp949.c: the CP949 codec * * Copyright (C) 2003 Hye-Shik Chang <pe...@Fr...>. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $Id: _cp949.c,v 1.1 2003/05/19 10:38:08 perky Exp $ */ #include "codeccommon.h" ENCMAP(cp949) DECMAP(ksx1001) DECMAP(cp949ext) ENCODER(cp949) { while (inleft > 0) { const struct unim_index *map; Py_UNICODE c = **inbuf, clow; DBCHAR code; if (c < 0x80) { if (outleft < 1) return MBERR_TOOSMALL; **outbuf = c; (*inbuf)++; inleft--; (*outbuf)++; outleft--; continue; } if (outleft < 2) return MBERR_TOOSMALL; map = &cp949encmap[c >> 8]; clow = c & 0xff; if (map->map == NULL || clow < map->bottom || clow > map->top || (code = map->map[clow - map->bottom]) == UNIINV) return 1; (*outbuf)[0] = (code >> 8) | 0x80; if (code & 0x8000) (*outbuf)[1] = (code & 0xFF); /* MSB set: CP949 */ else (*outbuf)[1] = (code & 0xFF) | 0x80; /* MSB unset: ks x 1001 */ (*outbuf) += 2; outleft -= 2; (*inbuf)++; inleft--; } return 0; } DECODER(cp949) { while (inleft > 0) { const struct dbcs_index *map; unsigned char c = **inbuf, c2; Py_UNICODE code; if (outleft < 1) return MBERR_TOOSMALL; if (c < 0x80) { **outbuf = c; (*inbuf)++; inleft--; (*outbuf)++; outleft--; continue; } if (inleft < 2) return MBERR_TOOFEW; if ((*inbuf)[1] < 0x80) goto cp949dec; c2 = (*inbuf)[1] & 0x7f; map = &ksx1001decmap[c & 0x7f]; if (map->map == NULL || c2 < map->bottom || c2 > map->top || (code = map->map[c2 - map->bottom]) == UNIINV) { cp949dec: c2 = (*inbuf)[1]; map = &cp949extdecmap[c]; if (map->map == NULL || c2 < map->bottom || c2 > map->top || (code = map->map[c2 - map->bottom]) == UNIINV) return 2; } **outbuf = code; (*outbuf)++; outleft--; (*inbuf) += 2; inleft -= 2; } return 0; } CODECDEF(cp949) NOMETHODS(__methods) void init_cp949(void) { PyObject *codec; PyObject *m = NULL, *mod = NULL, *o = NULL; m = Py_InitModule("_cp949", __methods); /* Import mapdata */ MAPOPEN(mod, "ko_KR") if (IMPORTMAP(mod, ksx1001, NULL, &ksx1001decmap) || IMPORTMAP(mod, cp949ext, NULL, &cp949extdecmap) || IMPORTMAP(mod, cp949, &cp949encmap, NULL)) goto errorexit; MAPCLOSE(mod) /* Create Codec Instances */ MULTIBYTECODEC_OPEN(mod, o) REGISTERCODEC(m, o, codec) MULTIBYTECODEC_CLOSE(mod, o) if (PyErr_Occurred()) Py_FatalError("can't initialize the _cp949 module"); return; errorexit: Py_XDECREF(m); Py_XDECREF(mod); Py_XDECREF(o); } /* * ex: ts=8 sts=4 et */ |