[KoCo-CVS] [Commit] cjkcodecs/src _utf_8.c codeccommon.h multibytecodec.c multibytecodec.h
Brought to you by:
perky
From: Hye-Shik C. <pe...@us...> - 2003-07-05 19:49:04
|
perky 03/07/05 12:49:03 Modified: src _utf_8.c codeccommon.h multibytecodec.c multibytecodec.h Log: StreamWriter became to be able to buffer incomplete sequences. (this feature is used for surrogate-pair and mapping from unicode character with a following modifier) Revision Changes Path 1.9 +6 -24 cjkcodecs/src/_utf_8.c Index: _utf_8.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/_utf_8.c,v retrieving revision 1.8 retrieving revision 1.9 diff -u -r1.8 -r1.9 --- _utf_8.c 1 Jul 2003 20:45:27 -0000 1.8 +++ _utf_8.c 5 Jul 2003 19:49:02 -0000 1.9 @@ -26,32 +26,11 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: _utf_8.c,v 1.8 2003/07/01 20:45:27 perky Exp $ + * $Id: _utf_8.c,v 1.9 2003/07/05 19:49:02 perky Exp $ */ #include "codeccommon.h" -#if Py_UNICODE_SIZE == 2 -#define HAVE_ENCODER_RESET -ENCODER_RESET(utf_8) -{ - assert(inleft == 0 || inleft == 1); - - if (inleft) { /* all pending characters are "high surrogate" */ - ucs4_t c = **inbuf; - - RESERVE_OUTBUF(3) - (*outbuf)[2] = 0x80 | ((c) & 0x3f); - (c) = (c) >> 6; (c) |= 0x800; - (*outbuf)[1] = 0x80 | ((c) & 0x3f); - (c) = (c) >> 6; (c) |= 0xc0; - (*outbuf)[0] = (c); - NEXT(1, 3) - } - return 0; -} -#endif /* Py_UNICODE_SIZE == 2 */ - ENCODER(utf_8) { while (inleft > 0) { @@ -63,8 +42,11 @@ else { #if Py_UNICODE_SIZE == 2 if (c >> 10 == 0xd800 >> 10) { /* high surrogate */ - RESERVE_INBUF(2) - if ((*inbuf)[1] >> 10 == 0xdc00 >> 10) { /* low surrogate */ + if (inleft < 2) { + if (!(flags & MBENC_FLUSH)) + return MBERR_TOOFEW; + } else if ((*inbuf)[1] >> 10 == 0xdc00 >> 10) { + /* low surrogate */ c = 0x10000 + ((c - 0xd800) << 10) + ((ucs4_t)((*inbuf)[1]) - 0xdc00); insize = 2; 1.15 +2 -3 cjkcodecs/src/codeccommon.h Index: codeccommon.h =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/codeccommon.h,v retrieving revision 1.14 retrieving revision 1.15 diff -u -r1.14 -r1.15 --- codeccommon.h 1 Jul 2003 19:33:43 -0000 1.14 +++ codeccommon.h 5 Jul 2003 19:49:02 -0000 1.15 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: codeccommon.h,v 1.14 2003/07/01 19:33:43 perky Exp $ + * $Id: codeccommon.h,v 1.15 2003/07/05 19:49:02 perky Exp $ */ #include "Python.h" @@ -46,11 +46,10 @@ static int encoding##_encode( \ MultibyteCodec_State *state, \ const Py_UNICODE **inbuf, size_t inleft, \ - unsigned char **outbuf, size_t outleft) + unsigned char **outbuf, size_t outleft, int flags) #define ENCODER_RESET(encoding) \ static int encoding##_encode_reset( \ MultibyteCodec_State *state, \ - const Py_UNICODE **inbuf, size_t inleft, \ unsigned char **outbuf, size_t outleft) #define DECODER_INIT(encoding) \ 1.20 +81 -66 cjkcodecs/src/multibytecodec.c Index: multibytecodec.c =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/multibytecodec.c,v retrieving revision 1.19 retrieving revision 1.20 diff -u -r1.19 -r1.20 --- multibytecodec.c 1 Jul 2003 20:45:27 -0000 1.19 +++ multibytecodec.c 5 Jul 2003 19:49:02 -0000 1.20 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: multibytecodec.c,v 1.19 2003/07/01 20:45:27 perky Exp $ + * $Id: multibytecodec.c,v 1.20 2003/07/05 19:49:02 perky Exp $ */ #include "Python.h" @@ -73,13 +73,15 @@ static char *streamkwarglist[] = {"stream", "errors", NULL}; static PyObject *multibytecodec_encode(MultibyteCodec *, - MultibyteCodec_State *, const Py_UNICODE *, int, PyObject *, - int flush); + MultibyteCodec_State *, const Py_UNICODE **, size_t, + PyObject *, int); static PyObject *mbstreamreader_create(MultibyteCodec *, PyObject *, const char *); static PyObject *mbstreamwriter_create(MultibyteCodec *, PyObject *, const char *); +#define MBENC_RESET MBENC_MAX<<1 /* reset after an encoding session */ + static PyObject * make_tuple(PyObject *unicode, int len) { @@ -214,7 +216,7 @@ size_t outleft; outleft = (size_t)(buf->outbuf_end - buf->outbuf); - r = codec->encode(state, &inbuf, 1, &buf->outbuf, outleft); + r = codec->encode(state, &inbuf, 1, &buf->outbuf, outleft, 0); if (r == MBERR_TOOSMALL) { RESERVE_ENCODEBUFFER(buf, -1); continue; @@ -291,10 +293,14 @@ goto errorexit; } - retstr = multibytecodec_encode(codec, state, PyUnicode_AS_UNICODE(tobj), - PyUnicode_GET_SIZE(tobj), ERROR_STRICT, 0); - if (retstr == NULL) - goto errorexit; + { + const Py_UNICODE *uraw = PyUnicode_AS_UNICODE(tobj); + + retstr = multibytecodec_encode(codec, state, &uraw, + PyUnicode_GET_SIZE(tobj), ERROR_STRICT, MBENC_FLUSH); + if (retstr == NULL) + goto errorexit; + } retstrsize = PyString_GET_SIZE(retstr); RESERVE_ENCODEBUFFER(buf, retstrsize); @@ -448,8 +454,8 @@ static PyObject * multibytecodec_encode(MultibyteCodec *codec, MultibyteCodec_State *state, - const Py_UNICODE *data, int datalen, - PyObject *errors, int flush) + const Py_UNICODE **data, size_t datalen, + PyObject *errors, int flags) { MultibyteEncodeBuffer buf; int finalsize, r = 0; @@ -458,7 +464,7 @@ return PyString_FromString(""); buf.excobj = NULL; - buf.inbuf = buf.inbuf_top = data; + buf.inbuf = buf.inbuf_top = *data; buf.inbuf_end = buf.inbuf_top + datalen; buf.outobj = PyString_FromStringAndSize(NULL, datalen * 2 + 16); if (buf.outobj == NULL) @@ -473,33 +479,28 @@ * error callbacks can relocate the cursor anywhere on buffer */ inleft = (size_t)(buf.inbuf_end - buf.inbuf); outleft = (size_t)(buf.outbuf_end - buf.outbuf); - r = codec->encode(state, &buf.inbuf, inleft, &buf.outbuf, outleft); - if (r == 0 || r == MBERR_TOOFEW) + r = codec->encode(state, &buf.inbuf, inleft, + &buf.outbuf, outleft, flags); + *data = buf.inbuf; + if ((r == 0) || (r == MBERR_TOOFEW && !(flags & MBENC_FLUSH))) break; else if (multibytecodec_encerror(codec, state, &buf, errors, r)) goto errorexit; + else if (r == MBERR_TOOFEW) + break; } - if (flush) { - if (codec->encreset == NULL) { - if (r == MBERR_TOOFEW) { - if (multibytecodec_encerror(codec, state, &buf, errors, r)) - goto errorexit; - } - } else for (;;) { - size_t inleft, outleft; + if (codec->encreset != NULL) + for (;;) { + size_t outleft; - /* inleft can be non-zero value when r == MBERR_TOOFEW */ - inleft = (size_t)(buf.inbuf_end - buf.inbuf); outleft = (size_t)(buf.outbuf_end - buf.outbuf); - r = codec->encreset(state, &buf.inbuf, inleft, - &buf.outbuf, outleft); + r = codec->encreset(state, &buf.outbuf, outleft); if (r == 0) break; else if (multibytecodec_encerror(codec, state, &buf, errors, r)) goto errorexit; } - } finalsize = (int)((char*)buf.outbuf - PyString_AS_STRING(buf.outobj)); @@ -536,7 +537,8 @@ if (self->codec->encinit != NULL && self->codec->encinit(&state) != 0) goto errorexit; - r = multibytecodec_encode(self->codec, &state, data, datalen, errorcb, 1); + r = multibytecodec_encode(self->codec, &state, (const Py_UNICODE **)&data, + datalen, errorcb, MBENC_FLUSH | MBENC_RESET); if (r == NULL) goto errorexit; @@ -977,6 +979,7 @@ PyObject *unistr) { PyObject *wr, *r = NULL; + Py_UNICODE *inbuf, *inbuf_end, *inbuf_tmp = NULL; int rsize; if (!PyUnicode_Check(unistr)) { @@ -989,20 +992,49 @@ if (rsize == 0) return 0; + if (self->pendingsize > 0) { + inbuf_tmp = PyMem_New(Py_UNICODE, rsize + self->pendingsize); + if (inbuf_tmp == NULL) + goto errorexit; + memcpy(inbuf_tmp, self->pending, Py_UNICODE_SIZE * self->pendingsize); + memcpy(inbuf_tmp + self->pendingsize, PyUnicode_AS_UNICODE(unistr), + Py_UNICODE_SIZE * rsize); + rsize += self->pendingsize; + self->pendingsize = 0; + inbuf = inbuf_tmp; + } else + inbuf = (Py_UNICODE *)PyUnicode_AS_UNICODE(unistr); + + inbuf_end = inbuf + rsize; + r = multibytecodec_encode(self->codec, &self->state, - (Py_UNICODE *)PyUnicode_AS_UNICODE(unistr), rsize, self->errors, 0); + (const Py_UNICODE **)&inbuf, rsize, self->errors, 0); if (r == NULL) goto errorexit; + if (inbuf < inbuf_end) { + self->pendingsize = (int)(inbuf_end - inbuf); + if (self->pendingsize > MAXENCPENDING) { + self->pendingsize = 0; + PyErr_SetString(PyExc_RuntimeError, "pending buffer overflow"); + goto errorexit; + } + memcpy(self->pending, inbuf, self->pendingsize * Py_UNICODE_SIZE); + } + wr = PyObject_CallMethod(self->stream, "write", "O", r); if (wr == NULL) goto errorexit; + if (inbuf_tmp != NULL) + PyMem_Del(inbuf_tmp); Py_DECREF(r); Py_DECREF(wr); return 0; errorexit: + if (inbuf_tmp != NULL) + PyMem_Del(inbuf_tmp); Py_XDECREF(r); return -1; } @@ -1056,48 +1088,30 @@ static PyObject * mbstreamwriter_reset(MultibyteStreamWriterObject *self) { - if (self->codec->encreset != NULL) { - PyObject *rsbuf = NULL; - size_t rsbufsiz, rsbufnc; - int r; - unsigned char *rsbuf_top, *rsbuf_cur; - - rsbufnc = 0; - for (rsbufsiz = 0;;rsbufsiz *= 2) { - if (rsbuf == NULL) { - rsbuf = PyString_FromStringAndSize(NULL, rsbufsiz); - if (rsbuf == NULL) - return NULL; - } else { - if (_PyString_Resize(&rsbuf, rsbufsiz)) - goto errorexit; - } - rsbuf_top = (unsigned char *)PyString_AS_STRING(rsbuf); - rsbuf_cur = rsbuf_top + rsbufnc; - - r = self->codec->encreset(&self->state, - NULL, 0, &rsbuf_cur, rsbufsiz - rsbufnc); - rsbufnc = (size_t)(rsbuf_cur - rsbuf_top); - if (r == MBERR_TOOSMALL) - continue; - else { - if (r != 0) - goto errorexit; - else - break; - } - } + const Py_UNICODE *pending; + PyObject *pwrt; - if (_PyString_Resize(&rsbuf, rsbufnc)) { -errorexit: Py_DECREF(rsbuf); - return NULL; - } + pending = self->pending; + pwrt = multibytecodec_encode(self->codec, &self->state, + &pending, self->pendingsize, self->errors, + MBENC_FLUSH | MBENC_RESET); + /* some pending buffer can be truncated when UnicodeEncodeError is + * raised on 'strict' mode. but, 'reset' method is designed to + * reset the pending buffer or states so failed string sequence + * ought to be missed */ + self->pendingsize = 0; + if (pwrt == NULL) + return NULL; - r = mbstreamwriter_iwrite(self, rsbuf); - Py_DECREF(rsbuf); - if (r == -1) + if (PyString_Size(pwrt) > 0) { + PyObject *wr; + wr = PyObject_CallMethod(self->stream, "write", "O", pwrt); + if (wr == NULL) { + Py_DECREF(pwrt); return NULL; + } } + Py_DECREF(pwrt); Py_INCREF(Py_None); return Py_None; @@ -1232,6 +1246,7 @@ self->codec = codec; self->stream = stream; Py_INCREF(stream); + self->pendingsize = 0; self->errors = get_errorcallback(errors); if (self->errors == NULL) goto errorexit; 1.9 +8 -3 cjkcodecs/src/multibytecodec.h Index: multibytecodec.h =================================================================== RCS file: /cvsroot/koco/cjkcodecs/src/multibytecodec.h,v retrieving revision 1.8 retrieving revision 1.9 diff -u -r1.8 -r1.9 --- multibytecodec.h 1 Jul 2003 19:33:43 -0000 1.8 +++ multibytecodec.h 5 Jul 2003 19:49:02 -0000 1.9 @@ -26,7 +26,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $Id: multibytecodec.h,v 1.8 2003/07/01 19:33:43 perky Exp $ + * $Id: multibytecodec.h,v 1.9 2003/07/05 19:49:02 perky Exp $ */ #ifndef _PYTHON_MULTIBYTECODEC_H_ @@ -35,17 +35,19 @@ extern "C" { #endif +#define MAXCHARSTATE 8 typedef union { unsigned long i; void *p; + unsigned char c[MAXCHARSTATE]; } MultibyteCodec_State; typedef int (*mbencode_func)(MultibyteCodec_State *state, const Py_UNICODE **inbuf, size_t inleft, - unsigned char **outbuf, size_t outleft); + unsigned char **outbuf, size_t outleft, + int flags); typedef int (*mbencodeinit_func)(MultibyteCodec_State *state); typedef int (*mbencodereset_func)(MultibyteCodec_State *state, - const Py_UNICODE **inbuf, size_t inleft, unsigned char **outbuf, size_t outleft); typedef int (*mbdecode_func)(MultibyteCodec_State *state, const unsigned char **inbuf, size_t inleft, @@ -97,6 +99,9 @@ #define ERROR_IGNORE (PyObject *)(2) #define ERROR_REPLACE (PyObject *)(3) #define ERROR_MAX ERROR_REPLACE + +#define MBENC_FLUSH 0x0001 /* encode all characters encodable */ +#define MBENC_MAX MBENC_FLUSH #ifdef __cplusplus } |