[KoCo-CVS] [Commit] cjkcodecs/src _utf_8.c codeccommon.h multibytecodec.c multibytecodec.h

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

perky       03/07/05 12:49:03

  Modified:    src      _utf_8.c codeccommon.h multibytecodec.c
                        multibytecodec.h
  Log:
  StreamWriter became to be able to buffer incomplete sequences.
  (this feature is used for surrogate-pair and mapping from unicode
   character with a following modifier)
  
  Revision  Changes    Path
  1.9       +6 -24     cjkcodecs/src/_utf_8.c
  
  Index: _utf_8.c
  ===================================================================
  RCS file: /cvsroot/koco/cjkcodecs/src/_utf_8.c,v
  retrieving revision 1.8
  retrieving revision 1.9
  diff -u -r1.8 -r1.9
  --- _utf_8.c	1 Jul 2003 20:45:27 -0000	1.8
  +++ _utf_8.c	5 Jul 2003 19:49:02 -0000	1.9
  @@ -26,32 +26,11 @@
    * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
    * POSSIBILITY OF SUCH DAMAGE.
    *
  - * $Id: _utf_8.c,v 1.8 2003/07/01 20:45:27 perky Exp $
  + * $Id: _utf_8.c,v 1.9 2003/07/05 19:49:02 perky Exp $
    */
   
   #include "codeccommon.h"
   
  -#if Py_UNICODE_SIZE == 2
  -#define HAVE_ENCODER_RESET
  -ENCODER_RESET(utf_8)
  -{
  -    assert(inleft == 0 || inleft == 1);
  -
  -    if (inleft) { /* all pending characters are "high surrogate" */
  -        ucs4_t   c = **inbuf;
  -
  -        RESERVE_OUTBUF(3)
  -        (*outbuf)[2] = 0x80 | ((c) & 0x3f);
  -        (c) = (c) >> 6; (c) |= 0x800;
  -        (*outbuf)[1] = 0x80 | ((c) & 0x3f);
  -        (c) = (c) >> 6; (c) |= 0xc0;
  -        (*outbuf)[0] = (c);
  -        NEXT(1, 3)
  -    }
  -    return 0;
  -}
  -#endif /* Py_UNICODE_SIZE == 2 */
  -
   ENCODER(utf_8)
   {
       while (inleft > 0) {
  @@ -63,8 +42,11 @@
           else {
   #if Py_UNICODE_SIZE == 2
               if (c >> 10 == 0xd800 >> 10) { /* high surrogate */
  -                RESERVE_INBUF(2)
  -                if ((*inbuf)[1] >> 10 == 0xdc00 >> 10) { /* low surrogate */
  +                if (inleft < 2) {
  +                    if (!(flags & MBENC_FLUSH))
  +                        return MBERR_TOOFEW;
  +                } else if ((*inbuf)[1] >> 10 == 0xdc00 >> 10) {
  +                    /* low surrogate */
                       c = 0x10000 + ((c - 0xd800) << 10) +
                                     ((ucs4_t)((*inbuf)[1]) - 0xdc00);
                       insize = 2;
  
  
  
  1.15      +2 -3      cjkcodecs/src/codeccommon.h
  
  Index: codeccommon.h
  ===================================================================
  RCS file: /cvsroot/koco/cjkcodecs/src/codeccommon.h,v
  retrieving revision 1.14
  retrieving revision 1.15
  diff -u -r1.14 -r1.15
  --- codeccommon.h	1 Jul 2003 19:33:43 -0000	1.14
  +++ codeccommon.h	5 Jul 2003 19:49:02 -0000	1.15
  @@ -26,7 +26,7 @@
    * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
    * POSSIBILITY OF SUCH DAMAGE.
    *
  - * $Id: codeccommon.h,v 1.14 2003/07/01 19:33:43 perky Exp $
  + * $Id: codeccommon.h,v 1.15 2003/07/05 19:49:02 perky Exp $
    */
   
   #include "Python.h"
  @@ -46,11 +46,10 @@
       static int encoding##_encode(                           \
           MultibyteCodec_State *state,                        \
           const Py_UNICODE **inbuf, size_t inleft,            \
  -        unsigned char **outbuf, size_t outleft)
  +        unsigned char **outbuf, size_t outleft, int flags)
   #define ENCODER_RESET(encoding)                             \
       static int encoding##_encode_reset(                     \
           MultibyteCodec_State *state,                        \
  -        const Py_UNICODE **inbuf, size_t inleft,            \
           unsigned char **outbuf, size_t outleft)
   
   #define DECODER_INIT(encoding)                              \
  
  
  
  1.20      +81 -66    cjkcodecs/src/multibytecodec.c
  
  Index: multibytecodec.c
  ===================================================================
  RCS file: /cvsroot/koco/cjkcodecs/src/multibytecodec.c,v
  retrieving revision 1.19
  retrieving revision 1.20
  diff -u -r1.19 -r1.20
  --- multibytecodec.c	1 Jul 2003 20:45:27 -0000	1.19
  +++ multibytecodec.c	5 Jul 2003 19:49:02 -0000	1.20
  @@ -26,7 +26,7 @@
    * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
    * POSSIBILITY OF SUCH DAMAGE.
    *
  - * $Id: multibytecodec.c,v 1.19 2003/07/01 20:45:27 perky Exp $
  + * $Id: multibytecodec.c,v 1.20 2003/07/05 19:49:02 perky Exp $
    */
   
   #include "Python.h"
  @@ -73,13 +73,15 @@
   static char *streamkwarglist[] = {"stream", "errors", NULL};
   
   static PyObject *multibytecodec_encode(MultibyteCodec *,
  -            MultibyteCodec_State *, const Py_UNICODE *, int, PyObject *,
  -            int flush);
  +            MultibyteCodec_State *, const Py_UNICODE **, size_t,
  +            PyObject *, int);
   static PyObject *mbstreamreader_create(MultibyteCodec *,
               PyObject *, const char *);
   static PyObject *mbstreamwriter_create(MultibyteCodec *,
               PyObject *, const char *);
   
  +#define MBENC_RESET     MBENC_MAX<<1 /* reset after an encoding session */
  +
   static PyObject *
   make_tuple(PyObject *unicode, int len)
   {
  @@ -214,7 +216,7 @@
               size_t   outleft;
   
               outleft = (size_t)(buf->outbuf_end - buf->outbuf);
  -            r = codec->encode(state, &inbuf, 1, &buf->outbuf, outleft);
  +            r = codec->encode(state, &inbuf, 1, &buf->outbuf, outleft, 0);
               if (r == MBERR_TOOSMALL) {
                   RESERVE_ENCODEBUFFER(buf, -1);
                   continue;
  @@ -291,10 +293,14 @@
           goto errorexit;
       }
   
  -    retstr = multibytecodec_encode(codec, state, PyUnicode_AS_UNICODE(tobj),
  -                    PyUnicode_GET_SIZE(tobj), ERROR_STRICT, 0);
  -    if (retstr == NULL)
  -        goto errorexit;
  +    {
  +        const Py_UNICODE    *uraw = PyUnicode_AS_UNICODE(tobj);
  +
  +        retstr = multibytecodec_encode(codec, state, &uraw,
  +                    PyUnicode_GET_SIZE(tobj), ERROR_STRICT, MBENC_FLUSH);
  +        if (retstr == NULL)
  +            goto errorexit;
  +    }
   
       retstrsize = PyString_GET_SIZE(retstr);
       RESERVE_ENCODEBUFFER(buf, retstrsize);
  @@ -448,8 +454,8 @@
   static PyObject *
   multibytecodec_encode(MultibyteCodec *codec,
                         MultibyteCodec_State *state,
  -                      const Py_UNICODE *data, int datalen,
  -                      PyObject *errors, int flush)
  +                      const Py_UNICODE **data, size_t datalen,
  +                      PyObject *errors, int flags)
   {
       MultibyteEncodeBuffer   buf;
       int  finalsize, r = 0;
  @@ -458,7 +464,7 @@
           return PyString_FromString("");
   
       buf.excobj = NULL;
  -    buf.inbuf = buf.inbuf_top = data;
  +    buf.inbuf = buf.inbuf_top = *data;
       buf.inbuf_end = buf.inbuf_top + datalen;
       buf.outobj = PyString_FromStringAndSize(NULL, datalen * 2 + 16);
       if (buf.outobj == NULL)
  @@ -473,33 +479,28 @@
            * error callbacks can relocate the cursor anywhere on buffer */
           inleft = (size_t)(buf.inbuf_end - buf.inbuf);
           outleft = (size_t)(buf.outbuf_end - buf.outbuf);
  -        r = codec->encode(state, &buf.inbuf, inleft, &buf.outbuf, outleft);
  -        if (r == 0 || r == MBERR_TOOFEW)
  +        r = codec->encode(state, &buf.inbuf, inleft,
  +                          &buf.outbuf, outleft, flags);
  +        *data = buf.inbuf;
  +        if ((r == 0) || (r == MBERR_TOOFEW && !(flags & MBENC_FLUSH)))
               break;
           else if (multibytecodec_encerror(codec, state, &buf, errors, r))
               goto errorexit;
  +        else if (r == MBERR_TOOFEW)
  +            break;
       }
   
  -    if (flush) {
  -        if (codec->encreset == NULL) {
  -            if (r == MBERR_TOOFEW) {
  -                if (multibytecodec_encerror(codec, state, &buf, errors, r))
  -                    goto errorexit;
  -            }
  -        } else for (;;) {
  -            size_t   inleft, outleft;
  +    if (codec->encreset != NULL)
  +        for (;;) {
  +            size_t   outleft;
   
  -            /* inleft can be non-zero value when r == MBERR_TOOFEW */
  -            inleft = (size_t)(buf.inbuf_end - buf.inbuf);
               outleft = (size_t)(buf.outbuf_end - buf.outbuf);
  -            r = codec->encreset(state, &buf.inbuf, inleft,
  -                                &buf.outbuf, outleft);
  +            r = codec->encreset(state, &buf.outbuf, outleft);
               if (r == 0)
                   break;
               else if (multibytecodec_encerror(codec, state, &buf, errors, r))
                   goto errorexit;
           }
  -    }
   
       finalsize = (int)((char*)buf.outbuf - PyString_AS_STRING(buf.outobj));
   
  @@ -536,7 +537,8 @@
   
       if (self->codec->encinit != NULL && self->codec->encinit(&state) != 0)
           goto errorexit;
  -    r = multibytecodec_encode(self->codec, &state, data, datalen, errorcb, 1);
  +    r = multibytecodec_encode(self->codec, &state, (const Py_UNICODE **)&data,
  +                        datalen, errorcb, MBENC_FLUSH | MBENC_RESET);
       if (r == NULL)
           goto errorexit;
   
  @@ -977,6 +979,7 @@
                         PyObject *unistr)
   {
       PyObject    *wr, *r = NULL;
  +    Py_UNICODE  *inbuf, *inbuf_end, *inbuf_tmp = NULL;
       int          rsize;
   
       if (!PyUnicode_Check(unistr)) {
  @@ -989,20 +992,49 @@
       if (rsize == 0)
           return 0;
   
  +    if (self->pendingsize > 0) {
  +        inbuf_tmp = PyMem_New(Py_UNICODE, rsize + self->pendingsize);
  +        if (inbuf_tmp == NULL)
  +            goto errorexit;
  +        memcpy(inbuf_tmp, self->pending, Py_UNICODE_SIZE * self->pendingsize);
  +        memcpy(inbuf_tmp + self->pendingsize, PyUnicode_AS_UNICODE(unistr),
  +               Py_UNICODE_SIZE * rsize);
  +        rsize += self->pendingsize;
  +        self->pendingsize = 0;
  +        inbuf = inbuf_tmp;
  +    } else
  +        inbuf = (Py_UNICODE *)PyUnicode_AS_UNICODE(unistr);
  +
  +    inbuf_end = inbuf + rsize;
  +
       r = multibytecodec_encode(self->codec, &self->state,
  -            (Py_UNICODE *)PyUnicode_AS_UNICODE(unistr), rsize, self->errors, 0);
  +                        (const Py_UNICODE **)&inbuf, rsize, self->errors, 0);
       if (r == NULL)
           goto errorexit;
   
  +    if (inbuf < inbuf_end) {
  +        self->pendingsize = (int)(inbuf_end - inbuf);
  +        if (self->pendingsize > MAXENCPENDING) {
  +            self->pendingsize = 0;
  +            PyErr_SetString(PyExc_RuntimeError, "pending buffer overflow");
  +            goto errorexit;
  +        }
  +        memcpy(self->pending, inbuf, self->pendingsize * Py_UNICODE_SIZE);
  +    }
  +
       wr = PyObject_CallMethod(self->stream, "write", "O", r);
       if (wr == NULL)
           goto errorexit;
   
  +    if (inbuf_tmp != NULL)
  +        PyMem_Del(inbuf_tmp);
       Py_DECREF(r);
       Py_DECREF(wr);
       return 0;
   
   errorexit:
  +    if (inbuf_tmp != NULL)
  +        PyMem_Del(inbuf_tmp);
       Py_XDECREF(r);
       return -1;
   }
  @@ -1056,48 +1088,30 @@
   static PyObject *
   mbstreamwriter_reset(MultibyteStreamWriterObject *self)
   {
  -    if (self->codec->encreset != NULL) {
  -        PyObject    *rsbuf = NULL;
  -        size_t       rsbufsiz, rsbufnc;
  -        int          r;
  -        unsigned char *rsbuf_top, *rsbuf_cur;
  -
  -        rsbufnc = 0;
  -        for (rsbufsiz = 0;;rsbufsiz *= 2) {
  -            if (rsbuf == NULL) {
  -                rsbuf = PyString_FromStringAndSize(NULL, rsbufsiz);
  -                if (rsbuf == NULL)
  -                    return NULL;
  -            } else {
  -                if (_PyString_Resize(&rsbuf, rsbufsiz))
  -                    goto errorexit;
  -            }
  -            rsbuf_top = (unsigned char *)PyString_AS_STRING(rsbuf);
  -            rsbuf_cur = rsbuf_top + rsbufnc;
  -            
  -            r = self->codec->encreset(&self->state,
  -                    NULL, 0, &rsbuf_cur, rsbufsiz - rsbufnc);
  -            rsbufnc = (size_t)(rsbuf_cur - rsbuf_top);
  -            if (r == MBERR_TOOSMALL)
  -                continue;
  -            else {
  -                if (r != 0)
  -                    goto errorexit;
  -                else
  -                    break;
  -            }
  -        }
  +    const Py_UNICODE    *pending;
  +    PyObject    *pwrt;
   
  -        if (_PyString_Resize(&rsbuf, rsbufnc)) {
  -errorexit:  Py_DECREF(rsbuf);
  -            return NULL;
  -        }
  +    pending = self->pending;
  +    pwrt = multibytecodec_encode(self->codec, &self->state,
  +                &pending, self->pendingsize, self->errors,
  +                MBENC_FLUSH | MBENC_RESET);
  +    /* some pending buffer can be truncated when UnicodeEncodeError is
  +     * raised on 'strict' mode. but, 'reset' method is designed to
  +     * reset the pending buffer or states so failed string sequence
  +     * ought to be missed */
  +    self->pendingsize = 0;
  +    if (pwrt == NULL)
  +        return NULL;
   
  -        r = mbstreamwriter_iwrite(self, rsbuf);
  -        Py_DECREF(rsbuf);
  -        if (r == -1)
  +    if (PyString_Size(pwrt) > 0) {
  +        PyObject    *wr;
  +        wr = PyObject_CallMethod(self->stream, "write", "O", pwrt);
  +        if (wr == NULL) {
  +            Py_DECREF(pwrt);
               return NULL;
  +        }
       }
  +    Py_DECREF(pwrt);
   
       Py_INCREF(Py_None);
       return Py_None;
  @@ -1232,6 +1246,7 @@
       self->codec = codec;
       self->stream = stream;
       Py_INCREF(stream);
  +    self->pendingsize = 0;
       self->errors = get_errorcallback(errors);
       if (self->errors == NULL)
           goto errorexit;
  
  
  
  1.9       +8 -3      cjkcodecs/src/multibytecodec.h
  
  Index: multibytecodec.h
  ===================================================================
  RCS file: /cvsroot/koco/cjkcodecs/src/multibytecodec.h,v
  retrieving revision 1.8
  retrieving revision 1.9
  diff -u -r1.8 -r1.9
  --- multibytecodec.h	1 Jul 2003 19:33:43 -0000	1.8
  +++ multibytecodec.h	5 Jul 2003 19:49:02 -0000	1.9
  @@ -26,7 +26,7 @@
    * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
    * POSSIBILITY OF SUCH DAMAGE.
    *
  - * $Id: multibytecodec.h,v 1.8 2003/07/01 19:33:43 perky Exp $
  + * $Id: multibytecodec.h,v 1.9 2003/07/05 19:49:02 perky Exp $
    */
   
   #ifndef _PYTHON_MULTIBYTECODEC_H_
  @@ -35,17 +35,19 @@
   extern "C" {
   #endif
   
  +#define MAXCHARSTATE    8
   typedef union {
       unsigned long    i;
       void            *p;
  +    unsigned char    c[MAXCHARSTATE];
   } MultibyteCodec_State;
   
   typedef int (*mbencode_func)(MultibyteCodec_State *state,
                                const Py_UNICODE **inbuf, size_t inleft,
  -                             unsigned char **outbuf, size_t outleft);
  +                             unsigned char **outbuf, size_t outleft,
  +                             int flags);
   typedef int (*mbencodeinit_func)(MultibyteCodec_State *state);
   typedef int (*mbencodereset_func)(MultibyteCodec_State *state,
  -                             const Py_UNICODE **inbuf, size_t inleft,
                                unsigned char **outbuf, size_t outleft);
   typedef int (*mbdecode_func)(MultibyteCodec_State *state,
                                const unsigned char **inbuf, size_t inleft,
  @@ -97,6 +99,9 @@
   #define ERROR_IGNORE        (PyObject *)(2)
   #define ERROR_REPLACE       (PyObject *)(3)
   #define ERROR_MAX           ERROR_REPLACE
  +
  +#define MBENC_FLUSH         0x0001 /* encode all characters encodable */
  +#define MBENC_MAX           MBENC_FLUSH
   
   #ifdef __cplusplus
   }