[KoCo-CVS] [Commit] cjkcodecs/src _ja_codecs.c _ko_codecs.c _zh_CN_codecs.c _zh_TW_codecs.c multibyt
Brought to you by:
perky
From: Hye-Shik C. <pe...@us...> - 2003-04-20 17:35:33
|
perky 03/04/20 10:35:32 Added: src _ja_codecs.c _ko_codecs.c _zh_CN_codecs.c _zh_TW_codecs.c multibytecodec.c multibytecodec.h Log: Import codec implementations from Multibyte Codecs patch. Revision Changes Path 1.1 cjkcodecs/src/_ja_codecs.c Index: _ja_codecs.c =================================================================== /* * _ja_codecs.c: Japanese Codecs Implementation * * Copyright (C) 2003 Hye-Shik Chang <pe...@Fr...>. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $Id: _ja_codecs.c,v 1.1 2003/04/20 17:35:31 perky Exp $ */ #include "Python.h" #include "multibytecodec.h" typedef unsigned short DBCHAR; #define UNIINV Py_UNICODE_REPLACEMENT_CHARACTER #define NOCHAR 0xFFFF #include "_ja_codecs.h" static DBCHAR *jisx0208_encode_map[256]; static DBCHAR *jisx0212_encode_map[256]; static DBCHAR *cp932_encode_map[256]; #define JISX0201_DECODE(c, assi) \ if ((c) < 0x5c) (assi) = (c); \ else if ((c) == 0x5c) (assi) = 0x00a5; \ else if ((c) < 0x7e) (assi) = (c); \ else if ((c) == 0x7e) (assi) = 0x203e; \ else if ((c) >= 0xa1 && (c) <= 0xdf) \ (assi) = 0xfec0 + (c); #define JISX0201_ENCODE(c, assi) \ if ((c) < 0x5c) (assi) = (c); \ else if ((c) > 0x5c && (c) < 0x7e) \ (assi) = (c); \ else if ((c) == 0x00a5) (assi) = 0x5c; \ else if ((c) == 0x203e) (assi) = 0x7e; \ else if ((c) >= 0xff61 && (c) <= 0xff9f) \ (assi) = (c) - 0xfec0; #define IN_RANGE(val, pfx) (pfx##_BOTTOM <= (val) && (val) <= pfx##_TOP) #define IN_RANGE2(c1, c2, pfx) \ (IN_RANGE(c1, pfx##_C1) && IN_RANGE(c2, pfx##_C2)) struct euc_jp_decode_state { unsigned char pending[2]; size_t pendingsize; }; /* * SHIFTJIS */ static int shiftjis_encode(PyMultibyteEncoder_Handle *hdl, PyMultibyteEncoder_Context *ctx, PyMultibyteEncoder_Buffer *buf, PyMultibyteEncoder_Error *err) { DBCHAR *map, code; while (buf->inbuf < buf->inbuf_end) { Py_UNICODE nc = *buf->inbuf; unsigned char c1, c2; JISX0201_ENCODE(nc, code) #if Py_UNICODE_SIZE == 4 else if (nc >= 0x10000) { SETERR_INBUF(err, buf); err->start = INBUFPOS(buf); err->end = err->start + 1; return MBERR_UNDEFINED; } #endif else code = NOCHAR; if (code < 0x80 || (code >= 0xa1 && code <= 0xdf)) { if (HAS_NOT_ENOUGH_SPACE(buf, 1)) return MBERR_TOOSMALL; *buf->outbuf++ = (unsigned char)code; buf->inbuf++; continue; } if (HAS_NOT_ENOUGH_SPACE(buf, 2)) return MBERR_TOOSMALL; if (code == NOCHAR) { map = jisx0208_encode_map[nc >> 8]; if (map == NULL || (code = map[nc & 0xff]) == NOCHAR) { if (nc >= 0xe000 && nc < 0xe758) { /* user-defined area */ c1 = (Py_UNICODE)(nc - 0xe000) / 188; c2 = (Py_UNICODE)(nc - 0xe000) % 188; *buf->outbuf++ = c1 + 0xf0; *buf->outbuf++ = (c2 < 0x3f ? c2 + 0x40 : c2 + 0x41); buf->inbuf++; continue; } else { SETERR_INBUF(err, buf); err->start = INBUFPOS(buf); err->end = err->start + 1; return MBERR_UNDEFINED; } } } c1 = code >> 8; c2 = code & 0xff; if (IN_RANGE2(c1, c2, JISX0208)) { c2 = (((c1 - 0x21) & 1) ? 0x5e : 0) + (c2 - 0x21); c1 = (c1 - 0x21) >> 1; *buf->outbuf++ = c1 < 0x1f ? c1 + 0x81 : c1 + 0xc1; *buf->outbuf++ = c2 < 0x3f ? c2 + 0x40 : c2 + 0x41; buf->inbuf++; continue; } else { PyErr_SetString(PyExc_RuntimeError, "internal logic error"); return MBERR_INTERNAL; } } return 0; } static int shiftjis_decode_open(PyMultibyteDecoder_Handle *hdl, PyMultibyteDecoder_Context *ctx) { *ctx = NULL; return 0; } static int shiftjis_decode(PyMultibyteDecoder_Handle *hdl, PyMultibyteDecoder_Context *ctx, PyMultibyteDecoder_Buffer *buf, PyMultibyteDecoder_Error *err) { Py_UNICODE code; unsigned char pending; pending = (unsigned char)(long)*ctx; *ctx = NULL; while (buf->inbuf < buf->inbuf_end) { unsigned char nc = *buf->inbuf; if (!pending) { JISX0201_DECODE(nc, code) else { pending = nc; buf->inbuf++; continue; } if (HAS_NOT_ENOUGH_SPACE(buf, 1)) return MBERR_TOOSMALL; *buf->outbuf++ = code; buf->inbuf++; } else { if ((pending >= 0x81 && pending <= 0x9f) || (pending >= 0xe0 && pending <= 0xea)) { unsigned char c1, c2; if (nc < 0x40 || (nc > 0x7e && nc < 0x80) || nc > 0xfc) goto illegalseq; c1 = (pending < 0xe0 ? pending - 0x81 : pending - 0xc1); c2 = (nc < 0x80 ? nc - 0x40 : nc - 0x41); c1 = (2 * c1 + (c2 < 0x5e ? 0 : 1) + 0x21); c2 = (c2 < 0x5e ? c2 : c2 - 0x5e) + 0x21; if (c1 < JISX0208_C1_BOTTOM || c1 > JISX0208_C1_TOP || c2 < JISX0208_C2_BOTTOM || c2 > JISX0208_C2_TOP || jisx0208_decode_map[c1] == NULL || (code = jisx0208_decode_map[c1][ c2 - JISX0208_C2_BOTTOM]) == UNIINV) goto illegalseq; } else if (pending >= 0xf0 && pending <= 0xf9) { if ((nc >= 0x40 && nc <= 0x7e) || (nc >= 0x80 && nc <= 0xfc)) code = 0xe000 + 188 * (pending - 0xf0) + (nc < 0x80 ? nc - 0x40 : nc - 0x41); else goto illegalseq; } else goto illegalseq; if (HAS_NOT_ENOUGH_SPACE(buf, 1)) { *ctx = (PyMultibyteDecoder_Context)(long)pending; return MBERR_TOOSMALL; } *buf->outbuf++ = code; buf->inbuf++; pending = 0; continue; illegalseq: if (INBUFPOS(buf) < 1) { /* the pending character is from previous buffer */ err->object = PyMem_Malloc(2); err->objlength = 2; err->object[0] = pending; err->object[1] = nc; err->start = 0; err->end = 1; } else { SETERR_INBUF(err, buf); err->end = INBUFPOS(buf) + 1; err->start = err->end - 2; } *ctx = (PyMultibyteDecoder_Context)(long)pending; return MBERR_ILLSEQ; } } if (pending) *ctx = (PyMultibyteDecoder_Context)(long)pending; return 0; } static int shiftjis_decode_flush(PyMultibyteDecoder_Handle *hdl, PyMultibyteDecoder_Context *ctx, PyMultibyteDecoder_Buffer *buf, PyMultibyteDecoder_Error *err) { if (*ctx != NULL) { if (INBUFPOS(buf) < 1) { /* the pending character is from the previous buffer */ err->object = PyMem_Malloc(1); err->objlength = 1; *err->object = (unsigned char)(long)*ctx; err->start = err->end = 0; /* no character on current buffer */ } else { /* we can sure that the last character on inbuf is the pending * one, here. all error situation clears dstate and it's the * only way to move the buffer cursor discontinuously. */ SETERR_INBUF(err, buf); err->end = INBUFPOS(buf); err->start = err->end - 1; } *ctx = NULL; return MBERR_TOOFEW; } return 0; } static int shiftjis_decode_reset(PyMultibyteDecoder_Handle *hdl, PyMultibyteDecoder_Context *ctx) { *ctx = NULL; return 0; } static PyMultibyteEncoder_Codec shiftjis_codec_encoder = { "shiftjis", /* name */ 0, /* init */ 0, /* shutdown */ 0, /* open */ 0, /* close */ shiftjis_encode, /* encode */ 0, /* flush */ 0, /* reset */ 0, /* putrepl */ }; static PyMultibyteDecoder_Codec shiftjis_codec_decoder = { "shiftjis", /* name */ 0, /* init */ 0, /* shutdown */ shiftjis_decode_open, /* open */ 0, /* close */ shiftjis_decode, /* decode */ shiftjis_decode_flush, /* flush */ shiftjis_decode_reset, /* reset */ }; /* * CP932: Microsoft extension of Shift-JIS */ static int cp932_encode(PyMultibyteEncoder_Handle *hdl, PyMultibyteEncoder_Context *ctx, PyMultibyteEncoder_Buffer *buf, PyMultibyteEncoder_Error *err) { DBCHAR *map, code; while (buf->inbuf < buf->inbuf_end) { Py_UNICODE nc = *buf->inbuf; unsigned char c1, c2; if (nc < 0x80 || (nc >= 0xff61 && nc <= 0xff9f)) { if (HAS_NOT_ENOUGH_SPACE(buf, 1)) return MBERR_TOOSMALL; *buf->outbuf++ = (unsigned char)(nc < 0x80 ? nc : nc - 0xfec0); buf->inbuf++; continue; } #if Py_UNICODE_SIZE == 4 else if (nc >= 0x10000) { SETERR_INBUF(err, buf); err->start = INBUFPOS(buf); err->end = err->start + 1; return MBERR_UNDEFINED; } #endif else code = NOCHAR; if (HAS_NOT_ENOUGH_SPACE(buf, 2)) return MBERR_TOOSMALL; map = cp932_encode_map[nc >> 8]; if (map == NULL || (code = map[nc & 0xff]) == NOCHAR) { map = jisx0208_encode_map[nc >> 8]; if (map == NULL || (code = map[nc & 0xff]) == NOCHAR) { if (nc >= 0xe000 && nc < 0xe758) { /* user-defined area */ c1 = (Py_UNICODE)(nc - 0xe000) / 188; c2 = (Py_UNICODE)(nc - 0xe000) % 188; *buf->outbuf++ = c1 + 0xf0; *buf->outbuf++ = (c2 < 0x3f ? c2 + 0x40 : c2 + 0x41); buf->inbuf++; continue; } else { SETERR_INBUF(err, buf); err->start = INBUFPOS(buf); err->end = err->start + 1; return MBERR_UNDEFINED; } } c1 = code >> 8; c2 = code & 0xff; if (IN_RANGE2(c1, c2, JISX0208)) { c2 = (((c1 - 0x21) & 1) ? 0x5e : 0) + (c2 - 0x21); c1 = (c1 - 0x21) >> 1; *buf->outbuf++ = c1 < 0x1f ? c1 + 0x81 : c1 + 0xc1; *buf->outbuf++ = c2 < 0x3f ? c2 + 0x40 : c2 + 0x41; buf->inbuf++; } else { PyErr_SetString(PyExc_RuntimeError, "internal logic error"); return MBERR_INTERNAL; } } else { *buf->outbuf++ = code >> 8; *buf->outbuf++ = code & 0xff; buf->inbuf++; } } return 0; } static int cp932_decode(PyMultibyteDecoder_Handle *hdl, PyMultibyteDecoder_Context *ctx, PyMultibyteDecoder_Buffer *buf, PyMultibyteDecoder_Error *err) { Py_UNICODE code; unsigned char pending; pending = (unsigned char)(long)*ctx; *ctx = NULL; while (buf->inbuf < buf->inbuf_end) { unsigned char nc = *buf->inbuf; if (!pending) { if (HAS_NOT_ENOUGH_SPACE(buf, 1)) return MBERR_TOOSMALL; if (nc < 0x80) { *buf->outbuf++ = nc; buf->inbuf++; continue; } else if (nc >= 0xa1 && nc <= 0xdf) { *buf->outbuf++ = 0xfec0 + nc; buf->inbuf++; continue; } else { pending = nc; buf->inbuf++; continue; } } if (IN_RANGE2(pending, nc, CP932P0) && (code = cp932_decode_map[pending & 0x7f][ nc - CP932P0_C2_BOTTOM]) != UNIINV) /* yeah */; else if (IN_RANGE2(pending, nc, CP932P1) && (code = cp932_decode_map[pending & 0x7f][ nc - CP932P1_C2_BOTTOM]) != UNIINV) /* go! */; else if (IN_RANGE2(pending, nc, CP932P2) && (code = cp932_decode_map[pending & 0x7f][ nc - CP932P2_C2_BOTTOM]) != UNIINV) /* okay */; else if ((pending >= 0x81 && pending <= 0x9f) || (pending >= 0xe0 && pending <= 0xea)) { unsigned char c1, c2; if (nc < 0x40 || (nc > 0x7e && nc < 0x80) || nc > 0xfc) goto illegalseq; c1 = (pending < 0xe0 ? pending - 0x81 : pending - 0xc1); c2 = (nc < 0x80 ? nc - 0x40 : nc - 0x41); c1 = (2 * c1 + (c2 < 0x5e ? 0 : 1) + 0x21); c2 = (c2 < 0x5e ? c2 : c2 - 0x5e) + 0x21; if ((!IN_RANGE2(c1, c2, JISX0208)) || jisx0208_decode_map[c1] == NULL || (code = jisx0208_decode_map[c1][ c2 - JISX0208_C2_BOTTOM]) == UNIINV) goto illegalseq; } else if (pending >= 0xf0 && pending <= 0xf9) { if ((nc >= 0x40 && nc <= 0x7e) || (nc >= 0x80 && nc <= 0xfc)) code = 0xe000 + 188 * (pending - 0xf0) + (nc < 0x80 ? nc - 0x40 : nc - 0x41); else goto illegalseq; } else goto illegalseq; if (HAS_NOT_ENOUGH_SPACE(buf, 1)) { *ctx = (PyMultibyteDecoder_Context)(long)pending; return MBERR_TOOSMALL; } *buf->outbuf++ = code; buf->inbuf++; pending = 0; continue; illegalseq: if (INBUFPOS(buf) < 1) { /* the pending character is from previous buffer */ err->object = PyMem_Malloc(2); err->objlength = 2; err->object[0] = pending; err->object[1] = nc; err->start = 0; err->end = 1; } else { SETERR_INBUF(err, buf); err->end = INBUFPOS(buf) + 1; err->start = err->end - 2; } *ctx = (PyMultibyteDecoder_Context)(long)pending; return MBERR_ILLSEQ; } if (pending) *ctx = (PyMultibyteDecoder_Context)(long)pending; return 0; } static PyMultibyteEncoder_Codec cp932_codec_encoder = { "cp932", /* name */ 0, /* init */ 0, /* shutdown */ 0, /* open */ 0, /* close */ cp932_encode, /* encode */ 0, /* flush */ 0, /* reset */ 0, /* putrepl */ }; static PyMultibyteDecoder_Codec cp932_codec_decoder = { "cp932", /* name */ 0, /* init */ 0, /* shutdown */ shiftjis_decode_open, /* open */ 0, /* close */ cp932_decode, /* decode */ shiftjis_decode_flush, /* flush */ shiftjis_decode_reset, /* reset */ }; /* * EUC-JP */ static int euc_jp_encode(PyMultibyteEncoder_Handle *hdl, PyMultibyteEncoder_Context *ctx, PyMultibyteEncoder_Buffer *buf, PyMultibyteEncoder_Error *err) { DBCHAR *map, code; while (buf->inbuf < buf->inbuf_end) { Py_UNICODE nc = *buf->inbuf; if (nc < 0x80) { if (HAS_NOT_ENOUGH_SPACE(buf, 1)) return MBERR_TOOSMALL; *buf->outbuf++ = nc; buf->inbuf++; continue; } #if Py_UNICODE_SIZE == 4 else if (nc >= 0x10000) { SETERR_INBUF(err, buf); err->start = INBUFPOS(buf); err->end = err->start + 1; return MBERR_UNDEFINED; } #endif /* JIS X 0208 */ map = jisx0208_encode_map[nc >> 8]; if (map != NULL && (code = map[nc & 0xff]) != NOCHAR) { if (HAS_NOT_ENOUGH_SPACE(buf, 2)) return MBERR_TOOSMALL; *buf->outbuf++ = (code >> 8) + 0x80; *buf->outbuf++ = (code & 0xff) + 0x80; buf->inbuf++; continue; } /* JIS X 0201 half-width katakana */ if (nc >= 0xff61 && nc <= 0xff9f) { if (HAS_NOT_ENOUGH_SPACE(buf, 2)) return MBERR_TOOSMALL; *buf->outbuf++ = 0x8e; *buf->outbuf++ = (unsigned char)(nc - 0xfec0); buf->inbuf++; continue; } /* JIS X 0212 */ map = jisx0212_encode_map[nc >> 8]; if (map != NULL && (code = map[nc & 0xff]) != NOCHAR) { if (HAS_NOT_ENOUGH_SPACE(buf, 3)) return MBERR_TOOSMALL; *buf->outbuf++ = 0x8f; *buf->outbuf++ = (code >> 8) + 0x80; *buf->outbuf++ = (code & 0xff) + 0x80; buf->inbuf++; continue; } /* user-defined area */ if (nc >= 0xe000 && nc < 0xe758) { if (nc < 0xe3ac) { if (HAS_NOT_ENOUGH_SPACE(buf, 2)) return MBERR_TOOSMALL; *buf->outbuf++ = (Py_UNICODE)(nc - 0xe000) / 94 + 0xf5; *buf->outbuf++ = (Py_UNICODE)(nc - 0xe000) % 94 + 0xa1; } else { if (HAS_NOT_ENOUGH_SPACE(buf, 3)) return MBERR_TOOSMALL; *buf->outbuf++ = 0x8f; *buf->outbuf++ = (Py_UNICODE)(nc - 0xe3ac) / 94 + 0xf5; *buf->outbuf++ = (Py_UNICODE)(nc - 0xe3ac) % 94 + 0xa1; } buf->inbuf++; continue; } SETERR_INBUF(err, buf); err->start = INBUFPOS(buf); err->end = err->start + 1; return MBERR_UNDEFINED; } return 0; } static int euc_jp_decode_open(PyMultibyteDecoder_Handle *hdl, PyMultibyteDecoder_Context *ctx) { struct euc_jp_decode_state *state; state = PyMem_New(struct euc_jp_decode_state, 1); if (state == NULL) return -1; state->pendingsize = 0; *ctx = state; return 0; } static void euc_jp_decode_close(PyMultibyteDecoder_Handle *hdl, PyMultibyteDecoder_Context *ctx) { PyMem_Del(*ctx); } static int euc_jp_decode(PyMultibyteDecoder_Handle *hdl, PyMultibyteDecoder_Context *ctx, PyMultibyteDecoder_Buffer *buf, PyMultibyteDecoder_Error *err) { struct euc_jp_decode_state *state = *ctx; Py_UNICODE code; while (buf->inbuf < buf->inbuf_end) { unsigned char nc = *buf->inbuf; if (HAS_NOT_ENOUGH_SPACE(buf, 1)) return MBERR_TOOSMALL; switch (state->pendingsize) { case 0: if (nc < 0x80) { *buf->outbuf++ = nc; buf->inbuf++; } else { state->pending[0] = nc; state->pendingsize = 1; buf->inbuf++; } break; case 1: if (0xa1 <= state->pending[0] && state->pending[0] < 0xff) { if (state->pending[0] < 0xf5) { /* JIS X 0208 */ unsigned char c1, c2; c1 = state->pending[0] - 0x80; c2 = nc - 0x80; if (IN_RANGE2(c1, c2, JISX0208) && jisx0208_decode_map[c1] != NULL && (code = jisx0208_decode_map[c1][ c2 - JISX0208_C2_BOTTOM]) != UNIINV) { *buf->outbuf++ = code; buf->inbuf++; state->pendingsize = 0; } else goto illegalseq; } else { /* 2bytes user-defined area */ if (0xa1 <= nc && nc < 0xff) { *buf->outbuf++ = 0xe000 + 94 * ( state->pending[0] - 0xf5) + (nc - 0xa1); buf->inbuf++; state->pendingsize = 0; } else goto illegalseq; } } else if (state->pending[0] == 0x8e) { /* half-width katakana */ if (nc >= 0xa1 && nc <= 0xdf) { *buf->outbuf++ = 0xfec0 + nc; buf->inbuf++; state->pendingsize = 0; } else goto illegalseq; } else if (state->pending[0] == 0x8f) { /* 3-bytes seq */ buf->inbuf++; state->pending[1] = nc; state->pendingsize = 2; } else goto illegalseq; break; case 2: assert(state->pending[0] == 0x8f); if (0xa1 <= state->pending[1] && state->pending[1] < 0xff) { if (state->pending[1] < 0xf5) { /* JIS X 0212 */ unsigned char c1, c2; c1 = state->pending[1] - 0x80; c2 = nc - 0x80; if (IN_RANGE2(c1, c2, JISX0212) && jisx0212_decode_map[c1] != NULL && (code = jisx0212_decode_map[c1][ c2 - JISX0212_C2_BOTTOM]) != UNIINV) { *buf->outbuf++ = code; buf->inbuf++; state->pendingsize = 0; } else goto illegalseq; } else { /* 3bytes user-defined area */ if (0xa1 <= nc && nc < 0xff) { *buf->outbuf++ = 0xe3ac + 94 * ( state->pending[1] - 0xf5) + (nc - 0xa1); buf->inbuf++; state->pendingsize = 0; } else goto illegalseq; } } else goto illegalseq; break; default: PyErr_SetString(PyExc_RuntimeError, "internal logic error"); return MBERR_INTERNAL; } continue; illegalseq: if (INBUFPOS(buf) < state->pendingsize) { err->objlength = state->pendingsize + 1; err->object = PyMem_Malloc(err->objlength); if (err->object == NULL) return MBERR_INTERNAL; memcpy(err->object, state->pending, state->pendingsize); err->object[state->pendingsize] = nc; err->start = 0; err->end = INBUFPOS(buf) + 1; } else { SETERR_INBUF(err, buf); err->end = INBUFPOS(buf) + 1; err->start = err->end - 1 - state->pendingsize; } return MBERR_ILLSEQ; } return 0; } static int euc_jp_decode_flush(PyMultibyteDecoder_Handle *hdl, PyMultibyteDecoder_Context *ctx, PyMultibyteDecoder_Buffer *buf, PyMultibyteDecoder_Error *err) { struct euc_jp_decode_state *state = *ctx; if (state->pendingsize > 0) { if (INBUFPOS(buf) < state->pendingsize) { err->objlength = state->pendingsize; err->object = PyMem_Malloc(err->objlength); if (err->object == NULL) return MBERR_INTERNAL; memcpy(err->object, state->pending, state->pendingsize); err->start = 0; err->end = INBUFPOS(buf); } else { SETERR_INBUF(err, buf); err->end = INBUFPOS(buf); err->start = err->end - state->pendingsize; } return MBERR_TOOFEW; } return 0; } static int euc_jp_decode_reset(PyMultibyteDecoder_Handle *hdl, PyMultibyteDecoder_Context *ctx) { struct euc_jp_decode_state *state = *ctx; state->pendingsize = 0; return 0; } static PyMultibyteEncoder_Codec euc_jp_codec_encoder = { "euc_jp", /* name */ 0, /* init */ 0, /* shutdown */ 0, /* open */ 0, /* close */ euc_jp_encode, /* encode */ 0, /* flush */ 0, /* reset */ 0, /* putrepl */ }; static PyMultibyteDecoder_Codec euc_jp_codec_decoder = { "euc_jp", /* name */ 0, /* init */ 0, /* shutdown */ euc_jp_decode_open, /* open */ euc_jp_decode_close, /* close */ euc_jp_decode, /* decode */ euc_jp_decode_flush, /* flush */ euc_jp_decode_reset, /* reset */ }; static int build_encode_map(DBCHAR **encmap, const Py_UNICODE **decmap, unsigned char c1bottom, unsigned char c1top, unsigned char c2bottom, unsigned char c2top) { unsigned char c1, c2; const Py_UNICODE *umap; for (c1 = c1bottom; c1 <= c1top; c1++) { umap = decmap[c1 & 0x7f]; if (umap == NULL) continue; for (c2 = c2bottom; c2 <= c2top; c2++) { Py_UNICODE uni; int upage, i; uni = umap[c2 - c2bottom]; if (uni == UNIINV) continue; upage = uni >> 8; if (encmap[upage] == NULL) { encmap[upage] = PyMem_New(DBCHAR, 256); if (encmap[upage] == NULL) return -1; for (i = 0; i <= 255; i++) encmap[upage][i] = NOCHAR; } if (encmap[upage][uni & 0xff] == NOCHAR) encmap[upage][uni & 0xff] = c1 << 8 | c2; } } return 0; } static int init_maps(void) { int i; for (i = 0; i < 256; i++) jisx0208_encode_map[i] = jisx0212_encode_map[i] = cp932_encode_map[i] = NULL; #define BUILD_MAP(emap, dmap, pfx) \ build_encode_map(emap##_encode_map, dmap##_decode_map, \ pfx##_C1_BOTTOM, pfx##_C1_TOP, \ pfx##_C2_BOTTOM, pfx##_C2_TOP) if (BUILD_MAP(jisx0208, jisx0208, JISX0208) || BUILD_MAP(jisx0212, jisx0212, JISX0212) || BUILD_MAP(cp932, cp932, CP932P0) || BUILD_MAP(cp932, cp932, CP932P1) || BUILD_MAP(cp932, cp932, CP932P2)) { for (i = 0; i < 256; i++) { if (jisx0208_encode_map[i] != NULL) PyMem_Del(jisx0208_encode_map[i]); if (jisx0212_encode_map[i] != NULL) PyMem_Del(jisx0212_encode_map[i]); if (cp932_encode_map[i] != NULL) PyMem_Del(cp932_encode_map[i]); } return -1; } #undef BUILD_MAP /* resolve duplicated mappings between jisx0208 and cp932 */ CP932_TWEAKUNIMAP(cp932_encode_map) return 0; } static struct PyMethodDef _ja_codecs_methods[] = { {NULL, NULL}, }; void init_ja_codecs(void) { PyObject *m; m = Py_InitModule("_ja_codecs", _ja_codecs_methods); PyModule_AddObject(m, "shiftjis_encode", _PyMultibyteEncoder_Create(&shiftjis_codec_encoder, "shiftjis")); PyModule_AddObject(m, "shiftjis_decode", _PyMultibyteDecoder_Create(&shiftjis_codec_decoder, "shiftjis")); PyModule_AddObject(m, "cp932_encode", _PyMultibyteEncoder_Create(&cp932_codec_encoder, "cp932")); PyModule_AddObject(m, "cp932_decode", _PyMultibyteDecoder_Create(&cp932_codec_decoder, "cp932")); PyModule_AddObject(m, "euc_jp_encode", _PyMultibyteEncoder_Create(&euc_jp_codec_encoder, "euc_jp")); PyModule_AddObject(m, "euc_jp_decode", _PyMultibyteDecoder_Create(&euc_jp_codec_decoder, "euc_jp")); if (PyErr_Occurred() || init_maps()) Py_FatalError("can't initialize the _ja_codecs module"); } /* * ex: ts=8 sts=4 et */ 1.1 cjkcodecs/src/_ko_codecs.c Index: _ko_codecs.c =================================================================== /* * _ko_codecs.c: Korean Codecs Implementation * * Copyright (C) 2003 Hye-Shik Chang <pe...@Fr...>. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $Id: _ko_codecs.c,v 1.1 2003/04/20 17:35:31 perky Exp $ */ #include "Python.h" #include "multibytecodec.h" typedef unsigned short DBCHAR; #define UNIINV Py_UNICODE_REPLACEMENT_CHARACTER #define NOCHAR 0x0000 #include "_ko_codecs.h" static DBCHAR *ksx1001_encode_map[256]; static DBCHAR *uhc_encode_map[256]; /* * EUC-KR: KS X 1001:1998 */ static int euc_kr_encode(PyMultibyteEncoder_Handle *hdl, PyMultibyteEncoder_Context *ctx, PyMultibyteEncoder_Buffer *buf, PyMultibyteEncoder_Error *err) { DBCHAR *map, code; while (buf->inbuf < buf->inbuf_end) { Py_UNICODE nc = *buf->inbuf; if (nc < 0x80) { if (HAS_NOT_ENOUGH_SPACE(buf, 1)) return MBERR_TOOSMALL; *buf->outbuf++ = (unsigned char)nc; buf->inbuf++; continue; } #if Py_UNICODE_SIZE == 4 else if (nc >= 0x10000) { SETERR_INBUF(err, buf); err->start = INBUFPOS(buf); err->end = err->start + 1; return MBERR_UNDEFINED; /* all characters of ks x 1001 are included in BMP. */ } #endif map = ksx1001_encode_map[nc >> 8]; if (map == NULL || (code = map[nc & 0xff]) == NOCHAR) { SETERR_INBUF(err, buf); err->start = INBUFPOS(buf); err->end = err->start + 1; return MBERR_UNDEFINED; } if (HAS_NOT_ENOUGH_SPACE(buf, 2)) return MBERR_TOOSMALL; *buf->outbuf++ = code >> 8; *buf->outbuf++ = code & 0xFF; buf->inbuf++; } return 0; } static int euc_kr_decode_open(PyMultibyteDecoder_Handle *hdl, PyMultibyteDecoder_Context *ctx) { *ctx = NULL; return 0; } static int euc_kr_decode(PyMultibyteDecoder_Handle *hdl, PyMultibyteDecoder_Context *ctx, PyMultibyteDecoder_Buffer *buf, PyMultibyteDecoder_Error *err) { Py_UNICODE code; unsigned char pending; pending = (unsigned char)(long)*ctx; *ctx = NULL; while (buf->inbuf < buf->inbuf_end) { unsigned char nc = *buf->inbuf; if (!pending) { if (nc & 0x80) { pending = nc; buf->inbuf++; } else { if (HAS_NOT_ENOUGH_SPACE(buf, 1)) return MBERR_TOOSMALL; *buf->outbuf++ = nc; buf->inbuf++; } } else { if (nc < KSX1001_C2_BOTTOM || nc > KSX1001_C2_TOP || ksx1001_decode_map[pending & 0x7F] == NULL || (code = ksx1001_decode_map[pending & 0x7F][ nc - KSX1001_C2_BOTTOM]) == UNIINV) { if (INBUFPOS(buf) < 1) { /* the pending character is from previous buffer */ err->object = PyMem_Malloc(2); err->objlength = 2; err->object[0] = pending; err->object[1] = nc; /* huh? characters from current buffer only! */ err->start = 0; err->end = 1; } else { SETERR_INBUF(err, buf); err->end = INBUFPOS(buf) + 1; err->start = err->end - 2; } *ctx = (PyMultibyteDecoder_Context)(long)pending; if (pending < KSX1001_C1_BOTTOM || nc < KSX1001_C2_BOTTOM) return MBERR_ILLSEQ; else return MBERR_UNDEFINED; } if (HAS_NOT_ENOUGH_SPACE(buf, 1)) { *ctx = (PyMultibyteDecoder_Context)(long)pending; return MBERR_TOOSMALL; } *buf->outbuf++ = code; buf->inbuf++; pending = 0; } } if (pending) *ctx = (PyMultibyteDecoder_Context)(long)pending; return 0; } static int euc_kr_decode_flush(PyMultibyteDecoder_Handle *hdl, PyMultibyteDecoder_Context *ctx, PyMultibyteDecoder_Buffer *buf, PyMultibyteDecoder_Error *err) { if (*ctx != NULL) { if (INBUFPOS(buf) < 1) { /* the pending character is from the previous buffer */ err->object = PyMem_Malloc(1); err->objlength = 1; *err->object = (unsigned char)(long)*ctx; err->start = err->end = 0; /* no character on current buffer */ } else { /* we can sure that the last character on inbuf is the pending * one, here. all error situation clears dstate and it's the * only way to move the buffer cursor discontinuously. */ SETERR_INBUF(err, buf); err->end = INBUFPOS(buf); err->start = err->end - 1; } *ctx = NULL; return MBERR_TOOFEW; } return 0; } static int euc_kr_decode_reset(PyMultibyteDecoder_Handle *hdl, PyMultibyteDecoder_Context *ctx) { *ctx = NULL; return 0; } static PyMultibyteEncoder_Codec euc_kr_codec_encoder = { "euc_kr", /* name */ 0, /* init */ 0, /* shutdown */ 0, /* open */ 0, /* close */ euc_kr_encode, /* encode */ 0, /* flush */ 0, /* reset */ 0, /* putrepl */ }; static PyMultibyteDecoder_Codec euc_kr_codec_decoder = { "euc_kr", /* name */ 0, /* init */ 0, /* shutdown */ euc_kr_decode_open, /* open */ 0, /* close */ euc_kr_decode, /* decode */ euc_kr_decode_flush, /* flush */ euc_kr_decode_reset, /* reset */ }; /* * CP949: Microsoft CodePage 949, a.k.a. Unified Hangul Code */ static int cp949_encode(PyMultibyteEncoder_Handle *hdl, PyMultibyteEncoder_Context *ctx, PyMultibyteEncoder_Buffer *buf, PyMultibyteEncoder_Error *err) { DBCHAR *map, code; while (buf->inbuf < buf->inbuf_end) { Py_UNICODE nc = *buf->inbuf; if (nc < 0x80) { if (HAS_NOT_ENOUGH_SPACE(buf, 1)) return MBERR_TOOSMALL; *buf->outbuf++ = (unsigned char)nc; buf->inbuf++; continue; } #if Py_UNICODE_SIZE == 4 else if (nc >= 0x10000) { SETERR_INBUF(err, buf); err->start = INBUFPOS(buf); err->end = err->start + 1; return MBERR_UNDEFINED; /* all characters of ks x 1001 are included in BMP. */ } #endif map = ksx1001_encode_map[nc >> 8]; if (map == NULL || (code = map[nc & 0xff]) == NOCHAR) { map = uhc_encode_map[nc >> 8]; if (map == NULL || (code = map[nc & 0xff]) == NOCHAR) { SETERR_INBUF(err, buf); err->start = INBUFPOS(buf); err->end = err->start + 1; return MBERR_UNDEFINED; } } if (HAS_NOT_ENOUGH_SPACE(buf, 2)) return MBERR_TOOSMALL; *buf->outbuf++ = code >> 8; *buf->outbuf++ = code & 0xFF; buf->inbuf++; } return 0; } static int cp949_decode(PyMultibyteDecoder_Handle *hdl, PyMultibyteDecoder_Context *ctx, PyMultibyteDecoder_Buffer *buf, PyMultibyteDecoder_Error *err) { Py_UNICODE code; unsigned char pending; pending = (unsigned char)(long)*ctx; *ctx = NULL; while (buf->inbuf < buf->inbuf_end) { unsigned char nc = *buf->inbuf; if (!pending) { if (nc & 0x80) { pending = nc; buf->inbuf++; } else { if (HAS_NOT_ENOUGH_SPACE(buf, 1)) return MBERR_TOOSMALL; *buf->outbuf++ = nc; buf->inbuf++; } } else { code = UNIINV; if (pending >= KSX1001_C1_BOTTOM && pending <= KSX1001_C1_TOP && ksx1001_decode_map[pending & 0x7F] != NULL && nc >= KSX1001_C2_BOTTOM && nc <= KSX1001_C2_TOP) /* ks x 1001 */ code = ksx1001_decode_map[pending & 0x7F][ nc - KSX1001_C2_BOTTOM]; else if (pending >= UHCL1_C1_BOTTOM && pending <= UHCL1_C1_TOP && uhc_decode_map[pending & 0x7F] != NULL && nc >= UHCL1_C2_BOTTOM && nc <= UHCL1_C2_TOP) /* uhc level 1 */ code = uhc_decode_map[pending & 0x7F][ nc - UHCL1_C2_BOTTOM]; else if (pending >= UHCL2_C1_BOTTOM && pending <= UHCL2_C1_TOP && uhc_decode_map[pending & 0x7F] != NULL && nc >= UHCL2_C2_BOTTOM && nc <= UHCL2_C2_TOP) /* uhc level 2 */ code = uhc_decode_map[pending & 0x7F][ nc - UHCL2_C2_BOTTOM]; if (code == UNIINV) { if (INBUFPOS(buf) < 1) { /* the pending character is from previous buffer */ err->object = PyMem_Malloc(2); err->objlength = 2; err->object[0] = pending; err->object[1] = nc; /* huh? characters from current buffer only! */ err->start = 0; err->end = 1; } else { SETERR_INBUF(err, buf); err->end = INBUFPOS(buf) + 1; err->start = err->end - 2; } *ctx = (PyMultibyteDecoder_Context)(long)pending; /* unlike euc-kr, * cp949 has complete map region when high bit is set */ return MBERR_UNDEFINED; } if (HAS_NOT_ENOUGH_SPACE(buf, 1)) { *ctx = (PyMultibyteDecoder_Context)(long)pending; return MBERR_TOOSMALL; } *buf->outbuf++ = code; buf->inbuf++; pending = 0; } } if (pending) *ctx = (PyMultibyteDecoder_Context)(long)pending; return 0; } static PyMultibyteEncoder_Codec cp949_codec_encoder = { "cp949", /* name */ 0, /* init */ 0, /* shutdown */ 0, /* open */ 0, /* close */ cp949_encode, /* encode */ 0, /* flush */ 0, /* reset */ 0, /* putrepl */ }; static PyMultibyteDecoder_Codec cp949_codec_decoder = { "cp949", /* name */ 0, /* init */ 0, /* shutdown */ euc_kr_decode_open, /* open */ 0, /* close */ cp949_decode, /* decode */ euc_kr_decode_flush, /* flush */ euc_kr_decode_reset, /* reset */ }; static int build_encode_map(DBCHAR **encmap, const Py_UNICODE **decmap, unsigned char c1bottom, unsigned char c1top, unsigned char c2bottom, unsigned char c2top) { unsigned char c1, c2; const Py_UNICODE *umap; for (c1 = c1bottom; c1 <= c1top; c1++) { umap = decmap[c1 & 0x7f]; if (umap == NULL) continue; for (c2 = c2bottom; c2 <= c2top; c2++) { Py_UNICODE uni; int upage, i; uni = umap[c2 - c2bottom]; if (uni == UNIINV) continue; upage = uni >> 8; if (encmap[upage] == NULL) { encmap[upage] = PyMem_New(DBCHAR, 256); if (encmap[upage] == NULL) return -1; for (i = 0; i <= 255; i++) encmap[upage][i] = NOCHAR; } encmap[upage][uni & 0xff] = c1 << 8 | c2; } } return 0; } static int init_maps(void) { int i; for (i = 0; i < 256; i++) ksx1001_encode_map[i] = uhc_encode_map[i] = NULL; if (build_encode_map(ksx1001_encode_map, ksx1001_decode_map, KSX1001_C1_BOTTOM, KSX1001_C1_TOP, KSX1001_C2_BOTTOM, KSX1001_C2_TOP) || build_encode_map(uhc_encode_map, uhc_decode_map, UHCL1_C1_BOTTOM, UHCL1_C1_TOP, UHCL1_C2_BOTTOM, UHCL1_C2_TOP) || build_encode_map(uhc_encode_map, uhc_decode_map, UHCL2_C1_BOTTOM, UHCL2_C1_TOP, UHCL2_C2_BOTTOM, UHCL2_C2_TOP)) { /* memory error */ for (i = 0; i < 256; i++) { if (ksx1001_encode_map[i] != NULL) PyMem_Del(ksx1001_encode_map[i]); if (uhc_encode_map[i] != NULL) PyMem_Del(uhc_encode_map[i]); } return -1; } return 0; } static struct PyMethodDef _ko_codecs_methods[] = { {NULL, NULL}, }; void init_ko_codecs(void) { PyObject *m; m = Py_InitModule("_ko_codecs", _ko_codecs_methods); PyModule_AddObject(m, "euc_kr_encode", _PyMultibyteEncoder_Create(&euc_kr_codec_encoder, "euc_kr")); PyModule_AddObject(m, "euc_kr_decode", _PyMultibyteDecoder_Create(&euc_kr_codec_decoder, "euc_kr")); PyModule_AddObject(m, "cp949_encode", _PyMultibyteEncoder_Create(&cp949_codec_encoder, "cp949")); PyModule_AddObject(m, "cp949_decode", _PyMultibyteDecoder_Create(&cp949_codec_decoder, "cp949")); if (PyErr_Occurred() || init_maps()) Py_FatalError("can't initialize the _ko_codecs module"); } /* * ex: ts=8 sts=4 et */ 1.1 cjkcodecs/src/_zh_CN_codecs.c Index: _zh_CN_codecs.c =================================================================== /* * _zh_CN_codecs.c: Simplified Chinese Codecs Implementation * * Copyright (C) 2003 Hye-Shik Chang <pe...@Fr...>. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $Id: _zh_CN_codecs.c,v 1.1 2003/04/20 17:35:31 perky Exp $ */ #include "Python.h" #include "multibytecodec.h" typedef unsigned short DBCHAR; #define UNIINV Py_UNICODE_REPLACEMENT_CHARACTER #define NOCHAR 0x0000 #include "_zh_CN_codecs.h" static DBCHAR *gb2312_encode_map[256]; static DBCHAR *gbk_encode_map[256]; static DBCHAR *gb18030_encode_map[256]; struct gb18030dec_state { unsigned char pending[4]; size_t pendingsize; }; #define DBCS_DECODE(c1, c2, pfx, map, ass) \ if ((c1) >= pfx##_C1_BOTTOM && \ (c1) <= pfx##_C1_TOP && \ (map)[(c1) & 0x7f] != NULL && \ (c2) >= pfx##_C2_BOTTOM && \ (c2) <= pfx##_C2_TOP && \ ((ass) = (map)[(c1) & 0x7f][(c2) - pfx##_C2_BOTTOM]) \ != UNIINV) ; /* * EUC (the most popular) instance of GB2312 */ static int gb2312_encode(PyMultibyteEncoder_Handle *hdl, PyMultibyteEncoder_Context *ctx, PyMultibyteEncoder_Buffer *buf, PyMultibyteEncoder_Error *err) { DBCHAR *map, code; while (buf->inbuf < buf->inbuf_end) { Py_UNICODE nc = *buf->inbuf; if (nc < 0x80) { if (HAS_NOT_ENOUGH_SPACE(buf, 1)) return MBERR_TOOSMALL; *buf->outbuf++ = (unsigned char)nc; buf->inbuf++; continue; } #if Py_UNICODE_SIZE == 4 else if (nc >= 0x10000) { SETERR_INBUF(err, buf); err->start = INBUFPOS(buf); err->end = err->start + 1; return MBERR_UNDEFINED; /* all characters of gb2312 are included in BMP. */ } #endif map = gb2312_encode_map[nc >> 8]; if (map == NULL || (code = map[nc & 0xff]) == NOCHAR) { SETERR_INBUF(err, buf); err->start = INBUFPOS(buf); err->end = err->start + 1; return MBERR_UNDEFINED; } if (HAS_NOT_ENOUGH_SPACE(buf, 2)) return MBERR_TOOSMALL; *buf->outbuf++ = code >> 8; *buf->outbuf++ = code & 0xFF; buf->inbuf++; } return 0; } static int gb2312_decode_open(PyMultibyteDecoder_Handle *hdl, PyMultibyteDecoder_Context *ctx) { *ctx = NULL; return 0; } static int gb2312_decode(PyMultibyteDecoder_Handle *hdl, PyMultibyteDecoder_Context *ctx, PyMultibyteDecoder_Buffer *buf, PyMultibyteDecoder_Error *err) { Py_UNICODE code; unsigned char pending; pending = (unsigned char)(long)*ctx; *ctx = NULL; while (buf->inbuf < buf->inbuf_end) { unsigned char nc = *buf->inbuf; if (!pending) { if (nc & 0x80) { pending = nc; buf->inbuf++; } else { if (HAS_NOT_ENOUGH_SPACE(buf, 1)) return MBERR_TOOSMALL; *buf->outbuf++ = nc; buf->inbuf++; } } else { if (nc < GB2312_C2_BOTTOM || nc > GB2312_C2_TOP || gb2312_decode_map[pending & 0x7F] == NULL || (code = gb2312_decode_map[pending & 0x7F][ nc - GB2312_C2_BOTTOM]) == UNIINV) { if (INBUFPOS(buf) < 1) { /* the pending character is from previous buffer */ err->object = PyMem_Malloc(2); err->objlength = 2; err->object[0] = pending; err->object[1] = nc; /* huh? characters from current buffer only! */ err->start = 0; err->end = 1; } else { SETERR_INBUF(err, buf); err->end = INBUFPOS(buf) + 1; err->start = err->end - 2; } *ctx = (PyMultibyteDecoder_Context)(long)pending; if (pending < GB2312_C1_BOTTOM || nc < GB2312_C2_BOTTOM) return MBERR_ILLSEQ; else return MBERR_UNDEFINED; } if (HAS_NOT_ENOUGH_SPACE(buf, 1)) { *ctx = (PyMultibyteDecoder_Context)(long)pending; return MBERR_TOOSMALL; } *buf->outbuf++ = code; buf->inbuf++; pending = 0; } } if (pending) *ctx = (PyMultibyteDecoder_Context)(long)pending; return 0; } static int gb2312_decode_flush(PyMultibyteDecoder_Handle *hdl, PyMultibyteDecoder_Context *ctx, PyMultibyteDecoder_Buffer *buf, PyMultibyteDecoder_Error *err) { if (*ctx != NULL) { if (INBUFPOS(buf) < 1) { /* the pending character is from the previous buffer */ err->object = PyMem_Malloc(1); err->objlength = 1; *err->object = (unsigned char)(long)*ctx; err->start = err->end = 0; /* no character on current buffer */ } else { /* we can sure that the last character on inbuf is the pending * one, here. all error situation clears dstate and it's the * only way to move the buffer cursor discontinuously. */ SETERR_INBUF(err, buf); err->end = INBUFPOS(buf); err->start = err->end - 1; } *ctx = NULL; return MBERR_TOOFEW; } return 0; } static int gb2312_decode_reset(PyMultibyteDecoder_Handle *hdl, PyMultibyteDecoder_Context *ctx) { *ctx = NULL; return 0; } static PyMultibyteEncoder_Codec gb2312_codec_encoder = { "gb2312", /* name */ 0, /* init */ 0, /* shutdown */ 0, /* open */ 0, /* close */ gb2312_encode, /* encode */ 0, /* flush */ 0, /* reset */ 0, /* putrepl */ }; static PyMultibyteDecoder_Codec gb2312_codec_decoder = { "gb2312", /* name */ 0, /* init */ 0, /* shutdown */ gb2312_decode_open, /* open */ 0, /* close */ gb2312_decode, /* decode */ gb2312_decode_flush, /* flush */ gb2312_decode_reset, /* reset */ }; /* * CP936: Microsoft CodePage 936, a.k.a. GBK * * - GBK is backward compatible to gb2312 and incorporated Big5, * GB12345 and GB13000 characters. */ static int cp936_encode(PyMultibyteEncoder_Handle *hdl, PyMultibyteEncoder_Context *ctx, PyMultibyteEncoder_Buffer *buf, PyMultibyteEncoder_Error *err) { DBCHAR *map, code; while (buf->inbuf < buf->inbuf_end) { Py_UNICODE nc = *buf->inbuf; if (nc < 0x80) { if (HAS_NOT_ENOUGH_SPACE(buf, 1)) return MBERR_TOOSMALL; *buf->outbuf++ = (unsigned char)nc; buf->inbuf++; continue; } #if Py_UNICODE_SIZE == 4 else if (nc >= 0x10000) { SETERR_INBUF(err, buf); err->start = INBUFPOS(buf); err->end = err->start + 1; return MBERR_UNDEFINED; /* all characters of cp936 are included in BMP. */ } #endif map = gbk_encode_map[nc >> 8]; if (map == NULL || (code = map[nc & 0xff]) == NOCHAR) { SETERR_INBUF(err, buf); err->start = INBUFPOS(buf); err->end = err->start + 1; return MBERR_UNDEFINED; } if (HAS_NOT_ENOUGH_SPACE(buf, 2)) return MBERR_TOOSMALL; *buf->outbuf++ = code >> 8; *buf->outbuf++ = code & 0xFF; buf->inbuf++; } return 0; } static int cp936_decode(PyMultibyteDecoder_Handle *hdl, PyMultibyteDecoder_Context *ctx, PyMultibyteDecoder_Buffer *buf, PyMultibyteDecoder_Error *err) { Py_UNICODE code; unsigned char pending; pending = (unsigned char)(long)*ctx; *ctx = NULL; while (buf->inbuf < buf->inbuf_end) { unsigned char nc = *buf->inbuf; if (!pending) { if (nc & 0x80) { pending = nc; buf->inbuf++; } else { if (HAS_NOT_ENOUGH_SPACE(buf, 1)) return MBERR_TOOSMALL; *buf->outbuf++ = nc; buf->inbuf++; } } else { code = UNIINV; GBK_PREDECODE(pending, nc, code) else DBCS_DECODE(pending, nc, GB2312, gb2312_decode_map, code) else DBCS_DECODE(pending, nc, GBKL1, gbk_decode_map, code) else DBCS_DECODE(pending, nc, GBKL2, gbk_decode_map, code) if (code == UNIINV) { if (INBUFPOS(buf) < 1) { /* the pending character is from previous buffer */ err->object = PyMem_Malloc(2); err->objlength = 2; err->object[0] = pending; err->object[1] = nc; /* huh? characters from current buffer only! */ err->start = 0; err->end = 1; } else { SETERR_INBUF(err, buf); err->end = INBUFPOS(buf) + 1; err->start = err->end - 2; } *ctx = (PyMultibyteDecoder_Context)(long)pending; /* unlike gb2312, * cp936 has c... [truncated message content] |