[KoCo-CVS] [Commit] KoreanCodecs/src hangul.c
Brought to you by:
perky
From: Chang <pe...@us...> - 2002-04-25 04:49:04
|
perky 02/04/24 21:49:01 Modified: src hangul.c Log: - Implement join, split, conjoin, disjoint methods on korean.c.hangul Revision Changes Path 1.2 +323 -82 KoreanCodecs/src/hangul.c Index: hangul.c =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/src/hangul.c,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- hangul.c 24 Apr 2002 14:16:56 -0000 1.1 +++ hangul.c 25 Apr 2002 04:49:01 -0000 1.2 @@ -4,14 +4,14 @@ * KoreanCodecs Hangul Module C Implementation * * Author : Hye-Shik Chang <pe...@fa...> - * Date : $Date: 2002/04/24 14:16:56 $ + * Date : $Date: 2002/04/25 04:49:01 $ * Created : 25 April 2002 * - * $Revision: 1.1 $ + * $Revision: 1.2 $ */ static char *version = -"$Id: hangul.c,v 1.1 2002/04/24 14:16:56 perky Exp $"; +"$Id: hangul.c,v 1.2 2002/04/25 04:49:01 perky Exp $"; #include "Python.h" @@ -46,53 +46,61 @@ #define CHOSUNG_FILLER 0x115f #define JUNGSUNG_FILLER 0x1160 -#define F_JAEUM 0x01 -#define F_MOEUM 0x02 -#define F_CHOSUNG 0x04 -#define F_JUNGSUNG 0x08 -#define F_JONGSUNG 0x10 +static PyObject *UniNull, *UniSpace; +static PyObject *ErrorObject; #define MAX_MULTIJAMO 3 typedef struct _jamotype { char *name; Py_UNICODE code; - int multi[MAX_MULTIJAMO]; - int flags; + int multi[MAX_MULTIJAMO]; + char orders[3]; /* cho, jung, jong */ } jamotype; #define CODE(c) #c,c #define NOMULTI {0,0,0} -#define JC (F_JAEUM | F_CHOSUNG) -#define JJ (F_JAEUM | F_JONGSUNG) -#define JCJ (F_JAEUM | F_CHOSUNG | F_JONGSUNG) -#define MJ (F_MOEUM | F_JUNGSUNG) -jamotype jamos[] = { +#define J_C {0,-1,-1} +#define J_J {-1,-1,0} +#define J_CJ {0,-1,0} +#define M_J {-1,0,-1} +static jamotype jamos[] = { /* JAEUM */ - { CODE(G), NOMULTI, JCJ }, { CODE(GG), {G, G,}, JCJ }, { CODE(GS), {G, S,}, JJ }, - { CODE(N), NOMULTI, JCJ }, { CODE(NJ), {N, J,}, JJ }, { CODE(NH), {N, H,}, JJ }, - { CODE(D), NOMULTI, JCJ }, { CODE(DD), {D, D,}, JC }, { CODE(L), NOMULTI, JCJ }, - { CODE(LG), {L, G,}, JJ }, { CODE(LM), {L, M,}, JJ }, { CODE(LB), {L, B,}, JJ }, - { CODE(LS), {L, S,}, JJ }, { CODE(LT), {L, T,}, JJ }, { CODE(LP), {L, P,}, JJ }, - { CODE(LH), {L, H,}, JJ }, { CODE(M), NOMULTI, JCJ }, { CODE(B), NOMULTI, JCJ }, - { CODE(BB), {B, B,}, JC }, { CODE(BS), {B, S,}, JJ }, { CODE(S), NOMULTI, JCJ }, - { CODE(SS), {S, S,}, JCJ }, { CODE(NG), NOMULTI, JCJ }, { CODE(J), NOMULTI, JCJ }, - { CODE(JJ), {J, J,}, JC }, { CODE(C), NOMULTI, JCJ }, { CODE(K), NOMULTI, JCJ }, - { CODE(T), NOMULTI, JCJ }, { CODE(P), NOMULTI, JCJ }, { CODE(H), NOMULTI, JCJ }, + { CODE(G), NOMULTI, J_CJ }, { CODE(GG), {G, G,}, J_CJ }, { CODE(GS), {G, S,}, J_J }, + { CODE(N), NOMULTI, J_CJ }, { CODE(NJ), {N, J,}, J_J }, { CODE(NH), {N, H,}, J_J }, + { CODE(D), NOMULTI, J_CJ }, { CODE(DD), {D, D,}, J_C }, { CODE(L), NOMULTI, J_CJ }, + { CODE(LG), {L, G,}, J_J }, { CODE(LM), {L, M,}, J_J }, { CODE(LB), {L, B,}, J_J }, + { CODE(LS), {L, S,}, J_J }, { CODE(LT), {L, T,}, J_J }, { CODE(LP), {L, P,}, J_J }, + { CODE(LH), {L, H,}, J_J }, { CODE(M), NOMULTI, J_CJ }, { CODE(B), NOMULTI, J_CJ }, + { CODE(BB), {B, B,}, J_C }, { CODE(BS), {B, S,}, J_J }, { CODE(S), NOMULTI, J_CJ }, + { CODE(SS), {S, S,}, J_CJ }, { CODE(NG), NOMULTI, J_CJ }, { CODE(J), NOMULTI, J_CJ }, + { CODE(JJ), {J, J,}, J_C }, { CODE(C), NOMULTI, J_CJ }, { CODE(K), NOMULTI, J_CJ }, + { CODE(T), NOMULTI, J_CJ }, { CODE(P), NOMULTI, J_CJ }, { CODE(H), NOMULTI, J_CJ }, /* MOEUM */ - { CODE(A), NOMULTI, MJ }, { CODE(AE), {A, I,}, MJ }, { CODE(YA), NOMULTI, MJ }, - { CODE(YAE), {YA,I}, MJ }, { CODE(EO), NOMULTI, MJ }, { CODE(E), NOMULTI, MJ }, - { CODE(YEO), NOMULTI, MJ }, { CODE(YE), {YEO,I}, MJ }, { CODE(O), NOMULTI, MJ }, - { CODE(WA), {O, A}, MJ }, { CODE(WAE), {O,A,I}, MJ }, { CODE(OE), {O, I}, MJ }, - { CODE(YO), NOMULTI, MJ }, { CODE(U), NOMULTI, MJ }, { CODE(WEO), {U, EO}, MJ }, - { CODE(WE), {U, E}, MJ }, { CODE(WI), {U, I}, MJ }, { CODE(YU), NOMULTI, MJ }, - { CODE(EU), NOMULTI, MJ }, { CODE(YI), {EU, I}, MJ }, { CODE(I), NOMULTI, MJ }, + { CODE(A), NOMULTI, M_J }, { CODE(AE), {A, I,}, M_J }, { CODE(YA), NOMULTI, M_J }, + { CODE(YAE), {YA,I}, M_J }, { CODE(EO), NOMULTI, M_J }, { CODE(E), NOMULTI, M_J }, + { CODE(YEO), NOMULTI, M_J }, { CODE(YE), {YEO,I}, M_J }, { CODE(O), NOMULTI, M_J }, + { CODE(WA), {O, A}, M_J }, { CODE(WAE), {O,A,I}, M_J }, { CODE(OE), {O, I}, M_J }, + { CODE(YO), NOMULTI, M_J }, { CODE(U), NOMULTI, M_J }, { CODE(WEO), {U, EO}, M_J }, + { CODE(WE), {U, E}, M_J }, { CODE(WI), {U, I}, M_J }, { CODE(YU), NOMULTI, M_J }, + { CODE(EU), NOMULTI, M_J }, { CODE(YI), {EU, I}, M_J }, { CODE(I), NOMULTI, M_J }, /* END MARKER */ - { 0, 0, NOMULTI, 0 }, + { 0, 0, NOMULTI, {0,} }, }; -#undef JC, JJ, JCJ, MJ, NOMULTI, CODE +#undef J_C, J_J, J_CJ, M_J, NOMULTI, CODE +static jamotype *jamo_chosung[NCHOSUNG], *jamo_jungsung[NJUNGSUNG], *jamo_jongsung[NJONGSUNG]; + +#define getJamotype(c) jamos[(c)-JAEUM_BOTTOM] #define isJaeum(c) (JAEUM_BOTTOM <= (c) && (c) <= JAEUM_TOP) #define isMoeum(c) (MOEUM_BOTTOM <= (c) && (c) <= MOEUM_TOP) +#define isHangulSyllable(c) (HANGUL_BOTTOM <= (c) && (c) <= HANGUL_TOP) +#define isChosung(c) (getJamotype(c).orders[0] >= 0) +#define isJungsung(c) (getJamotype(c).orders[1] >= 0) +#define isJongsung(c) (getJamotype(c).orders[2] >= 0) +#define getChosungOrder(c) (getJamotype(c).orders[0]) +#define getJungsungOrder(c) (getJamotype(c).orders[1]) +#define getJongsungOrder(c) (getJamotype(c).orders[2]) + static char Py_isJaeum__doc__[] = "isJaeum(code): Verify whether the code is Jaeum."; @@ -113,7 +121,8 @@ if (isJaeum(*code)) { Py_INCREF(Py_True); return Py_True; - } else { + } + else { Py_INCREF(Py_False); return Py_False; } @@ -138,76 +147,290 @@ if (isMoeum(*code)) { Py_INCREF(Py_True); return Py_True; - } else { + } + else { + Py_INCREF(Py_False); + return Py_False; + } +} + +static char Py_ishangul__doc__[] = "ishangul(code): Verify whether the code is hangul."; + +static PyObject * +Py_ishangul(PyObject *self, PyObject *args) +{ + Py_UNICODE *code; + int codelen; + + if (!PyArg_ParseTuple(args, "u#:ishangul", &code, &codelen)) + return NULL; + + if (codelen < 1) { + PyErr_Format(PyExc_ValueError, "need not null unicode string"); + return NULL; + } + + if (isHangulSyllable(*code) || isJaeum(*code) || isMoeum(*code)) { + Py_INCREF(Py_True); + return Py_True; + } + else { Py_INCREF(Py_False); return Py_False; } } -#if 0 -static char cp949_encode__doc__[] = "CP949 encoder"; +static char Py_join__doc__[] = "join([chosung, jungsung, jongsung]): Assemble hangul syllable from jamos."; static PyObject * -cp949_encode(PyObject *self, PyObject *args) +Py_join(PyObject *self, PyObject *args) { - Py_UNICODE *argptr, *srccur, *srcend; - int arglen, errtype = error_strict; - char *errors = NULL; - unsigned char *destptr, *destcur, *decbuf; + PyObject *argchar, *argelems[3]; + Py_UNICODE elems[3], *uobj; + int i; + + if (!PyArg_ParseTuple(args, "O:join", &argchar)) + return NULL; + + if (PyList_Check(argchar)) { + if (PyList_GET_SIZE(argchar) != 3) + goto argerr; + for (i = 0; i < 3; i ++) + argelems[i] = PyList_GET_ITEM(argchar, i); + } + else if (PyTuple_Check(argchar)) { + if (PyTuple_GET_SIZE(argchar) != 3) + goto argerr; + for (i = 0; i < 3; i ++) + argelems[i] = PyTuple_GET_ITEM(argchar, i); + } + else { +argerr: PyErr_Format(PyExc_ValueError, "need list or tuple with 3 unicode elements"); + return NULL; + } + + for (i = 0; i < 3; i ++) { + if ((uobj = PyUnicode_AsUnicode(argelems[i])) == NULL) + goto argerr; + if (PyUnicode_GET_SIZE(argelems[i])) + elems[i] = *uobj; + else + elems[i] = NULL; + } + + if ( (elems[0] && (!isJaeum(elems[0]) || !isChosung(elems[0]))) /* Chosung validity */ + || (elems[1] && (!isMoeum(elems[1]))) /* Jungsung validity */ + || (elems[2] && (!isJaeum(elems[2]) || !isJongsung(elems[2]))) ) { + PyErr_Format(ErrorObject, "not valid jamo combination"); + return NULL; + } + + if ((!elems[0] || !elems[1]) && elems[2]) { + PyErr_Format(ErrorObject, "trying to assemble character which " + "is not in unicode map"); + return NULL; + } + else if (elems[0] && !elems[1]) { + Py_INCREF(argelems[0]); + return argelems[0]; + } + else if (elems[1] && !elems[0]) { + Py_INCREF(argelems[1]); + return argelems[1]; + } + else if (!elems[0]) { /* [Null, Null, Null] */ + Py_INCREF(UniSpace); + return UniSpace; + } + else { + Py_UNICODE code; + + code = ((getChosungOrder(elems[0]) * NJUNGSUNG) + getJungsungOrder(elems[1])) * + NJONGSUNG + getJongsungOrder(elems[2]) + HANGUL_BOTTOM; + return PyUnicode_FromUnicode(&code, 1); + } +} + +static char Py_split__doc__[] = "split(code): Disassemble hangul syllable into jamos."; + +static PyObject * +Py_split(PyObject *self, PyObject *args) +{ + Py_UNICODE *code; PyObject *r; + int codelen; + + if (!PyArg_ParseTuple(args, "u#:split", &code, &codelen)) + return NULL; + + if (codelen < 1) { + PyErr_Format(PyExc_ValueError, "need not null unicode string"); + return NULL; + } + + if (isHangulSyllable(*code)) { + Py_UNICODE cho, jung, jong; + PyObject *jongobj; + Py_UNICODE hseq, t; + + hseq = *code - HANGUL_BOTTOM; + + cho = jamo_chosung[hseq / (NJUNGSUNG*NJONGSUNG)]->code; + jung = jamo_jungsung[(hseq / NJONGSUNG) % NJUNGSUNG]->code; + + if ((t = hseq % NJONGSUNG) != NULL) { + jong = jamo_jongsung[t]->code; + jongobj = PyUnicode_FromUnicode(&jong, 1); + } else { + jongobj = UniNull; + Py_INCREF(UniNull); + } + + r = PyTuple_New(3); + PyTuple_SET_ITEM(r, 0, PyUnicode_FromUnicode(&cho, 1)); + PyTuple_SET_ITEM(r, 1, PyUnicode_FromUnicode(&jung, 1)); + PyTuple_SET_ITEM(r, 2, jongobj); - if (!PyArg_ParseTuple(args, "u#|z:cp949_encode", &argptr, &arglen, &errors)) + return r; + } + else if (isJaeum(*code)) { + r = PyTuple_New(3); + PyTuple_SET_ITEM(r, 0, PyUnicode_FromUnicode(code, 1)); + PyTuple_SET_ITEM(r, 1, UniNull); Py_INCREF(UniNull); + PyTuple_SET_ITEM(r, 2, UniNull); Py_INCREF(UniNull); + return r; + } + else if (isMoeum(*code)) { + r = PyTuple_New(3); + PyTuple_SET_ITEM(r, 0, UniNull); Py_INCREF(UniNull); + PyTuple_SET_ITEM(r, 1, PyUnicode_FromUnicode(code, 1)); + PyTuple_SET_ITEM(r, 2, UniNull); Py_INCREF(UniNull); + return r; + } + else { + PyErr_Format(ErrorObject, "not a hangul code"); return NULL; + } +} + +static char Py_conjoin__doc__[] = "conjoin(unicodestring): conjoin unicode johab string into unicode syllable string"; + +static PyObject * +Py_conjoin(PyObject *self, PyObject *args) +{ + PyObject *r; + Py_UNICODE *code, *dst, *dstorg, c; + int cho, jung, jong; + int codelen, i; - errtype = error_type(errors); - if (errtype == error_undef) + if (!PyArg_ParseTuple(args, "u#:conjoin", &code, &codelen)) return NULL; - destcur = destptr = PyMem_New(unsigned char, arglen*2+1); - for (srccur = argptr, srcend = argptr + arglen; srccur < srcend; srccur++) { - if (*srccur <= 0x7F) - *(destcur++) = *srccur; - else { - decbuf = _ksc5601_encode(*srccur); - if (!decbuf) - decbuf = _uhc_encode(*srccur); - if(decbuf == 0) { - switch (errtype) { - case error_strict: - PyMem_Del(destptr); - PyErr_Format(PyExc_UnicodeError, - "CP949 encoding error: invalid character \\u%04x", - *srccur); - return NULL; - break; - case error_replace: - *(destcur++) = 0xa1; - *(destcur++) = 0xa1; - break; - /* case error_ignore: break; */ + dstorg = dst = PyMem_New(Py_UNICODE, codelen); + + for (i = 0; i < codelen; i++) { + c = code[i]; + if ((JBASE_CHOSUNG <= c && c <= 0x1112) || c == CHOSUNG_FILLER) { + if (codelen > i+1 && JUNGSUNG_FILLER <= code[i+1] && code[i+1] <= 0x1175) { + if (c == CHOSUNG_FILLER) cho = -1; + else cho = c - JBASE_CHOSUNG; + if (code[i+1] == JUNGSUNG_FILLER) jung = -1; + else jung = code[i+1] - JBASE_JUNGSUNG; + + if (codelen > i+2 && JBASE_JONGSUNG <= code[i+2] && code[i+2] <= 0x11c2) { + jong = code[i+2] - JBASE_JONGSUNG + 1; + i += 2; + } + else { + jong = 0; i++; + } + + if (jong && (cho == -1 || jung == -1)) { /* can't trans to syllable */ + if (cho >= 0) *(dst++) = jamo_chosung[cho]->code; + if (jung >= 0) *(dst++) = jamo_jungsung[jung]->code; + *(dst++) = jamo_jongsung[jong]->code; } - } else { - *(destcur++) = decbuf[0]; - *(destcur++) = decbuf[1]; + else if (cho == -1) /* jungsung only */ + *(dst++) = jamo_jungsung[jung]->code; + else if (jung == -1) /* chosung only */ + *(dst++) = jamo_chosung[cho]->code; + else /* full set */ + *(dst++) = HANGUL_BOTTOM + (cho * NJUNGSUNG + jung) * NJONGSUNG + jong; } + else if (c != CHOSUNG_FILLER) /* chosung only */ + *(dst++) = jamo_chosung[c-JBASE_CHOSUNG]->code; } + else if (JBASE_JUNGSUNG <= c && c <= 0x1175) /* jungsung only */ + *(dst++) = jamo_jungsung[c-JBASE_JUNGSUNG]->code; + else + *(dst++) = c; } - r = codec_tuple(PyString_FromStringAndSize((char*)destptr, destcur - destptr), arglen); - PyMem_Del(destptr); + r = PyUnicode_FromUnicode(dstorg, dst-dstorg); + PyMem_Del(dstorg); + return r; } -#endif +static char Py_disjoint__doc__[] = "disjoint(unicodestring): disjoint unicode syllable string into unicode johab string"; + +static PyObject * +Py_disjoint(PyObject *self, PyObject *args) +{ + Py_UNICODE *code, *dst, *dstorg, c; + PyObject *r; + int codelen, i; + + if (!PyArg_ParseTuple(args, "u#:split", &code, &codelen)) + return NULL; + + dstorg = dst = PyMem_New(Py_UNICODE, codelen*3); + + for (i = 0; i < codelen; i++) { + c = code[i]; + if (isHangulSyllable(c)) { + int hseq; + Py_UNICODE jong; + + hseq = c - HANGUL_BOTTOM; + jong = hseq % NJONGSUNG; + + *(dst++) = hseq / (NJUNGSUNG * NJONGSUNG) + JBASE_CHOSUNG; + *(dst++) = (hseq / NJONGSUNG) % NJUNGSUNG + JBASE_JUNGSUNG; + if (jong) + *(dst++) = jong + JBASE_JONGSUNG - 1; + } + else if (isJaeum(c) && isChosung(c)) { + *(dst++) = getChosungOrder(c) + JBASE_CHOSUNG; + *(dst++) = JUNGSUNG_FILLER; + } + else if (isMoeum(c)) { + *(dst++) = CHOSUNG_FILLER; + *(dst++) = getJungsungOrder(c) + JBASE_JUNGSUNG; + } else + *(dst++) = c; + } + + r = PyUnicode_FromUnicode(dstorg, dst-dstorg); + PyMem_Del(dstorg); + + return r; +} + /* List of methods defined in the module */ #define meth(name, func, doc) {name, (PyCFunction)func, METH_VARARGS, doc} static struct PyMethodDef hangul_methods[] = { - meth("isJaeum", Py_isJaeum, Py_isJaeum__doc__), - meth("isMoeum", Py_isMoeum, Py_isMoeum__doc__), + meth("isJaeum", Py_isJaeum, Py_isJaeum__doc__), + meth("isMoeum", Py_isMoeum, Py_isMoeum__doc__), + meth("ishangul", Py_ishangul, Py_ishangul__doc__), + meth("join", Py_join, Py_join__doc__), + meth("split", Py_split, Py_split__doc__), + meth("conjoin", Py_conjoin, Py_conjoin__doc__), + meth("disjoint", Py_disjoint, Py_disjoint__doc__), {NULL, NULL}, }; @@ -230,6 +453,11 @@ /* Create the module and add the functions */ m = Py_InitModule("hangul", hangul_methods); + UniNull = PyUnicode_FromUnicode(NULL, 0); + tuni[0] = 0x3000; /* Unicode Double-wide Space */ + UniSpace = PyUnicode_FromUnicode(tuni, 1); + Py_INCREF(UniSpace); + /* Add some symbolic constants to the module */ d = PyModule_GetDict(m); SET_INTCONSTANT(d, NCHOSUNG); @@ -253,7 +481,9 @@ PyDict_SetItemString(d, "Chosung", Chosung); PyDict_SetItemString(d, "Jungsung", Jungsung); PyDict_SetItemString(d, "Jongsung", Jongsung); - PyList_SET_ITEM(Jongsung, cur_jong++, PyUnicode_FromUnicode(NULL, 0)); + jamo_jongsung[cur_jong] = NULL; + Py_INCREF(UniNull); + PyList_SET_ITEM(Jongsung, cur_jong++, UniNull); /* Create Jaeum and Moeum meta class */ JaeumDict = PyDict_New(); @@ -294,20 +524,27 @@ PyDict_SetItemString(d, jamo->name, unijamo); Py_INCREF(unijamo); /* PuTyple_SET_ITEM steals reference */ - if (jamo->flags & F_JAEUM) { + if (isJaeum(jamo->code)) { PyTuple_SET_ITEM(JaeumCodes, cur_jaeum++, unijamo); - if (jamo->flags & F_CHOSUNG) { + if (isChosung(jamo->code)) { + jamo->orders[0] = cur_cho; + jamo_chosung[cur_cho] = jamo; PyList_SET_ITEM(Chosung, cur_cho++, unijamo); PyDict_SetItemString(JaeumDict, jamo->name, unijamo); } - if (jamo->flags & F_JONGSUNG) { + if (isJongsung(jamo->code)) { + jamo->orders[2] = cur_jong; + jamo_jongsung[cur_jong] = jamo; PyList_SET_ITEM(Jongsung, cur_jong++, unijamo); PyDict_SetItemString(JaeumDict, jamo->name, unijamo); } multicls = JaeumMulti; - } else { /* Moeum */ + } + else { /* Moeum */ PyTuple_SET_ITEM(MoeumCodes, cur_moeum++, unijamo); - if (jamo->flags & F_JUNGSUNG) { + if (isJungsung(jamo->code)) { + jamo->orders[1] = cur_jung; + jamo_jungsung[cur_jung] = jamo; PyList_SET_ITEM(Jungsung, cur_jung++, unijamo); PyDict_SetItemString(MoeumDict, jamo->name, unijamo); } @@ -347,8 +584,12 @@ PyDict_SetItemString(d, "CHOSUNG_FILLER", PyUnicode_FromUnicode(tuni, 1)); tuni[0] = JUNGSUNG_FILLER; PyDict_SetItemString(d, "JUNGSUNG_FILLER", PyUnicode_FromUnicode(tuni, 1)); + PyDict_SetItemString(d, "Null", UniNull); PyDict_SetItemString(d, "version", PyString_FromString(version)); + + ErrorObject = PyErr_NewException("hangul.UnicodeHangulError", NULL, NULL); + PyDict_SetItemString(d, "UnicodeHangulError", ErrorObject); /* Check for errors */ if (PyErr_Occurred()) |