[KoCo-CVS] [Commit] KoreanCodecs/korean cp949.py euc_kr.py iso_2022_kr.py johab.py mackorean.py qwer
Brought to you by:
perky
From: Hye-Shik C. <pe...@us...> - 2003-01-12 22:54:13
|
perky 03/01/12 14:54:12 Modified: korean cp949.py euc_kr.py iso_2022_kr.py johab.py mackorean.py qwerty2bul.py unijohab.py Removed: korean hangul.py Log: Remove selective framework for two implementations, 'C' and 'Python'. We'll maintain only 1 implementation from now. Accordingly, --with[out]-extension options is removed, too. Revision Changes Path 1.5 +18 -5 KoreanCodecs/korean/cp949.py Index: cp949.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/korean/cp949.py,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- cp949.py 9 Jan 2003 21:35:48 -0000 1.4 +++ cp949.py 12 Jan 2003 22:54:12 -0000 1.5 @@ -17,10 +17,23 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: cp949.py,v 1.4 2003/01/09 21:35:48 perky Exp $ +# $Id: cp949.py,v 1.5 2003/01/12 22:54:12 perky Exp $ # -try: - from korean.c.cp949 import * -except ImportError: - from korean.python.cp949 import * +import codecs +import _koco + +class Codec(codecs.Codec): + encode = _koco.cp949_encode + decode = _koco.cp949_decode + +class StreamWriter(Codec, codecs.StreamWriter): + pass + +class StreamReader(Codec, _koco.StreamReader, codecs.StreamReader): + encoding = 'cp949' + +### encodings module API + +def getregentry(): + return (Codec().encode,Codec().decode,StreamReader,StreamWriter) 1.5 +18 -5 KoreanCodecs/korean/euc_kr.py Index: euc_kr.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/korean/euc_kr.py,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- euc_kr.py 9 Jan 2003 21:35:48 -0000 1.4 +++ euc_kr.py 12 Jan 2003 22:54:12 -0000 1.5 @@ -17,10 +17,23 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: euc_kr.py,v 1.4 2003/01/09 21:35:48 perky Exp $ +# $Id: euc_kr.py,v 1.5 2003/01/12 22:54:12 perky Exp $ # -try: - from korean.c.euc_kr import * -except ImportError: - from korean.python.euc_kr import * +import codecs +import _koco + +class Codec(codecs.Codec): + encode = _koco.euc_kr_encode + decode = _koco.euc_kr_decode + +class StreamWriter(Codec, codecs.StreamWriter): + pass + +class StreamReader(Codec, _koco.StreamReader, codecs.StreamReader): + encoding = 'euc-kr' + +### encodings module API + +def getregentry(): + return (Codec().encode,Codec().decode,StreamReader,StreamWriter) 1.5 +139 -5 KoreanCodecs/korean/iso_2022_kr.py Index: iso_2022_kr.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/korean/iso_2022_kr.py,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- iso_2022_kr.py 9 Jan 2003 21:35:48 -0000 1.4 +++ iso_2022_kr.py 12 Jan 2003 22:54:12 -0000 1.5 @@ -17,10 +17,144 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: iso_2022_kr.py,v 1.4 2003/01/09 21:35:48 perky Exp $ +# $Id: iso_2022_kr.py,v 1.5 2003/01/12 22:54:12 perky Exp $ # -try: - from korean.c.iso_2022_kr import * -except ImportError: - from korean.python.iso_2022_kr import * +import codecs + +KSC5601_CODEC = 'korean.euc-kr' + +US_ASCII = 1 +KSC5601_1987 = 2 + +G0, G1 = 0, 1 # iso-2022-kr doesn't handle G2 and G3 area. + +CHARSETS = { + "\033(B": (G0, US_ASCII), + "\033)B": (G1, US_ASCII), + "\033$(C": (G0, KSC5601_1987), + "\033$)C": (G1, KSC5601_1987), +} +SI = '\x0f' +SO = '\x0e' +ESC = '\033' + +DESIGNATION_MARK = {} +for k, v in CHARSETS.items(): + DESIGNATION_MARK[v] = k + +class Codec(codecs.Codec): + # Unicode to character buffer + def encode(self, data, errors='strict'): + if errors not in ('strict', 'ignore', 'replace'): + raise ValueError, "unknown error handling" + buffer = [] + designation = [US_ASCII, US_ASCII] + new_designation = designation[:] + new_shiftstate = shiftstate = 0 + for c in data: + if c in ('\n', '\r'): + new_shiftstate = 0 + + if c < u'\u0080': + new_shiftstate = 0 + new_designation[0] = US_ASCII + s = c.encode("ascii", errors) + else: + new_shiftstate = 1 + new_designation[1] = KSC5601_1987 + s = c.encode('korean.euc_kr', errors) + + if designation[0] != new_designation[0]: + buffer.append(DESIGNATION_MARK[(G0, new_designation[0])]) + designation[0] = new_designation[0] + if designation[1] != new_designation[1]: + buffer.append(DESIGNATION_MARK[(G1, new_designation[1])]) + designation[1] = new_designation[1] + if shiftstate != new_shiftstate: + buffer.append([SI, SO][new_shiftstate]) + shiftstate = new_shiftstate + + if shiftstate: + s = chr(ord(s[0])&0x7F) + chr(ord(s[1])&0x7F) + buffer.append(s) + + if shiftstate: + buffer.append(SI) + + return (''.join(buffer), len(data)) + + # character buffer to Unicode + def decode(self, data, errors='strict'): + global decmap_ideo, decmap_misc + + if errors not in ('strict', 'ignore', 'replace'): + raise ValueError, "unknown error handling" + buffer = [] + data = str(data) # character buffer compatible object + size = len(data) + + designation = [US_ASCII, KSC5601_1987] + shiftstate = 0 + escstart = -1 + p = 0 + + while p < size: + if data[p] in ('\n', '\r'): + shiftstate = 0 + + if escstart >= 0: + if data[p].isalpha(): + escstr = data[escstart:p+1] + if CHARSETS.has_key(escstr): + charset = CHARSETS[escstr] + designation[charset[0]] = charset[1] + elif errors == 'strict': + raise UnicodeError, "unsupported charset found: " \ + + repr(data[escstart:p+1]) + escstart = -1 + p += 1 + elif data[p] == SO: + shiftstate = 1 + p += 1 + elif data[p] == SI: + shiftstate = 0 + p += 1 + elif data[p] == ESC: + escstart = p + p += 1 + else: + if (ord(data[p]) | (shiftstate and 0x80 or 0x00)) >= 0x80: + codearea = G1 + else: + codearea = G0 + + if designation[codearea] == US_ASCII: + buffer.append(unicode(data[p], "ascii", errors)) + p += 1 + elif ord(data[p]) & 0x7F >= 0x20: # KSC5601_1987 + c = data[p:p+2] + p += 2 + if len(c) == 2: + c = chr(ord(c[0])|0x80) + chr(ord(c[1])|0x80) + buffer.append(unicode(c, KSC5601_CODEC, errors)) + else: # control characters + buffer.append(unichr(ord(data[p]) & 0x7F)) + p += 1 + + return (u''.join(buffer), len(data)) + +class StreamWriter(Codec, codecs.StreamWriter): + pass + +class StreamReader(Codec, codecs.StreamReader): + pass + # not implemented. + # (JapaneseCodecs's implementation is so different to adopt.) + +### encodings module API + +def getregentry(): + return (Codec().encode,Codec().decode,StreamReader,StreamWriter) + +# ex: ts=8 sts=4 et 1.5 +183 -5 KoreanCodecs/korean/johab.py Index: johab.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/korean/johab.py,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- johab.py 9 Jan 2003 21:35:48 -0000 1.4 +++ johab.py 12 Jan 2003 22:54:12 -0000 1.5 @@ -17,10 +17,188 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: johab.py,v 1.4 2003/01/09 21:35:48 perky Exp $ +# $Id: johab.py,v 1.5 2003/01/12 22:54:12 perky Exp $ # -try: - from korean.c.johab import * -except ImportError: - from korean.python.johab import * +import codecs + +from korean.hangul import Jaeum, Moeum, ishangul, split, join +encmap, decmap = {}, {} + +johab2uni_chosung = { + 1: u'', 2: Jaeum.G, 3: Jaeum.GG, 4: Jaeum.N, + 5: Jaeum.D, 6: Jaeum.DD, 7: Jaeum.L, 8: Jaeum.M, + 9: Jaeum.B, 10: Jaeum.BB, 11: Jaeum.S, 12: Jaeum.SS, + 13: Jaeum.NG, 14: Jaeum.J, 15: Jaeum.JJ, 16: Jaeum.C, + 17: Jaeum.K, 18: Jaeum.T, 19: Jaeum.P, 20: Jaeum.H, +} +johab2uni_jungsung = { + 2: u'', 3: Moeum.A, 4: Moeum.AE, 5: Moeum.YA, + 6: Moeum.YAE, 7: Moeum.EO, 10: Moeum.E, 11: Moeum.YEO, + 12: Moeum.YE, 13: Moeum.O, 14: Moeum.WA, 15: Moeum.WAE, + 18: Moeum.OE, 19: Moeum.YO, 20: Moeum.U, 21: Moeum.WEO, + 22: Moeum.WE, 23: Moeum.WI, 26: Moeum.YU, 27: Moeum.EU, + 28: Moeum.YI, 29: Moeum.I +} +johab2uni_jongsung = { + 1: u'', 2: Jaeum.G, 3: Jaeum.GG, 4: Jaeum.GS, + 5: Jaeum.N, 6: Jaeum.NJ, 7: Jaeum.NH, 8: Jaeum.D, + 9: Jaeum.L, 10: Jaeum.LG, 11: Jaeum.LM, 12: Jaeum.LB, + 13: Jaeum.LS, 14: Jaeum.LT, 15: Jaeum.LP, 16: Jaeum.LH, + 17: Jaeum.M, 19: Jaeum.B, 20: Jaeum.BS, 21: Jaeum.S, + 22: Jaeum.SS, 23: Jaeum.NG, 24: Jaeum.J, 25: Jaeum.C, + 26: Jaeum.K, 27: Jaeum.T, 28: Jaeum.P, 29: Jaeum.H +} + +uni2johab_chosung = {} +uni2johab_jungsung = {} +uni2johab_jongsung = {} +for k, v in johab2uni_chosung.items(): + uni2johab_chosung[v] = k +for k, v in johab2uni_jungsung.items(): + uni2johab_jungsung[v] = k +for k, v in johab2uni_jongsung.items(): + uni2johab_jongsung[v] = k + +class Codec(codecs.Codec): + + # Unicode to character buffer + def encode(self, data, errors='strict'): + global encmap + + if errors not in ('strict', 'ignore', 'replace'): + raise ValueError, "unknown error handling" + buffer = [] + + for c in data: + if c < u'\u0080': + buffer.append(c.encode("ascii", errors)) + elif ishangul(c): + cho, jung, jong = split(c) # all hangul can success + cho, jung, jong = ( + uni2johab_chosung[cho], + uni2johab_jungsung[jung], + uni2johab_jongsung[jong] + ) + code = 0x8000 | (cho<<10) | (jung<<5) | jong + buffer.append(chr(code>>8) + chr(code&0xFF)) + else: + if not encmap: + from korean.mappings import johab_ideograph + encmap = johab_ideograph.encoding_map + + if encmap.has_key(c): + buffer.append(encmap[c]) + elif errors == 'replace': + buffer.append('\x84\x41') + elif errors == 'strict': + raise UnicodeError, "cannot map \\u%04x to JOHAB" % ord(c) + + return (''.join(buffer), len(data)) + + # character buffer to Unicode + def decode(self, data, errors='strict'): + global decmap + + if errors not in ('strict', 'ignore', 'replace'): + raise ValueError, "unknown error handling" + + buffer = [] + data = str(data) # character buffer compatible object + size = len(data) + p = 0 + while p < size: + if data[p] < '\x80': + buffer.append(unicode(data[p], "ascii", errors)) + p += 1 + else: + c = data[p:p+2] + p += 2 + if len(c) == 2: + code = (ord(c[0])<<8) | ord(c[1]) + cho = (code >> 10) & 0x1f + jung = (code >> 5) & 0x1f + jong = (code) & 0x1f + if ( johab2uni_chosung.has_key(cho) and + johab2uni_jungsung.has_key(jung) and + johab2uni_jongsung.has_key(jong) ): + buffer.append( join([ + johab2uni_chosung[cho], + johab2uni_jungsung[jung], + johab2uni_jongsung[jong] + ]) ) + continue + + if not decmap: + from korean.mappings import johab_ideograph + decmap = johab_ideograph.decoding_map + + if decmap.has_key(c): + buffer.append(decmap[c]) + continue + + if errors == 'replace': + buffer.append(u'\uFFFD') # REPLACEMENT CHARACTER + elif errors == 'strict': + raise UnicodeError, "unexpected byte 0x%02x%02x found" % tuple(map(ord, c)) + + return (u''.join(buffer), size) + + +class StreamWriter(Codec, codecs.StreamWriter): + pass + + +class StreamReader(Codec, codecs.StreamReader): + + def __init__(self, stream, errors='strict'): + codecs.StreamReader.__init__(self, stream, errors) + self.data = '' + + def _read(self, func, size): + if size == 0: + return u'' + if size is None or size < 0: + data = self.data + func() + self.data = '' + else: + data = self.data + func(max(size, 2) - len(self.data)) + size = len(data) + p = 0 + while p < size: + if data[p] < "\x80": + p = p + 1 + elif p + 2 <= size: + p = p + 2 + else: + break + data, self.data = data[:p], data[p:] + return self.decode(data)[0] + + def read(self, size=-1): + return self._read(self.stream.read, size) + + def readline(self, size=-1): + return self._read(self.stream.readline, size) + + def readlines(self, size=-1): + data = self._read(self.stream.read, size) + buffer = [] + end = 0 + while 1: + pos = data.find(u'\n', end) + if pos < 0: + if end < len(data): + buffer.append(data[end:]) + break + buffer.append(data[end:pos+1]) + end = pos+1 + return buffer + def reset(self): + self.data = '' + +### encodings module API + +def getregentry(): + return (Codec().encode,Codec().decode,StreamReader,StreamWriter) + 1.2 +181 -5 KoreanCodecs/korean/mackorean.py Index: mackorean.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/korean/mackorean.py,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- mackorean.py 9 Jan 2003 22:40:39 -0000 1.1 +++ mackorean.py 12 Jan 2003 22:54:12 -0000 1.2 @@ -17,10 +17,186 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: mackorean.py,v 1.1 2003/01/09 22:40:39 perky Exp $ +# $Id: mackorean.py,v 1.2 2003/01/12 22:54:12 perky Exp $ # -try: - from korean.c.mackorean import * -except ImportError: - from korean.python.mackorean import * +import codecs +from korean.mappings import ksc5601_hangul, appleextension +encmap_hangul, decmap_hangul = ksc5601_hangul.encoding_map, ksc5601_hangul.decoding_map +encmap_apple, decmap_apple = appleextension.encoding_map, appleextension.decoding_map +encmap_ideo, decmap_ideo = {}, {} +encmap_misc, decmap_misc = {}, {} + +class Codec(codecs.Codec): + + # Unicode to character buffer + def encode(self, data, errors='strict'): + global encmap_ideo, encmap_misc + + if errors not in ('strict', 'ignore', 'replace'): + raise ValueError, "unknown error handling" + buffer = [] + p = 0 + size = len(data) + + while p < size: + aemap = appleextension.multilevel_encmap + relp = 0 + while p + relp < size and aemap.has_key(data[p + relp]): + aemap = aemap[data[p + relp]] + relp += 1 + if aemap.has_key(None): + buffer.append(aemap[None]) + p += relp + continue + + c = data[p] + p += 1 + + if c < u'\u0080': + buffer.append(c.encode("ascii", errors)) + elif encmap_hangul.has_key(c): + buffer.append(encmap_hangul[c]) + elif encmap_apple.has_key(c): + buffer.append(encmap_apple[c]) + else: + if not encmap_misc: + from korean.mappings import ksc5601_misc + encmap_misc = ksc5601_misc.encoding_map + if encmap_misc.has_key(c): + buffer.append(encmap_misc[c]) + continue + + if not encmap_ideo: + from korean.mappings import ksc5601_ideograph + encmap_ideo = ksc5601_ideograph.encoding_map + if encmap_ideo.has_key(c): + buffer.append(encmap_ideo[c]) + continue + + if errors == 'replace': + buffer.append('\xa1\xa1') + elif errors == 'strict': + raise UnicodeError, ("cannot map " + "\\u%04x to MacKorean") % ord(c) + + return (''.join(buffer), len(data)) + + # character buffer to Unicode + def decode(self, data, errors='strict'): + global decmap_ideo, decmap_misc + + if errors not in ('strict', 'ignore', 'replace'): + raise ValueError, "unknown error handling" + + buffer = [] + data = str(data) # character buffer compatible object + size = len(data) + p = 0 + while p < size: + if data[p] < '\x80': + buffer.append(unicode(data[p], "ascii", errors)) + p += 1 + elif data[p] <= '\xa0' or data[p] == '\xff': + if decmap_apple.has_key(data[p]): + buffer.append(decmap_apple[data[p]]) + p += 1 + continue + + if errors == 'replace': + buffer.append(u'\uFFFD') # REPLACEMENT CHARACTER + elif errors == 'strict': + raise UnicodeError, "unexpected byte %s found" % ( + hex(ord(data[p]))) + p += 1 + else: + c = data[p:p+2] + p += 2 + if len(c) == 2: + if decmap_hangul.has_key(c): + buffer.append(decmap_hangul[c]) + continue + elif decmap_apple.has_key(c): + buffer.append(decmap_apple[c]) + continue + + if not decmap_misc: + from korean.mappings import ksc5601_misc + decmap_misc = ksc5601_misc.decoding_map + if decmap_misc.has_key(c): + buffer.append(decmap_misc[c]) + continue + + if not decmap_ideo: + from korean.mappings import ksc5601_ideograph + decmap_ideo = ksc5601_ideograph.decoding_map + if decmap_ideo.has_key(c): + buffer.append(decmap_ideo[c]) + continue + + if errors == 'replace': + buffer.append(u'\uFFFD') # REPLACEMENT CHARACTER + elif errors == 'strict': + raise UnicodeError, "unexpected byte 0x%s found" % ( + ''.join(["%02x"%ord(x) for x in c]) ) + + return (u''.join(buffer), size) + + +class StreamWriter(Codec, codecs.StreamWriter): + pass + + +class StreamReader(Codec, codecs.StreamReader): + + def __init__(self, stream, errors='strict'): + codecs.StreamReader.__init__(self, stream, errors) + self.data = '' + + def _read(self, func, size): + if size == 0: + return u'' + if size is None or size < 0: + data = self.data + func() + self.data = '' + else: + data = self.data + func(max(size, 2) - len(self.data)) + size = len(data) + p = 0 + while p < size: + if data[p] < "\xa1" or data[p] == "\xff": + p = p + 1 + elif p + 2 <= size: + p = p + 2 + else: + break + data, self.data = data[:p], data[p:] + return self.decode(data)[0] + + def read(self, size=-1): + return self._read(self.stream.read, size) + + def readline(self, size=-1): + return self._read(self.stream.readline, size) + + def readlines(self, size=-1): + data = self._read(self.stream.read, size) + buffer = [] + end = 0 + while 1: + pos = data.find(u'\n', end) + if pos < 0: + if end < len(data): + buffer.append(data[end:]) + break + buffer.append(data[end:pos+1]) + end = pos+1 + return buffer + def reset(self): + self.data = '' + + +def getregentry(): + return (Codec().encode,Codec().decode,StreamReader,StreamWriter) + +# ex: ts=8 sts=4 et 1.5 +177 -5 KoreanCodecs/korean/qwerty2bul.py Index: qwerty2bul.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/korean/qwerty2bul.py,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- qwerty2bul.py 9 Jan 2003 21:35:48 -0000 1.4 +++ qwerty2bul.py 12 Jan 2003 22:54:12 -0000 1.5 @@ -17,10 +17,182 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: qwerty2bul.py,v 1.4 2003/01/09 21:35:48 perky Exp $ +# $Id: qwerty2bul.py,v 1.5 2003/01/12 22:54:12 perky Exp $ # -try: - from korean.c.qwerty2bul import * -except ImportError: - from korean.python.qwerty2bul import * +import codecs +from korean.hangul import Moeum, Jaeum, Chosung, Jungsung, Jongsung +from korean.hangul import ishangul, join, split, isJaeum, isMoeum + +codekeymap = { + Jaeum.G: 'r', Jaeum.GG: 'R', Jaeum.GS: 'rt', + Jaeum.N: 's', Jaeum.NJ:'sw', Jaeum.NH: 'sg', Jaeum.D: 'e', + Jaeum.DD:'E', Jaeum.L: 'f', Jaeum.LG: 'fr', Jaeum.LM: 'fa', + Jaeum.LB:'fq', Jaeum.LS:'ft', Jaeum.LT: 'fx', Jaeum.LP: 'fv', + Jaeum.LH:'fg', Jaeum.M: 'a', Jaeum.B: 'q', Jaeum.BB: 'Q', + Jaeum.BS:'qt', Jaeum.S: 't', Jaeum.SS: 'T', Jaeum.NG: 'd', + Jaeum.J: 'w', Jaeum.JJ:'W', Jaeum.C: 'c', Jaeum.K: 'z', + Jaeum.T: 'x', Jaeum.P: 'v', Jaeum.H: 'g', + + Moeum.A: 'k', Moeum.AE:'o', Moeum.YA: 'i', Moeum.YAE:'O', + Moeum.EO:'j', Moeum.E: 'p', Moeum.YEO:'u', Moeum.YE: 'P', + Moeum.O: 'h', Moeum.WA:'hk', Moeum.WAE:'ho', Moeum.OE: 'hl', + Moeum.YO:'y', Moeum.U: 'n', Moeum.WEO:'nj', Moeum.WE: 'np', + Moeum.WI:'nl', Moeum.YU:'b', Moeum.EU: 'm', Moeum.YI: 'ml', + Moeum.I: 'l', + + u'': '', +} + +keycodemap = {} +for k, v in codekeymap.items(): + keycodemap[v] = k + keycodemap.setdefault(v.upper(), k) +keycodes = ''.join(keycodemap.keys()) +del k, v + + +class Automata_Hangul2: + + # must Unicode in / Unicode out + + def __init__(self): + self.clear() + + def pushcomp(self): + if self.chosung and not self.jungsung: + self.word_valid = 0 + self.word_comp.append(join([self.chosung, self.jungsung, self.jongsung])) + self.clearcomp() + + def clearcomp(self): + self.chosung = u'' + self.jungsung = u'' + self.jongsung = u'' + + def clear(self): + self.buff = [''] + self.word_raw = [] + self.word_comp = [] + self.word_valid = 1 + self.clearcomp() + + def convert(self, s): + self.clear() + + map(self.feed, s) + self.finalize() + + return u''.join(self.buff) + + def finalize(self): + if self.chosung or self.jungsung or self.jongsung: + self.pushcomp() + if self.word_raw or self.word_comp: + if self.word_valid: + self.buff.append(u''.join(self.word_comp)) + else: + self.word_valid = 1 + self.buff.append(u''.join(self.word_raw)) + + self.word_raw, self.word_comp = [], [] + + def feed(self, c): + self.word_raw.append(c) + if c in keycodes: + code = keycodemap[c] + if isJaeum(code): + if not self.chosung: # chosung O + if self.jungsung or self.jongsung: + self.word_valid = 0 + else: + self.chosung = code + elif not self.jungsung: # chosung O jungsung X + if self.jongsung: + self.word_valid = 0 + else: + self.pushcomp() + self.chosung = code + elif not self.jongsung: # chosung O jungsung O jongsung X + if code not in Jongsung: + self.pushcomp() + self.chosung = code + else: + self.jongsung = code + else: # full + trymul = codekeymap[self.jongsung] + c + if keycodemap.has_key(trymul): # can be multi jongsung + self.jongsung = keycodemap[trymul] + else: + self.pushcomp() + self.chosung = code + else: # MOEUM... + if not self.jongsung: + if not self.jungsung: # jungsung X jongsung X + self.jungsung = code + else: # jungsung O jongsung X + trymul = codekeymap[self.jungsung] + c + if keycodemap.has_key(trymul): # can be multi jungsung + self.jungsung = keycodemap[trymul] + else: + self.pushcomp() + self.jungsung = code + else: # jongsung O + if len(codekeymap[self.jongsung]) > 1: + ojong = keycodemap[codekeymap[self.jongsung][:-1]] + ncho = keycodemap[codekeymap[self.jongsung][-1]] + self.jongsung = ojong + self.pushcomp() + self.chosung = ncho + self.jungsung = code + else: + njong = self.jongsung + self.jongsung = u'' + self.pushcomp() + self.chosung = njong + self.jungsung = code + else: # non key code + self.finalize() + self.buff.append(c) + + +class Codec(codecs.Codec): + + BASECODEC = 'korean.cp949' # fallback codec of decoder + + # Unicode to key stroke + def encode(self, data, errors='strict'): + if errors not in ('strict', 'ignore', 'replace'): + raise ValueError, "unknown error handling" + + r = [] + for c in data: + if c <= u'\u0080': + r.append(c.encode('ascii')) + elif not ishangul(c): + r.append(c.encode(self.BASECODEC, errors=errors)) + else: + for k in split(c): + r.append(codekeymap[k]) + + r = ''.join(r) + return (r, len(r)) + + # key stroke to Unicode + def decode(self, data, errors='strict'): + if errors not in ('strict', 'ignore', 'replace'): + raise ValueError, "unknown error handling" + + s = unicode(data, self.BASECODEC, errors) + am = Automata_Hangul2() + r = am.convert(s) + return (r, len(r)) + +class StreamWriter(Codec, codecs.StreamWriter): + pass + +class StreamReader(Codec, codecs.StreamReader): + pass + +def getregentry(): + return (Codec().encode, Codec().decode, StreamReader, StreamWriter) 1.5 +31 -5 KoreanCodecs/korean/unijohab.py Index: unijohab.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/korean/unijohab.py,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- unijohab.py 9 Jan 2003 21:35:48 -0000 1.4 +++ unijohab.py 12 Jan 2003 22:54:12 -0000 1.5 @@ -17,10 +17,36 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: unijohab.py,v 1.4 2003/01/09 21:35:48 perky Exp $ +# $Id: unijohab.py,v 1.5 2003/01/12 22:54:12 perky Exp $ # -try: - from korean.c.unijohab import * -except ImportError: - from korean.python.unijohab import * +import codecs +from korean.hangul import ishangul, disjoint, conjoin + +class Codec(codecs.Codec): + + # Unicode to character buffer + def encode(self, data, errors='strict'): + if errors not in ('strict', 'ignore', 'replace'): + raise ValueError, "unknown error handling" + + return disjoint(data).encode('utf-8', errors), len(data) + + # character buffer to Unicode + def decode(self, data, errors='strict'): + if errors not in ('strict', 'ignore', 'replace'): + raise ValueError, "unknown error handling" + + return conjoin(unicode(data, 'utf-8', errors)), len(data) + +class StreamWriter(Codec, codecs.StreamWriter): + pass + +class StreamReader(Codec, codecs.StreamReader): + pass + # XXX: Temporarily None. + +### encodings module API + +def getregentry(): + return (Codec().encode,Codec().decode,StreamReader,StreamWriter) |