Thread: [KoCo-CVS] [Commit] KoreanCodecs/korean/python unijohab.py
Brought to you by:
perky
From: Chang <pe...@us...> - 2002-04-24 07:38:13
|
perky 02/04/24 00:38:11 Modified: korean/python unijohab.py Log: - Change unijohab implementation to use newly introduced hangul.conjoin and hangul.disjoint Reviewed by: unittest ;) Revision Changes Path 1.4 +4 -92 KoreanCodecs/korean/python/unijohab.py Index: unijohab.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/korean/python/unijohab.py,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- unijohab.py 8 Apr 2002 12:49:57 -0000 1.3 +++ unijohab.py 24 Apr 2002 07:38:11 -0000 1.4 @@ -1,116 +1,28 @@ # Hye-Shik Chang <16 Feb 2002> -# $Id: unijohab.py,v 1.3 2002/04/08 12:49:57 perky Exp $ +# $Id: unijohab.py,v 1.4 2002/04/24 07:38:11 perky Exp $ import codecs - -from korean.hangul import Jaeum, Moeum, ishangul, split, join -encmap, decmap = {}, {} - -johab2uni_chosung = { - u'\u115f': u'', u'\u1100': Jaeum.G, u'\u1101': Jaeum.GG, - u'\u1102': Jaeum.N, u'\u1103': Jaeum.D, u'\u1104': Jaeum.DD, - u'\u1105': Jaeum.L, u'\u1106': Jaeum.M, u'\u1107': Jaeum.B, - u'\u1108': Jaeum.BB, u'\u1109': Jaeum.S, u'\u110a': Jaeum.SS, - u'\u110b': Jaeum.NG, u'\u110c': Jaeum.J, u'\u110d': Jaeum.JJ, - u'\u110e': Jaeum.C, u'\u110f': Jaeum.K, u'\u1110': Jaeum.T, - u'\u1111': Jaeum.P, u'\u1112': Jaeum.H -} -johab2uni_jungsung = { - u'\u1160': u'', u'\u1161': Moeum.A, u'\u1162': Moeum.AE, - u'\u1163': Moeum.YA, u'\u1164': Moeum.YAE, u'\u1165': Moeum.EO, - u'\u1166': Moeum.E, u'\u1167': Moeum.YEO, u'\u1168': Moeum.YE, - u'\u1169': Moeum.O, u'\u116a': Moeum.WA, u'\u116b': Moeum.WAE, - u'\u116c': Moeum.OE, u'\u116d': Moeum.YO, u'\u116e': Moeum.U, - u'\u116f': Moeum.WEO, u'\u1170': Moeum.WE, u'\u1171': Moeum.WI, - u'\u1172': Moeum.YU, u'\u1173': Moeum.EU, u'\u1174': Moeum.YI, - u'\u1175': Moeum.I -} -johab2uni_jongsung = { - u'': u'', u'\u11a8': Jaeum.G, u'\u11a9': Jaeum.GG, - u'\u11aa': Jaeum.GS, u'\u11ab': Jaeum.N, u'\u11ac': Jaeum.NJ, - u'\u11ad': Jaeum.NH, u'\u11ae': Jaeum.D, u'\u11af': Jaeum.L, - u'\u11b0': Jaeum.LG, u'\u11b1': Jaeum.LM, u'\u11b2': Jaeum.LB, - u'\u11b3': Jaeum.LS, u'\u11b4': Jaeum.LT, u'\u11b5': Jaeum.LP, - u'\u11b6': Jaeum.LH, u'\u11b7': Jaeum.M, u'\u11b8': Jaeum.B, - u'\u11b9': Jaeum.BS, u'\u11ba': Jaeum.S, u'\u11bb': Jaeum.SS, - u'\u11bc': Jaeum.NG, u'\u11bd': Jaeum.J, u'\u11be': Jaeum.C, - u'\u11bf': Jaeum.K, u'\u11c0': Jaeum.T, u'\u11c1': Jaeum.P, - u'\u11c2': Jaeum.H -} - -uni2johab_chosung = {} -uni2johab_jungsung = {} -uni2johab_jongsung = {} -for k, v in johab2uni_chosung.items(): - uni2johab_chosung[v] = k -for k, v in johab2uni_jungsung.items(): - uni2johab_jungsung[v] = k -for k, v in johab2uni_jongsung.items(): - uni2johab_jongsung[v] = k - +from korean.hangul import ishangul, disjoint, conjoin class Codec(codecs.Codec): # Unicode to character buffer def encode(self, data, errors='strict', supported_errors=('strict', 'ignore', 'replace')): - global encmap if errors not in supported_errors: raise UnicodeError, "unknown error handling" - buffer = [] - for c in data: - if ishangul(c): - cho, jung, jong = split(c) # all hangul can success - buffer.append( - uni2johab_chosung[cho] + - uni2johab_jungsung[jung] + - uni2johab_jongsung[jong] - ) - else: - buffer.append(c) - - return (u''.join(buffer).encode('utf-8', errors), len(data)) + return disjoint(data).encode('utf-8', errors), len(data) # character buffer to Unicode def decode(self, data, errors='strict', supported_errors=('strict', 'ignore', 'replace')): - global decmap if errors not in supported_errors: raise UnicodeError, "unknown error handling" - buffer = [] - data = unicode(data, 'utf-8', errors) - size = len(data) - p = 0 - while p < size: - if not u'\u1100' <= data[p] <= u'\u11FF': - buffer.append(data[p]) - p += 1 - else: - c = data[p:p+3] - try: - cho = johab2uni_chosung[c[0]] - jung = johab2uni_jungsung[c[1]] - if len(c)>2 and johab2uni_jongsung.has_key(c[2]): - jong = johab2uni_jongsung[c[2]] - p += 3 # this must locate end of this block - else: - jong = u'' - p += 2 # too. - except: - if errors == 'replace': - buffer.append(u'\uFFFD') # REPLACEMENT CHARACTER - elif errors == 'strict': - raise UnicodeError, "unexpected byte \\u%04x found" % ord(c[0]) - p += 1 - else: - buffer.append(join([cho, jung, jong])) - - return (u''.join(buffer), size) - + return conjoin(unicode(data, 'utf-8', errors)), len(data) class StreamWriter(Codec, codecs.StreamWriter): pass |