[KoCo-CVS] [Commit] KoreanCodecs/korean/python hangul.py
Brought to you by:
perky
From: Chang <pe...@us...> - 2002-04-24 03:36:03
|
perky 02/04/23 20:36:01 Added: korean/python hangul.py Log: - Move hangul python implementation into python/ - Added hangul.format, the hangul adaptive formatter Revision Changes Path 1.1 KoreanCodecs/korean/python/hangul.py Index: hangul.py =================================================================== #!/usr/local/bin/python # ex:ts=4 # # Unicode hangul abstractive controller # # written by Hye-Shik Chang <pe...@fa...> # # # Unicode Hangul Code-Area Specifications: # http://www.unicode.org/charts/PDF/UAC00.pdf # # Jamo Short Name Conventions: # http://www.unicode.org/unicode/uni2book/ch04.pdf (section 4.4) # # Conjoining Jamo Behavior: # http://www.unicode.org/unicode/uni2book/ch03.pdf (section 3.11) # # $Id: hangul.py,v 1.1 2002/04/24 03:36:01 perky Exp $ # class UnicodeHangulError(Exception): def __init__ (self, msg): self.msg = msg def __repr__ (self): return self.msg __str__ = __repr__ Null = u'' class Jaeum: # XXX: 1100-1159 Old Jaeum need? Codes = (u'\u3131', u'\u3132', u'\u3133', u'\u3134', u'\u3135', u'\u3136', # G GG GS N NJ NH u'\u3137', u'\u3138', u'\u3139', u'\u313a', u'\u313b', u'\u313c', # D DD L LG LM LB u'\u313d', u'\u313e', u'\u313f', u'\u3140', u'\u3141', u'\u3142', # LS LT LP LH M B u'\u3143', u'\u3144', u'\u3145', u'\u3146', u'\u3147', u'\u3148', # BB BS S SS NG J u'\u3149', u'\u314a', u'\u314b', u'\u314c', u'\u314d', u'\u314e') # JJ C K T P H Width = len(Codes) G, GG, GS, N, NJ, NH, D, DD, L, LG, LM, LB, LS, LT, LP, LH, M, B, \ BB, BS, S, SS, NG, J, JJ, C, K, T, P, H = Codes Chosung = [G, GG, N, D, DD, L, M, B, BB, S, SS, NG, J, JJ, C, K, T, P, H] Jongsung = [Null, G, GG, GS, N, NJ, NH, D, L, LG, LM, LB, LS, LT, \ LP, LH, M, B, BS, S, SS, NG, J, C, K, T, P, H] MultiElement = { GG: (G, G), GS: (G, S), NJ: (N, J), NH: (N, H), DD: (D, D), LG: (L, G), LM: (L, M), LB: (L, B), LS: (L, S), LT: (L, T), LP: (L, P), LH: (L, H), BB: (B, B), BS: (B, S), SS: (S, S), JJ: (J, J) } class Moeum: # XXX: 1161-117f Old Moeum need? Codes = (u'\u314f', u'\u3150', u'\u3151', u'\u3152', u'\u3153', u'\u3154', # A AE YA YAE EO E u'\u3155', u'\u3156', u'\u3157', u'\u3158', u'\u3159', u'\u315a', # YEO YE O WA WAE OE u'\u315b', u'\u315c', u'\u315d', u'\u315e', u'\u315f', u'\u3160', # YO U WEO WE WI YU u'\u3161', u'\u3162', u'\u3163') # EU YI I Width = len(Codes) A, AE, YA, YAE, EO, E, YEO, YE, O, WA, WAE, OE, YO, \ U, WEO, WE, WI, YU, EU, YI, I = Codes Jungsung = list(Codes) MultiElement = { AE: (A, I), YAE: (YA, I), YE: (YEO, I), WA: (O, A), WAE: (O, A, I), OE: (O, I), WEO: (U, EO), WE: (U, E), WI: (U, I), YI: (EU, I) } # Aliases for your convinience Chosung = Jaeum.Chosung Jungsung = Moeum.Jungsung Jongsung = Jaeum.Jongsung for name, code in Jaeum.__dict__.items() + Moeum.__dict__.items(): if name.isupper() and len(name) <= 3: exec "%s = %s" % (name, repr(code)) isJaeum = lambda c: c in Jaeum.Codes isMoeum = lambda c: c in Moeum.Codes # Unicode Hangul Syllables Characteristics zone = (u'\uAC00', u'\uD7A3') splitters = [ ( len(Jongsung)*len(Jungsung), Chosung ), ( len(Jongsung), Jungsung ), ( 1, Jongsung ) ] ishangul = ( lambda code: zone[0] <= code <= zone[1] or code in Jaeum.Codes or code in Moeum.Codes ) # Alternative Suffixes ALT_SUFFIXES = { u'\uc744': (u'\ub97c', u'\uc744'), # reul, eul u'\ub97c': (u'\ub97c', u'\uc744'), # reul, eul u'\uc740': (u'\ub294', u'\uc740'), # neun, eun u'\ub294': (u'\ub294', u'\uc740'), # neun, eun u'\uc774': (u'\uac00', u'\uc774'), # yi, ga u'\uac00': (u'\uac00', u'\uc774'), # yi, ga u'\uc640': (u'\uc640', u'\uacfc'), # wa, gwa u'\uacfc': (u'\uc640', u'\uacfc'), # wa, gwa } # Ida-Varitaion Suffixes IDA_SUFFIXES = { u'(\uc774)': (u'', u'\uc774'), # (yi)da u'(\uc785)': (17, u'\uc785'), # (ip)nida u'(\uc778)': (4, u'\uc778'), # (in)- } def join(codes): """ Join function which makes hangul syllable from jamos """ if len(codes) is not 3: raise UnicodeHangulError("needs 3-element tuple") if not codes[0] or not codes[1]: # single jamo return codes[0] or codes[1] r = ord(zone[0]) codes = codes[:] # simple copy :D for multiplier, codeset in splitters: r = r + multiplier*codeset.index(codes.pop(0)) return unichr(r) def split(code): """ Split function which splits hangul syllable into jamos """ if len(code) != 1 or not ishangul(code): raise UnicodeHangulError("needs 1 hangul letter") if code in Jaeum.Codes: return [code, Null, Null] if code in Moeum.Codes: return [Null, code, Null] code = ord(code) - ord(zone[0]) r = [] for divider, codeset in splitters: value, code = code / divider, code % divider r.append(codeset[value]) return r def dividestring(str, intoelements=0): if type(str) is not type(u''): raise UnicodeHangulError("needs unicode string") r = u'' for char in str: if ishangul(char): elems = split(char) for elem in elems: for htype in (Jaeum, Moeum, None): if htype == None: r += elem elif intoelements and \ htype.MultiElement.has_key(elem): r += u''.join(htype.MultiElement[elem]) break else: r += char return r def _has_final(c): # for internal use only if u'\uac00' <= c <= u'\ud7a3': # hangul return 1, (ord(c) - 0xac00) % 28 > 0 else: return 0, c in u'013678.bklmnptMN' def format(fmtstr, args): if not isinstance(args, dict): argget = iter(args).next else: argget = lambda:args obuff = [] ncur = escape = fmtinpth = 0 ofmt = fmt = u'' while ncur < len(fmtstr): c = fmtstr[ncur] if escape: obuff.append(c) escape = 0 ofmt = u'' elif c == u'\\': escape = 1 elif fmt: fmt += c if not fmtinpth and c.isalpha(): ofmt = fmt % argget() obuff.append(ofmt) fmt = u'' elif fmtinpth and c == u')': fmtinpth = 0 elif c == u'(': fmtinpth = 1 elif c == u'%': obuff.append(u'%') elif c == u'%': fmt += c ofmt = u'' else: if ofmt and ALT_SUFFIXES.has_key(c): obuff.append(ALT_SUFFIXES[c][ _has_final(ofmt[-1])[1] and 1 or 0 ]) elif ofmt and IDA_SUFFIXES.has_key(fmtstr[ncur:ncur+3]): sel = IDA_SUFFIXES[fmtstr[ncur:ncur+3]] ishan, hasfinal = _has_final(ofmt[-1]) if hasfinal: obuff.append(sel[1]) elif ishan: if sel[0]: obuff[-1] = obuff[-1][:-1] + unichr(ord(ofmt[-1]) + sel[0]) else: obuff.append(sel[0] and sel[1]) ncur += 2 else: obuff.append(c) ofmt = u'' ncur += 1 return u''.join(obuff) if __name__ == '__main__': print ( join([Jaeum.P, Moeum.EO, Null]) + \ join([Jaeum.K, Moeum.I, Null]) + \ join([Jaeum.JJ, Moeum.A, Jaeum.NG]) ).encode("utf-8") while 1: code = raw_input(">>> ") print dividestring(unicode(code, "utf-8"), 1).encode("utf-8") |