[KoCo-CVS] [Commit] KoreanCodecs/korean/python hangul.py
Brought to you by:
perky
From: Chang <pe...@us...> - 2002-04-24 07:33:13
|
perky 02/04/24 00:20:27 Modified: korean/python hangul.py Log: - Add hangul.conjoin and hangul.disjoint functions (this function set provides converter between U+AC00 and U+1100 pages) Revision Changes Path 1.3 +58 -9 KoreanCodecs/korean/python/hangul.py Index: hangul.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/korean/python/hangul.py,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- hangul.py 24 Apr 2002 05:00:03 -0000 1.2 +++ hangul.py 24 Apr 2002 07:20:27 -0000 1.3 @@ -15,7 +15,7 @@ # Conjoining Jamo Behavior: # http://www.unicode.org/unicode/uni2book/ch03.pdf (section 3.11) # -# $Id: hangul.py,v 1.2 2002/04/24 05:00:03 perky Exp $ +# $Id: hangul.py,v 1.3 2002/04/24 07:20:27 perky Exp $ # class UnicodeHangulError(Exception): @@ -30,7 +30,7 @@ Null = u'' -class Jaeum: # XXX: 1100-1159 Old Jaeum need? +class Jaeum: Codes = (u'\u3131', u'\u3132', u'\u3133', u'\u3134', u'\u3135', u'\u3136', # G GG GS N NJ NH @@ -56,7 +56,7 @@ } -class Moeum: # XXX: 1161-117f Old Moeum need? +class Moeum: Codes = (u'\u314f', u'\u3150', u'\u3151', u'\u3152', u'\u3153', u'\u3154', # A AE YA YAE EO E @@ -75,7 +75,6 @@ OE: (O, I), WEO: (U, EO), WE: (U, E), WI: (U, I), YI: (EU, I) } - # Aliases for your convinience Chosung = Jaeum.Chosung Jungsung = Moeum.Jungsung @@ -89,14 +88,19 @@ isMoeum = lambda c: c in Moeum.Codes # Unicode Hangul Syllables Characteristics -zone = (u'\uAC00', u'\uD7A3') +ZONE = (u'\uAC00', u'\uD7A3') NCHOSUNG = len(Chosung) NJUNGSUNG = len(Jungsung) NJONGSUNG = len(Jongsung) +JBASE_CHOSUNG = u'\u1100' +JBASE_JUNGSUNG = u'\u1161' +JBASE_JONGSUNG = u'\u11A8' +CHOSUNG_FILLER = u'\u115F' +JUNGSUNG_FILLER = u'\u1160' ishangul = ( lambda code: - zone[0] <= code <= zone[1] or + ZONE[0] <= code <= ZONE[1] or code in Jaeum.Codes or code in Moeum.Codes ) @@ -150,10 +154,55 @@ Jongsung[code % NJONGSUNG] ] -def dividestring(str, intoelements=0): - if type(str) is not type(u''): - raise UnicodeHangulError("needs unicode string") +def conjoin(s): + obuff = [] + ncur = 0 + + while ncur < len(s): + c = s[ncur] + if JBASE_CHOSUNG <= c <= u'\u1112' or c == CHOSUNG_FILLER: # starts with chosung + if len(s) > ncur+1 and JUNGSUNG_FILLER <= s[ncur+1] <= u'\u1175': + cho = Chosung[ord(c) - ord(JBASE_CHOSUNG)] + jung = Jungsung[ord(s[ncur+1]) - ord(JBASE_JUNGSUNG)] + if len(s) > ncur+2 and JBASE_JONGSUNG <= s[ncur+2] <= u'\u11C2': + jong = Jongsung[ord(s[ncur+2]) - ord(JBASE_JONGSUNG) + 1] + ncur += 2 + else: + jong = Null + ncur += 1 + obuff.append(join([cho, jung, jong])) + else: + obuff.append(join([Chosung[ord(c) - ord(JBASE_CHOSUNG)], Null, Null])) + elif JBASE_JUNGSUNG <= c <= u'\u1175': + obuff.append(join([Null, Jungsung[ord(c) - ord(JBASE_JUNGSUNG)], Null])) + else: + obuff.append(c) + ncur += 1 + + return u''.join(obuff) +def disjoint(s): + obuff = [] + for c in s: + if ishangul(c): + cho, jung, jong = split(c) + if cho: + obuff.append( unichr(ord(JBASE_CHOSUNG) + Chosung.index(cho)) ) + else: + obuff.append( CHOSUNG_FILLER ) + + if jung: + obuff.append( unichr(ord(JBASE_JUNGSUNG) + Jungsung.index(jung)) ) + else: + obuff.append( JUNGSUNG_FILLER ) + + if jong: + obuff.append( unichr(ord(JBASE_JONGSUNG) + Jongsung.index(jong) - 1) ) + else: + obuff.append(c) + return u''.join(obuff) + +def dividestring(str, intoelements=0): r = u'' for char in str: if ishangul(char): |