koco-cvs Mailing List for Python Korean Codecs (Page 20)
Brought to you by:
perky
You can subscribe to this list here.
2002 |
Jan
|
Feb
|
Mar
|
Apr
(88) |
May
(5) |
Jun
|
Jul
(27) |
Aug
|
Sep
|
Oct
(5) |
Nov
|
Dec
|
---|---|---|---|---|---|---|---|---|---|---|---|---|
2003 |
Jan
(77) |
Feb
(3) |
Mar
|
Apr
(22) |
May
(123) |
Jun
(80) |
Jul
(83) |
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
From: Chang <pe...@us...> - 2002-04-26 07:29:45
|
perky 02/04/26 00:29:43 Modified: korean/python hangul.py qwerty2bul.py Log: - Emulate Nested scope and iter() for old versions under 2.2 - Import korean.aliases before unit test Tested on: Python 2.0.1, 2.1.3 Revision Changes Path 1.7 +30 -2 KoreanCodecs/korean/python/hangul.py Index: hangul.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/korean/python/hangul.py,v retrieving revision 1.6 retrieving revision 1.7 diff -u -r1.6 -r1.7 --- hangul.py 25 Apr 2002 21:13:44 -0000 1.6 +++ hangul.py 26 Apr 2002 07:29:43 -0000 1.7 @@ -15,7 +15,7 @@ # Conjoining Jamo Behavior: # http://www.unicode.org/unicode/uni2book/ch03.pdf (section 3.11) # -# $Id: hangul.py,v 1.6 2002/04/25 21:13:44 perky Exp $ +# $Id: hangul.py,v 1.7 2002/04/26 07:29:43 perky Exp $ # class UnicodeHangulError(Exception): @@ -210,9 +210,37 @@ else: return 0, c in u'013678.bklmnptMN' +# Iterator Emulator for ancient versions before 2.1 +try: + iter +except: + class iter: + def __init__(self, obj): + self.obj = obj + self.ptr = 0 + def next(self): + try: + return self.obj[self.ptr] + finally: + self.ptr += 1 + +# Nested scope lambda emulation for versions before 2.2 +import sys +if sys.hexversion < '0x2020000': + class plambda: + def __init__(self, obj): + self.obj = obj + def __call__(self): + return self.obj +else: + plambda = None +del sys + def format(fmtstr, *args, **kwargs): if kwargs: - argget = lambda:kwargs + argget = lambda:kwargs + if plambda: + argget = plambda(kwargs) else: argget = iter(args).next 1.5 +2 -2 KoreanCodecs/korean/python/qwerty2bul.py Index: qwerty2bul.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/korean/python/qwerty2bul.py,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- qwerty2bul.py 11 Apr 2002 22:04:23 -0000 1.4 +++ qwerty2bul.py 26 Apr 2002 07:29:43 -0000 1.5 @@ -1,5 +1,5 @@ # Hye-Shik Chang <16 Feb 2002> -# $Id: qwerty2bul.py,v 1.4 2002/04/11 22:04:23 perky Exp $ +# $Id: qwerty2bul.py,v 1.5 2002/04/26 07:29:43 perky Exp $ import codecs from korean.hangul import Moeum, Jaeum, Chosung, Jungsung, Jongsung @@ -165,7 +165,7 @@ if errors not in supported_errors: raise UnicodeError, "unknown error handling" - s = unicode(data, self.BASECODEC, errors=errors) + s = unicode(data, self.BASECODEC, errors) am = Automata_Hangul2() r = am.convert(s) return (r, len(r)) |
From: Chang <pe...@us...> - 2002-04-25 22:35:24
|
perky 02/04/25 15:35:17 Modified: . ChangeLog Log: - Add tag descriptions Revision Changes Path 1.5 +16 -4 KoreanCodecs/ChangeLog Index: ChangeLog =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/ChangeLog,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- ChangeLog 25 Apr 2002 21:31:26 -0000 1.4 +++ ChangeLog 25 Apr 2002 22:35:17 -0000 1.5 @@ -1,6 +1,18 @@ +2002-04-26 06:31 Hye-Shik Chang <pe...@fa...> + + * ChangeLog (1.4): + + - Add version milestone + ----------------------------------------------------------------------------- -Version 2.0.3a1 (2002-04-26) +Version 2.0.3a1 (2002-04-26) Tag: RELENG_2_0_3_ALPHA1 +2002-04-26 06:29 Hye-Shik Chang <pe...@fa...> + + * ChangeLog (1.3), MANIFEST.in (1.6): + + - Final CVS ChangeLog for 2.0.3a1 + 2002-04-26 06:28 Hye-Shik Chang <pe...@fa...> * README.en (1.12), README.ko (1.11), setup.py (1.13): @@ -157,7 +169,7 @@ ----------------------------------------------------------------------------- -Version 2.0.2 (2002-03-16) +Version 2.0.2 (2002-03-16) Tag: RELENG_2_0_2_RELEASE 2002-03-16 11:35 Hye-Shik Chang <pe...@fa...> @@ -333,7 +345,7 @@ ----------------------------------------------------------------------------- -Version 2.0.1 (2002-03-09) +Version 2.0.1 (2002-03-09) Tag: RELENG_2_0_1_RELEASE 2002-03-09 09:06 Hye-Shik Chang <pe...@fa...> @@ -382,7 +394,7 @@ ----------------------------------------------------------------------------- -Version 2.0 (2002-03-01) +Version 2.0 (2002-03-01) Tag: RELENG_2_0_RELEASE (tigers smoked together...) |
From: Chang <pe...@us...> - 2002-04-25 21:31:26
|
perky 02/04/25 14:31:26 Modified: . ChangeLog Log: - Add version milestone Revision Changes Path 1.4 +3 -0 KoreanCodecs/ChangeLog Index: ChangeLog =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/ChangeLog,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- ChangeLog 25 Apr 2002 21:29:07 -0000 1.3 +++ ChangeLog 25 Apr 2002 21:31:26 -0000 1.4 @@ -1,3 +1,6 @@ +----------------------------------------------------------------------------- +Version 2.0.3a1 (2002-04-26) + 2002-04-26 06:28 Hye-Shik Chang <pe...@fa...> * README.en (1.12), README.ko (1.11), setup.py (1.13): |
From: Chang <pe...@us...> - 2002-04-25 21:29:08
|
perky 02/04/25 14:29:08 Modified: . ChangeLog MANIFEST.in Log: - Final CVS ChangeLog for 2.0.3a1 Revision Changes Path 1.3 +59 -0 KoreanCodecs/ChangeLog Index: ChangeLog =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/ChangeLog,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- ChangeLog 24 Apr 2002 07:46:38 -0000 1.2 +++ ChangeLog 25 Apr 2002 21:29:07 -0000 1.3 @@ -1,3 +1,62 @@ +2002-04-26 06:28 Hye-Shik Chang <pe...@fa...> + + * README.en (1.12), README.ko (1.11), setup.py (1.13): + + - Version 2.0.3a1 comments + +2002-04-26 06:13 Hye-Shik Chang <pe...@fa...> + + * korean/python/hangul.py (1.6), test/test_all.py (1.5), + test/test_hangul.py (1.7), src/hangul.c (1.5): + + - Change format argument passing to *args, **kwargs form + - Split unittests into CExtension and PurePython + +2002-04-26 05:55 Hye-Shik Chang <pe...@fa...> + + * src/hangul.c (1.4): + + - Add hangul.format C implementation. + +2002-04-25 14:12 Hye-Shik Chang <pe...@fa...> + + * src/hangul.c (1.3): + + - Fix the problem around syllable without jongsung. + +2002-04-25 14:01 Hye-Shik Chang <pe...@fa...> + + * setup.py (1.12), korean/python/hangul.py (1.5), + test/test_hangul.py (1.6): + + - Remove hangul.dividestring method (it was just fancy feature..) + - Add c.hangul to distutil build chain. Yeah~ + +2002-04-25 13:49 Hye-Shik Chang <pe...@fa...> + + * src/hangul.c (1.2): + + - Implement join, split, conjoin, disjoint methods on korean.c.hangul + +2002-04-25 12:46 Hye-Shik Chang <pe...@fa...> + + * korean/python/hangul.py (1.4), test/test_hangul.py (1.5): + + - Clean up namespace + - Change hangul.split's return type to Tuple (make compatible with c.hangul) + +2002-04-24 23:16 Hye-Shik Chang <pe...@fa...> + + * src/: Setup.in (1.2), hangul.c (1.1): + + - Add ROUGH implementation of korean.c.hangul module + +2002-04-24 16:46 Hye-Shik Chang <pe...@fa...> + + * ChangeLog (1.2), Makefile (1.2): + + - Remove -t option from generating changelog + 2002-04-24 16:38 Hye-Shik Chang <pe...@fa...> * korean/python/unijohab.py (1.4): 1.6 +2 -2 KoreanCodecs/MANIFEST.in Index: MANIFEST.in =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/MANIFEST.in,v retrieving revision 1.5 retrieving revision 1.6 diff -u -r1.5 -r1.6 --- MANIFEST.in 16 Mar 2002 02:26:41 -0000 1.5 +++ MANIFEST.in 25 Apr 2002 21:29:07 -0000 1.6 @@ -1,8 +1,8 @@ -# $Id: MANIFEST.in,v 1.5 2002/03/16 02:26:41 perky Exp $ +# $Id: MANIFEST.in,v 1.6 2002/04/25 21:29:07 perky Exp $ # Hye-Shik Chang <19 Feb 2002> include README README.en README.ko -include LICENSE MANIFEST.in +include LICENSE MANIFEST.in ChangeLog #recursive-include debian * recursive-include misc * |
From: Chang <pe...@us...> - 2002-04-25 21:28:10
|
perky 02/04/25 14:28:07 Modified: . README.en README.ko setup.py Log: - Version 2.0.3a1 comments Revision Changes Path 1.12 +10 -5 KoreanCodecs/README.en Index: README.en =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/README.en,v retrieving revision 1.11 retrieving revision 1.12 diff -u -r1.11 -r1.12 --- README.en 8 Apr 2002 12:45:25 -0000 1.11 +++ README.en 25 Apr 2002 21:28:07 -0000 1.12 @@ -1,8 +1,8 @@ -KoreanCodecs version 2.0.3 -========================== +KoreanCodecs version 2.0.3a1 +============================ Copyright(C) Hye-Shik Chang, 2002. -$Id: README.en,v 1.11 2002/04/08 12:45:25 perky Exp $ +$Id: README.en,v 1.12 2002/04/25 21:28:07 perky Exp $ @@ -40,7 +40,7 @@ python setup.py install -If you want to use a Japanese encoding as the default one, add a +If you want to use a Korean encoding as the default one, add a line sys.set_string_encoding(ENCODING) @@ -104,7 +104,7 @@ - korean.qwerty3bul (proposed on 2.1) - korean.qwerty3bul-390 (proposed on 2.1) -You can omit 'korean.' after importing 'korean' module. +You can omit 'korean.' after importing 'korean.aliases' module. Addition Modules @@ -121,6 +121,11 @@ o Version 2.0.3 - April 2002 - change jamo short names to confirm to Unicode 3.2 on hangul module + - added hangul module C implementation + (which means, johab, unijohab and qwerty2bul have gotten faster) + - added conjoin, disjoint, format in hangul module + (format function is a unicode formatter that fixes korean suffixes + after each arguments) o Version 2.0.2 - 16 March 2002 - added euc-kr and cp949 codec C implementations 1.11 +10 -4 KoreanCodecs/README.ko Index: README.ko =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/README.ko,v retrieving revision 1.10 retrieving revision 1.11 diff -u -r1.10 -r1.11 --- README.ko 8 Apr 2002 12:45:25 -0000 1.10 +++ README.ko 25 Apr 2002 21:28:07 -0000 1.11 @@ -1,8 +1,8 @@ -ÇѱÛÄÚµ¦ ¹öÁ¯ 2.0.3 -=================== +ÇѱÛÄÚµ¦ ¹öÁ¯ 2.0.3a1 +===================== Copyright(C) Hye-Shik Chang, 2002. -$Id: README.ko,v 1.10 2002/04/08 12:45:25 perky Exp $ +$Id: README.ko,v 1.11 2002/04/25 21:28:07 perky Exp $ *Ä·ÆäÀÎ* ÀÎÅͳݿ¡¼ ÇÑ±Û ¸ÂÃã¹ýÀ» Áöŵ½Ã´Ù. ^-^/~ @@ -105,7 +105,8 @@ - korean.qwerty3bul : 3¹ú½Ä - ÄõƼÀÚÆÇ ¸ÅÇÎ - korean.qwerty3bul-390 : 3¹ú½Ä 390 - ÄõƼÀÚÆÇ ¸ÅÇÎ -ÄÚµ¦À̸§¿¡¼ korean. ºÎºÐÀº korean¸ðµâÀ» ÀÓÆ÷Æ®ÇÏ¸é »ý·«ÇÒ ¼ö ÀÖ½À´Ï´Ù. +ÄÚµ¦À̸§¿¡¼ korean. ºÎºÐÀº korean.aliases¸ðµâÀ» ÀÓÆ÷Æ®ÇÏ¸é »ý·«ÇÒ ¼ö +ÀÖ½À´Ï´Ù. Ãß°¡ ÆÐŰÁö @@ -123,6 +124,11 @@ o ¹öÁ¯ 2.0.3 2002³â 4¿ù - hangul ¸ðµâ À¯´ÏÄÚµå 3.2 Ç¥ÁØÀ¸·Î ÀÚ¸ð ¾à¾î º¯°æ + - hangul ¸ðµâ C ±¸Çö Ãß°¡ + (ÀÌ È®ÀåÀ¸·Î johab, unijohab, qwerty2bul ÄÚµ¦ÀÌ »¡¶óÁý´Ï´Ù.) + - hangul ¸ðµâ¿¡ conjoin, disjoint, format ÇÔ¼ö Ãß°¡ + (formatÀº Æ÷¸ËµÈ ´Ü¾îÀÇ Á¾¼º¿©ºÎ¿¡ µû¶ó µÚÀÇ Á¶»ç¸¦ ¼öÁ¤ÇØÁÖ´Â + ÇѱۿëÀÇ À¯´ÏÄÚµå Æ÷¸ÅÆÃ ÇÔ¼öÀÔ´Ï´Ù.) o ¹öÁ¯ 2.0.2 2002³â 3¿ù 16ÀÏ - EUC-KR, CP949 ÄÚµ¦ C ±¸Çö Ãß°¡ 1.13 +2 -2 KoreanCodecs/setup.py Index: setup.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/setup.py,v retrieving revision 1.12 retrieving revision 1.13 diff -u -r1.12 -r1.13 --- setup.py 25 Apr 2002 05:01:06 -0000 1.12 +++ setup.py 25 Apr 2002 21:28:07 -0000 1.13 @@ -1,5 +1,5 @@ #!/usr/bin/env python -# $Id: setup.py,v 1.12 2002/04/25 05:01:06 perky Exp $ +# $Id: setup.py,v 1.13 2002/04/25 21:28:07 perky Exp $ import sys from distutils.core import setup, Extension @@ -18,7 +18,7 @@ (self.install_purelib, ["misc/korean.pth"])] setup (name = "KoreanCodecs", - version = "2.0.3", + version = "2.0.3a1", description = "Korean Codecs for Python Unicode Support", long_description = "This package provides Unicode codecs that make " "Python aware of Korean character encodings such as EUC-KR, CP949 " |
From: Chang <pe...@us...> - 2002-04-25 21:13:49
|
perky 02/04/25 14:13:44 Modified: korean/python hangul.py Log: - Change format argument passing to *args, **kwargs form - Split unittests into CExtension and PurePython Revision Changes Path 1.6 +5 -5 KoreanCodecs/korean/python/hangul.py Index: hangul.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/korean/python/hangul.py,v retrieving revision 1.5 retrieving revision 1.6 diff -u -r1.5 -r1.6 --- hangul.py 25 Apr 2002 05:01:06 -0000 1.5 +++ hangul.py 25 Apr 2002 21:13:44 -0000 1.6 @@ -15,7 +15,7 @@ # Conjoining Jamo Behavior: # http://www.unicode.org/unicode/uni2book/ch03.pdf (section 3.11) # -# $Id: hangul.py,v 1.5 2002/04/25 05:01:06 perky Exp $ +# $Id: hangul.py,v 1.6 2002/04/25 21:13:44 perky Exp $ # class UnicodeHangulError(Exception): @@ -210,11 +210,11 @@ else: return 0, c in u'013678.bklmnptMN' -def format(fmtstr, args): - if not isinstance(args, dict): - argget = iter(args).next +def format(fmtstr, *args, **kwargs): + if kwargs: + argget = lambda:kwargs else: - argget = lambda:args + argget = iter(args).next obuff = [] ncur = escape = fmtinpth = 0 |
From: Chang <pe...@us...> - 2002-04-25 21:13:49
|
perky 02/04/25 14:13:44 Modified: test test_all.py test_hangul.py Log: - Change format argument passing to *args, **kwargs form - Split unittests into CExtension and PurePython Revision Changes Path 1.5 +1 -1 KoreanCodecs/test/test_all.py Index: test_all.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/test/test_all.py,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- test_all.py 16 Mar 2002 02:18:37 -0000 1.4 +++ test_all.py 25 Apr 2002 21:13:44 -0000 1.5 @@ -7,7 +7,7 @@ from test_qwerty2bul import TestQWERTY2BUL from test_unijohab import TestUNIJOHAB -from test_hangul import TestHangul +from test_hangul import TestHangul_CExtension, TestHangul_PurePython if __name__ == '__main__': CodecTestBase.main() 1.7 +36 -27 KoreanCodecs/test/test_hangul.py Index: test_hangul.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/test/test_hangul.py,v retrieving revision 1.6 retrieving revision 1.7 diff -u -r1.6 -r1.7 --- test_hangul.py 25 Apr 2002 05:01:06 -0000 1.6 +++ test_hangul.py 25 Apr 2002 21:13:44 -0000 1.7 @@ -1,72 +1,81 @@ # Hye-Shik Chang <1 March 2002> import unittest -from korean.hangul import * - -class TestHangul(unittest.TestCase): +from korean.c import hangul as c +from korean.python import hangul as python +class Shield: + class TestHangul(unittest.TestCase): def test_joinsplit(self): - self.assertEqual(join([Jaeum.J, Moeum.WA, Jaeum.L]), u'\uc894') - self.assertEqual(join([Jaeum.JJ, Null, Null]), u'\u3149') - self.assertEqual(join((Null, Moeum.YI, Null)), u'\u3162') - - self.assertEqual(split(u'\uc894'), (Jaeum.J, Moeum.WA, Jaeum.L)) - self.assertEqual(split(u'\u3149'), (Jaeum.JJ, Null, Null)) - self.assertEqual(split(u'\u3162'), (Null, Moeum.YI, Null)) + self.assertEqual(self.h.join([self.h.J, self.h.WA, self.h.L]), u'\uc894') + self.assertEqual(self.h.join([self.h.JJ, self.h.Null, self.h.Null]), u'\u3149') + self.assertEqual(self.h.join((self.h.Null, self.h.YI, self.h.Null)), u'\u3162') + + self.assertEqual(self.h.split(u'\uc894'), (self.h.J, self.h.WA, self.h.L)) + self.assertEqual(self.h.split(u'\u3149'), (self.h.JJ, self.h.Null, self.h.Null)) + self.assertEqual(self.h.split(u'\u3162'), (self.h.Null, self.h.YI, self.h.Null)) def test_basicspec(self): - self.assertEqual(isJaeum(Jaeum.J), 1) - self.assertEqual(isJaeum(Moeum.E), 0) - self.assertEqual(isMoeum(Jaeum.L), 0) - self.assertEqual(isMoeum(Moeum.O), 1) - self.assertEqual(ishangul(u'\uc870'), 1) - self.assertEqual(ishangul(u'\u382c'), 0) + self.assertEqual(self.h.isJaeum(self.h.J), 1) + self.assertEqual(self.h.isJaeum(self.h.E), 0) + self.assertEqual(self.h.isMoeum(self.h.L), 0) + self.assertEqual(self.h.isMoeum(self.h.O), 1) + self.assertEqual(self.h.ishangul(u'\uc870'), 1) + self.assertEqual(self.h.ishangul(u'\u382c'), 0) def test_format_altsuffix(self): fmt = u'%s\ub294 %s\ub97c %s\ud55c\ub2e4.' obj1, obj2 = u'\ud61c\uc2dd', u'\uc544\ub77c' - self.assertEqual(format(fmt, (obj1, obj2, u'\u2661')), + self.assertEqual(self.h.format(fmt, obj1, obj2, u'\u2661'), u'\ud61c\uc2dd\uc740 \uc544\ub77c\ub97c \u2661\ud55c\ub2e4.') - self.assertEqual(format(fmt, (obj2, obj1, u'\uc2eb\uc5b4')), + self.assertEqual(self.h.format(fmt, obj2, obj1, u'\uc2eb\uc5b4'), u'\uc544\ub77c\ub294 \ud61c\uc2dd\uc744 \uc2eb\uc5b4\ud55c\ub2e4.') fmt = u'\ud0dc\ucd08\uc5d0 %s\uc640 %s\uac00 \uc788\uc5c8\ub2e4.' - self.assertEqual(format(fmt, (obj1, obj2)), + self.assertEqual(self.h.format(fmt, obj1, obj2), u'\ud0dc\ucd08\uc5d0 \ud61c\uc2dd\uacfc \uc544\ub77c\uac00' u' \uc788\uc5c8\ub2e4.') - self.assertEqual(format(fmt, (obj2, obj1)), + self.assertEqual(self.h.format(fmt, obj2, obj1), u'\ud0dc\ucd08\uc5d0 \uc544\ub77c\uc640 \ud61c\uc2dd\uc774' u' \uc788\uc5c8\ub2e4.') obj1, obj2 = u'Julian', u'Julie' - self.assertEqual(format(fmt, (obj1, obj2)), + self.assertEqual(self.h.format(fmt, obj1, obj2), u'\ud0dc\ucd08\uc5d0 Julian\uacfc Julie\uac00 \uc788\uc5c8\ub2e4.') - self.assertEqual(format(fmt, (obj2, obj1)), + self.assertEqual(self.h.format(fmt, obj2, obj1), u'\ud0dc\ucd08\uc5d0 Julie\uc640 Julian\uc774 \uc788\uc5c8\ub2e4.') def test_format_idasuffix(self): fmt = u'%s(\uc785)\ub2c8\ub2e4, %s(\uc778)\ub370, %s(\uc774)\ub2e4' - self.assertEqual(format(fmt, (u'\uc18c\uc774',)*3), + self.assertEqual(self.h.format(fmt, *(u'\uc18c\uc774',)*3), u'\uc18c\uc785\ub2c8\ub2e4, \uc18c\uc778\ub370, \uc18c\uc774\ub2e4') - self.assertEqual(format(fmt, (u'\ub2e4\ub155',)*3), + self.assertEqual(self.h.format(fmt, *(u'\ub2e4\ub155',)*3), u'\ub2e4\ub155\uc785\ub2c8\ub2e4, \ub2e4\ub155\uc778\ub370,' u' \ub2e4\ub155\uc774\ub2e4') def test_format_argtypes(self): fmt = u'%(int)d(\uc785)\ub2c8\ub2e4. %(str)s\uc740 %(str)s\uc5d0' \ u'%(float).2f\uc640' - self.assertEqual(format(fmt, { 'int': 1, 'str': u'hmm', 'float': 3.14 }), + self.assertEqual(self.h.format(fmt, int=1, str=u'hmm', float=3.14), u'1\uc785\ub2c8\ub2e4. hmm\uc740 hmm\uc5d03.14\uc640') def test_conjoin(self): - self.assertEqual(conjoin(u'\u1112\u1161\u11ab\u1100\u1173\u11af\u110b\u1175' + self.assertEqual(self.h.conjoin(u'\u1112\u1161\u11ab\u1100\u1173\u11af\u110b\u1175' u' \u110c\u1169\u11c2\u110b\u1161\u110b\u116d.'), u'\ud55c\uae00\uc774 \uc88b\uc544\uc694.') def test_disjoint(self): - self.assertEqual(disjoint(u'\ub9c8\ub140\ubc30\ub2ec\ubd80 \ud0a4\ud0a4'), + self.assertEqual(self.h.disjoint(u'\ub9c8\ub140\ubc30\ub2ec\ubd80 \ud0a4\ud0a4'), u'\u1106\u1161\u1102\u1167\u1107\u1162\u1103\u1161\u11af\u1107\u116e' u' \u110f\u1175\u110f\u1175') + +class TestHangul_CExtension(Shield.TestHangul): + def setUp(self): + self.h = c + +class TestHangul_PurePython(Shield.TestHangul): + def setUp(self): + self.h = python if __name__ == '__main__': |
From: Chang <pe...@us...> - 2002-04-25 21:13:48
|
perky 02/04/25 14:13:45 Modified: src hangul.c Log: - Change format argument passing to *args, **kwargs form - Split unittests into CExtension and PurePython Revision Changes Path 1.5 +6 -6 KoreanCodecs/src/hangul.c Index: hangul.c =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/src/hangul.c,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- hangul.c 25 Apr 2002 20:55:25 -0000 1.4 +++ hangul.c 25 Apr 2002 21:13:45 -0000 1.5 @@ -4,14 +4,14 @@ * KoreanCodecs Hangul Module C Implementation * * Author : Hye-Shik Chang <pe...@fa...> - * Date : $Date: 2002/04/25 20:55:25 $ + * Date : $Date: 2002/04/25 21:13:45 $ * Created : 25 April 2002 * - * $Revision: 1.4 $ + * $Revision: 1.5 $ */ static char *version = -"$Id: hangul.c,v 1.4 2002/04/25 20:55:25 perky Exp $"; +"$Id: hangul.c,v 1.5 2002/04/25 21:13:45 perky Exp $"; #include "Python.h" @@ -547,9 +547,9 @@ else PROCESSSUFFIX_IDA(4, 0xc778) /* (IN)- */ } else if (0xac00 <= next && next <= 0xc774) { - PROCESSSUFFIX(0xc744, 0xb97c) /* REUL, EUL */ - else PROCESSSUFFIX(0xc740, 0xb294) /* NEUN, EUN */ - else PROCESSSUFFIX(0xac00, 0xc774) /* I, GA */ + PROCESSSUFFIX(0xb97c, 0xc744) /* REUL, EUL */ + else PROCESSSUFFIX(0xb294, 0xc740) /* NEUN, EUN */ + else PROCESSSUFFIX(0xac00, 0xc774) /* GA, I */ else PROCESSSUFFIX(0xc640, 0xacfc) /* WA, GWA */ } } |
From: Chang <pe...@us...> - 2002-04-25 20:55:28
|
perky 02/04/25 13:55:25 Modified: src hangul.c Log: - Add hangul.format C implementation. Revision Changes Path 1.4 +175 -24 KoreanCodecs/src/hangul.c Index: hangul.c =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/src/hangul.c,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- hangul.c 25 Apr 2002 05:12:17 -0000 1.3 +++ hangul.c 25 Apr 2002 20:55:25 -0000 1.4 @@ -4,14 +4,14 @@ * KoreanCodecs Hangul Module C Implementation * * Author : Hye-Shik Chang <pe...@fa...> - * Date : $Date: 2002/04/25 05:12:17 $ + * Date : $Date: 2002/04/25 20:55:25 $ * Created : 25 April 2002 * - * $Revision: 1.3 $ + * $Revision: 1.4 $ */ static char *version = -"$Id: hangul.c,v 1.3 2002/04/25 05:12:17 perky Exp $"; +"$Id: hangul.c,v 1.4 2002/04/25 20:55:25 perky Exp $"; #include "Python.h" @@ -102,10 +102,10 @@ #define getJongsungOrder(c) (getJamotype(c).orders[2]) -static char Py_isJaeum__doc__[] = "isJaeum(code): Verify whether the code is Jaeum."; +static char hangul_isJaeum__doc__[] = "isJaeum(code): Verify whether the code is Jaeum."; static PyObject * -Py_isJaeum(PyObject *self, PyObject *args) +hangul_isJaeum(PyObject *self, PyObject *args) { Py_UNICODE *code; int codelen; @@ -128,10 +128,10 @@ } } -static char Py_isMoeum__doc__[] = "isMoeum(code): Verify whether the code is Moeum."; +static char hangul_isMoeum__doc__[] = "isMoeum(code): Verify whether the code is Moeum."; static PyObject * -Py_isMoeum(PyObject *self, PyObject *args) +hangul_isMoeum(PyObject *self, PyObject *args) { Py_UNICODE *code; int codelen; @@ -154,10 +154,10 @@ } } -static char Py_ishangul__doc__[] = "ishangul(code): Verify whether the code is hangul."; +static char hangul_ishangul__doc__[] = "ishangul(code): Verify whether the code is hangul."; static PyObject * -Py_ishangul(PyObject *self, PyObject *args) +hangul_ishangul(PyObject *self, PyObject *args) { Py_UNICODE *code; int codelen; @@ -180,10 +180,10 @@ } } -static char Py_join__doc__[] = "join([chosung, jungsung, jongsung]): Assemble hangul syllable from jamos."; +static char hangul_join__doc__[] = "join([chosung, jungsung, jongsung]): Assemble hangul syllable from jamos."; static PyObject * -Py_join(PyObject *self, PyObject *args) +hangul_join(PyObject *self, PyObject *args) { PyObject *argchar, *argelems[3]; Py_UNICODE elems[3], *uobj; @@ -251,10 +251,10 @@ } } -static char Py_split__doc__[] = "split(code): Disassemble hangul syllable into jamos."; +static char hangul_split__doc__[] = "split(code): Disassemble hangul syllable into jamos."; static PyObject * -Py_split(PyObject *self, PyObject *args) +hangul_split(PyObject *self, PyObject *args) { Py_UNICODE *code; PyObject *r; @@ -313,10 +313,10 @@ } } -static char Py_conjoin__doc__[] = "conjoin(unicodestring): conjoin unicode johab string into unicode syllable string"; +static char hangul_conjoin__doc__[] = "conjoin(unicodestring): conjoin unicode johab string into unicode syllable string"; static PyObject * -Py_conjoin(PyObject *self, PyObject *args) +hangul_conjoin(PyObject *self, PyObject *args) { PyObject *r; Py_UNICODE *code, *dst, *dstorg, c; @@ -373,10 +373,10 @@ } -static char Py_disjoint__doc__[] = "disjoint(unicodestring): disjoint unicode syllable string into unicode johab string"; +static char hangul_disjoint__doc__[] = "disjoint(unicodestring): disjoint unicode syllable string into unicode johab string"; static PyObject * -Py_disjoint(PyObject *self, PyObject *args) +hangul_disjoint(PyObject *self, PyObject *args) { Py_UNICODE *code, *dst, *dstorg, c; PyObject *r; @@ -419,18 +419,169 @@ } +static char pseudofinal[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1 */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, /* 2 */ + 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, /* 3 */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, /* 4 */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 5 */ + 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, /* 6 */ + 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 7 */ +}; + +static char hangul_format__doc__[] = "format(fmt, arg1, arg2, ...) or format(fmt, kw1=arg1, kw2=arg2" + ", ...):\nformat unicode string and fix korean suffixes after arguments"; + +static PyObject * +hangul_format(PyObject *self, PyObject *args, PyObject *kwargs) +{ +/*--- Poor Structure of this function ;) + hangul_format(fmt, *args, **kwargs) + -> insert end fmtmarkers(U+115E which is not used by Unicode) after every format position + -> PyUnicode_Format + -> Fix and update hangul suffixes in place of fmtmarkers + -> make PyObject and return. + */ +#define FMTMARKER 0x115E + Py_UNICODE *fmt, *fmtout, *fcur; + PyObject *r; + int fmtsize; + int inpth, infmt, escape; + + { + PyObject *fmtobj; + int argsize; + + argsize = PyTuple_GET_SIZE(args); + if (!argsize || !PyUnicode_Check(fmtobj = PyTuple_GET_ITEM(args, 0))) { + PyErr_Format(PyExc_TypeError, "needs unicode format string."); + return NULL; + } + fmtsize = PyUnicode_GET_SIZE(fmtobj); + fmt = PyUnicode_AS_UNICODE(fmtobj); + + if (!kwargs) + args = PyTuple_GetSlice(args, 1, argsize); + } + + fmtout = PyMem_New(Py_UNICODE, fmtsize + fmtsize/2); + inpth = infmt = escape = 0; + + for (fcur = fmtout; fmtsize--; fmt++) { + if (*fmt != FMTMARKER) /* skip bogus markers */ + *(fcur++) = *fmt; + + if (escape) + escape = 0; + else if (*fmt == '\\') + escape = 1; + else if (infmt) { + if (!inpth && (('A' <= *fmt && *fmt <= 'Z') || ('a' <= *fmt && *fmt <= 'z'))) { + *(fcur++) = FMTMARKER; + infmt = 0; + } + else if (inpth && *fmt == ')') + inpth = 0; + else if (*fmt == '(') + inpth = 1; + else if (*fmt == '%') + infmt = 0; + } + else if (*fmt == '%') + infmt = 1; + } + + r = PyUnicode_Format( + PyUnicode_FromUnicode(fmtout, fcur-fmtout), + kwargs?kwargs:args + ); + if (!kwargs) { + Py_DECREF(args); + } /* {} to avoid gcc warning */ + if (!r) + goto out; + + fmt = PyUnicode_AS_UNICODE(r); + fmtsize = PyUnicode_GET_SIZE(r); + Py_DECREF(r); + +#define HAS_FINAL() ( \ + (past = *(fmt-1)), \ + isHangulSyllable(past) ? \ + ((past-HANGUL_BOTTOM) % NJONGSUNG > 0) \ + : (past < 0x80 ? pseudofinal[past] : 0) \ +) + +#define HAS_FINAL_OR_NOTSYL() ( \ + (past = *(fmt-1)), \ + isHangulSyllable(past) ? \ + ((past-HANGUL_BOTTOM) % NJONGSUNG > 0) \ + : 1 \ +) + +#define PROCESSSUFFIX(nofinal, existfinal) \ + if (next == nofinal || next == existfinal) { \ + *(fcur++) = HAS_FINAL() ? (existfinal) : (nofinal); \ + fmtsize--; fmt++; \ + } + +#define PROCESSSUFFIX_IDA(jongsungadder, existfinal) \ + if (next == existfinal) { \ + if (HAS_FINAL_OR_NOTSYL()) \ + *(fcur++) = existfinal; \ + else \ + *(fcur-1) += jongsungadder; \ + fmtsize-=3; fmt+=3; \ + } + + for (fcur = fmtout; fmtsize--; fmt++) { + if (*fmt == FMTMARKER) { + if (fcur > fmtout && fmtsize > 0) { + Py_UNICODE past, next = *(fmt+1); + + if (next == '(' && fmtsize > 2 && *(fmt+3) == ')') { /* ida suffxes */ + next = *(fmt+2); + PROCESSSUFFIX_IDA(0, 0xc774) /* (I)DA */ + else PROCESSSUFFIX_IDA(17, 0xc785) /* (IP)NIDA */ + else PROCESSSUFFIX_IDA(4, 0xc778) /* (IN)- */ + } + else if (0xac00 <= next && next <= 0xc774) { + PROCESSSUFFIX(0xc744, 0xb97c) /* REUL, EUL */ + else PROCESSSUFFIX(0xc740, 0xb294) /* NEUN, EUN */ + else PROCESSSUFFIX(0xac00, 0xc774) /* I, GA */ + else PROCESSSUFFIX(0xc640, 0xacfc) /* WA, GWA */ + } + } + } + else + *(fcur++) = *fmt; + } + +#undef PROCESSSUFFIX, PROCESSSUFFIX_IDA +#undef HAS_FINAL, HAS_FINAL_OR_NOTSYL + + r = PyUnicode_FromUnicode(fmtout, fcur-fmtout); + +out: + PyMem_Free(fmtout); + return r; +} + /* List of methods defined in the module */ #define meth(name, func, doc) {name, (PyCFunction)func, METH_VARARGS, doc} +#define meth_kw(name, func, doc) {name, (PyCFunction)func, METH_VARARGS|METH_KEYWORDS, doc} static struct PyMethodDef hangul_methods[] = { - meth("isJaeum", Py_isJaeum, Py_isJaeum__doc__), - meth("isMoeum", Py_isMoeum, Py_isMoeum__doc__), - meth("ishangul", Py_ishangul, Py_ishangul__doc__), - meth("join", Py_join, Py_join__doc__), - meth("split", Py_split, Py_split__doc__), - meth("conjoin", Py_conjoin, Py_conjoin__doc__), - meth("disjoint", Py_disjoint, Py_disjoint__doc__), + meth("isJaeum", hangul_isJaeum, hangul_isJaeum__doc__), + meth("isMoeum", hangul_isMoeum, hangul_isMoeum__doc__), + meth("ishangul", hangul_ishangul, hangul_ishangul__doc__), + meth("join", hangul_join, hangul_join__doc__), + meth("split", hangul_split, hangul_split__doc__), + meth("conjoin", hangul_conjoin, hangul_conjoin__doc__), + meth("disjoint", hangul_disjoint, hangul_disjoint__doc__), + meth_kw("format", hangul_format, hangul_format__doc__), {NULL, NULL}, }; |
From: Chang <pe...@us...> - 2002-04-25 05:12:19
|
perky 02/04/24 22:12:18 Modified: src hangul.c Log: - Fix the problem around syllable without jongsung. Revision Changes Path 1.3 +4 -4 KoreanCodecs/src/hangul.c Index: hangul.c =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/src/hangul.c,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- hangul.c 25 Apr 2002 04:49:01 -0000 1.2 +++ hangul.c 25 Apr 2002 05:12:17 -0000 1.3 @@ -4,14 +4,14 @@ * KoreanCodecs Hangul Module C Implementation * * Author : Hye-Shik Chang <pe...@fa...> - * Date : $Date: 2002/04/25 04:49:01 $ + * Date : $Date: 2002/04/25 05:12:17 $ * Created : 25 April 2002 * - * $Revision: 1.2 $ + * $Revision: 1.3 $ */ static char *version = -"$Id: hangul.c,v 1.2 2002/04/25 04:49:01 perky Exp $"; +"$Id: hangul.c,v 1.3 2002/04/25 05:12:17 perky Exp $"; #include "Python.h" @@ -246,7 +246,7 @@ Py_UNICODE code; code = ((getChosungOrder(elems[0]) * NJUNGSUNG) + getJungsungOrder(elems[1])) * - NJONGSUNG + getJongsungOrder(elems[2]) + HANGUL_BOTTOM; + NJONGSUNG + (elems[2]?getJongsungOrder(elems[2]):0) + HANGUL_BOTTOM; return PyUnicode_FromUnicode(&code, 1); } } |
From: Chang <pe...@us...> - 2002-04-25 05:01:10
|
perky 02/04/24 22:01:06 Modified: korean/python hangul.py Log: - Remove hangul.dividestring method (it was just fancy feature..) - Add c.hangul to distutil build chain. Yeah~ Revision Changes Path 1.5 +1 -30 KoreanCodecs/korean/python/hangul.py Index: hangul.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/korean/python/hangul.py,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- hangul.py 25 Apr 2002 03:46:34 -0000 1.4 +++ hangul.py 25 Apr 2002 05:01:06 -0000 1.5 @@ -15,7 +15,7 @@ # Conjoining Jamo Behavior: # http://www.unicode.org/unicode/uni2book/ch03.pdf (section 3.11) # -# $Id: hangul.py,v 1.4 2002/04/25 03:46:34 perky Exp $ +# $Id: hangul.py,v 1.5 2002/04/25 05:01:06 perky Exp $ # class UnicodeHangulError(Exception): @@ -203,24 +203,6 @@ obuff.append(c) return u''.join(obuff) -def dividestring(str, intoelements=0): - r = u'' - for char in str: - if ishangul(char): - elems = split(char) - for elem in elems: - for htype in (Jaeum, Moeum, None): - if htype == None: - r += elem - elif intoelements and \ - htype.MultiElement.has_key(elem): - r += u''.join(htype.MultiElement[elem]) - break - else: - r += char - - return r - def _has_final(c): # for internal use only if u'\uac00' <= c <= u'\ud7a3': # hangul @@ -287,15 +269,4 @@ ncur += 1 return u''.join(obuff) - - -if __name__ == '__main__': - - print ( join([Jaeum.P, Moeum.EO, Null]) + \ - join([Jaeum.K, Moeum.I, Null]) + \ - join([Jaeum.JJ, Moeum.A, Jaeum.NG]) ).encode("utf-8") - - while 1: - code = raw_input(">>> ") - print dividestring(unicode(code, "utf-8"), 1).encode("utf-8") |
From: Chang <pe...@us...> - 2002-04-25 05:01:09
|
perky 02/04/24 22:01:06 Modified: test test_hangul.py Log: - Remove hangul.dividestring method (it was just fancy feature..) - Add c.hangul to distutil build chain. Yeah~ Revision Changes Path 1.6 +0 -12 KoreanCodecs/test/test_hangul.py Index: test_hangul.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/test/test_hangul.py,v retrieving revision 1.5 retrieving revision 1.6 diff -u -r1.5 -r1.6 --- test_hangul.py 25 Apr 2002 03:46:35 -0000 1.5 +++ test_hangul.py 25 Apr 2002 05:01:06 -0000 1.6 @@ -14,18 +14,6 @@ self.assertEqual(split(u'\u3149'), (Jaeum.JJ, Null, Null)) self.assertEqual(split(u'\u3162'), (Null, Moeum.YI, Null)) - def test_dividestring(self): - self.assertEqual( - dividestring(u'\ub5ab\uc870\uc544\ub77c\uaf65'), - u'\u3138\u3153\u313c\u3148\u3157\u3147\u314f\u3139\u314f\u3132\u3159\u3131' - ) - self.assertEqual( - dividestring(u'\ubfb0\ub85c\ub871 \uaf2c\ub9c8\ub9c8\ub140 \uc5f4\ub450\uc0b4 \ub09c', 1), - u'\u3142\u3142\u315b\u3139\u3157\u3139\u3157\u3147 ' - u'\u3131\u3131\u3157\u3141\u314f\u3141\u314f\u3134\u3155 ' - u'\u3147\u3155\u3139\u3137\u315c\u3145\u314f\u3139 \u3134\u314f\u3134' - ) - def test_basicspec(self): self.assertEqual(isJaeum(Jaeum.J), 1) self.assertEqual(isJaeum(Moeum.E), 0) |
From: Chang <pe...@us...> - 2002-04-25 05:01:08
|
perky 02/04/24 22:01:06 Modified: . setup.py Log: - Remove hangul.dividestring method (it was just fancy feature..) - Add c.hangul to distutil build chain. Yeah~ Revision Changes Path 1.12 +4 -3 KoreanCodecs/setup.py Index: setup.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/setup.py,v retrieving revision 1.11 retrieving revision 1.12 diff -u -r1.11 -r1.12 --- setup.py 17 Apr 2002 10:43:21 -0000 1.11 +++ setup.py 25 Apr 2002 05:01:06 -0000 1.12 @@ -1,5 +1,5 @@ #!/usr/bin/env python -# $Id: setup.py,v 1.11 2002/04/17 10:43:21 perky Exp $ +# $Id: setup.py,v 1.12 2002/04/25 05:01:06 perky Exp $ import sys from distutils.core import setup, Extension @@ -35,5 +35,6 @@ 'korean.c', 'korean.python'], ext_modules = [ - Extension("korean.c._koco", - ["src/_koco.c"])]) + Extension("korean.c._koco", ["src/_koco.c"]), + Extension("korean.c.hangul", ["src/hangul.c"]), + ]) |
From: Chang <pe...@us...> - 2002-04-25 04:49:04
|
perky 02/04/24 21:49:01 Modified: src hangul.c Log: - Implement join, split, conjoin, disjoint methods on korean.c.hangul Revision Changes Path 1.2 +323 -82 KoreanCodecs/src/hangul.c Index: hangul.c =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/src/hangul.c,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- hangul.c 24 Apr 2002 14:16:56 -0000 1.1 +++ hangul.c 25 Apr 2002 04:49:01 -0000 1.2 @@ -4,14 +4,14 @@ * KoreanCodecs Hangul Module C Implementation * * Author : Hye-Shik Chang <pe...@fa...> - * Date : $Date: 2002/04/24 14:16:56 $ + * Date : $Date: 2002/04/25 04:49:01 $ * Created : 25 April 2002 * - * $Revision: 1.1 $ + * $Revision: 1.2 $ */ static char *version = -"$Id: hangul.c,v 1.1 2002/04/24 14:16:56 perky Exp $"; +"$Id: hangul.c,v 1.2 2002/04/25 04:49:01 perky Exp $"; #include "Python.h" @@ -46,53 +46,61 @@ #define CHOSUNG_FILLER 0x115f #define JUNGSUNG_FILLER 0x1160 -#define F_JAEUM 0x01 -#define F_MOEUM 0x02 -#define F_CHOSUNG 0x04 -#define F_JUNGSUNG 0x08 -#define F_JONGSUNG 0x10 +static PyObject *UniNull, *UniSpace; +static PyObject *ErrorObject; #define MAX_MULTIJAMO 3 typedef struct _jamotype { char *name; Py_UNICODE code; - int multi[MAX_MULTIJAMO]; - int flags; + int multi[MAX_MULTIJAMO]; + char orders[3]; /* cho, jung, jong */ } jamotype; #define CODE(c) #c,c #define NOMULTI {0,0,0} -#define JC (F_JAEUM | F_CHOSUNG) -#define JJ (F_JAEUM | F_JONGSUNG) -#define JCJ (F_JAEUM | F_CHOSUNG | F_JONGSUNG) -#define MJ (F_MOEUM | F_JUNGSUNG) -jamotype jamos[] = { +#define J_C {0,-1,-1} +#define J_J {-1,-1,0} +#define J_CJ {0,-1,0} +#define M_J {-1,0,-1} +static jamotype jamos[] = { /* JAEUM */ - { CODE(G), NOMULTI, JCJ }, { CODE(GG), {G, G,}, JCJ }, { CODE(GS), {G, S,}, JJ }, - { CODE(N), NOMULTI, JCJ }, { CODE(NJ), {N, J,}, JJ }, { CODE(NH), {N, H,}, JJ }, - { CODE(D), NOMULTI, JCJ }, { CODE(DD), {D, D,}, JC }, { CODE(L), NOMULTI, JCJ }, - { CODE(LG), {L, G,}, JJ }, { CODE(LM), {L, M,}, JJ }, { CODE(LB), {L, B,}, JJ }, - { CODE(LS), {L, S,}, JJ }, { CODE(LT), {L, T,}, JJ }, { CODE(LP), {L, P,}, JJ }, - { CODE(LH), {L, H,}, JJ }, { CODE(M), NOMULTI, JCJ }, { CODE(B), NOMULTI, JCJ }, - { CODE(BB), {B, B,}, JC }, { CODE(BS), {B, S,}, JJ }, { CODE(S), NOMULTI, JCJ }, - { CODE(SS), {S, S,}, JCJ }, { CODE(NG), NOMULTI, JCJ }, { CODE(J), NOMULTI, JCJ }, - { CODE(JJ), {J, J,}, JC }, { CODE(C), NOMULTI, JCJ }, { CODE(K), NOMULTI, JCJ }, - { CODE(T), NOMULTI, JCJ }, { CODE(P), NOMULTI, JCJ }, { CODE(H), NOMULTI, JCJ }, + { CODE(G), NOMULTI, J_CJ }, { CODE(GG), {G, G,}, J_CJ }, { CODE(GS), {G, S,}, J_J }, + { CODE(N), NOMULTI, J_CJ }, { CODE(NJ), {N, J,}, J_J }, { CODE(NH), {N, H,}, J_J }, + { CODE(D), NOMULTI, J_CJ }, { CODE(DD), {D, D,}, J_C }, { CODE(L), NOMULTI, J_CJ }, + { CODE(LG), {L, G,}, J_J }, { CODE(LM), {L, M,}, J_J }, { CODE(LB), {L, B,}, J_J }, + { CODE(LS), {L, S,}, J_J }, { CODE(LT), {L, T,}, J_J }, { CODE(LP), {L, P,}, J_J }, + { CODE(LH), {L, H,}, J_J }, { CODE(M), NOMULTI, J_CJ }, { CODE(B), NOMULTI, J_CJ }, + { CODE(BB), {B, B,}, J_C }, { CODE(BS), {B, S,}, J_J }, { CODE(S), NOMULTI, J_CJ }, + { CODE(SS), {S, S,}, J_CJ }, { CODE(NG), NOMULTI, J_CJ }, { CODE(J), NOMULTI, J_CJ }, + { CODE(JJ), {J, J,}, J_C }, { CODE(C), NOMULTI, J_CJ }, { CODE(K), NOMULTI, J_CJ }, + { CODE(T), NOMULTI, J_CJ }, { CODE(P), NOMULTI, J_CJ }, { CODE(H), NOMULTI, J_CJ }, /* MOEUM */ - { CODE(A), NOMULTI, MJ }, { CODE(AE), {A, I,}, MJ }, { CODE(YA), NOMULTI, MJ }, - { CODE(YAE), {YA,I}, MJ }, { CODE(EO), NOMULTI, MJ }, { CODE(E), NOMULTI, MJ }, - { CODE(YEO), NOMULTI, MJ }, { CODE(YE), {YEO,I}, MJ }, { CODE(O), NOMULTI, MJ }, - { CODE(WA), {O, A}, MJ }, { CODE(WAE), {O,A,I}, MJ }, { CODE(OE), {O, I}, MJ }, - { CODE(YO), NOMULTI, MJ }, { CODE(U), NOMULTI, MJ }, { CODE(WEO), {U, EO}, MJ }, - { CODE(WE), {U, E}, MJ }, { CODE(WI), {U, I}, MJ }, { CODE(YU), NOMULTI, MJ }, - { CODE(EU), NOMULTI, MJ }, { CODE(YI), {EU, I}, MJ }, { CODE(I), NOMULTI, MJ }, + { CODE(A), NOMULTI, M_J }, { CODE(AE), {A, I,}, M_J }, { CODE(YA), NOMULTI, M_J }, + { CODE(YAE), {YA,I}, M_J }, { CODE(EO), NOMULTI, M_J }, { CODE(E), NOMULTI, M_J }, + { CODE(YEO), NOMULTI, M_J }, { CODE(YE), {YEO,I}, M_J }, { CODE(O), NOMULTI, M_J }, + { CODE(WA), {O, A}, M_J }, { CODE(WAE), {O,A,I}, M_J }, { CODE(OE), {O, I}, M_J }, + { CODE(YO), NOMULTI, M_J }, { CODE(U), NOMULTI, M_J }, { CODE(WEO), {U, EO}, M_J }, + { CODE(WE), {U, E}, M_J }, { CODE(WI), {U, I}, M_J }, { CODE(YU), NOMULTI, M_J }, + { CODE(EU), NOMULTI, M_J }, { CODE(YI), {EU, I}, M_J }, { CODE(I), NOMULTI, M_J }, /* END MARKER */ - { 0, 0, NOMULTI, 0 }, + { 0, 0, NOMULTI, {0,} }, }; -#undef JC, JJ, JCJ, MJ, NOMULTI, CODE +#undef J_C, J_J, J_CJ, M_J, NOMULTI, CODE +static jamotype *jamo_chosung[NCHOSUNG], *jamo_jungsung[NJUNGSUNG], *jamo_jongsung[NJONGSUNG]; + +#define getJamotype(c) jamos[(c)-JAEUM_BOTTOM] #define isJaeum(c) (JAEUM_BOTTOM <= (c) && (c) <= JAEUM_TOP) #define isMoeum(c) (MOEUM_BOTTOM <= (c) && (c) <= MOEUM_TOP) +#define isHangulSyllable(c) (HANGUL_BOTTOM <= (c) && (c) <= HANGUL_TOP) +#define isChosung(c) (getJamotype(c).orders[0] >= 0) +#define isJungsung(c) (getJamotype(c).orders[1] >= 0) +#define isJongsung(c) (getJamotype(c).orders[2] >= 0) +#define getChosungOrder(c) (getJamotype(c).orders[0]) +#define getJungsungOrder(c) (getJamotype(c).orders[1]) +#define getJongsungOrder(c) (getJamotype(c).orders[2]) + static char Py_isJaeum__doc__[] = "isJaeum(code): Verify whether the code is Jaeum."; @@ -113,7 +121,8 @@ if (isJaeum(*code)) { Py_INCREF(Py_True); return Py_True; - } else { + } + else { Py_INCREF(Py_False); return Py_False; } @@ -138,76 +147,290 @@ if (isMoeum(*code)) { Py_INCREF(Py_True); return Py_True; - } else { + } + else { + Py_INCREF(Py_False); + return Py_False; + } +} + +static char Py_ishangul__doc__[] = "ishangul(code): Verify whether the code is hangul."; + +static PyObject * +Py_ishangul(PyObject *self, PyObject *args) +{ + Py_UNICODE *code; + int codelen; + + if (!PyArg_ParseTuple(args, "u#:ishangul", &code, &codelen)) + return NULL; + + if (codelen < 1) { + PyErr_Format(PyExc_ValueError, "need not null unicode string"); + return NULL; + } + + if (isHangulSyllable(*code) || isJaeum(*code) || isMoeum(*code)) { + Py_INCREF(Py_True); + return Py_True; + } + else { Py_INCREF(Py_False); return Py_False; } } -#if 0 -static char cp949_encode__doc__[] = "CP949 encoder"; +static char Py_join__doc__[] = "join([chosung, jungsung, jongsung]): Assemble hangul syllable from jamos."; static PyObject * -cp949_encode(PyObject *self, PyObject *args) +Py_join(PyObject *self, PyObject *args) { - Py_UNICODE *argptr, *srccur, *srcend; - int arglen, errtype = error_strict; - char *errors = NULL; - unsigned char *destptr, *destcur, *decbuf; + PyObject *argchar, *argelems[3]; + Py_UNICODE elems[3], *uobj; + int i; + + if (!PyArg_ParseTuple(args, "O:join", &argchar)) + return NULL; + + if (PyList_Check(argchar)) { + if (PyList_GET_SIZE(argchar) != 3) + goto argerr; + for (i = 0; i < 3; i ++) + argelems[i] = PyList_GET_ITEM(argchar, i); + } + else if (PyTuple_Check(argchar)) { + if (PyTuple_GET_SIZE(argchar) != 3) + goto argerr; + for (i = 0; i < 3; i ++) + argelems[i] = PyTuple_GET_ITEM(argchar, i); + } + else { +argerr: PyErr_Format(PyExc_ValueError, "need list or tuple with 3 unicode elements"); + return NULL; + } + + for (i = 0; i < 3; i ++) { + if ((uobj = PyUnicode_AsUnicode(argelems[i])) == NULL) + goto argerr; + if (PyUnicode_GET_SIZE(argelems[i])) + elems[i] = *uobj; + else + elems[i] = NULL; + } + + if ( (elems[0] && (!isJaeum(elems[0]) || !isChosung(elems[0]))) /* Chosung validity */ + || (elems[1] && (!isMoeum(elems[1]))) /* Jungsung validity */ + || (elems[2] && (!isJaeum(elems[2]) || !isJongsung(elems[2]))) ) { + PyErr_Format(ErrorObject, "not valid jamo combination"); + return NULL; + } + + if ((!elems[0] || !elems[1]) && elems[2]) { + PyErr_Format(ErrorObject, "trying to assemble character which " + "is not in unicode map"); + return NULL; + } + else if (elems[0] && !elems[1]) { + Py_INCREF(argelems[0]); + return argelems[0]; + } + else if (elems[1] && !elems[0]) { + Py_INCREF(argelems[1]); + return argelems[1]; + } + else if (!elems[0]) { /* [Null, Null, Null] */ + Py_INCREF(UniSpace); + return UniSpace; + } + else { + Py_UNICODE code; + + code = ((getChosungOrder(elems[0]) * NJUNGSUNG) + getJungsungOrder(elems[1])) * + NJONGSUNG + getJongsungOrder(elems[2]) + HANGUL_BOTTOM; + return PyUnicode_FromUnicode(&code, 1); + } +} + +static char Py_split__doc__[] = "split(code): Disassemble hangul syllable into jamos."; + +static PyObject * +Py_split(PyObject *self, PyObject *args) +{ + Py_UNICODE *code; PyObject *r; + int codelen; + + if (!PyArg_ParseTuple(args, "u#:split", &code, &codelen)) + return NULL; + + if (codelen < 1) { + PyErr_Format(PyExc_ValueError, "need not null unicode string"); + return NULL; + } + + if (isHangulSyllable(*code)) { + Py_UNICODE cho, jung, jong; + PyObject *jongobj; + Py_UNICODE hseq, t; + + hseq = *code - HANGUL_BOTTOM; + + cho = jamo_chosung[hseq / (NJUNGSUNG*NJONGSUNG)]->code; + jung = jamo_jungsung[(hseq / NJONGSUNG) % NJUNGSUNG]->code; + + if ((t = hseq % NJONGSUNG) != NULL) { + jong = jamo_jongsung[t]->code; + jongobj = PyUnicode_FromUnicode(&jong, 1); + } else { + jongobj = UniNull; + Py_INCREF(UniNull); + } + + r = PyTuple_New(3); + PyTuple_SET_ITEM(r, 0, PyUnicode_FromUnicode(&cho, 1)); + PyTuple_SET_ITEM(r, 1, PyUnicode_FromUnicode(&jung, 1)); + PyTuple_SET_ITEM(r, 2, jongobj); - if (!PyArg_ParseTuple(args, "u#|z:cp949_encode", &argptr, &arglen, &errors)) + return r; + } + else if (isJaeum(*code)) { + r = PyTuple_New(3); + PyTuple_SET_ITEM(r, 0, PyUnicode_FromUnicode(code, 1)); + PyTuple_SET_ITEM(r, 1, UniNull); Py_INCREF(UniNull); + PyTuple_SET_ITEM(r, 2, UniNull); Py_INCREF(UniNull); + return r; + } + else if (isMoeum(*code)) { + r = PyTuple_New(3); + PyTuple_SET_ITEM(r, 0, UniNull); Py_INCREF(UniNull); + PyTuple_SET_ITEM(r, 1, PyUnicode_FromUnicode(code, 1)); + PyTuple_SET_ITEM(r, 2, UniNull); Py_INCREF(UniNull); + return r; + } + else { + PyErr_Format(ErrorObject, "not a hangul code"); return NULL; + } +} + +static char Py_conjoin__doc__[] = "conjoin(unicodestring): conjoin unicode johab string into unicode syllable string"; + +static PyObject * +Py_conjoin(PyObject *self, PyObject *args) +{ + PyObject *r; + Py_UNICODE *code, *dst, *dstorg, c; + int cho, jung, jong; + int codelen, i; - errtype = error_type(errors); - if (errtype == error_undef) + if (!PyArg_ParseTuple(args, "u#:conjoin", &code, &codelen)) return NULL; - destcur = destptr = PyMem_New(unsigned char, arglen*2+1); - for (srccur = argptr, srcend = argptr + arglen; srccur < srcend; srccur++) { - if (*srccur <= 0x7F) - *(destcur++) = *srccur; - else { - decbuf = _ksc5601_encode(*srccur); - if (!decbuf) - decbuf = _uhc_encode(*srccur); - if(decbuf == 0) { - switch (errtype) { - case error_strict: - PyMem_Del(destptr); - PyErr_Format(PyExc_UnicodeError, - "CP949 encoding error: invalid character \\u%04x", - *srccur); - return NULL; - break; - case error_replace: - *(destcur++) = 0xa1; - *(destcur++) = 0xa1; - break; - /* case error_ignore: break; */ + dstorg = dst = PyMem_New(Py_UNICODE, codelen); + + for (i = 0; i < codelen; i++) { + c = code[i]; + if ((JBASE_CHOSUNG <= c && c <= 0x1112) || c == CHOSUNG_FILLER) { + if (codelen > i+1 && JUNGSUNG_FILLER <= code[i+1] && code[i+1] <= 0x1175) { + if (c == CHOSUNG_FILLER) cho = -1; + else cho = c - JBASE_CHOSUNG; + if (code[i+1] == JUNGSUNG_FILLER) jung = -1; + else jung = code[i+1] - JBASE_JUNGSUNG; + + if (codelen > i+2 && JBASE_JONGSUNG <= code[i+2] && code[i+2] <= 0x11c2) { + jong = code[i+2] - JBASE_JONGSUNG + 1; + i += 2; + } + else { + jong = 0; i++; + } + + if (jong && (cho == -1 || jung == -1)) { /* can't trans to syllable */ + if (cho >= 0) *(dst++) = jamo_chosung[cho]->code; + if (jung >= 0) *(dst++) = jamo_jungsung[jung]->code; + *(dst++) = jamo_jongsung[jong]->code; } - } else { - *(destcur++) = decbuf[0]; - *(destcur++) = decbuf[1]; + else if (cho == -1) /* jungsung only */ + *(dst++) = jamo_jungsung[jung]->code; + else if (jung == -1) /* chosung only */ + *(dst++) = jamo_chosung[cho]->code; + else /* full set */ + *(dst++) = HANGUL_BOTTOM + (cho * NJUNGSUNG + jung) * NJONGSUNG + jong; } + else if (c != CHOSUNG_FILLER) /* chosung only */ + *(dst++) = jamo_chosung[c-JBASE_CHOSUNG]->code; } + else if (JBASE_JUNGSUNG <= c && c <= 0x1175) /* jungsung only */ + *(dst++) = jamo_jungsung[c-JBASE_JUNGSUNG]->code; + else + *(dst++) = c; } - r = codec_tuple(PyString_FromStringAndSize((char*)destptr, destcur - destptr), arglen); - PyMem_Del(destptr); + r = PyUnicode_FromUnicode(dstorg, dst-dstorg); + PyMem_Del(dstorg); + return r; } -#endif +static char Py_disjoint__doc__[] = "disjoint(unicodestring): disjoint unicode syllable string into unicode johab string"; + +static PyObject * +Py_disjoint(PyObject *self, PyObject *args) +{ + Py_UNICODE *code, *dst, *dstorg, c; + PyObject *r; + int codelen, i; + + if (!PyArg_ParseTuple(args, "u#:split", &code, &codelen)) + return NULL; + + dstorg = dst = PyMem_New(Py_UNICODE, codelen*3); + + for (i = 0; i < codelen; i++) { + c = code[i]; + if (isHangulSyllable(c)) { + int hseq; + Py_UNICODE jong; + + hseq = c - HANGUL_BOTTOM; + jong = hseq % NJONGSUNG; + + *(dst++) = hseq / (NJUNGSUNG * NJONGSUNG) + JBASE_CHOSUNG; + *(dst++) = (hseq / NJONGSUNG) % NJUNGSUNG + JBASE_JUNGSUNG; + if (jong) + *(dst++) = jong + JBASE_JONGSUNG - 1; + } + else if (isJaeum(c) && isChosung(c)) { + *(dst++) = getChosungOrder(c) + JBASE_CHOSUNG; + *(dst++) = JUNGSUNG_FILLER; + } + else if (isMoeum(c)) { + *(dst++) = CHOSUNG_FILLER; + *(dst++) = getJungsungOrder(c) + JBASE_JUNGSUNG; + } else + *(dst++) = c; + } + + r = PyUnicode_FromUnicode(dstorg, dst-dstorg); + PyMem_Del(dstorg); + + return r; +} + /* List of methods defined in the module */ #define meth(name, func, doc) {name, (PyCFunction)func, METH_VARARGS, doc} static struct PyMethodDef hangul_methods[] = { - meth("isJaeum", Py_isJaeum, Py_isJaeum__doc__), - meth("isMoeum", Py_isMoeum, Py_isMoeum__doc__), + meth("isJaeum", Py_isJaeum, Py_isJaeum__doc__), + meth("isMoeum", Py_isMoeum, Py_isMoeum__doc__), + meth("ishangul", Py_ishangul, Py_ishangul__doc__), + meth("join", Py_join, Py_join__doc__), + meth("split", Py_split, Py_split__doc__), + meth("conjoin", Py_conjoin, Py_conjoin__doc__), + meth("disjoint", Py_disjoint, Py_disjoint__doc__), {NULL, NULL}, }; @@ -230,6 +453,11 @@ /* Create the module and add the functions */ m = Py_InitModule("hangul", hangul_methods); + UniNull = PyUnicode_FromUnicode(NULL, 0); + tuni[0] = 0x3000; /* Unicode Double-wide Space */ + UniSpace = PyUnicode_FromUnicode(tuni, 1); + Py_INCREF(UniSpace); + /* Add some symbolic constants to the module */ d = PyModule_GetDict(m); SET_INTCONSTANT(d, NCHOSUNG); @@ -253,7 +481,9 @@ PyDict_SetItemString(d, "Chosung", Chosung); PyDict_SetItemString(d, "Jungsung", Jungsung); PyDict_SetItemString(d, "Jongsung", Jongsung); - PyList_SET_ITEM(Jongsung, cur_jong++, PyUnicode_FromUnicode(NULL, 0)); + jamo_jongsung[cur_jong] = NULL; + Py_INCREF(UniNull); + PyList_SET_ITEM(Jongsung, cur_jong++, UniNull); /* Create Jaeum and Moeum meta class */ JaeumDict = PyDict_New(); @@ -294,20 +524,27 @@ PyDict_SetItemString(d, jamo->name, unijamo); Py_INCREF(unijamo); /* PuTyple_SET_ITEM steals reference */ - if (jamo->flags & F_JAEUM) { + if (isJaeum(jamo->code)) { PyTuple_SET_ITEM(JaeumCodes, cur_jaeum++, unijamo); - if (jamo->flags & F_CHOSUNG) { + if (isChosung(jamo->code)) { + jamo->orders[0] = cur_cho; + jamo_chosung[cur_cho] = jamo; PyList_SET_ITEM(Chosung, cur_cho++, unijamo); PyDict_SetItemString(JaeumDict, jamo->name, unijamo); } - if (jamo->flags & F_JONGSUNG) { + if (isJongsung(jamo->code)) { + jamo->orders[2] = cur_jong; + jamo_jongsung[cur_jong] = jamo; PyList_SET_ITEM(Jongsung, cur_jong++, unijamo); PyDict_SetItemString(JaeumDict, jamo->name, unijamo); } multicls = JaeumMulti; - } else { /* Moeum */ + } + else { /* Moeum */ PyTuple_SET_ITEM(MoeumCodes, cur_moeum++, unijamo); - if (jamo->flags & F_JUNGSUNG) { + if (isJungsung(jamo->code)) { + jamo->orders[1] = cur_jung; + jamo_jungsung[cur_jung] = jamo; PyList_SET_ITEM(Jungsung, cur_jung++, unijamo); PyDict_SetItemString(MoeumDict, jamo->name, unijamo); } @@ -347,8 +584,12 @@ PyDict_SetItemString(d, "CHOSUNG_FILLER", PyUnicode_FromUnicode(tuni, 1)); tuni[0] = JUNGSUNG_FILLER; PyDict_SetItemString(d, "JUNGSUNG_FILLER", PyUnicode_FromUnicode(tuni, 1)); + PyDict_SetItemString(d, "Null", UniNull); PyDict_SetItemString(d, "version", PyString_FromString(version)); + + ErrorObject = PyErr_NewException("hangul.UnicodeHangulError", NULL, NULL); + PyDict_SetItemString(d, "UnicodeHangulError", ErrorObject); /* Check for errors */ if (PyErr_Occurred()) |
From: Chang <pe...@us...> - 2002-04-25 03:46:37
|
perky 02/04/24 20:46:34 Modified: korean/python hangul.py Log: - Clean up namespace - Change hangul.split's return type to Tuple (make compatible with c.hangul) Revision Changes Path 1.4 +8 -7 KoreanCodecs/korean/python/hangul.py Index: hangul.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/korean/python/hangul.py,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- hangul.py 24 Apr 2002 07:20:27 -0000 1.3 +++ hangul.py 25 Apr 2002 03:46:34 -0000 1.4 @@ -15,7 +15,7 @@ # Conjoining Jamo Behavior: # http://www.unicode.org/unicode/uni2book/ch03.pdf (section 3.11) # -# $Id: hangul.py,v 1.3 2002/04/24 07:20:27 perky Exp $ +# $Id: hangul.py,v 1.4 2002/04/25 03:46:34 perky Exp $ # class UnicodeHangulError(Exception): @@ -83,6 +83,7 @@ for name, code in Jaeum.__dict__.items() + Moeum.__dict__.items(): if name.isupper() and len(name) <= 3: exec "%s = %s" % (name, repr(code)) +del name, code isJaeum = lambda c: c in Jaeum.Codes isMoeum = lambda c: c in Moeum.Codes @@ -105,7 +106,7 @@ code in Moeum.Codes ) -# Alternative Suffixes +# Alternative Suffixes : do not use outside ALT_SUFFIXES = { u'\uc744': (u'\ub97c', u'\uc744'), # reul, eul u'\ub97c': (u'\ub97c', u'\uc744'), # reul, eul @@ -117,7 +118,7 @@ u'\uacfc': (u'\uc640', u'\uacfc'), # wa, gwa } -# Ida-Varitaion Suffixes +# Ida-Varitaion Suffixes : do not use outside IDA_SUFFIXES = { u'(\uc774)': (u'', u'\uc774'), # (yi)da u'(\uc785)': (17, u'\uc785'), # (ip)nida @@ -143,16 +144,16 @@ if len(code) != 1 or not ishangul(code): raise UnicodeHangulError("needs 1 hangul letter") if code in Jaeum.Codes: - return [code, Null, Null] + return (code, Null, Null) if code in Moeum.Codes: - return [Null, code, Null] + return (Null, code, Null) code = ord(code) - 0xac00 - return [ + return ( Chosung[int(code / (NJUNGSUNG*NJONGSUNG))], # Python3000 safe Jungsung[int(code / NJONGSUNG) % NJUNGSUNG], Jongsung[code % NJONGSUNG] - ] + ) def conjoin(s): obuff = [] |
From: Chang <pe...@us...> - 2002-04-25 03:46:37
|
perky 02/04/24 20:46:35 Modified: test test_hangul.py Log: - Clean up namespace - Change hangul.split's return type to Tuple (make compatible with c.hangul) Revision Changes Path 1.5 +4 -4 KoreanCodecs/test/test_hangul.py Index: test_hangul.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/test/test_hangul.py,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- test_hangul.py 24 Apr 2002 07:20:16 -0000 1.4 +++ test_hangul.py 25 Apr 2002 03:46:35 -0000 1.5 @@ -8,11 +8,11 @@ def test_joinsplit(self): self.assertEqual(join([Jaeum.J, Moeum.WA, Jaeum.L]), u'\uc894') self.assertEqual(join([Jaeum.JJ, Null, Null]), u'\u3149') - self.assertEqual(join([Null, Moeum.YI, Null]), u'\u3162') + self.assertEqual(join((Null, Moeum.YI, Null)), u'\u3162') - self.assertEqual(split(u'\uc894'), [Jaeum.J, Moeum.WA, Jaeum.L]) - self.assertEqual(split(u'\u3149'), [Jaeum.JJ, Null, Null]) - self.assertEqual(split(u'\u3162'), [Null, Moeum.YI, Null]) + self.assertEqual(split(u'\uc894'), (Jaeum.J, Moeum.WA, Jaeum.L)) + self.assertEqual(split(u'\u3149'), (Jaeum.JJ, Null, Null)) + self.assertEqual(split(u'\u3162'), (Null, Moeum.YI, Null)) def test_dividestring(self): self.assertEqual( |
From: Chang <pe...@us...> - 2002-04-24 14:17:01
|
perky 02/04/24 07:16:56 Modified: src Setup.in Added: src hangul.c Log: - Add ROUGH implementation of korean.c.hangul module Revision Changes Path 1.2 +1 -0 KoreanCodecs/src/Setup.in Index: Setup.in =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/src/Setup.in,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- Setup.in 14 Mar 2002 21:10:53 -0000 1.1 +++ Setup.in 24 Apr 2002 14:16:56 -0000 1.2 @@ -1,2 +1,3 @@ *shared* _koco _koco.c +hangul hangul.c 1.1 KoreanCodecs/src/hangul.c Index: hangul.c =================================================================== /* * hangul.c * * KoreanCodecs Hangul Module C Implementation * * Author : Hye-Shik Chang <pe...@fa...> * Date : $Date: 2002/04/24 14:16:56 $ * Created : 25 April 2002 * * $Revision: 1.1 $ */ static char *version = "$Id: hangul.c,v 1.1 2002/04/24 14:16:56 perky Exp $"; #include "Python.h" enum { /* Jaeum Codes on U+3100 */ G = 0x3131, GG, GS, N, NJ, NH, D, DD, L, LG, LM, LB, LS, LT, LP, LH, M, B, BB, BS, S, SS, NG, J, JJ, C, K, T, P, H }; enum { /* Moeum Codes on U+3100 */ A = 0x314f, AE, YA, YAE, EO, E, YEO, YE, O, WA, WAE, OE, YO, U, WEO, WE, WI, YU, EU, YI, I }; #define NCHOSUNG 19 #define NJUNGSUNG 21 #define NJONGSUNG 28 #define NJAEUM 30 #define NMOEUM 21 #define JAEUM_BOTTOM G #define JAEUM_TOP H #define MOEUM_BOTTOM A #define MOEUM_TOP I #define HANGUL_BOTTOM 0xac00 #define HANGUL_TOP 0xd7a3 #define JBASE_CHOSUNG 0x1100 #define JBASE_JUNGSUNG 0x1161 #define JBASE_JONGSUNG 0x11A8 #define CHOSUNG_FILLER 0x115f #define JUNGSUNG_FILLER 0x1160 #define F_JAEUM 0x01 #define F_MOEUM 0x02 #define F_CHOSUNG 0x04 #define F_JUNGSUNG 0x08 #define F_JONGSUNG 0x10 #define MAX_MULTIJAMO 3 typedef struct _jamotype { char *name; Py_UNICODE code; int multi[MAX_MULTIJAMO]; int flags; } jamotype; #define CODE(c) #c,c #define NOMULTI {0,0,0} #define JC (F_JAEUM | F_CHOSUNG) #define JJ (F_JAEUM | F_JONGSUNG) #define JCJ (F_JAEUM | F_CHOSUNG | F_JONGSUNG) #define MJ (F_MOEUM | F_JUNGSUNG) jamotype jamos[] = { /* JAEUM */ { CODE(G), NOMULTI, JCJ }, { CODE(GG), {G, G,}, JCJ }, { CODE(GS), {G, S,}, JJ }, { CODE(N), NOMULTI, JCJ }, { CODE(NJ), {N, J,}, JJ }, { CODE(NH), {N, H,}, JJ }, { CODE(D), NOMULTI, JCJ }, { CODE(DD), {D, D,}, JC }, { CODE(L), NOMULTI, JCJ }, { CODE(LG), {L, G,}, JJ }, { CODE(LM), {L, M,}, JJ }, { CODE(LB), {L, B,}, JJ }, { CODE(LS), {L, S,}, JJ }, { CODE(LT), {L, T,}, JJ }, { CODE(LP), {L, P,}, JJ }, { CODE(LH), {L, H,}, JJ }, { CODE(M), NOMULTI, JCJ }, { CODE(B), NOMULTI, JCJ }, { CODE(BB), {B, B,}, JC }, { CODE(BS), {B, S,}, JJ }, { CODE(S), NOMULTI, JCJ }, { CODE(SS), {S, S,}, JCJ }, { CODE(NG), NOMULTI, JCJ }, { CODE(J), NOMULTI, JCJ }, { CODE(JJ), {J, J,}, JC }, { CODE(C), NOMULTI, JCJ }, { CODE(K), NOMULTI, JCJ }, { CODE(T), NOMULTI, JCJ }, { CODE(P), NOMULTI, JCJ }, { CODE(H), NOMULTI, JCJ }, /* MOEUM */ { CODE(A), NOMULTI, MJ }, { CODE(AE), {A, I,}, MJ }, { CODE(YA), NOMULTI, MJ }, { CODE(YAE), {YA,I}, MJ }, { CODE(EO), NOMULTI, MJ }, { CODE(E), NOMULTI, MJ }, { CODE(YEO), NOMULTI, MJ }, { CODE(YE), {YEO,I}, MJ }, { CODE(O), NOMULTI, MJ }, { CODE(WA), {O, A}, MJ }, { CODE(WAE), {O,A,I}, MJ }, { CODE(OE), {O, I}, MJ }, { CODE(YO), NOMULTI, MJ }, { CODE(U), NOMULTI, MJ }, { CODE(WEO), {U, EO}, MJ }, { CODE(WE), {U, E}, MJ }, { CODE(WI), {U, I}, MJ }, { CODE(YU), NOMULTI, MJ }, { CODE(EU), NOMULTI, MJ }, { CODE(YI), {EU, I}, MJ }, { CODE(I), NOMULTI, MJ }, /* END MARKER */ { 0, 0, NOMULTI, 0 }, }; #undef JC, JJ, JCJ, MJ, NOMULTI, CODE #define isJaeum(c) (JAEUM_BOTTOM <= (c) && (c) <= JAEUM_TOP) #define isMoeum(c) (MOEUM_BOTTOM <= (c) && (c) <= MOEUM_TOP) static char Py_isJaeum__doc__[] = "isJaeum(code): Verify whether the code is Jaeum."; static PyObject * Py_isJaeum(PyObject *self, PyObject *args) { Py_UNICODE *code; int codelen; if (!PyArg_ParseTuple(args, "u#:isJaeum", &code, &codelen)) return NULL; if (codelen < 1) { PyErr_Format(PyExc_ValueError, "need not null unicode string"); return NULL; } if (isJaeum(*code)) { Py_INCREF(Py_True); return Py_True; } else { Py_INCREF(Py_False); return Py_False; } } static char Py_isMoeum__doc__[] = "isMoeum(code): Verify whether the code is Moeum."; static PyObject * Py_isMoeum(PyObject *self, PyObject *args) { Py_UNICODE *code; int codelen; if (!PyArg_ParseTuple(args, "u#:isMoeum", &code, &codelen)) return NULL; if (codelen < 1) { PyErr_Format(PyExc_ValueError, "need not null unicode string"); return NULL; } if (isMoeum(*code)) { Py_INCREF(Py_True); return Py_True; } else { Py_INCREF(Py_False); return Py_False; } } #if 0 static char cp949_encode__doc__[] = "CP949 encoder"; static PyObject * cp949_encode(PyObject *self, PyObject *args) { Py_UNICODE *argptr, *srccur, *srcend; int arglen, errtype = error_strict; char *errors = NULL; unsigned char *destptr, *destcur, *decbuf; PyObject *r; if (!PyArg_ParseTuple(args, "u#|z:cp949_encode", &argptr, &arglen, &errors)) return NULL; errtype = error_type(errors); if (errtype == error_undef) return NULL; destcur = destptr = PyMem_New(unsigned char, arglen*2+1); for (srccur = argptr, srcend = argptr + arglen; srccur < srcend; srccur++) { if (*srccur <= 0x7F) *(destcur++) = *srccur; else { decbuf = _ksc5601_encode(*srccur); if (!decbuf) decbuf = _uhc_encode(*srccur); if(decbuf == 0) { switch (errtype) { case error_strict: PyMem_Del(destptr); PyErr_Format(PyExc_UnicodeError, "CP949 encoding error: invalid character \\u%04x", *srccur); return NULL; break; case error_replace: *(destcur++) = 0xa1; *(destcur++) = 0xa1; break; /* case error_ignore: break; */ } } else { *(destcur++) = decbuf[0]; *(destcur++) = decbuf[1]; } } } r = codec_tuple(PyString_FromStringAndSize((char*)destptr, destcur - destptr), arglen); PyMem_Del(destptr); return r; } #endif /* List of methods defined in the module */ #define meth(name, func, doc) {name, (PyCFunction)func, METH_VARARGS, doc} static struct PyMethodDef hangul_methods[] = { meth("isJaeum", Py_isJaeum, Py_isJaeum__doc__), meth("isMoeum", Py_isMoeum, Py_isMoeum__doc__), {NULL, NULL}, }; #define SET_INTCONSTANT(dict, value) \ PyDict_SetItemString(dict, #value, PyInt_FromLong((long) value)) #define SET_STRCONSTANT(dict, value) \ PyDict_SetItemString(dict, #value, PyString_FromString(value)) #define SET_CHARCONSTANT(dict, value) \ PyDict_SetItemString(dict, #value, PyString_FromFormat("%c", value)) /* Initialization function for the module */ void inithangul(void) { PyObject *m, *d, *tmp; Py_UNICODE tuni[2]; int i; /* Create the module and add the functions */ m = Py_InitModule("hangul", hangul_methods); /* Add some symbolic constants to the module */ d = PyModule_GetDict(m); SET_INTCONSTANT(d, NCHOSUNG); SET_INTCONSTANT(d, NJUNGSUNG); SET_INTCONSTANT(d, NJONGSUNG); { PyObject *Chosung, *Jungsung, *Jongsung; PyObject *Jaeum, *Moeum; PyObject *JaeumDict, *MoeumDict; PyObject *JaeumCodes, *MoeumCodes; PyObject *JaeumMulti, *MoeumMulti; int cur_cho, cur_jung, cur_jong; int cur_jaeum, cur_moeum; jamotype *jamo; /* Bind Chosung, Jungsung, Jongsung lists */ cur_cho = cur_jung = cur_jong = 0; Chosung = PyList_New(NCHOSUNG); Jungsung = PyList_New(NJUNGSUNG); Jongsung = PyList_New(NJONGSUNG); PyDict_SetItemString(d, "Chosung", Chosung); PyDict_SetItemString(d, "Jungsung", Jungsung); PyDict_SetItemString(d, "Jongsung", Jongsung); PyList_SET_ITEM(Jongsung, cur_jong++, PyUnicode_FromUnicode(NULL, 0)); /* Create Jaeum and Moeum meta class */ JaeumDict = PyDict_New(); MoeumDict = PyDict_New(); tmp = PyString_FromString("Jaeum"); Jaeum = PyClass_New(NULL, JaeumDict, tmp); Py_DECREF(tmp); tmp = PyString_FromString("Moeum"); Moeum = PyClass_New(NULL, MoeumDict, tmp); Py_DECREF(tmp); /* Bind meta class members */ PyDict_SetItemString(d, "Jaeum", Jaeum); PyDict_SetItemString(d, "Moeum", Moeum); PyDict_SetItemString(JaeumDict, "Chosung", Chosung); PyDict_SetItemString(MoeumDict, "Jungsung", Jungsung); PyDict_SetItemString(JaeumDict, "Jongsung", Jongsung); /* Create Jaeum and Moeum Members */ JaeumCodes = PyTuple_New(NJAEUM); MoeumCodes = PyTuple_New(NMOEUM); JaeumMulti = PyDict_New(); MoeumMulti = PyDict_New(); cur_jaeum = cur_moeum = 0; PyDict_SetItemString(JaeumDict, "Codes", JaeumCodes); PyDict_SetItemString(MoeumDict, "Codes", MoeumCodes); PyDict_SetItemString(JaeumDict, "Width", PyInt_FromLong(NJAEUM)); PyDict_SetItemString(MoeumDict, "Width", PyInt_FromLong(NMOEUM)); PyDict_SetItemString(JaeumDict, "MultiElement", JaeumMulti); PyDict_SetItemString(MoeumDict, "MultiElement", MoeumMulti); for (jamo = jamos; jamo->name; jamo++) { PyObject *unijamo, *multicls; int tuplen; tuni[0] = jamo->code; unijamo = PyUnicode_FromUnicode(tuni, 1); PyDict_SetItemString(d, jamo->name, unijamo); Py_INCREF(unijamo); /* PuTyple_SET_ITEM steals reference */ if (jamo->flags & F_JAEUM) { PyTuple_SET_ITEM(JaeumCodes, cur_jaeum++, unijamo); if (jamo->flags & F_CHOSUNG) { PyList_SET_ITEM(Chosung, cur_cho++, unijamo); PyDict_SetItemString(JaeumDict, jamo->name, unijamo); } if (jamo->flags & F_JONGSUNG) { PyList_SET_ITEM(Jongsung, cur_jong++, unijamo); PyDict_SetItemString(JaeumDict, jamo->name, unijamo); } multicls = JaeumMulti; } else { /* Moeum */ PyTuple_SET_ITEM(MoeumCodes, cur_moeum++, unijamo); if (jamo->flags & F_JUNGSUNG) { PyList_SET_ITEM(Jungsung, cur_jung++, unijamo); PyDict_SetItemString(MoeumDict, jamo->name, unijamo); } multicls = MoeumMulti; } if (jamo->multi[0]) { tuplen = jamo->multi[2] ? 3 : 2; tmp = PyTuple_New(tuplen); for (i = 0; i < tuplen; i++) { tuni[0] = jamo->multi[i]; PyTuple_SET_ITEM(tmp, i, PyUnicode_FromUnicode(tuni, 1)); } PyDict_SetItem(multicls, unijamo, tmp); Py_DECREF(tmp); } } Py_DECREF(JaeumDict); Py_DECREF(MoeumDict); } tmp = PyTuple_New(2); tuni[0] = HANGUL_BOTTOM; PyTuple_SET_ITEM(tmp, 0, PyUnicode_FromUnicode(tuni, 1)); tuni[0] = HANGUL_TOP; PyTuple_SET_ITEM(tmp, 1, PyUnicode_FromUnicode(tuni, 1)); PyDict_SetItemString(d, "ZONE", tmp); Py_DECREF(tmp); tuni[0] = JBASE_CHOSUNG; PyDict_SetItemString(d, "JBASE_CHOSUNG", PyUnicode_FromUnicode(tuni, 1)); tuni[0] = JBASE_JUNGSUNG; PyDict_SetItemString(d, "JBASE_JUNGSUNG", PyUnicode_FromUnicode(tuni, 1)); tuni[0] = JBASE_JONGSUNG; PyDict_SetItemString(d, "JBASE_JONGSUNG", PyUnicode_FromUnicode(tuni, 1)); tuni[0] = CHOSUNG_FILLER; PyDict_SetItemString(d, "CHOSUNG_FILLER", PyUnicode_FromUnicode(tuni, 1)); tuni[0] = JUNGSUNG_FILLER; PyDict_SetItemString(d, "JUNGSUNG_FILLER", PyUnicode_FromUnicode(tuni, 1)); PyDict_SetItemString(d, "version", PyString_FromString(version)); /* Check for errors */ if (PyErr_Occurred()) Py_FatalError("can't initialize the hangul module"); } |
From: Chang <pe...@us...> - 2002-04-24 07:46:40
|
perky 02/04/24 00:46:38 Modified: . Makefile ChangeLog Log: - Remove -t option from generating changelog Revision Changes Path 1.2 +2 -2 KoreanCodecs/Makefile Index: Makefile =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/Makefile,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- Makefile 17 Apr 2002 10:10:43 -0000 1.1 +++ Makefile 24 Apr 2002 07:46:38 -0000 1.2 @@ -3,7 +3,7 @@ # # by Hye-Shik Chang <pe...@fa...> # -# $Id: Makefile,v 1.1 2002/04/17 10:10:43 perky Exp $ +# $Id: Makefile,v 1.2 2002/04/24 07:46:38 perky Exp $ # CVS2CL= /usr/local/bin/cvs2cl @@ -13,7 +13,7 @@ rm -rf build log: - ${CVS2CL} -f ChangeLog --accum -r -b -t -S --no-wrap -U mk/unames + ${CVS2CL} -f ChangeLog --accum -r -b -S --no-wrap -U mk/unames dist: ChangeLog ${PYTHON} setup.py sdist 1.2 +63 -2 KoreanCodecs/ChangeLog Index: ChangeLog =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/ChangeLog,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- ChangeLog 17 Apr 2002 10:10:43 -0000 1.1 +++ ChangeLog 24 Apr 2002 07:46:38 -0000 1.2 @@ -1,6 +1,67 @@ ------------------------------------------------------------------------------ -Version 2.0.3 (2002-04-xx) +2002-04-24 16:38 Hye-Shik Chang <pe...@fa...> + * korean/python/unijohab.py (1.4): + + - Change unijohab implementation to use newly introduced hangul.conjoin + and hangul.disjoint + + Reviewed by: unittest ;) + +2002-04-24 16:20 Hye-Shik Chang <pe...@fa...> + + * test/test_hangul.py (1.4), korean/python/hangul.py (1.3): + + - Add hangul.conjoin and hangul.disjoint functions + (this function set provides converter between U+AC00 and U+1100 pages) + +2002-04-24 14:00 Hye-Shik Chang <pe...@fa...> + + * korean/python/hangul.py (1.2): + + - Simpilify join, split function implementation. + +2002-04-24 13:19 Hye-Shik Chang <pe...@fa...> + + * test/test_hangul.py (1.3): + + - Add hangul format string test units + +2002-04-24 12:36 Hye-Shik Chang <pe...@fa...> + + * korean/python/hangul.py (1.1): + + - Move hangul python implementation into python/ + - Added hangul.format, the hangul adaptive formatter + +2002-04-19 06:45 Hye-Shik Chang <pe...@fa...> + + * src/_koco.c (1.12): + + - Make compatible with Intel C/C++ Compiler + +2002-04-17 19:56 Hye-Shik Chang <pe...@fa...> + + * test/test_hangul.py (1.2): + + - Sync with korean.hangul's name changes + +2002-04-17 19:43 Hye-Shik Chang <pe...@fa...> + + * setup.py (1.11), korean/__init__.py (1.4), korean/aliases.py + (1.1), misc/KoreanCodecs.pth (1.2), misc/korean.pth (1.1): + + (Catch up JapaneseCodecs 1.4.5's style) + + - Change pth filename same to package name + - Add --without-aliases option onto setup.py + - Move alias codes from __init__.py to aliases.py + +2002-04-17 19:10 Hye-Shik Chang <pe...@fa...> + + * ChangeLog (1.1), Makefile (1.1), mk/unames (1.1): + + - Add tools for generate ChangeLog + 2002-04-12 07:04 Hye-Shik Chang <pe...@fa...> * korean/python/qwerty2bul.py (1.4): |
From: Chang <pe...@us...> - 2002-04-24 07:38:13
|
perky 02/04/24 00:38:11 Modified: korean/python unijohab.py Log: - Change unijohab implementation to use newly introduced hangul.conjoin and hangul.disjoint Reviewed by: unittest ;) Revision Changes Path 1.4 +4 -92 KoreanCodecs/korean/python/unijohab.py Index: unijohab.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/korean/python/unijohab.py,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- unijohab.py 8 Apr 2002 12:49:57 -0000 1.3 +++ unijohab.py 24 Apr 2002 07:38:11 -0000 1.4 @@ -1,116 +1,28 @@ # Hye-Shik Chang <16 Feb 2002> -# $Id: unijohab.py,v 1.3 2002/04/08 12:49:57 perky Exp $ +# $Id: unijohab.py,v 1.4 2002/04/24 07:38:11 perky Exp $ import codecs - -from korean.hangul import Jaeum, Moeum, ishangul, split, join -encmap, decmap = {}, {} - -johab2uni_chosung = { - u'\u115f': u'', u'\u1100': Jaeum.G, u'\u1101': Jaeum.GG, - u'\u1102': Jaeum.N, u'\u1103': Jaeum.D, u'\u1104': Jaeum.DD, - u'\u1105': Jaeum.L, u'\u1106': Jaeum.M, u'\u1107': Jaeum.B, - u'\u1108': Jaeum.BB, u'\u1109': Jaeum.S, u'\u110a': Jaeum.SS, - u'\u110b': Jaeum.NG, u'\u110c': Jaeum.J, u'\u110d': Jaeum.JJ, - u'\u110e': Jaeum.C, u'\u110f': Jaeum.K, u'\u1110': Jaeum.T, - u'\u1111': Jaeum.P, u'\u1112': Jaeum.H -} -johab2uni_jungsung = { - u'\u1160': u'', u'\u1161': Moeum.A, u'\u1162': Moeum.AE, - u'\u1163': Moeum.YA, u'\u1164': Moeum.YAE, u'\u1165': Moeum.EO, - u'\u1166': Moeum.E, u'\u1167': Moeum.YEO, u'\u1168': Moeum.YE, - u'\u1169': Moeum.O, u'\u116a': Moeum.WA, u'\u116b': Moeum.WAE, - u'\u116c': Moeum.OE, u'\u116d': Moeum.YO, u'\u116e': Moeum.U, - u'\u116f': Moeum.WEO, u'\u1170': Moeum.WE, u'\u1171': Moeum.WI, - u'\u1172': Moeum.YU, u'\u1173': Moeum.EU, u'\u1174': Moeum.YI, - u'\u1175': Moeum.I -} -johab2uni_jongsung = { - u'': u'', u'\u11a8': Jaeum.G, u'\u11a9': Jaeum.GG, - u'\u11aa': Jaeum.GS, u'\u11ab': Jaeum.N, u'\u11ac': Jaeum.NJ, - u'\u11ad': Jaeum.NH, u'\u11ae': Jaeum.D, u'\u11af': Jaeum.L, - u'\u11b0': Jaeum.LG, u'\u11b1': Jaeum.LM, u'\u11b2': Jaeum.LB, - u'\u11b3': Jaeum.LS, u'\u11b4': Jaeum.LT, u'\u11b5': Jaeum.LP, - u'\u11b6': Jaeum.LH, u'\u11b7': Jaeum.M, u'\u11b8': Jaeum.B, - u'\u11b9': Jaeum.BS, u'\u11ba': Jaeum.S, u'\u11bb': Jaeum.SS, - u'\u11bc': Jaeum.NG, u'\u11bd': Jaeum.J, u'\u11be': Jaeum.C, - u'\u11bf': Jaeum.K, u'\u11c0': Jaeum.T, u'\u11c1': Jaeum.P, - u'\u11c2': Jaeum.H -} - -uni2johab_chosung = {} -uni2johab_jungsung = {} -uni2johab_jongsung = {} -for k, v in johab2uni_chosung.items(): - uni2johab_chosung[v] = k -for k, v in johab2uni_jungsung.items(): - uni2johab_jungsung[v] = k -for k, v in johab2uni_jongsung.items(): - uni2johab_jongsung[v] = k - +from korean.hangul import ishangul, disjoint, conjoin class Codec(codecs.Codec): # Unicode to character buffer def encode(self, data, errors='strict', supported_errors=('strict', 'ignore', 'replace')): - global encmap if errors not in supported_errors: raise UnicodeError, "unknown error handling" - buffer = [] - for c in data: - if ishangul(c): - cho, jung, jong = split(c) # all hangul can success - buffer.append( - uni2johab_chosung[cho] + - uni2johab_jungsung[jung] + - uni2johab_jongsung[jong] - ) - else: - buffer.append(c) - - return (u''.join(buffer).encode('utf-8', errors), len(data)) + return disjoint(data).encode('utf-8', errors), len(data) # character buffer to Unicode def decode(self, data, errors='strict', supported_errors=('strict', 'ignore', 'replace')): - global decmap if errors not in supported_errors: raise UnicodeError, "unknown error handling" - buffer = [] - data = unicode(data, 'utf-8', errors) - size = len(data) - p = 0 - while p < size: - if not u'\u1100' <= data[p] <= u'\u11FF': - buffer.append(data[p]) - p += 1 - else: - c = data[p:p+3] - try: - cho = johab2uni_chosung[c[0]] - jung = johab2uni_jungsung[c[1]] - if len(c)>2 and johab2uni_jongsung.has_key(c[2]): - jong = johab2uni_jongsung[c[2]] - p += 3 # this must locate end of this block - else: - jong = u'' - p += 2 # too. - except: - if errors == 'replace': - buffer.append(u'\uFFFD') # REPLACEMENT CHARACTER - elif errors == 'strict': - raise UnicodeError, "unexpected byte \\u%04x found" % ord(c[0]) - p += 1 - else: - buffer.append(join([cho, jung, jong])) - - return (u''.join(buffer), size) - + return conjoin(unicode(data, 'utf-8', errors)), len(data) class StreamWriter(Codec, codecs.StreamWriter): pass |
From: Chang <pe...@us...> - 2002-04-24 07:33:13
|
perky 02/04/24 00:20:27 Modified: korean/python hangul.py Log: - Add hangul.conjoin and hangul.disjoint functions (this function set provides converter between U+AC00 and U+1100 pages) Revision Changes Path 1.3 +58 -9 KoreanCodecs/korean/python/hangul.py Index: hangul.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/korean/python/hangul.py,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- hangul.py 24 Apr 2002 05:00:03 -0000 1.2 +++ hangul.py 24 Apr 2002 07:20:27 -0000 1.3 @@ -15,7 +15,7 @@ # Conjoining Jamo Behavior: # http://www.unicode.org/unicode/uni2book/ch03.pdf (section 3.11) # -# $Id: hangul.py,v 1.2 2002/04/24 05:00:03 perky Exp $ +# $Id: hangul.py,v 1.3 2002/04/24 07:20:27 perky Exp $ # class UnicodeHangulError(Exception): @@ -30,7 +30,7 @@ Null = u'' -class Jaeum: # XXX: 1100-1159 Old Jaeum need? +class Jaeum: Codes = (u'\u3131', u'\u3132', u'\u3133', u'\u3134', u'\u3135', u'\u3136', # G GG GS N NJ NH @@ -56,7 +56,7 @@ } -class Moeum: # XXX: 1161-117f Old Moeum need? +class Moeum: Codes = (u'\u314f', u'\u3150', u'\u3151', u'\u3152', u'\u3153', u'\u3154', # A AE YA YAE EO E @@ -75,7 +75,6 @@ OE: (O, I), WEO: (U, EO), WE: (U, E), WI: (U, I), YI: (EU, I) } - # Aliases for your convinience Chosung = Jaeum.Chosung Jungsung = Moeum.Jungsung @@ -89,14 +88,19 @@ isMoeum = lambda c: c in Moeum.Codes # Unicode Hangul Syllables Characteristics -zone = (u'\uAC00', u'\uD7A3') +ZONE = (u'\uAC00', u'\uD7A3') NCHOSUNG = len(Chosung) NJUNGSUNG = len(Jungsung) NJONGSUNG = len(Jongsung) +JBASE_CHOSUNG = u'\u1100' +JBASE_JUNGSUNG = u'\u1161' +JBASE_JONGSUNG = u'\u11A8' +CHOSUNG_FILLER = u'\u115F' +JUNGSUNG_FILLER = u'\u1160' ishangul = ( lambda code: - zone[0] <= code <= zone[1] or + ZONE[0] <= code <= ZONE[1] or code in Jaeum.Codes or code in Moeum.Codes ) @@ -150,10 +154,55 @@ Jongsung[code % NJONGSUNG] ] -def dividestring(str, intoelements=0): - if type(str) is not type(u''): - raise UnicodeHangulError("needs unicode string") +def conjoin(s): + obuff = [] + ncur = 0 + + while ncur < len(s): + c = s[ncur] + if JBASE_CHOSUNG <= c <= u'\u1112' or c == CHOSUNG_FILLER: # starts with chosung + if len(s) > ncur+1 and JUNGSUNG_FILLER <= s[ncur+1] <= u'\u1175': + cho = Chosung[ord(c) - ord(JBASE_CHOSUNG)] + jung = Jungsung[ord(s[ncur+1]) - ord(JBASE_JUNGSUNG)] + if len(s) > ncur+2 and JBASE_JONGSUNG <= s[ncur+2] <= u'\u11C2': + jong = Jongsung[ord(s[ncur+2]) - ord(JBASE_JONGSUNG) + 1] + ncur += 2 + else: + jong = Null + ncur += 1 + obuff.append(join([cho, jung, jong])) + else: + obuff.append(join([Chosung[ord(c) - ord(JBASE_CHOSUNG)], Null, Null])) + elif JBASE_JUNGSUNG <= c <= u'\u1175': + obuff.append(join([Null, Jungsung[ord(c) - ord(JBASE_JUNGSUNG)], Null])) + else: + obuff.append(c) + ncur += 1 + + return u''.join(obuff) +def disjoint(s): + obuff = [] + for c in s: + if ishangul(c): + cho, jung, jong = split(c) + if cho: + obuff.append( unichr(ord(JBASE_CHOSUNG) + Chosung.index(cho)) ) + else: + obuff.append( CHOSUNG_FILLER ) + + if jung: + obuff.append( unichr(ord(JBASE_JUNGSUNG) + Jungsung.index(jung)) ) + else: + obuff.append( JUNGSUNG_FILLER ) + + if jong: + obuff.append( unichr(ord(JBASE_JONGSUNG) + Jongsung.index(jong) - 1) ) + else: + obuff.append(c) + return u''.join(obuff) + +def dividestring(str, intoelements=0): r = u'' for char in str: if ishangul(char): |
From: Chang <pe...@us...> - 2002-04-24 07:20:21
|
perky 02/04/24 00:20:16 Modified: test test_hangul.py Log: - Add hangul.conjoin and hangul.disjoint functions (this function set provides converter between U+AC00 and U+1100 pages) Revision Changes Path 1.4 +11 -0 KoreanCodecs/test/test_hangul.py Index: test_hangul.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/test/test_hangul.py,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- test_hangul.py 24 Apr 2002 04:19:45 -0000 1.3 +++ test_hangul.py 24 Apr 2002 07:20:16 -0000 1.4 @@ -70,6 +70,17 @@ self.assertEqual(format(fmt, { 'int': 1, 'str': u'hmm', 'float': 3.14 }), u'1\uc785\ub2c8\ub2e4. hmm\uc740 hmm\uc5d03.14\uc640') + def test_conjoin(self): + self.assertEqual(conjoin(u'\u1112\u1161\u11ab\u1100\u1173\u11af\u110b\u1175' + u' \u110c\u1169\u11c2\u110b\u1161\u110b\u116d.'), + u'\ud55c\uae00\uc774 \uc88b\uc544\uc694.') + + def test_disjoint(self): + self.assertEqual(disjoint(u'\ub9c8\ub140\ubc30\ub2ec\ubd80 \ud0a4\ud0a4'), + u'\u1106\u1161\u1102\u1167\u1107\u1162\u1103\u1161\u11af\u1107\u116e' + u' \u110f\u1175\u110f\u1175') + + if __name__ == '__main__': import sys sys.argv.insert(1, '-v') |
From: Chang <pe...@us...> - 2002-04-24 05:00:05
|
perky 02/04/23 22:00:03 Modified: korean/python hangul.py Log: - Simpilify join, split function implementation. Revision Changes Path 1.2 +16 -16 KoreanCodecs/korean/python/hangul.py Index: hangul.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/korean/python/hangul.py,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- hangul.py 24 Apr 2002 03:36:01 -0000 1.1 +++ hangul.py 24 Apr 2002 05:00:03 -0000 1.2 @@ -15,7 +15,7 @@ # Conjoining Jamo Behavior: # http://www.unicode.org/unicode/uni2book/ch03.pdf (section 3.11) # -# $Id: hangul.py,v 1.1 2002/04/24 03:36:01 perky Exp $ +# $Id: hangul.py,v 1.2 2002/04/24 05:00:03 perky Exp $ # class UnicodeHangulError(Exception): @@ -90,9 +90,9 @@ # Unicode Hangul Syllables Characteristics zone = (u'\uAC00', u'\uD7A3') -splitters = [ ( len(Jongsung)*len(Jungsung), Chosung ), - ( len(Jongsung), Jungsung ), - ( 1, Jongsung ) ] +NCHOSUNG = len(Chosung) +NJUNGSUNG = len(Jungsung) +NJONGSUNG = len(Jongsung) ishangul = ( lambda code: @@ -127,12 +127,12 @@ if not codes[0] or not codes[1]: # single jamo return codes[0] or codes[1] - r = ord(zone[0]) - codes = codes[:] # simple copy :D - for multiplier, codeset in splitters: - r = r + multiplier*codeset.index(codes.pop(0)) - - return unichr(r) + return unichr( + 0xac00 + ( + Chosung.index(codes[0])*NJUNGSUNG + + Jungsung.index(codes[1]) + )*NJONGSUNG + Jongsung.index(codes[2]) + ) def split(code): """ Split function which splits hangul syllable into jamos """ @@ -143,12 +143,12 @@ if code in Moeum.Codes: return [Null, code, Null] - code = ord(code) - ord(zone[0]) - r = [] - for divider, codeset in splitters: - value, code = code / divider, code % divider - r.append(codeset[value]) - return r + code = ord(code) - 0xac00 + return [ + Chosung[int(code / (NJUNGSUNG*NJONGSUNG))], # Python3000 safe + Jungsung[int(code / NJONGSUNG) % NJUNGSUNG], + Jongsung[code % NJONGSUNG] + ] def dividestring(str, intoelements=0): if type(str) is not type(u''): |
From: Chang <pe...@us...> - 2002-04-24 04:19:46
|
perky 02/04/23 21:19:45 Modified: test test_hangul.py Log: - Add hangul format string test units Revision Changes Path 1.3 +35 -0 KoreanCodecs/test/test_hangul.py Index: test_hangul.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/test/test_hangul.py,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- test_hangul.py 17 Apr 2002 10:56:16 -0000 1.2 +++ test_hangul.py 24 Apr 2002 04:19:45 -0000 1.3 @@ -34,6 +34,41 @@ self.assertEqual(ishangul(u'\uc870'), 1) self.assertEqual(ishangul(u'\u382c'), 0) + def test_format_altsuffix(self): + fmt = u'%s\ub294 %s\ub97c %s\ud55c\ub2e4.' + obj1, obj2 = u'\ud61c\uc2dd', u'\uc544\ub77c' + self.assertEqual(format(fmt, (obj1, obj2, u'\u2661')), + u'\ud61c\uc2dd\uc740 \uc544\ub77c\ub97c \u2661\ud55c\ub2e4.') + self.assertEqual(format(fmt, (obj2, obj1, u'\uc2eb\uc5b4')), + u'\uc544\ub77c\ub294 \ud61c\uc2dd\uc744 \uc2eb\uc5b4\ud55c\ub2e4.') + + fmt = u'\ud0dc\ucd08\uc5d0 %s\uc640 %s\uac00 \uc788\uc5c8\ub2e4.' + self.assertEqual(format(fmt, (obj1, obj2)), + u'\ud0dc\ucd08\uc5d0 \ud61c\uc2dd\uacfc \uc544\ub77c\uac00' + u' \uc788\uc5c8\ub2e4.') + self.assertEqual(format(fmt, (obj2, obj1)), + u'\ud0dc\ucd08\uc5d0 \uc544\ub77c\uc640 \ud61c\uc2dd\uc774' + u' \uc788\uc5c8\ub2e4.') + + obj1, obj2 = u'Julian', u'Julie' + self.assertEqual(format(fmt, (obj1, obj2)), + u'\ud0dc\ucd08\uc5d0 Julian\uacfc Julie\uac00 \uc788\uc5c8\ub2e4.') + self.assertEqual(format(fmt, (obj2, obj1)), + u'\ud0dc\ucd08\uc5d0 Julie\uc640 Julian\uc774 \uc788\uc5c8\ub2e4.') + + def test_format_idasuffix(self): + fmt = u'%s(\uc785)\ub2c8\ub2e4, %s(\uc778)\ub370, %s(\uc774)\ub2e4' + self.assertEqual(format(fmt, (u'\uc18c\uc774',)*3), + u'\uc18c\uc785\ub2c8\ub2e4, \uc18c\uc778\ub370, \uc18c\uc774\ub2e4') + self.assertEqual(format(fmt, (u'\ub2e4\ub155',)*3), + u'\ub2e4\ub155\uc785\ub2c8\ub2e4, \ub2e4\ub155\uc778\ub370,' + u' \ub2e4\ub155\uc774\ub2e4') + + def test_format_argtypes(self): + fmt = u'%(int)d(\uc785)\ub2c8\ub2e4. %(str)s\uc740 %(str)s\uc5d0' \ + u'%(float).2f\uc640' + self.assertEqual(format(fmt, { 'int': 1, 'str': u'hmm', 'float': 3.14 }), + u'1\uc785\ub2c8\ub2e4. hmm\uc740 hmm\uc5d03.14\uc640') if __name__ == '__main__': import sys |
From: Chang <pe...@us...> - 2002-04-24 03:36:03
|
perky 02/04/23 20:36:01 Added: korean/python hangul.py Log: - Move hangul python implementation into python/ - Added hangul.format, the hangul adaptive formatter Revision Changes Path 1.1 KoreanCodecs/korean/python/hangul.py Index: hangul.py =================================================================== #!/usr/local/bin/python # ex:ts=4 # # Unicode hangul abstractive controller # # written by Hye-Shik Chang <pe...@fa...> # # # Unicode Hangul Code-Area Specifications: # http://www.unicode.org/charts/PDF/UAC00.pdf # # Jamo Short Name Conventions: # http://www.unicode.org/unicode/uni2book/ch04.pdf (section 4.4) # # Conjoining Jamo Behavior: # http://www.unicode.org/unicode/uni2book/ch03.pdf (section 3.11) # # $Id: hangul.py,v 1.1 2002/04/24 03:36:01 perky Exp $ # class UnicodeHangulError(Exception): def __init__ (self, msg): self.msg = msg def __repr__ (self): return self.msg __str__ = __repr__ Null = u'' class Jaeum: # XXX: 1100-1159 Old Jaeum need? Codes = (u'\u3131', u'\u3132', u'\u3133', u'\u3134', u'\u3135', u'\u3136', # G GG GS N NJ NH u'\u3137', u'\u3138', u'\u3139', u'\u313a', u'\u313b', u'\u313c', # D DD L LG LM LB u'\u313d', u'\u313e', u'\u313f', u'\u3140', u'\u3141', u'\u3142', # LS LT LP LH M B u'\u3143', u'\u3144', u'\u3145', u'\u3146', u'\u3147', u'\u3148', # BB BS S SS NG J u'\u3149', u'\u314a', u'\u314b', u'\u314c', u'\u314d', u'\u314e') # JJ C K T P H Width = len(Codes) G, GG, GS, N, NJ, NH, D, DD, L, LG, LM, LB, LS, LT, LP, LH, M, B, \ BB, BS, S, SS, NG, J, JJ, C, K, T, P, H = Codes Chosung = [G, GG, N, D, DD, L, M, B, BB, S, SS, NG, J, JJ, C, K, T, P, H] Jongsung = [Null, G, GG, GS, N, NJ, NH, D, L, LG, LM, LB, LS, LT, \ LP, LH, M, B, BS, S, SS, NG, J, C, K, T, P, H] MultiElement = { GG: (G, G), GS: (G, S), NJ: (N, J), NH: (N, H), DD: (D, D), LG: (L, G), LM: (L, M), LB: (L, B), LS: (L, S), LT: (L, T), LP: (L, P), LH: (L, H), BB: (B, B), BS: (B, S), SS: (S, S), JJ: (J, J) } class Moeum: # XXX: 1161-117f Old Moeum need? Codes = (u'\u314f', u'\u3150', u'\u3151', u'\u3152', u'\u3153', u'\u3154', # A AE YA YAE EO E u'\u3155', u'\u3156', u'\u3157', u'\u3158', u'\u3159', u'\u315a', # YEO YE O WA WAE OE u'\u315b', u'\u315c', u'\u315d', u'\u315e', u'\u315f', u'\u3160', # YO U WEO WE WI YU u'\u3161', u'\u3162', u'\u3163') # EU YI I Width = len(Codes) A, AE, YA, YAE, EO, E, YEO, YE, O, WA, WAE, OE, YO, \ U, WEO, WE, WI, YU, EU, YI, I = Codes Jungsung = list(Codes) MultiElement = { AE: (A, I), YAE: (YA, I), YE: (YEO, I), WA: (O, A), WAE: (O, A, I), OE: (O, I), WEO: (U, EO), WE: (U, E), WI: (U, I), YI: (EU, I) } # Aliases for your convinience Chosung = Jaeum.Chosung Jungsung = Moeum.Jungsung Jongsung = Jaeum.Jongsung for name, code in Jaeum.__dict__.items() + Moeum.__dict__.items(): if name.isupper() and len(name) <= 3: exec "%s = %s" % (name, repr(code)) isJaeum = lambda c: c in Jaeum.Codes isMoeum = lambda c: c in Moeum.Codes # Unicode Hangul Syllables Characteristics zone = (u'\uAC00', u'\uD7A3') splitters = [ ( len(Jongsung)*len(Jungsung), Chosung ), ( len(Jongsung), Jungsung ), ( 1, Jongsung ) ] ishangul = ( lambda code: zone[0] <= code <= zone[1] or code in Jaeum.Codes or code in Moeum.Codes ) # Alternative Suffixes ALT_SUFFIXES = { u'\uc744': (u'\ub97c', u'\uc744'), # reul, eul u'\ub97c': (u'\ub97c', u'\uc744'), # reul, eul u'\uc740': (u'\ub294', u'\uc740'), # neun, eun u'\ub294': (u'\ub294', u'\uc740'), # neun, eun u'\uc774': (u'\uac00', u'\uc774'), # yi, ga u'\uac00': (u'\uac00', u'\uc774'), # yi, ga u'\uc640': (u'\uc640', u'\uacfc'), # wa, gwa u'\uacfc': (u'\uc640', u'\uacfc'), # wa, gwa } # Ida-Varitaion Suffixes IDA_SUFFIXES = { u'(\uc774)': (u'', u'\uc774'), # (yi)da u'(\uc785)': (17, u'\uc785'), # (ip)nida u'(\uc778)': (4, u'\uc778'), # (in)- } def join(codes): """ Join function which makes hangul syllable from jamos """ if len(codes) is not 3: raise UnicodeHangulError("needs 3-element tuple") if not codes[0] or not codes[1]: # single jamo return codes[0] or codes[1] r = ord(zone[0]) codes = codes[:] # simple copy :D for multiplier, codeset in splitters: r = r + multiplier*codeset.index(codes.pop(0)) return unichr(r) def split(code): """ Split function which splits hangul syllable into jamos """ if len(code) != 1 or not ishangul(code): raise UnicodeHangulError("needs 1 hangul letter") if code in Jaeum.Codes: return [code, Null, Null] if code in Moeum.Codes: return [Null, code, Null] code = ord(code) - ord(zone[0]) r = [] for divider, codeset in splitters: value, code = code / divider, code % divider r.append(codeset[value]) return r def dividestring(str, intoelements=0): if type(str) is not type(u''): raise UnicodeHangulError("needs unicode string") r = u'' for char in str: if ishangul(char): elems = split(char) for elem in elems: for htype in (Jaeum, Moeum, None): if htype == None: r += elem elif intoelements and \ htype.MultiElement.has_key(elem): r += u''.join(htype.MultiElement[elem]) break else: r += char return r def _has_final(c): # for internal use only if u'\uac00' <= c <= u'\ud7a3': # hangul return 1, (ord(c) - 0xac00) % 28 > 0 else: return 0, c in u'013678.bklmnptMN' def format(fmtstr, args): if not isinstance(args, dict): argget = iter(args).next else: argget = lambda:args obuff = [] ncur = escape = fmtinpth = 0 ofmt = fmt = u'' while ncur < len(fmtstr): c = fmtstr[ncur] if escape: obuff.append(c) escape = 0 ofmt = u'' elif c == u'\\': escape = 1 elif fmt: fmt += c if not fmtinpth and c.isalpha(): ofmt = fmt % argget() obuff.append(ofmt) fmt = u'' elif fmtinpth and c == u')': fmtinpth = 0 elif c == u'(': fmtinpth = 1 elif c == u'%': obuff.append(u'%') elif c == u'%': fmt += c ofmt = u'' else: if ofmt and ALT_SUFFIXES.has_key(c): obuff.append(ALT_SUFFIXES[c][ _has_final(ofmt[-1])[1] and 1 or 0 ]) elif ofmt and IDA_SUFFIXES.has_key(fmtstr[ncur:ncur+3]): sel = IDA_SUFFIXES[fmtstr[ncur:ncur+3]] ishan, hasfinal = _has_final(ofmt[-1]) if hasfinal: obuff.append(sel[1]) elif ishan: if sel[0]: obuff[-1] = obuff[-1][:-1] + unichr(ord(ofmt[-1]) + sel[0]) else: obuff.append(sel[0] and sel[1]) ncur += 2 else: obuff.append(c) ofmt = u'' ncur += 1 return u''.join(obuff) if __name__ == '__main__': print ( join([Jaeum.P, Moeum.EO, Null]) + \ join([Jaeum.K, Moeum.I, Null]) + \ join([Jaeum.JJ, Moeum.A, Jaeum.NG]) ).encode("utf-8") while 1: code = raw_input(">>> ") print dividestring(unicode(code, "utf-8"), 1).encode("utf-8") |
From: Chang <pe...@us...> - 2002-04-24 03:36:03
|
perky 02/04/23 20:36:00 Modified: korean hangul.py Log: - Move hangul python implementation into python/ - Added hangul.format, the hangul adaptive formatter Revision Changes Path 1.3 +4 -165 KoreanCodecs/korean/hangul.py Index: hangul.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/korean/hangul.py,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- hangul.py 8 Apr 2002 12:41:41 -0000 1.2 +++ hangul.py 24 Apr 2002 03:36:00 -0000 1.3 @@ -1,165 +1,4 @@ -#!/usr/local/bin/python -# ex:ts=4 -# -# Unicode hangul abstractive controller -# -# written by Hye-Shik Chang <pe...@fa...> -# -# Unicode Hangul Code-Area Specifications: -# http://www.unicode.org/charts/PDF/UAC00.pdf -# -# Jamo Short Name property confirms to sections 3.1 and 4.4 of Unicode 3.2.0 -# ftp://ftp.unicode.org/Public/UNIDATA/Jamo.txt -# -# ---------------------------------------------------------------------------- -# "THE BEER-WARE LICENSE" (Revision 42): -# <pe...@fa...> wrote this file. As long as you retain this notice you -# can do whatever you want with this stuff. If we meet some day, and you think -# this stuff is worth it, you can buy me a beer in return. Hye-Shik Chang -# ---------------------------------------------------------------------------- -# -# $LinuxKorea: UnicodeHangul.py,v 1.3 2001/08/04 05:51:29 perky Exp $' -# $Id: hangul.py,v 1.2 2002/04/08 12:41:41 perky Exp $ -# - -class UnicodeHangulError(Exception): - - def __init__ (self, msg): - self.msg = msg - - def __repr__ (self): - return self.msg - - __str__ = __repr__ - -Null = u'' - -class Jaeum: # XXX: 1100-1159 Old Jaeum need? - - Codes = (u'\u3131', u'\u3132', u'\u3133', u'\u3134', u'\u3135', u'\u3136', - # G GG GS N NJ NH - u'\u3137', u'\u3138', u'\u3139', u'\u313a', u'\u313b', u'\u313c', - # D DD L LG LM LB - u'\u313d', u'\u313e', u'\u313f', u'\u3140', u'\u3141', u'\u3142', - # LS LT LP LH M B - u'\u3143', u'\u3144', u'\u3145', u'\u3146', u'\u3147', u'\u3148', - # BB BS S SS NG J - u'\u3149', u'\u314a', u'\u314b', u'\u314c', u'\u314d', u'\u314e') - # JJ C K T P H - Width = len(Codes) - G, GG, GS, N, NJ, NH, D, DD, L, LG, LM, LB, LS, LT, LP, LH, M, B, \ - BB, BS, S, SS, NG, J, JJ, C, K, T, P, H = Codes - Chosung = [G, GG, N, D, DD, L, M, B, BB, S, SS, NG, J, JJ, C, K, T, P, H] - Jongsung = [Null, G, GG, GS, N, NJ, NH, D, L, LG, LM, LB, LS, LT, \ - LP, LH, M, B, BS, S, SS, NG, J, C, K, T, P, H] - MultiElement = { - GG: (G, G), GS: (G, S), NJ: (N, J), NH: (N, H), DD: (D, D), - LG: (L, G), LM: (L, M), LB: (L, B), LS: (L, S), LT: (L, T), - LP: (L, P), LH: (L, H), BB: (B, B), BS: (B, S), SS: (S, S), - JJ: (J, J) - } - - -class Moeum: # XXX: 1161-117f Old Moeum need? - - Codes = (u'\u314f', u'\u3150', u'\u3151', u'\u3152', u'\u3153', u'\u3154', - # A AE YA YAE EO E - u'\u3155', u'\u3156', u'\u3157', u'\u3158', u'\u3159', u'\u315a', - # YEO YE O WA WAE OE - u'\u315b', u'\u315c', u'\u315d', u'\u315e', u'\u315f', u'\u3160', - # YO U WEO WE WI YU - u'\u3161', u'\u3162', u'\u3163') - # EU YI I - Width = len(Codes) - A, AE, YA, YAE, EO, E, YEO, YE, O, WA, WAE, OE, YO, \ - U, WEO, WE, WI, YU, EU, YI, I = Codes - Jungsung = list(Codes) - MultiElement = { - AE: (A, I), YAE: (YA, I), YE: (YEO, I), WA: (O, A), WAE: (O, A, I), - OE: (O, I), WEO: (U, EO), WE: (U, E), WI: (U, I), YI: (EU, I) - } - - -# Aliases for your convinience -Chosung = Jaeum.Chosung -Jungsung = Moeum.Jungsung -Jongsung = Jaeum.Jongsung - -isJaeum = lambda c: c in Jaeum.Codes -isMoeum = lambda c: c in Moeum.Codes - -# Unicode Hangul Syllables Characteristics -zone = (u'\uAC00', u'\uD7A3') -splitters = [ ( len(Jongsung)*len(Jungsung), Chosung ), - ( len(Jongsung), Jungsung ), - ( 1, Jongsung ) ] - -ishangul = ( - lambda code: - zone[0] <= code <= zone[1] or - code in Jaeum.Codes or - code in Moeum.Codes -) - -def join(codes): - """ Join function which makes hangul syllable from jamos """ - if len(codes) is not 3: - raise UnicodeHangulError("needs 3-element tuple") - if not codes[0] or not codes[1]: # single jamo - return codes[0] or codes[1] - - r = ord(zone[0]) - codes = codes[:] # simple copy :D - for multiplier, codeset in splitters: - r = r + multiplier*codeset.index(codes.pop(0)) - - return unichr(r) - -def split(code): - """ Split function which splits hangul syllable into jamos """ - if len(code) != 1 or not ishangul(code): - raise UnicodeHangulError("needs 1 hangul letter") - if code in Jaeum.Codes: - return [code, Null, Null] - if code in Moeum.Codes: - return [Null, code, Null] - - code = ord(code) - ord(zone[0]) - r = [] - for divider, codeset in splitters: - value, code = code / divider, code % divider - r.append(codeset[value]) - return r - -def dividestring(str, intoelements=0): - if type(str) is not type(u''): - raise UnicodeHangulError("needs unicode string") - - r = u'' - for char in str: - if ishangul(char): - elems = split(char) - for elem in elems: - for htype in (Jaeum, Moeum, None): - if htype == None: - r += elem - elif intoelements and \ - htype.MultiElement.has_key(elem): - r += u''.join(htype.MultiElement[elem]) - break - else: - r += char - - return r - - -if __name__ == '__main__': - - print ( join([Jaeum.P, Moeum.EO, Null]) + \ - join([Jaeum.K, Moeum.I, Null]) + \ - join([Jaeum.JJ, Moeum.A, Jaeum.NG]) ).encode("utf-8") - - while 1: - code = raw_input(">>> ") - print dividestring(unicode(code, "utf-8"), 1).encode("utf-8") - +try: + from korean.c.hangul import * +except: + from korean.python.hangul import * |