koco-cvs Mailing List for Python Korean Codecs (Page 14)
Brought to you by:
perky
You can subscribe to this list here.
2002 |
Jan
|
Feb
|
Mar
|
Apr
(88) |
May
(5) |
Jun
|
Jul
(27) |
Aug
|
Sep
|
Oct
(5) |
Nov
|
Dec
|
---|---|---|---|---|---|---|---|---|---|---|---|---|
2003 |
Jan
(77) |
Feb
(3) |
Mar
|
Apr
(22) |
May
(123) |
Jun
(80) |
Jul
(83) |
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
From: Hye-Shik C. <pe...@us...> - 2003-01-13 07:25:27
|
perky 03/01/12 23:25:26 Added: korean error_callback.py Log: Add PEP293 emulation codes. Revision Changes Path 1.1 KoreanCodecs/korean/error_callback.py Index: error_callback.py =================================================================== # # This file is part of KoreanCodecs. # # Copyright(C) 2002-2003 Hye-Shik Chang <pe...@Fr...>. # # KoreanCodecs is free software; you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as published # by the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # KoreanCodecs is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # $Id: error_callback.py,v 1.1 2003/01/13 07:25:26 perky Exp $ # try: from __builtin__ import UnicodeEncodeError, UnicodeDecodeError from codecs import lookup_error except: # implementations from PEP293 for ancient systems class UnicodeEncodeError(UnicodeError): def __init__(self, encoding, object, start, end, reason): UnicodeError.__init__(self, "encoding '%s' can't encode characters " + "in positions %d-%d: %s" % (encoding, start, end-1, reason)) self.encoding = encoding self.object = object self.start = start self.end = end self.reason = reason class UnicodeDecodeError(UnicodeError): def __init__(self, encoding, object, start, end, reason): UnicodeError.__init__(self, "encoding '%s' can't decode characters " + "in positions %d-%d: %s" % (encoding, start, end-1, reason)) self.encoding = encoding self.object = object self.start = start self.end = end self.reason = reason def errcb_strict(exc): raise exc def errcb_ignore(exc): if isinstance(exc, UnicodeError): return (u"", exc.end) else: raise TypeError("can't handle %s" % exc.__name__) def errcb_replace(exc): if isinstance(exc, UnicodeEncodeError): return ((exc.end-exc.start)*u"?", exc.end) elif isinstance(exc, UnicodeDecodeError): return (u"\ufffd", exc.end) else: raise TypeError("can't handle %s" % exc.__name__) error_callbacks = { 'strict': errcb_strict, 'ignore': errcb_ignore, 'replace': errcb_replace, } def lookup_error(name): cb = error_callbacks.get(name) if cb: return cb else: raise LookupError, "unknown error handler name '%s'" % name # ex: ts=8 sts=4 et |
From: Hye-Shik C. <pe...@us...> - 2003-01-13 00:23:25
|
perky 03/01/12 16:23:24 Modified: . README.en README.ko Log: python codecs is removed now. Revision Changes Path 1.29 +1 -3 KoreanCodecs/README.en Index: README.en =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/README.en,v retrieving revision 1.28 retrieving revision 1.29 diff -u -r1.28 -r1.29 --- README.en 12 Jan 2003 23:46:35 -0000 1.28 +++ README.en 13 Jan 2003 00:23:24 -0000 1.29 @@ -2,7 +2,7 @@ ============================ Copyright(C) 2002-2003 Hye-Shik Chang. -$Id: README.en,v 1.28 2003/01/12 23:46:35 perky Exp $ +$Id: README.en,v 1.29 2003/01/13 00:23:24 perky Exp $ Introduction @@ -99,8 +99,6 @@ o Version 2.1.0 - January 2003 - As a result of refactoring wansung encoders, C codec consumes about 1/3 of memory than before. - - Fixed a bug that korean.python.cp949 codec taints korean.python.euc_kr - codec's decoding and encoding mappings on its loading time. - Added MacKorean codec which is used by MacOS 7 and above. - Reimplemented ISO-2022-KR codec and it can handle ksc5601 designated on G0 area, now. (MULE Compatible) 1.27 +1 -3 KoreanCodecs/README.ko Index: README.ko =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/README.ko,v retrieving revision 1.26 retrieving revision 1.27 diff -u -r1.26 -r1.27 --- README.ko 12 Jan 2003 23:46:36 -0000 1.26 +++ README.ko 13 Jan 2003 00:23:24 -0000 1.27 @@ -2,7 +2,7 @@ ===================== Copyright(C) 2002-2003 Hye-Shik Chang. -$Id: README.ko,v 1.26 2003/01/12 23:46:36 perky Exp $ +$Id: README.ko,v 1.27 2003/01/13 00:23:24 perky Exp $ *Ä·ÆäÀÎ* ÀÎÅͳݿ¡¼ ÇÑ±Û ¸ÂÃã¹ýÀ» Áöŵ½Ã´Ù. ^-^/~ @@ -103,8 +103,6 @@ o ¹öÁ¯ 2.1.0 2003³â 1¿ù - ³»ºÎ ¿Ï¼ºÇü ÀÎÄÚ´õÀÇ ÃÖÀûÈ·Î C ÄÚµ¦ÀÇ ¸Þ¸ð¸® ¼Òºñ°¡ 1/3 Á¤µµ·Î ÁÙ¾î µé¾ú½À´Ï´Ù. - - korean.python.cp949 ÄÚµ¦ÀÌ ·ÎµåµÇ¸é¼ korean.python.euc_kr ÀÇ ¸ÅÇο¡ - cp949¸¦ Ãß°¡Çعö¸®´Â ¹ö±×°¡ ¼öÁ¤µÇ¾ú½À´Ï´Ù. - MacOS 7 À̻󿡼 »ç¿ëµÇ´Â MacKorean ÄÚµ¦ÀÌ Ãß°¡µÇ¾ú½À´Ï´Ù. - »õ·Î ±¸ÇöµÈ ISO-2022-KR ÄÚµ¦Àº G0 ¿µ¿ª¿¡ ÁöÁ¤µÈ KSC5601µµ ´Ù·ê ¼ö ÀÖ°Ô µÇ¾ú½À´Ï´Ù. (MULE ȣȯ) |
From: Hye-Shik C. <pe...@us...> - 2003-01-12 23:46:37
|
perky 03/01/12 15:46:36 Modified: . README.en README.ko setup.py Log: Announce this version as 2.1.0a1 Revision Changes Path 1.28 +3 -3 KoreanCodecs/README.en Index: README.en =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/README.en,v retrieving revision 1.27 retrieving revision 1.28 diff -u -r1.27 -r1.28 --- README.en 12 Jan 2003 23:04:55 -0000 1.27 +++ README.en 12 Jan 2003 23:46:35 -0000 1.28 @@ -1,8 +1,8 @@ -KoreanCodecs version 2.1.0dev -============================= +KoreanCodecs version 2.1.0a1 +============================ Copyright(C) 2002-2003 Hye-Shik Chang. -$Id: README.en,v 1.27 2003/01/12 23:04:55 perky Exp $ +$Id: README.en,v 1.28 2003/01/12 23:46:35 perky Exp $ Introduction 1.26 +3 -3 KoreanCodecs/README.ko Index: README.ko =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/README.ko,v retrieving revision 1.25 retrieving revision 1.26 diff -u -r1.25 -r1.26 --- README.ko 12 Jan 2003 23:04:55 -0000 1.25 +++ README.ko 12 Jan 2003 23:46:36 -0000 1.26 @@ -1,8 +1,8 @@ -ÇѱÛÄÚµ¦ ¹öÁ¯ 2.1.0dev -====================== +ÇѱÛÄÚµ¦ ¹öÁ¯ 2.1.0a1 +===================== Copyright(C) 2002-2003 Hye-Shik Chang. -$Id: README.ko,v 1.25 2003/01/12 23:04:55 perky Exp $ +$Id: README.ko,v 1.26 2003/01/12 23:46:36 perky Exp $ *Ä·ÆäÀÎ* ÀÎÅͳݿ¡¼ ÇÑ±Û ¸ÂÃã¹ýÀ» Áöŵ½Ã´Ù. ^-^/~ 1.33 +2 -2 KoreanCodecs/setup.py Index: setup.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/setup.py,v retrieving revision 1.32 retrieving revision 1.33 diff -u -r1.32 -r1.33 --- setup.py 12 Jan 2003 22:54:11 -0000 1.32 +++ setup.py 12 Jan 2003 23:46:36 -0000 1.33 @@ -18,7 +18,7 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: setup.py,v 1.32 2003/01/12 22:54:11 perky Exp $ +# $Id: setup.py,v 1.33 2003/01/12 23:46:36 perky Exp $ # import sys @@ -51,7 +51,7 @@ org_install_lib or self.install_purelib setup (name = "KoreanCodecs", - version = "2.1.0dev", + version = "2.1.0a1", description = "Korean Codecs for Python Unicode Support", long_description = "This package provides Unicode codecs that " "make Python aware of Korean character encodings such as " |
From: Hye-Shik C. <pe...@us...> - 2003-01-12 23:22:35
|
perky 03/01/12 15:22:35 Removed: korean/mappings ksc5601_hangul.py ksc5601_ideograph.py ksc5601_misc.py uhc.py Log: Remove ksc5601, uhc pure python mappings. We save 1000KB now! :) |
From: Hye-Shik C. <pe...@us...> - 2003-01-12 23:22:33
|
perky 03/01/12 15:22:32 Modified: korean mac_korean.py Log: Remove ksc5601, uhc pure python mappings. We save 1000KB now! :) Revision Changes Path 1.2 +12 -48 KoreanCodecs/korean/mac_korean.py Index: mac_korean.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/korean/mac_korean.py,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- mac_korean.py 12 Jan 2003 23:04:56 -0000 1.1 +++ mac_korean.py 12 Jan 2003 23:22:32 -0000 1.2 @@ -17,15 +17,15 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: mac_korean.py,v 1.1 2003/01/12 23:04:56 perky Exp $ +# $Id: mac_korean.py,v 1.2 2003/01/12 23:22:32 perky Exp $ # import codecs -from korean.mappings import ksc5601_hangul, appleextension -encmap_hangul, decmap_hangul = ksc5601_hangul.encoding_map, ksc5601_hangul.decoding_map -encmap_apple, decmap_apple = appleextension.encoding_map, appleextension.decoding_map -encmap_ideo, decmap_ideo = {}, {} -encmap_misc, decmap_misc = {}, {} +from korean.mappings import appleextension +encmap_apple = appleextension.encoding_map +decmap_apple = appleextension.decoding_map + +KSC5601_CODEC = 'korean.euc-kr' class Codec(codecs.Codec): @@ -55,30 +55,10 @@ if c < u'\u0080': buffer.append(c.encode("ascii", errors)) - elif encmap_hangul.has_key(c): - buffer.append(encmap_hangul[c]) elif encmap_apple.has_key(c): buffer.append(encmap_apple[c]) else: - if not encmap_misc: - from korean.mappings import ksc5601_misc - encmap_misc = ksc5601_misc.encoding_map - if encmap_misc.has_key(c): - buffer.append(encmap_misc[c]) - continue - - if not encmap_ideo: - from korean.mappings import ksc5601_ideograph - encmap_ideo = ksc5601_ideograph.encoding_map - if encmap_ideo.has_key(c): - buffer.append(encmap_ideo[c]) - continue - - if errors == 'replace': - buffer.append('\xa1\xa1') - elif errors == 'strict': - raise UnicodeError, ("cannot map " - "\\u%04x to MacKorean") % ord(c) + buffer.append(c.encode(KSC5601_CODEC, errors)) return (''.join(buffer), len(data)) @@ -113,28 +93,11 @@ c = data[p:p+2] p += 2 if len(c) == 2: - if decmap_hangul.has_key(c): - buffer.append(decmap_hangul[c]) - continue - elif decmap_apple.has_key(c): + if decmap_apple.has_key(c): buffer.append(decmap_apple[c]) - continue - - if not decmap_misc: - from korean.mappings import ksc5601_misc - decmap_misc = ksc5601_misc.decoding_map - if decmap_misc.has_key(c): - buffer.append(decmap_misc[c]) - continue - - if not decmap_ideo: - from korean.mappings import ksc5601_ideograph - decmap_ideo = ksc5601_ideograph.decoding_map - if decmap_ideo.has_key(c): - buffer.append(decmap_ideo[c]) - continue - - if errors == 'replace': + else: + buffer.append(unicode(c, KSC5601_CODEC, errors)) + elif errors == 'replace': buffer.append(u'\uFFFD') # REPLACEMENT CHARACTER elif errors == 'strict': raise UnicodeError, "unexpected byte 0x%s found" % ( @@ -192,6 +155,7 @@ buffer.append(data[end:pos+1]) end = pos+1 return buffer + def reset(self): self.data = '' |
From: Hye-Shik C. <pe...@us...> - 2003-01-12 23:12:49
|
perky 03/01/12 15:12:49 Added: tools generate_codec_mapping.py Log: Move src/tablegen.py to tools/generate_codec_mapping.py Revision Changes Path 1.1 KoreanCodecs/tools/generate_codec_mapping.py Index: generate_codec_mapping.py =================================================================== # # generate_codec_mapping.py - $Revision: 1.1 $ # # Code Table Generator # # Author: Hye-Shik Chang <pe...@Fr...> # Date : $Date: 2003/01/12 23:12:48 $ # # # This file is part of KoreanCodecs. # # KoreanCodecs is free software; you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as published # by the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # KoreanCodecs is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # import time UNICODE_INVALID = "UNIINV," COPYRIGHT_HEADER = """\ /* * This file is part of KoreanCodecs. * * KoreanCodecs is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * KoreanCodecs is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with KoreanCodecs; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Generated by generate_codec_mapping.py on %s * $Id: generate_codec_mapping.py,v 1.1 2003/01/12 23:12:48 perky Exp $ */ """ % time.asctime(time.gmtime()) def tohex(s): return "\\x%02x\\x%02x" % tuple(map(ord, s)) def decodemapgen(fo, prefix, bottom, top, m, print_region=1, print_index=1): fmap = {} for oco, dco in m.items(): fmap.setdefault(ord(oco[0]), {}) fmap[ord(oco[0])][ord(oco[1])] = dco firstkeys = fmap.keys() firstkeys.sort() if print_region: print >> fo, """\ #define %s_bottom %d #define %s_top %d """ % (prefix, bottom, prefix, top) for fk in firstkeys: seckeys = fmap[fk].keys() seckeys.sort() print >> fo, "static const Py_UNICODE %s_%02X[%d] = { /* %02X::%02X-%02X */" \ % (prefix, fk, top-bottom+1, fk, bottom, top) seckeys = range(bottom, top+1) while seckeys: dp = seckeys[:8] del seckeys[:8] print >> fo, " ", ' '.join([ fmap[fk].has_key(i) and ("0x%04x," % ord(fmap[fk][i])) or UNICODE_INVALID for i in dp ]) print >> fo, "};" print >> fo if print_index: decmapindex(fo, prefix, fmap) def decmapindex(fo, prefix, fmap): print >> fo, "static const Py_UNICODE *%s_map[128] = {" % (prefix) for i in range(128, 256): if fmap.has_key(i): print >> fo, " %s_%02X, /* 0x%02X */" % (prefix, i, i) else: print >> fo, " 0, /* 0x%02X */" % i print >> fo, "};" print >> fo def encodemapgen(fo, prefix, m, threshold): ecodes = m.keys() ecodes.sort() eblocks = [[-99999, -99999, {}]] for i in ecodes: if eblocks[-1][1] + threshold < ord(i): eblocks.append([ord(i), ord(i), {ord(i):m[i]}]) else: eblocks[-1][1] = ord(i) eblocks[-1][2][ord(i)] = m[i] blkcount = 0 for blk in eblocks[1:]: print >> fo, "static const DBYTECHAR %s_page%d[%d] = { /* 0x%04x - 0x%04x */" % \ (prefix, blkcount, blk[1]-blk[0]+1, blk[0], blk[1]) blkcount += 1 obl = range(blk[0], blk[1]+1) while obl: dp = obl[:8] del obl[:8] print >> fo, " ", " ".join([ blk[2].has_key(ok) and '0x%02x%02x,' % tuple(map(ord, blk[2][ok])) or "NOCHAR," for ok in dp ]) print >> fo, "};" print >> fo blkcount = 0 print >> fo, "#define _%s(uni) ( \\" % prefix for blk in eblocks[1:]: print >> fo, " uni >= 0x%04x && uni <= 0x%04x ? %s_page%d[uni-0x%04x] : \\" % ( blk[0], blk[1], prefix, blkcount, blk[0] ) blkcount += 1 print >> fo, " NOCHAR \\" print >> fo, ")" print >> fo def hintgen(fo, prefix, m): k = range(256) print >> fo, "static const char %s_hint[256] = {" % prefix while k: n = k[:16] del k[:16] print >> fo, " ", " ".join(['%d,' % m[i] for i in n]) print >> fo, "};" print >> fo ksc5601 = open("_koco_ksc5601.h", "w") print >> ksc5601, COPYRIGHT_HEADER # johab_ideograph ksc5601_hangul ksc5601_ideograph ksc5601_misc uhc.py from korean.mappings import ksc5601_hangul, ksc5601_ideograph, ksc5601_misc ksc5601_decoding = {} ksc5601_decoding.update(ksc5601_hangul.decoding_map) ksc5601_decoding.update(ksc5601_ideograph.decoding_map) ksc5601_decoding.update(ksc5601_misc.decoding_map) decodemapgen(ksc5601, "ksc5601_decode", 0xa1, 0xfe, ksc5601_decoding) print >> ksc5601 ksc5601_encoding = {} ksc5601_encoding.update(ksc5601_hangul.encoding_map) ksc5601_encoding.update(ksc5601_ideograph.encoding_map) ksc5601_encoding.update(ksc5601_misc.encoding_map) del ksc5601 del ksc5601_decoding, ksc5601_hangul, ksc5601_ideograph, ksc5601_misc uhctable = open("_koco_uhc.h", "w") print >> uhctable, COPYRIGHT_HEADER print >> uhctable, """\ #define uhc_page0_bottom 0x41 #define uhc_page0_top 0xfe #define uhc_page1_bottom 0x41 #define uhc_page1_top 0xa0 """ from korean.mappings import uhc uhcpage0 = {} uhcpage1 = {} fmap = {} # Hmm, I need dictionary comprehension.... for code, uni in uhc.decoding_map.items(): if code[0] <= '\xa0': # page 0 uhcpage0[code] = uni else: uhcpage1[code] = uni fmap[ord(code[0])] = None # just for index decodemapgen(uhctable, "uhc_decode", 0x41, 0xfe, uhcpage0, 0, 0) decodemapgen(uhctable, "uhc_decode", 0x41, 0xa0, uhcpage1, 0, 0) decmapindex(uhctable, "uhc_decode", fmap) hintarray = [] for i in range(256): if chr(i).isalpha() or 0x81 <= i <= 0xA0: hintarray.append(1) else: hintarray.append(0) hintgen(uhctable, "uhc_decode", hintarray) encmapfile = open("_koco_wansungenc.h", "w") print >> encmapfile, COPYRIGHT_HEADER ksc5601_encoding.update(uhc.encoding_map) encodemapgen(encmapfile, "wansung_encode", ksc5601_encoding, 512) # # $Id: generate_codec_mapping.py,v 1.1 2003/01/12 23:12:48 perky Exp $ # # -*- End-Of-File -*- |
From: Hye-Shik C. <pe...@us...> - 2003-01-12 23:12:49
|
perky 03/01/12 15:12:48 Removed: src tablegen.py Log: Move src/tablegen.py to tools/generate_codec_mapping.py |
From: Hye-Shik C. <pe...@us...> - 2003-01-12 23:04:57
|
perky 03/01/12 15:04:56 Modified: test test_mackorean.py Log: Rename mackorean codec to mac_korean according to python standard codec's name convention Revision Changes Path 1.6 +2 -2 KoreanCodecs/test/test_mackorean.py Index: test_mackorean.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/test/test_mackorean.py,v retrieving revision 1.5 retrieving revision 1.6 diff -u -r1.5 -r1.6 --- test_mackorean.py 12 Jan 2003 22:54:13 -0000 1.5 +++ test_mackorean.py 12 Jan 2003 23:04:56 -0000 1.6 @@ -16,7 +16,7 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: test_mackorean.py,v 1.5 2003/01/12 22:54:13 perky Exp $ +# $Id: test_mackorean.py,v 1.6 2003/01/12 23:04:56 perky Exp $ # import CodecTestBase @@ -25,7 +25,7 @@ return u''.join(map(unichr, map(eval, s.split('+')))) class TestMacKorean(CodecTestBase.TestStreamReader, CodecTestBase.CodecTestBase): - encoding = 'korean.mackorean' + encoding = 'korean.mac_korean' textfile_chunk = ('texts/mackorean', 'texts/mackorean.utf-8') errortests = ( # invalid bytes |
From: Hye-Shik C. <pe...@us...> - 2003-01-12 23:04:56
|
perky 03/01/12 15:04:56 Modified: korean aliases.py Added: korean mac_korean.py Removed: korean mackorean.py Log: Rename mackorean codec to mac_korean according to python standard codec's name convention Revision Changes Path 1.11 +3 -3 KoreanCodecs/korean/aliases.py Index: aliases.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/korean/aliases.py,v retrieving revision 1.10 retrieving revision 1.11 diff -u -r1.10 -r1.11 --- aliases.py 10 Jan 2003 06:50:44 -0000 1.10 +++ aliases.py 12 Jan 2003 23:04:56 -0000 1.11 @@ -17,7 +17,7 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: aliases.py,v 1.10 2003/01/10 06:50:44 perky Exp $ +# $Id: aliases.py,v 1.11 2003/01/12 23:04:56 perky Exp $ # import encodings.aliases @@ -41,6 +41,6 @@ 'qwerty2bul': 'korean.qwerty2bul', 'unijohab': 'korean.unijohab', 'macjohab': 'korean.unijohab', - 'mackorean': 'korean.mackorean', - 'macwansung': 'korean.mackorean', + 'mackorean': 'korean.mac_korean', + 'mac_korean': 'korean.mac_korean', }) 1.1 KoreanCodecs/korean/mac_korean.py Index: mac_korean.py =================================================================== # # This file is part of KoreanCodecs. # # Copyright(C) 2002-2003 Hye-Shik Chang <pe...@Fr...>. # # KoreanCodecs is free software; you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as published # by the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # KoreanCodecs is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # $Id: mac_korean.py,v 1.1 2003/01/12 23:04:56 perky Exp $ # import codecs from korean.mappings import ksc5601_hangul, appleextension encmap_hangul, decmap_hangul = ksc5601_hangul.encoding_map, ksc5601_hangul.decoding_map encmap_apple, decmap_apple = appleextension.encoding_map, appleextension.decoding_map encmap_ideo, decmap_ideo = {}, {} encmap_misc, decmap_misc = {}, {} class Codec(codecs.Codec): # Unicode to character buffer def encode(self, data, errors='strict'): global encmap_ideo, encmap_misc if errors not in ('strict', 'ignore', 'replace'): raise ValueError, "unknown error handling" buffer = [] p = 0 size = len(data) while p < size: aemap = appleextension.multilevel_encmap relp = 0 while p + relp < size and aemap.has_key(data[p + relp]): aemap = aemap[data[p + relp]] relp += 1 if aemap.has_key(None): buffer.append(aemap[None]) p += relp continue c = data[p] p += 1 if c < u'\u0080': buffer.append(c.encode("ascii", errors)) elif encmap_hangul.has_key(c): buffer.append(encmap_hangul[c]) elif encmap_apple.has_key(c): buffer.append(encmap_apple[c]) else: if not encmap_misc: from korean.mappings import ksc5601_misc encmap_misc = ksc5601_misc.encoding_map if encmap_misc.has_key(c): buffer.append(encmap_misc[c]) continue if not encmap_ideo: from korean.mappings import ksc5601_ideograph encmap_ideo = ksc5601_ideograph.encoding_map if encmap_ideo.has_key(c): buffer.append(encmap_ideo[c]) continue if errors == 'replace': buffer.append('\xa1\xa1') elif errors == 'strict': raise UnicodeError, ("cannot map " "\\u%04x to MacKorean") % ord(c) return (''.join(buffer), len(data)) # character buffer to Unicode def decode(self, data, errors='strict'): global decmap_ideo, decmap_misc if errors not in ('strict', 'ignore', 'replace'): raise ValueError, "unknown error handling" buffer = [] data = str(data) # character buffer compatible object size = len(data) p = 0 while p < size: if data[p] < '\x80': buffer.append(unicode(data[p], "ascii", errors)) p += 1 elif data[p] <= '\xa0' or data[p] == '\xff': if decmap_apple.has_key(data[p]): buffer.append(decmap_apple[data[p]]) p += 1 continue if errors == 'replace': buffer.append(u'\uFFFD') # REPLACEMENT CHARACTER elif errors == 'strict': raise UnicodeError, "unexpected byte %s found" % ( hex(ord(data[p]))) p += 1 else: c = data[p:p+2] p += 2 if len(c) == 2: if decmap_hangul.has_key(c): buffer.append(decmap_hangul[c]) continue elif decmap_apple.has_key(c): buffer.append(decmap_apple[c]) continue if not decmap_misc: from korean.mappings import ksc5601_misc decmap_misc = ksc5601_misc.decoding_map if decmap_misc.has_key(c): buffer.append(decmap_misc[c]) continue if not decmap_ideo: from korean.mappings import ksc5601_ideograph decmap_ideo = ksc5601_ideograph.decoding_map if decmap_ideo.has_key(c): buffer.append(decmap_ideo[c]) continue if errors == 'replace': buffer.append(u'\uFFFD') # REPLACEMENT CHARACTER elif errors == 'strict': raise UnicodeError, "unexpected byte 0x%s found" % ( ''.join(["%02x"%ord(x) for x in c]) ) return (u''.join(buffer), size) class StreamWriter(Codec, codecs.StreamWriter): pass class StreamReader(Codec, codecs.StreamReader): def __init__(self, stream, errors='strict'): codecs.StreamReader.__init__(self, stream, errors) self.data = '' def _read(self, func, size): if size == 0: return u'' if size is None or size < 0: data = self.data + func() self.data = '' else: data = self.data + func(max(size, 2) - len(self.data)) size = len(data) p = 0 while p < size: if data[p] < "\xa1" or data[p] == "\xff": p = p + 1 elif p + 2 <= size: p = p + 2 else: break data, self.data = data[:p], data[p:] return self.decode(data)[0] def read(self, size=-1): return self._read(self.stream.read, size) def readline(self, size=-1): return self._read(self.stream.readline, size) def readlines(self, size=-1): data = self._read(self.stream.read, size) buffer = [] end = 0 while 1: pos = data.find(u'\n', end) if pos < 0: if end < len(data): buffer.append(data[end:]) break buffer.append(data[end:pos+1]) end = pos+1 return buffer def reset(self): self.data = '' def getregentry(): return (Codec().encode,Codec().decode,StreamReader,StreamWriter) # ex: ts=8 sts=4 et |
From: Hye-Shik C. <pe...@us...> - 2003-01-12 23:04:56
|
perky 03/01/12 15:04:55 Modified: . README.en README.ko Log: Rename mackorean codec to mac_korean according to python standard codec's name convention Revision Changes Path 1.27 +3 -3 KoreanCodecs/README.en Index: README.en =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/README.en,v retrieving revision 1.26 retrieving revision 1.27 diff -u -r1.26 -r1.27 --- README.en 12 Jan 2003 22:54:11 -0000 1.26 +++ README.en 12 Jan 2003 23:04:55 -0000 1.27 @@ -2,7 +2,7 @@ ============================= Copyright(C) 2002-2003 Hye-Shik Chang. -$Id: README.en,v 1.26 2003/01/12 22:54:11 perky Exp $ +$Id: README.en,v 1.27 2003/01/12 23:04:55 perky Exp $ Introduction @@ -53,7 +53,7 @@ "korean.euc-kr" "korean.cp949" - "korean.mackorean" + "korean.mac_korean" "korean.johab" "korean.iso-2022-kr" @@ -67,7 +67,7 @@ o Wansung - korean.euc-kr - korean.cp949 - - korean.mackorean + - korean.mac_korean o Johab - korean.johab 1.25 +3 -3 KoreanCodecs/README.ko Index: README.ko =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/README.ko,v retrieving revision 1.24 retrieving revision 1.25 diff -u -r1.24 -r1.25 --- README.ko 12 Jan 2003 22:54:11 -0000 1.24 +++ README.ko 12 Jan 2003 23:04:55 -0000 1.25 @@ -2,7 +2,7 @@ ====================== Copyright(C) 2002-2003 Hye-Shik Chang. -$Id: README.ko,v 1.24 2003/01/12 22:54:11 perky Exp $ +$Id: README.ko,v 1.25 2003/01/12 23:04:55 perky Exp $ *Ä·ÆäÀÎ* ÀÎÅͳݿ¡¼ ÇÑ±Û ¸ÂÃã¹ýÀ» Áöŵ½Ã´Ù. ^-^/~ @@ -56,7 +56,7 @@ "korean.euc-kr" "korean.cp949" - "korean.mackorean" + "korean.mac_korean" "korean.johab" "korean.iso-2022-kr" @@ -69,7 +69,7 @@ o ¿Ï¼ºÇü - korean.euc-kr : KS5601 ¿Ï¼ºÇü - korean.cp949 : Microsoft È®Àå¿Ï¼ºÇü - - korean.mackorean : Apple È®Àå¿Ï¼ºÇü + - korean.mac_korean : Apple È®Àå¿Ï¼ºÇü o Á¶ÇÕÇü - korean.johab : »ó¿ë Á¶ÇÕÇü (8ºñÆ®) |
From: Hye-Shik C. <pe...@us...> - 2003-01-12 23:01:36
|
perky 03/01/12 15:01:35 Modified: korean iso_2022_kr.py Log: Style fix: Use global constant, remove useless comments. Revision Changes Path 1.6 +6 -5 KoreanCodecs/korean/iso_2022_kr.py Index: iso_2022_kr.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/korean/iso_2022_kr.py,v retrieving revision 1.5 retrieving revision 1.6 diff -u -r1.5 -r1.6 --- iso_2022_kr.py 12 Jan 2003 22:54:12 -0000 1.5 +++ iso_2022_kr.py 12 Jan 2003 23:01:34 -0000 1.6 @@ -17,7 +17,7 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: iso_2022_kr.py,v 1.5 2003/01/12 22:54:12 perky Exp $ +# $Id: iso_2022_kr.py,v 1.6 2003/01/12 23:01:34 perky Exp $ # import codecs @@ -48,10 +48,12 @@ def encode(self, data, errors='strict'): if errors not in ('strict', 'ignore', 'replace'): raise ValueError, "unknown error handling" + buffer = [] designation = [US_ASCII, US_ASCII] new_designation = designation[:] new_shiftstate = shiftstate = 0 + for c in data: if c in ('\n', '\r'): new_shiftstate = 0 @@ -63,7 +65,7 @@ else: new_shiftstate = 1 new_designation[1] = KSC5601_1987 - s = c.encode('korean.euc_kr', errors) + s = c.encode(KSC5601_CODEC, errors) if designation[0] != new_designation[0]: buffer.append(DESIGNATION_MARK[(G0, new_designation[0])]) @@ -144,15 +146,14 @@ return (u''.join(buffer), len(data)) + class StreamWriter(Codec, codecs.StreamWriter): pass + class StreamReader(Codec, codecs.StreamReader): pass - # not implemented. - # (JapaneseCodecs's implementation is so different to adopt.) -### encodings module API def getregentry(): return (Codec().encode,Codec().decode,StreamReader,StreamWriter) |
From: Hye-Shik C. <pe...@us...> - 2003-01-12 22:57:21
|
perky 03/01/12 14:57:20 Modified: korean qwerty2bul.py Log: Style fix: respect terminal with 80 columns. :) Revision Changes Path 1.6 +24 -20 KoreanCodecs/korean/qwerty2bul.py Index: qwerty2bul.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/korean/qwerty2bul.py,v retrieving revision 1.5 retrieving revision 1.6 diff -u -r1.5 -r1.6 --- qwerty2bul.py 12 Jan 2003 22:54:12 -0000 1.5 +++ qwerty2bul.py 12 Jan 2003 22:57:19 -0000 1.6 @@ -17,7 +17,7 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: qwerty2bul.py,v 1.5 2003/01/12 22:54:12 perky Exp $ +# $Id: qwerty2bul.py,v 1.6 2003/01/12 22:57:19 perky Exp $ # import codecs @@ -25,29 +25,29 @@ from korean.hangul import ishangul, join, split, isJaeum, isMoeum codekeymap = { - Jaeum.G: 'r', Jaeum.GG: 'R', Jaeum.GS: 'rt', - Jaeum.N: 's', Jaeum.NJ:'sw', Jaeum.NH: 'sg', Jaeum.D: 'e', - Jaeum.DD:'E', Jaeum.L: 'f', Jaeum.LG: 'fr', Jaeum.LM: 'fa', - Jaeum.LB:'fq', Jaeum.LS:'ft', Jaeum.LT: 'fx', Jaeum.LP: 'fv', - Jaeum.LH:'fg', Jaeum.M: 'a', Jaeum.B: 'q', Jaeum.BB: 'Q', - Jaeum.BS:'qt', Jaeum.S: 't', Jaeum.SS: 'T', Jaeum.NG: 'd', - Jaeum.J: 'w', Jaeum.JJ:'W', Jaeum.C: 'c', Jaeum.K: 'z', - Jaeum.T: 'x', Jaeum.P: 'v', Jaeum.H: 'g', - - Moeum.A: 'k', Moeum.AE:'o', Moeum.YA: 'i', Moeum.YAE:'O', - Moeum.EO:'j', Moeum.E: 'p', Moeum.YEO:'u', Moeum.YE: 'P', - Moeum.O: 'h', Moeum.WA:'hk', Moeum.WAE:'ho', Moeum.OE: 'hl', - Moeum.YO:'y', Moeum.U: 'n', Moeum.WEO:'nj', Moeum.WE: 'np', - Moeum.WI:'nl', Moeum.YU:'b', Moeum.EU: 'm', Moeum.YI: 'ml', - Moeum.I: 'l', + Jaeum.G: 'r', Jaeum.GG:'R', Jaeum.GS: 'rt', + Jaeum.N: 's', Jaeum.NJ:'sw', Jaeum.NH: 'sg', Jaeum.D: 'e', + Jaeum.DD:'E', Jaeum.L: 'f', Jaeum.LG: 'fr', Jaeum.LM: 'fa', + Jaeum.LB:'fq', Jaeum.LS:'ft', Jaeum.LT: 'fx', Jaeum.LP: 'fv', + Jaeum.LH:'fg', Jaeum.M: 'a', Jaeum.B: 'q', Jaeum.BB: 'Q', + Jaeum.BS:'qt', Jaeum.S: 't', Jaeum.SS: 'T', Jaeum.NG: 'd', + Jaeum.J: 'w', Jaeum.JJ:'W', Jaeum.C: 'c', Jaeum.K: 'z', + Jaeum.T: 'x', Jaeum.P: 'v', Jaeum.H: 'g', + + Moeum.A: 'k', Moeum.AE:'o', Moeum.YA: 'i', Moeum.YAE:'O', + Moeum.EO:'j', Moeum.E: 'p', Moeum.YEO:'u', Moeum.YE: 'P', + Moeum.O: 'h', Moeum.WA:'hk', Moeum.WAE:'ho', Moeum.OE: 'hl', + Moeum.YO:'y', Moeum.U: 'n', Moeum.WEO:'nj', Moeum.WE: 'np', + Moeum.WI:'nl', Moeum.YU:'b', Moeum.EU: 'm', Moeum.YI: 'ml', + Moeum.I: 'l', - u'': '', + u'': '', } keycodemap = {} for k, v in codekeymap.items(): - keycodemap[v] = k - keycodemap.setdefault(v.upper(), k) + keycodemap[v] = k + keycodemap.setdefault(v.upper(), k) keycodes = ''.join(keycodemap.keys()) del k, v @@ -62,7 +62,9 @@ def pushcomp(self): if self.chosung and not self.jungsung: self.word_valid = 0 - self.word_comp.append(join([self.chosung, self.jungsung, self.jongsung])) + self.word_comp.append(join([ + self.chosung, self.jungsung, self.jongsung + ])) self.clearcomp() def clearcomp(self): @@ -196,3 +198,5 @@ def getregentry(): return (Codec().encode, Codec().decode, StreamReader, StreamWriter) + +# ex: ts=8 sts=4 et |
From: Hye-Shik C. <pe...@us...> - 2003-01-12 22:54:14
|
perky 03/01/12 14:54:13 Modified: test CodecTestBase.py test_all.py test_cp949.py test_euc_kr.py test_hangul.py test_iso_2022_kr.py test_johab.py test_mackorean.py test_qwerty2bul.py test_unijohab.py Log: Remove selective framework for two implementations, 'C' and 'Python'. We'll maintain only 1 implementation from now. Accordingly, --with[out]-extension options is removed, too. Revision Changes Path 1.10 +6 -3 KoreanCodecs/test/CodecTestBase.py Index: CodecTestBase.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/test/CodecTestBase.py,v retrieving revision 1.9 retrieving revision 1.10 diff -u -r1.9 -r1.10 --- CodecTestBase.py 10 Jan 2003 06:08:21 -0000 1.9 +++ CodecTestBase.py 12 Jan 2003 22:54:13 -0000 1.10 @@ -16,7 +16,7 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: CodecTestBase.py,v 1.9 2003/01/10 06:08:21 perky Exp $ +# $Id: CodecTestBase.py,v 1.10 2003/01/12 22:54:13 perky Exp $ # import StringIO @@ -46,8 +46,11 @@ def setUp(self): if not self.textfile_chunk: - self.textfile_chunk = ('texts/' + self.encoding, - 'texts/%s.utf-8' % self.encoding) or self.textfile_stream + self.textfile_chunk = ('texts/' + + self.encoding.replace('korean.', ''), + 'texts/%s.utf-8' % + self.encoding.replace('korean.', '') + ) or self.textfile_stream if not self.textfile_stream: self.textfile_stream = self.textfile_chunk # checked above. :) 1.8 +5 -5 KoreanCodecs/test/test_all.py Index: test_all.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/test/test_all.py,v retrieving revision 1.7 retrieving revision 1.8 diff -u -r1.7 -r1.8 --- test_all.py 10 Jan 2003 03:15:25 -0000 1.7 +++ test_all.py 12 Jan 2003 22:54:13 -0000 1.8 @@ -16,20 +16,20 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: test_all.py,v 1.7 2003/01/10 03:15:25 perky Exp $ +# $Id: test_all.py,v 1.8 2003/01/12 22:54:13 perky Exp $ # import CodecTestBase -from test_cp949 import TestCP949_CExtension, TestCP949_PurePython -from test_euc_kr import TestEUCKR_CExtension, TestEUCKR_PurePython -from test_mackorean import TestMacKorean_PurePython +from test_cp949 import TestCP949 +from test_euc_kr import TestEUCKR +from test_mackorean import TestMacKorean from test_iso_2022_kr import TestISO_2022_KR from test_johab import TestJOHAB from test_qwerty2bul import TestQWERTY2BUL from test_unijohab import TestUNIJOHAB -from test_hangul import TestHangul_CExtension, TestHangul_PurePython +from test_hangul import TestHangul if __name__ == '__main__': CodecTestBase.main() 1.12 +40 -47 KoreanCodecs/test/test_cp949.py Index: test_cp949.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/test/test_cp949.py,v retrieving revision 1.11 retrieving revision 1.12 diff -u -r1.11 -r1.12 --- test_cp949.py 11 Jan 2003 15:01:56 -0000 1.11 +++ test_cp949.py 12 Jan 2003 22:54:13 -0000 1.12 @@ -16,7 +16,7 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: test_cp949.py,v 1.11 2003/01/11 15:01:56 perky Exp $ +# $Id: test_cp949.py,v 1.12 2003/01/12 22:54:13 perky Exp $ # import CodecTestBase @@ -24,53 +24,46 @@ def unichrs(s): return u''.join(map(unichr, map(eval, s.split('+')))) -class Shield: - class TestCP949Base(CodecTestBase.TestStreamReader, CodecTestBase.CodecTestBase): - encoding = 'cp949' - textfile_chunk = ('texts/cp949', 'texts/cp949.utf-8') - errortests = ( - # invalid bytes - ("abc\x80\x80\xc1\xc4", "strict", None), - ("abc\xc8", "strict", None), - ("abc\x80\x80\xc1\xc4", "replace", u"abc\ufffd\uc894"), - ("abc\x80\x80\xc1\xc4\xc8", "replace", u"abc\ufffd\uc894\ufffd"), - ("abc\x80\x80\xc1\xc4", "ignore", u"abc\uc894"), - ) - - def test_mapping(self): - import sys, os - - if not os.access('CP949.TXT', os.R_OK): - sys.stdout.write('skipped -- CP949.TXT not found, download from' - ' http://www.unicode.org/Public/MAPPINGS' - '/VENDORS/MICSFT/WINDOWS/CP949.TXT ') - sys.stdout.flush() - return - - for line in open('CP949.TXT'): - if not line: - break - data = line.split('#')[0].strip().split() - if len(data) != 2: - continue - - cp949val = eval(data[0]) - if cp949val <= 0x7F: - cp949ch = chr(cp949val & 0xff) - elif cp949val >= 0x100: - cp949ch = chr(cp949val >> 8) + chr(cp949val & 0xff) - else: - continue - unich = unichrs(data[1]) +class TestCP949(CodecTestBase.TestStreamReader, CodecTestBase.CodecTestBase): + encoding = 'korean.cp949' + textfile_chunk = ('texts/cp949', 'texts/cp949.utf-8') + errortests = ( + # invalid bytes + ("abc\x80\x80\xc1\xc4", "strict", None), + ("abc\xc8", "strict", None), + ("abc\x80\x80\xc1\xc4", "replace", u"abc\ufffd\uc894"), + ("abc\x80\x80\xc1\xc4\xc8", "replace", u"abc\ufffd\uc894\ufffd"), + ("abc\x80\x80\xc1\xc4", "ignore", u"abc\uc894"), + ) + + def test_mapping(self): + import sys, os + + if not os.access('CP949.TXT', os.R_OK): + sys.stdout.write('skipped -- CP949.TXT not found, download from' + ' http://www.unicode.org/Public/MAPPINGS' + '/VENDORS/MICSFT/WINDOWS/CP949.TXT ') + sys.stdout.flush() + return + + for line in open('CP949.TXT'): + if not line: + break + data = line.split('#')[0].strip().split() + if len(data) != 2: + continue + + cp949val = eval(data[0]) + if cp949val <= 0x7F: + cp949ch = chr(cp949val & 0xff) + elif cp949val >= 0x100: + cp949ch = chr(cp949val >> 8) + chr(cp949val & 0xff) + else: + continue + unich = unichrs(data[1]) - self.assertEqual(unich.encode(self.encoding), cp949ch) - self.assertEqual(unicode(cp949ch, self.encoding), unich) - -class TestCP949_CExtension(Shield.TestCP949Base): - encoding = 'korean.c.cp949' - -class TestCP949_PurePython(Shield.TestCP949Base): - encoding = 'korean.python.cp949' + self.assertEqual(unich.encode(self.encoding), cp949ch) + self.assertEqual(unicode(cp949ch, self.encoding), unich) if __name__ == '__main__': CodecTestBase.main() 1.9 +18 -25 KoreanCodecs/test/test_euc_kr.py Index: test_euc_kr.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/test/test_euc_kr.py,v retrieving revision 1.8 retrieving revision 1.9 diff -u -r1.8 -r1.9 --- test_euc_kr.py 9 Jan 2003 23:23:31 -0000 1.8 +++ test_euc_kr.py 12 Jan 2003 22:54:13 -0000 1.9 @@ -16,37 +16,30 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: test_euc_kr.py,v 1.8 2003/01/09 23:23:31 perky Exp $ +# $Id: test_euc_kr.py,v 1.9 2003/01/12 22:54:13 perky Exp $ # import CodecTestBase -class Shield: - class TestEUCKR_Base(CodecTestBase.TestStreamReader, CodecTestBase.CodecTestBase): - encoding = 'euc-kr' - textfile_chunk = ('texts/euc-kr', 'texts/euc-kr.utf-8') - errortests = ( - # invalid bytes - ("abc\x80\x80\xc1\xc4", "strict", None), - ("abc\xc8", "strict", None), - ("abc\x80\x80\xc1\xc4", "replace", u"abc\ufffd\uc894"), - ("abc\x80\x80\xc1\xc4\xc8", "replace", u"abc\ufffd\uc894\ufffd"), - ("abc\x80\x80\xc1\xc4", "ignore", u"abc\uc894"), - ("\xc1\x64", "strict", None), # cp949 code H-AE-H - ) +class TestEUCKR(CodecTestBase.TestStreamReader, CodecTestBase.CodecTestBase): + encoding = 'korean.euc-kr' + textfile_chunk = ('texts/euc-kr', 'texts/euc-kr.utf-8') + errortests = ( + # invalid bytes + ("abc\x80\x80\xc1\xc4", "strict", None), + ("abc\xc8", "strict", None), + ("abc\x80\x80\xc1\xc4", "replace", u"abc\ufffd\uc894"), + ("abc\x80\x80\xc1\xc4\xc8", "replace", u"abc\ufffd\uc894\ufffd"), + ("abc\x80\x80\xc1\xc4", "ignore", u"abc\uc894"), + ("\xc1\x64", "strict", None), # cp949 code H-AE-H + ) - def test_ksx1001_1998(self): - self.assertEqual(unicode('\xa2\xe6', self.encoding), u'\u20ac') - self.assertEqual(unicode('\xa2\xe7', self.encoding), u'\u00ae') - self.assertEqual(u'\u20ac'.encode(self.encoding), '\xa2\xe6') - self.assertEqual(u'\u00ae'.encode(self.encoding), '\xa2\xe7') + def test_ksx1001_1998(self): + self.assertEqual(unicode('\xa2\xe6', self.encoding), u'\u20ac') + self.assertEqual(unicode('\xa2\xe7', self.encoding), u'\u00ae') + self.assertEqual(u'\u20ac'.encode(self.encoding), '\xa2\xe6') + self.assertEqual(u'\u00ae'.encode(self.encoding), '\xa2\xe7') - -class TestEUCKR_CExtension(Shield.TestEUCKR_Base): - encoding = 'korean.c.euc-kr' - -class TestEUCKR_PurePython(Shield.TestEUCKR_Base): - encoding = 'korean.python.euc-kr' if __name__ == '__main__': CodecTestBase.main() 1.11 +38 -44 KoreanCodecs/test/test_hangul.py Index: test_hangul.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/test/test_hangul.py,v retrieving revision 1.10 retrieving revision 1.11 diff -u -r1.10 -r1.11 --- test_hangul.py 9 Jan 2003 21:31:44 -0000 1.10 +++ test_hangul.py 12 Jan 2003 22:54:13 -0000 1.11 @@ -16,94 +16,88 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: test_hangul.py,v 1.10 2003/01/09 21:31:44 perky Exp $ +# $Id: test_hangul.py,v 1.11 2003/01/12 22:54:13 perky Exp $ # import unittest +from korean import hangul -class Shield: - class TestHangul(unittest.TestCase): +class TestHangul(unittest.TestCase): def test_joinsplit(self): - self.assertEqual(self.h.join([self.h.J, self.h.WA, self.h.L]), u'\uc894') - self.assertEqual(self.h.join([self.h.JJ, self.h.Null, self.h.Null]), u'\u3149') - self.assertEqual(self.h.join((self.h.Null, self.h.YI, self.h.Null)), u'\u3162') - - self.assertEqual(self.h.split(u'\uc894'), (self.h.J, self.h.WA, self.h.L)) - self.assertEqual(self.h.split(u'\u3149'), (self.h.JJ, self.h.Null, self.h.Null)) - self.assertEqual(self.h.split(u'\u3162'), (self.h.Null, self.h.YI, self.h.Null)) + self.assertEqual(hangul.join([hangul.J, hangul.WA, hangul.L]), u'\uc894') + self.assertEqual(hangul.join([hangul.JJ, hangul.Null, hangul.Null]), u'\u3149') + self.assertEqual(hangul.join((hangul.Null, hangul.YI, hangul.Null)), u'\u3162') + + self.assertEqual(hangul.split(u'\uc894'), (hangul.J, hangul.WA, hangul.L)) + self.assertEqual(hangul.split(u'\u3149'), (hangul.JJ, hangul.Null, hangul.Null)) + self.assertEqual(hangul.split(u'\u3162'), (hangul.Null, hangul.YI, hangul.Null)) def test_basicspec(self): - self.assertEqual(self.h.isJaeum(self.h.J), 1) - self.assertEqual(self.h.isJaeum(self.h.E), 0) - self.assertEqual(self.h.isMoeum(self.h.L), 0) - self.assertEqual(self.h.isMoeum(self.h.O), 1) - self.assertEqual(self.h.ishangul(u'\uc870'), 1) - self.assertEqual(self.h.ishangul(u'\u382c'), 0) + self.assertEqual(hangul.isJaeum(hangul.J), 1) + self.assertEqual(hangul.isJaeum(hangul.E), 0) + self.assertEqual(hangul.isMoeum(hangul.L), 0) + self.assertEqual(hangul.isMoeum(hangul.O), 1) + self.assertEqual(hangul.ishangul(u'\uc870'), 1) + self.assertEqual(hangul.ishangul(u'\u382c'), 0) def test_testlong(self): - self.assertEqual(self.h.isJaeum(u'\u3131\u3134\u3137\u3139'), 1) - self.assertEqual(self.h.isJaeum(u'\u3131\u314f\u3134\u314f'), 0) - self.assertEqual(self.h.isJaeum(u''), 0) - - self.assertEqual(self.h.isMoeum(u'\u314f\u3151\u3153\u3155'), 1) - self.assertEqual(self.h.isMoeum(u'\u3131\u314f\u3134\u314f'), 0) - self.assertEqual(self.h.isMoeum(u''), 0) - - self.assertEqual(self.h.ishangul(u'\ud2f0\ud2f0\ub9c8\uc18c\uc774'), 1) - self.assertEqual(self.h.ishangul(u'\ud2f0\ud2f0\ub9c8 \uc18c\uc774'), 0) - self.assertEqual(self.h.ishangul(u''), 0) + self.assertEqual(hangul.isJaeum(u'\u3131\u3134\u3137\u3139'), 1) + self.assertEqual(hangul.isJaeum(u'\u3131\u314f\u3134\u314f'), 0) + self.assertEqual(hangul.isJaeum(u''), 0) + + self.assertEqual(hangul.isMoeum(u'\u314f\u3151\u3153\u3155'), 1) + self.assertEqual(hangul.isMoeum(u'\u3131\u314f\u3134\u314f'), 0) + self.assertEqual(hangul.isMoeum(u''), 0) + + self.assertEqual(hangul.ishangul(u'\ud2f0\ud2f0\ub9c8\uc18c\uc774'), 1) + self.assertEqual(hangul.ishangul(u'\ud2f0\ud2f0\ub9c8 \uc18c\uc774'), 0) + self.assertEqual(hangul.ishangul(u''), 0) def test_format_altsuffix(self): fmt = u'%s\ub294 %s\ub97c %s\ud55c\ub2e4.' obj1, obj2 = u'\ud61c\uc2dd', u'\uc544\ub77c' - self.assertEqual(self.h.format(fmt, obj1, obj2, u'\u2661'), + self.assertEqual(hangul.format(fmt, obj1, obj2, u'\u2661'), u'\ud61c\uc2dd\uc740 \uc544\ub77c\ub97c \u2661\ud55c\ub2e4.') - self.assertEqual(self.h.format(fmt, obj2, obj1, u'\uc2eb\uc5b4'), + self.assertEqual(hangul.format(fmt, obj2, obj1, u'\uc2eb\uc5b4'), u'\uc544\ub77c\ub294 \ud61c\uc2dd\uc744 \uc2eb\uc5b4\ud55c\ub2e4.') fmt = u'\ud0dc\ucd08\uc5d0 %s\uc640 %s\uac00 \uc788\uc5c8\ub2e4.' - self.assertEqual(self.h.format(fmt, obj1, obj2), + self.assertEqual(hangul.format(fmt, obj1, obj2), u'\ud0dc\ucd08\uc5d0 \ud61c\uc2dd\uacfc \uc544\ub77c\uac00' u' \uc788\uc5c8\ub2e4.') - self.assertEqual(self.h.format(fmt, obj2, obj1), + self.assertEqual(hangul.format(fmt, obj2, obj1), u'\ud0dc\ucd08\uc5d0 \uc544\ub77c\uc640 \ud61c\uc2dd\uc774' u' \uc788\uc5c8\ub2e4.') obj1, obj2 = u'Julian', u'Julie' - self.assertEqual(self.h.format(fmt, obj1, obj2), + self.assertEqual(hangul.format(fmt, obj1, obj2), u'\ud0dc\ucd08\uc5d0 Julian\uacfc Julie\uac00 \uc788\uc5c8\ub2e4.') - self.assertEqual(self.h.format(fmt, obj2, obj1), + self.assertEqual(hangul.format(fmt, obj2, obj1), u'\ud0dc\ucd08\uc5d0 Julie\uc640 Julian\uc774 \uc788\uc5c8\ub2e4.') def test_format_idasuffix(self): fmt = u'%s(\uc785)\ub2c8\ub2e4, %s(\uc778)\ub370, %s(\uc774)\ub2e4' - self.assertEqual(self.h.format(fmt, *(u'\uc18c\uc774',)*3), + self.assertEqual(hangul.format(fmt, *(u'\uc18c\uc774',)*3), u'\uc18c\uc785\ub2c8\ub2e4, \uc18c\uc778\ub370, \uc18c\uc774\ub2e4') - self.assertEqual(self.h.format(fmt, *(u'\ub2e4\ub155',)*3), + self.assertEqual(hangul.format(fmt, *(u'\ub2e4\ub155',)*3), u'\ub2e4\ub155\uc785\ub2c8\ub2e4, \ub2e4\ub155\uc778\ub370,' u' \ub2e4\ub155\uc774\ub2e4') def test_format_argtypes(self): fmt = u'%(int)d(\uc785)\ub2c8\ub2e4. %(str)s\uc740 %(str)s\uc5d0' \ u'%(float).2f\uc640' - self.assertEqual(self.h.format(fmt, int=1, str=u'hmm', float=3.14), + self.assertEqual(hangul.format(fmt, int=1, str=u'hmm', float=3.14), u'1\uc785\ub2c8\ub2e4. hmm\uc740 hmm\uc5d03.14\uc640') def test_conjoin(self): - self.assertEqual(self.h.conjoin(u'\u1112\u1161\u11ab\u1100\u1173\u11af\u110b\u1175' + self.assertEqual(hangul.conjoin(u'\u1112\u1161\u11ab\u1100\u1173\u11af\u110b\u1175' u' \u110c\u1169\u11c2\u110b\u1161\u110b\u116d.'), u'\ud55c\uae00\uc774 \uc88b\uc544\uc694.') def test_disjoint(self): - self.assertEqual(self.h.disjoint(u'\ub9c8\ub140\ubc30\ub2ec\ubd80 \ud0a4\ud0a4'), + self.assertEqual(hangul.disjoint(u'\ub9c8\ub140\ubc30\ub2ec\ubd80 \ud0a4\ud0a4'), u'\u1106\u1161\u1102\u1167\u1107\u1162\u1103\u1161\u11af\u1107\u116e' u' \u110f\u1175\u110f\u1175') - -class TestHangul_CExtension(Shield.TestHangul): - from korean.c import hangul as h - -class TestHangul_PurePython(Shield.TestHangul): - from korean.python import hangul as h if __name__ == '__main__': import sys 1.7 +6 -4 KoreanCodecs/test/test_iso_2022_kr.py Index: test_iso_2022_kr.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/test/test_iso_2022_kr.py,v retrieving revision 1.6 retrieving revision 1.7 diff -u -r1.6 -r1.7 --- test_iso_2022_kr.py 10 Jan 2003 06:08:22 -0000 1.6 +++ test_iso_2022_kr.py 12 Jan 2003 22:54:13 -0000 1.7 @@ -16,15 +16,17 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: test_iso_2022_kr.py,v 1.6 2003/01/10 06:08:22 perky Exp $ +# $Id: test_iso_2022_kr.py,v 1.7 2003/01/12 22:54:13 perky Exp $ # import CodecTestBase class TestISO_2022_KR(CodecTestBase.CodecTestBase): - encoding = 'iso-2022-kr' - textfile_chunk = ('texts/%s.roundrobin' % encoding, 'texts/%s.utf-8' % encoding) - textfile_stream = ('texts/%s.stream' % encoding, 'texts/%s.utf-8' % encoding) + encoding = 'korean.iso-2022-kr' + textfile_chunk = ('texts/iso-2022-kr.roundrobin', + 'texts/iso-2022-kr.utf-8') + textfile_stream = ('texts/iso-2022-kr.stream', + 'texts/iso-2022-kr.utf-8') roundtriptest = 0 errortests = ( 1.4 +2 -2 KoreanCodecs/test/test_johab.py Index: test_johab.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/test/test_johab.py,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- test_johab.py 9 Jan 2003 21:31:44 -0000 1.3 +++ test_johab.py 12 Jan 2003 22:54:13 -0000 1.4 @@ -16,13 +16,13 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: test_johab.py,v 1.3 2003/01/09 21:31:44 perky Exp $ +# $Id: test_johab.py,v 1.4 2003/01/12 22:54:13 perky Exp $ # import CodecTestBase class TestJOHAB(CodecTestBase.TestStreamReader, CodecTestBase.CodecTestBase): - encoding = 'johab' + encoding = 'korean.johab' errortests = ( # invalid bytes ("abc\x80\x80\xc1\xc4", "strict", None), 1.5 +43 -49 KoreanCodecs/test/test_mackorean.py Index: test_mackorean.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/test/test_mackorean.py,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- test_mackorean.py 11 Jan 2003 15:01:56 -0000 1.4 +++ test_mackorean.py 12 Jan 2003 22:54:13 -0000 1.5 @@ -16,7 +16,7 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: test_mackorean.py,v 1.4 2003/01/11 15:01:56 perky Exp $ +# $Id: test_mackorean.py,v 1.5 2003/01/12 22:54:13 perky Exp $ # import CodecTestBase @@ -24,56 +24,50 @@ def unichrs(s): return u''.join(map(unichr, map(eval, s.split('+')))) -class Shield: - class TestMacKorean_Base(CodecTestBase.TestStreamReader, CodecTestBase.CodecTestBase): - encoding = 'mackorean' - textfile_chunk = ('texts/mackorean', 'texts/mackorean.utf-8') - errortests = ( - # invalid bytes - # \x90 is expected as one byte character in MacKorean. - ("abc\x90\x90\xc1\xc4", "strict", None), - ("abc\xc8", "strict", None), - ("abc\x90\x90\xc1\xc4", "replace", u"abc\ufffd\ufffd\uc894"), - ("abc\x90\x90\xc1\xc4\xc8", "replace", u"abc\ufffd\ufffd\uc894\ufffd"), - ("abc\x90\x90\xc1\xc4", "ignore", u"abc\uc894"), - ("\xc1\x64", "strict", None), # cp949 code H-AE-H - ) - - def test_mapping(self): - import os, sys - - if not os.access('KOREAN.TXT', os.R_OK): - sys.stdout.write('skipped -- KOREAN.TXT not found, download from' - ' http://www.unicode.org/Public/MAPPINGS' - '/VENDORS/APPLE/KOREAN.TXT ') - sys.stdout.flush() - return - - for line in open('KOREAN.TXT'): - if not line: - break - data = line.split('#')[0].strip().split() - if len(data) != 2: - continue - - macval = eval(data[0]) - - if macval <= 0x7F: - macch = chr(macval & 0xff) - elif macval >= 0x100: - macch = chr(macval >> 8) + chr(macval & 0xff) - else: - continue - unich = unichrs(data[1]) +class TestMacKorean(CodecTestBase.TestStreamReader, CodecTestBase.CodecTestBase): + encoding = 'korean.mackorean' + textfile_chunk = ('texts/mackorean', 'texts/mackorean.utf-8') + errortests = ( + # invalid bytes + # \x90 is expected as one byte character in MacKorean. + ("abc\x90\x90\xc1\xc4", "strict", None), + ("abc\xc8", "strict", None), + ("abc\x90\x90\xc1\xc4", "replace", u"abc\ufffd\ufffd\uc894"), + ("abc\x90\x90\xc1\xc4\xc8", "replace", u"abc\ufffd\ufffd\uc894\ufffd"), + ("abc\x90\x90\xc1\xc4", "ignore", u"abc\uc894"), + ("\xc1\x64", "strict", None), # cp949 code H-AE-H + ) + + def test_mapping(self): + import os, sys + + if not os.access('KOREAN.TXT', os.R_OK): + sys.stdout.write('skipped -- KOREAN.TXT not found, download from' + ' http://www.unicode.org/Public/MAPPINGS' + '/VENDORS/APPLE/KOREAN.TXT ') + sys.stdout.flush() + return + + for line in open('KOREAN.TXT'): + if not line: + break + data = line.split('#')[0].strip().split() + if len(data) != 2: + continue + + macval = eval(data[0]) + + if macval <= 0x7F: + macch = chr(macval & 0xff) + elif macval >= 0x100: + macch = chr(macval >> 8) + chr(macval & 0xff) + else: + continue + unich = unichrs(data[1]) - self.assertEqual(unich.encode(self.encoding), macch) - self.assertEqual(unicode(macch, self.encoding), unich) + self.assertEqual(unich.encode(self.encoding), macch) + self.assertEqual(unicode(macch, self.encoding), unich) -#class TestMacKorean_CExtension(Shield.TestMacKorean_Base): -# encoding = 'korean.c.mackorean' - -class TestMacKorean_PurePython(Shield.TestMacKorean_Base): - encoding = 'korean.python.mackorean' if __name__ == '__main__': CodecTestBase.main() 1.5 +2 -2 KoreanCodecs/test/test_qwerty2bul.py Index: test_qwerty2bul.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/test/test_qwerty2bul.py,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- test_qwerty2bul.py 9 Jan 2003 21:31:44 -0000 1.4 +++ test_qwerty2bul.py 12 Jan 2003 22:54:13 -0000 1.5 @@ -16,13 +16,13 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: test_qwerty2bul.py,v 1.4 2003/01/09 21:31:44 perky Exp $ +# $Id: test_qwerty2bul.py,v 1.5 2003/01/12 22:54:13 perky Exp $ # import CodecTestBase class TestQWERTY2BUL(CodecTestBase.CodecTestBase): - encoding = 'qwerty2bul' + encoding = 'korean.qwerty2bul' errortests = ( # invalid bytes ("123\x80\x80whkf", "strict", None), 1.3 +2 -2 KoreanCodecs/test/test_unijohab.py Index: test_unijohab.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/test/test_unijohab.py,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- test_unijohab.py 9 Jan 2003 21:31:44 -0000 1.2 +++ test_unijohab.py 12 Jan 2003 22:54:13 -0000 1.3 @@ -16,13 +16,13 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: test_unijohab.py,v 1.2 2003/01/09 21:31:44 perky Exp $ +# $Id: test_unijohab.py,v 1.3 2003/01/12 22:54:13 perky Exp $ # import CodecTestBase class TestUNIJOHAB(CodecTestBase.CodecTestBase): - encoding = 'unijohab' + encoding = 'korean.unijohab' errortests = () # error handling is relying UTF-8 codec. if __name__ == '__main__': |
From: Hye-Shik C. <pe...@us...> - 2003-01-12 22:54:14
|
perky 03/01/12 14:54:13 Removed: korean/python __init__.py cp949.py euc_kr.py hangul.py iso_2022_kr.py johab.py mackorean.py qwerty2bul.py unijohab.py Log: Remove selective framework for two implementations, 'C' and 'Python'. We'll maintain only 1 implementation from now. Accordingly, --with[out]-extension options is removed, too. |
From: Hye-Shik C. <pe...@us...> - 2003-01-12 22:54:13
|
perky 03/01/12 14:54:12 Removed: korean/c __init__.py cp949.py euc_kr.py Log: Remove selective framework for two implementations, 'C' and 'Python'. We'll maintain only 1 implementation from now. Accordingly, --with[out]-extension options is removed, too. |
From: Hye-Shik C. <pe...@us...> - 2003-01-12 22:54:13
|
perky 03/01/12 14:54:12 Modified: korean cp949.py euc_kr.py iso_2022_kr.py johab.py mackorean.py qwerty2bul.py unijohab.py Removed: korean hangul.py Log: Remove selective framework for two implementations, 'C' and 'Python'. We'll maintain only 1 implementation from now. Accordingly, --with[out]-extension options is removed, too. Revision Changes Path 1.5 +18 -5 KoreanCodecs/korean/cp949.py Index: cp949.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/korean/cp949.py,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- cp949.py 9 Jan 2003 21:35:48 -0000 1.4 +++ cp949.py 12 Jan 2003 22:54:12 -0000 1.5 @@ -17,10 +17,23 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: cp949.py,v 1.4 2003/01/09 21:35:48 perky Exp $ +# $Id: cp949.py,v 1.5 2003/01/12 22:54:12 perky Exp $ # -try: - from korean.c.cp949 import * -except ImportError: - from korean.python.cp949 import * +import codecs +import _koco + +class Codec(codecs.Codec): + encode = _koco.cp949_encode + decode = _koco.cp949_decode + +class StreamWriter(Codec, codecs.StreamWriter): + pass + +class StreamReader(Codec, _koco.StreamReader, codecs.StreamReader): + encoding = 'cp949' + +### encodings module API + +def getregentry(): + return (Codec().encode,Codec().decode,StreamReader,StreamWriter) 1.5 +18 -5 KoreanCodecs/korean/euc_kr.py Index: euc_kr.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/korean/euc_kr.py,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- euc_kr.py 9 Jan 2003 21:35:48 -0000 1.4 +++ euc_kr.py 12 Jan 2003 22:54:12 -0000 1.5 @@ -17,10 +17,23 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: euc_kr.py,v 1.4 2003/01/09 21:35:48 perky Exp $ +# $Id: euc_kr.py,v 1.5 2003/01/12 22:54:12 perky Exp $ # -try: - from korean.c.euc_kr import * -except ImportError: - from korean.python.euc_kr import * +import codecs +import _koco + +class Codec(codecs.Codec): + encode = _koco.euc_kr_encode + decode = _koco.euc_kr_decode + +class StreamWriter(Codec, codecs.StreamWriter): + pass + +class StreamReader(Codec, _koco.StreamReader, codecs.StreamReader): + encoding = 'euc-kr' + +### encodings module API + +def getregentry(): + return (Codec().encode,Codec().decode,StreamReader,StreamWriter) 1.5 +139 -5 KoreanCodecs/korean/iso_2022_kr.py Index: iso_2022_kr.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/korean/iso_2022_kr.py,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- iso_2022_kr.py 9 Jan 2003 21:35:48 -0000 1.4 +++ iso_2022_kr.py 12 Jan 2003 22:54:12 -0000 1.5 @@ -17,10 +17,144 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: iso_2022_kr.py,v 1.4 2003/01/09 21:35:48 perky Exp $ +# $Id: iso_2022_kr.py,v 1.5 2003/01/12 22:54:12 perky Exp $ # -try: - from korean.c.iso_2022_kr import * -except ImportError: - from korean.python.iso_2022_kr import * +import codecs + +KSC5601_CODEC = 'korean.euc-kr' + +US_ASCII = 1 +KSC5601_1987 = 2 + +G0, G1 = 0, 1 # iso-2022-kr doesn't handle G2 and G3 area. + +CHARSETS = { + "\033(B": (G0, US_ASCII), + "\033)B": (G1, US_ASCII), + "\033$(C": (G0, KSC5601_1987), + "\033$)C": (G1, KSC5601_1987), +} +SI = '\x0f' +SO = '\x0e' +ESC = '\033' + +DESIGNATION_MARK = {} +for k, v in CHARSETS.items(): + DESIGNATION_MARK[v] = k + +class Codec(codecs.Codec): + # Unicode to character buffer + def encode(self, data, errors='strict'): + if errors not in ('strict', 'ignore', 'replace'): + raise ValueError, "unknown error handling" + buffer = [] + designation = [US_ASCII, US_ASCII] + new_designation = designation[:] + new_shiftstate = shiftstate = 0 + for c in data: + if c in ('\n', '\r'): + new_shiftstate = 0 + + if c < u'\u0080': + new_shiftstate = 0 + new_designation[0] = US_ASCII + s = c.encode("ascii", errors) + else: + new_shiftstate = 1 + new_designation[1] = KSC5601_1987 + s = c.encode('korean.euc_kr', errors) + + if designation[0] != new_designation[0]: + buffer.append(DESIGNATION_MARK[(G0, new_designation[0])]) + designation[0] = new_designation[0] + if designation[1] != new_designation[1]: + buffer.append(DESIGNATION_MARK[(G1, new_designation[1])]) + designation[1] = new_designation[1] + if shiftstate != new_shiftstate: + buffer.append([SI, SO][new_shiftstate]) + shiftstate = new_shiftstate + + if shiftstate: + s = chr(ord(s[0])&0x7F) + chr(ord(s[1])&0x7F) + buffer.append(s) + + if shiftstate: + buffer.append(SI) + + return (''.join(buffer), len(data)) + + # character buffer to Unicode + def decode(self, data, errors='strict'): + global decmap_ideo, decmap_misc + + if errors not in ('strict', 'ignore', 'replace'): + raise ValueError, "unknown error handling" + buffer = [] + data = str(data) # character buffer compatible object + size = len(data) + + designation = [US_ASCII, KSC5601_1987] + shiftstate = 0 + escstart = -1 + p = 0 + + while p < size: + if data[p] in ('\n', '\r'): + shiftstate = 0 + + if escstart >= 0: + if data[p].isalpha(): + escstr = data[escstart:p+1] + if CHARSETS.has_key(escstr): + charset = CHARSETS[escstr] + designation[charset[0]] = charset[1] + elif errors == 'strict': + raise UnicodeError, "unsupported charset found: " \ + + repr(data[escstart:p+1]) + escstart = -1 + p += 1 + elif data[p] == SO: + shiftstate = 1 + p += 1 + elif data[p] == SI: + shiftstate = 0 + p += 1 + elif data[p] == ESC: + escstart = p + p += 1 + else: + if (ord(data[p]) | (shiftstate and 0x80 or 0x00)) >= 0x80: + codearea = G1 + else: + codearea = G0 + + if designation[codearea] == US_ASCII: + buffer.append(unicode(data[p], "ascii", errors)) + p += 1 + elif ord(data[p]) & 0x7F >= 0x20: # KSC5601_1987 + c = data[p:p+2] + p += 2 + if len(c) == 2: + c = chr(ord(c[0])|0x80) + chr(ord(c[1])|0x80) + buffer.append(unicode(c, KSC5601_CODEC, errors)) + else: # control characters + buffer.append(unichr(ord(data[p]) & 0x7F)) + p += 1 + + return (u''.join(buffer), len(data)) + +class StreamWriter(Codec, codecs.StreamWriter): + pass + +class StreamReader(Codec, codecs.StreamReader): + pass + # not implemented. + # (JapaneseCodecs's implementation is so different to adopt.) + +### encodings module API + +def getregentry(): + return (Codec().encode,Codec().decode,StreamReader,StreamWriter) + +# ex: ts=8 sts=4 et 1.5 +183 -5 KoreanCodecs/korean/johab.py Index: johab.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/korean/johab.py,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- johab.py 9 Jan 2003 21:35:48 -0000 1.4 +++ johab.py 12 Jan 2003 22:54:12 -0000 1.5 @@ -17,10 +17,188 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: johab.py,v 1.4 2003/01/09 21:35:48 perky Exp $ +# $Id: johab.py,v 1.5 2003/01/12 22:54:12 perky Exp $ # -try: - from korean.c.johab import * -except ImportError: - from korean.python.johab import * +import codecs + +from korean.hangul import Jaeum, Moeum, ishangul, split, join +encmap, decmap = {}, {} + +johab2uni_chosung = { + 1: u'', 2: Jaeum.G, 3: Jaeum.GG, 4: Jaeum.N, + 5: Jaeum.D, 6: Jaeum.DD, 7: Jaeum.L, 8: Jaeum.M, + 9: Jaeum.B, 10: Jaeum.BB, 11: Jaeum.S, 12: Jaeum.SS, + 13: Jaeum.NG, 14: Jaeum.J, 15: Jaeum.JJ, 16: Jaeum.C, + 17: Jaeum.K, 18: Jaeum.T, 19: Jaeum.P, 20: Jaeum.H, +} +johab2uni_jungsung = { + 2: u'', 3: Moeum.A, 4: Moeum.AE, 5: Moeum.YA, + 6: Moeum.YAE, 7: Moeum.EO, 10: Moeum.E, 11: Moeum.YEO, + 12: Moeum.YE, 13: Moeum.O, 14: Moeum.WA, 15: Moeum.WAE, + 18: Moeum.OE, 19: Moeum.YO, 20: Moeum.U, 21: Moeum.WEO, + 22: Moeum.WE, 23: Moeum.WI, 26: Moeum.YU, 27: Moeum.EU, + 28: Moeum.YI, 29: Moeum.I +} +johab2uni_jongsung = { + 1: u'', 2: Jaeum.G, 3: Jaeum.GG, 4: Jaeum.GS, + 5: Jaeum.N, 6: Jaeum.NJ, 7: Jaeum.NH, 8: Jaeum.D, + 9: Jaeum.L, 10: Jaeum.LG, 11: Jaeum.LM, 12: Jaeum.LB, + 13: Jaeum.LS, 14: Jaeum.LT, 15: Jaeum.LP, 16: Jaeum.LH, + 17: Jaeum.M, 19: Jaeum.B, 20: Jaeum.BS, 21: Jaeum.S, + 22: Jaeum.SS, 23: Jaeum.NG, 24: Jaeum.J, 25: Jaeum.C, + 26: Jaeum.K, 27: Jaeum.T, 28: Jaeum.P, 29: Jaeum.H +} + +uni2johab_chosung = {} +uni2johab_jungsung = {} +uni2johab_jongsung = {} +for k, v in johab2uni_chosung.items(): + uni2johab_chosung[v] = k +for k, v in johab2uni_jungsung.items(): + uni2johab_jungsung[v] = k +for k, v in johab2uni_jongsung.items(): + uni2johab_jongsung[v] = k + +class Codec(codecs.Codec): + + # Unicode to character buffer + def encode(self, data, errors='strict'): + global encmap + + if errors not in ('strict', 'ignore', 'replace'): + raise ValueError, "unknown error handling" + buffer = [] + + for c in data: + if c < u'\u0080': + buffer.append(c.encode("ascii", errors)) + elif ishangul(c): + cho, jung, jong = split(c) # all hangul can success + cho, jung, jong = ( + uni2johab_chosung[cho], + uni2johab_jungsung[jung], + uni2johab_jongsung[jong] + ) + code = 0x8000 | (cho<<10) | (jung<<5) | jong + buffer.append(chr(code>>8) + chr(code&0xFF)) + else: + if not encmap: + from korean.mappings import johab_ideograph + encmap = johab_ideograph.encoding_map + + if encmap.has_key(c): + buffer.append(encmap[c]) + elif errors == 'replace': + buffer.append('\x84\x41') + elif errors == 'strict': + raise UnicodeError, "cannot map \\u%04x to JOHAB" % ord(c) + + return (''.join(buffer), len(data)) + + # character buffer to Unicode + def decode(self, data, errors='strict'): + global decmap + + if errors not in ('strict', 'ignore', 'replace'): + raise ValueError, "unknown error handling" + + buffer = [] + data = str(data) # character buffer compatible object + size = len(data) + p = 0 + while p < size: + if data[p] < '\x80': + buffer.append(unicode(data[p], "ascii", errors)) + p += 1 + else: + c = data[p:p+2] + p += 2 + if len(c) == 2: + code = (ord(c[0])<<8) | ord(c[1]) + cho = (code >> 10) & 0x1f + jung = (code >> 5) & 0x1f + jong = (code) & 0x1f + if ( johab2uni_chosung.has_key(cho) and + johab2uni_jungsung.has_key(jung) and + johab2uni_jongsung.has_key(jong) ): + buffer.append( join([ + johab2uni_chosung[cho], + johab2uni_jungsung[jung], + johab2uni_jongsung[jong] + ]) ) + continue + + if not decmap: + from korean.mappings import johab_ideograph + decmap = johab_ideograph.decoding_map + + if decmap.has_key(c): + buffer.append(decmap[c]) + continue + + if errors == 'replace': + buffer.append(u'\uFFFD') # REPLACEMENT CHARACTER + elif errors == 'strict': + raise UnicodeError, "unexpected byte 0x%02x%02x found" % tuple(map(ord, c)) + + return (u''.join(buffer), size) + + +class StreamWriter(Codec, codecs.StreamWriter): + pass + + +class StreamReader(Codec, codecs.StreamReader): + + def __init__(self, stream, errors='strict'): + codecs.StreamReader.__init__(self, stream, errors) + self.data = '' + + def _read(self, func, size): + if size == 0: + return u'' + if size is None or size < 0: + data = self.data + func() + self.data = '' + else: + data = self.data + func(max(size, 2) - len(self.data)) + size = len(data) + p = 0 + while p < size: + if data[p] < "\x80": + p = p + 1 + elif p + 2 <= size: + p = p + 2 + else: + break + data, self.data = data[:p], data[p:] + return self.decode(data)[0] + + def read(self, size=-1): + return self._read(self.stream.read, size) + + def readline(self, size=-1): + return self._read(self.stream.readline, size) + + def readlines(self, size=-1): + data = self._read(self.stream.read, size) + buffer = [] + end = 0 + while 1: + pos = data.find(u'\n', end) + if pos < 0: + if end < len(data): + buffer.append(data[end:]) + break + buffer.append(data[end:pos+1]) + end = pos+1 + return buffer + def reset(self): + self.data = '' + +### encodings module API + +def getregentry(): + return (Codec().encode,Codec().decode,StreamReader,StreamWriter) + 1.2 +181 -5 KoreanCodecs/korean/mackorean.py Index: mackorean.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/korean/mackorean.py,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- mackorean.py 9 Jan 2003 22:40:39 -0000 1.1 +++ mackorean.py 12 Jan 2003 22:54:12 -0000 1.2 @@ -17,10 +17,186 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: mackorean.py,v 1.1 2003/01/09 22:40:39 perky Exp $ +# $Id: mackorean.py,v 1.2 2003/01/12 22:54:12 perky Exp $ # -try: - from korean.c.mackorean import * -except ImportError: - from korean.python.mackorean import * +import codecs +from korean.mappings import ksc5601_hangul, appleextension +encmap_hangul, decmap_hangul = ksc5601_hangul.encoding_map, ksc5601_hangul.decoding_map +encmap_apple, decmap_apple = appleextension.encoding_map, appleextension.decoding_map +encmap_ideo, decmap_ideo = {}, {} +encmap_misc, decmap_misc = {}, {} + +class Codec(codecs.Codec): + + # Unicode to character buffer + def encode(self, data, errors='strict'): + global encmap_ideo, encmap_misc + + if errors not in ('strict', 'ignore', 'replace'): + raise ValueError, "unknown error handling" + buffer = [] + p = 0 + size = len(data) + + while p < size: + aemap = appleextension.multilevel_encmap + relp = 0 + while p + relp < size and aemap.has_key(data[p + relp]): + aemap = aemap[data[p + relp]] + relp += 1 + if aemap.has_key(None): + buffer.append(aemap[None]) + p += relp + continue + + c = data[p] + p += 1 + + if c < u'\u0080': + buffer.append(c.encode("ascii", errors)) + elif encmap_hangul.has_key(c): + buffer.append(encmap_hangul[c]) + elif encmap_apple.has_key(c): + buffer.append(encmap_apple[c]) + else: + if not encmap_misc: + from korean.mappings import ksc5601_misc + encmap_misc = ksc5601_misc.encoding_map + if encmap_misc.has_key(c): + buffer.append(encmap_misc[c]) + continue + + if not encmap_ideo: + from korean.mappings import ksc5601_ideograph + encmap_ideo = ksc5601_ideograph.encoding_map + if encmap_ideo.has_key(c): + buffer.append(encmap_ideo[c]) + continue + + if errors == 'replace': + buffer.append('\xa1\xa1') + elif errors == 'strict': + raise UnicodeError, ("cannot map " + "\\u%04x to MacKorean") % ord(c) + + return (''.join(buffer), len(data)) + + # character buffer to Unicode + def decode(self, data, errors='strict'): + global decmap_ideo, decmap_misc + + if errors not in ('strict', 'ignore', 'replace'): + raise ValueError, "unknown error handling" + + buffer = [] + data = str(data) # character buffer compatible object + size = len(data) + p = 0 + while p < size: + if data[p] < '\x80': + buffer.append(unicode(data[p], "ascii", errors)) + p += 1 + elif data[p] <= '\xa0' or data[p] == '\xff': + if decmap_apple.has_key(data[p]): + buffer.append(decmap_apple[data[p]]) + p += 1 + continue + + if errors == 'replace': + buffer.append(u'\uFFFD') # REPLACEMENT CHARACTER + elif errors == 'strict': + raise UnicodeError, "unexpected byte %s found" % ( + hex(ord(data[p]))) + p += 1 + else: + c = data[p:p+2] + p += 2 + if len(c) == 2: + if decmap_hangul.has_key(c): + buffer.append(decmap_hangul[c]) + continue + elif decmap_apple.has_key(c): + buffer.append(decmap_apple[c]) + continue + + if not decmap_misc: + from korean.mappings import ksc5601_misc + decmap_misc = ksc5601_misc.decoding_map + if decmap_misc.has_key(c): + buffer.append(decmap_misc[c]) + continue + + if not decmap_ideo: + from korean.mappings import ksc5601_ideograph + decmap_ideo = ksc5601_ideograph.decoding_map + if decmap_ideo.has_key(c): + buffer.append(decmap_ideo[c]) + continue + + if errors == 'replace': + buffer.append(u'\uFFFD') # REPLACEMENT CHARACTER + elif errors == 'strict': + raise UnicodeError, "unexpected byte 0x%s found" % ( + ''.join(["%02x"%ord(x) for x in c]) ) + + return (u''.join(buffer), size) + + +class StreamWriter(Codec, codecs.StreamWriter): + pass + + +class StreamReader(Codec, codecs.StreamReader): + + def __init__(self, stream, errors='strict'): + codecs.StreamReader.__init__(self, stream, errors) + self.data = '' + + def _read(self, func, size): + if size == 0: + return u'' + if size is None or size < 0: + data = self.data + func() + self.data = '' + else: + data = self.data + func(max(size, 2) - len(self.data)) + size = len(data) + p = 0 + while p < size: + if data[p] < "\xa1" or data[p] == "\xff": + p = p + 1 + elif p + 2 <= size: + p = p + 2 + else: + break + data, self.data = data[:p], data[p:] + return self.decode(data)[0] + + def read(self, size=-1): + return self._read(self.stream.read, size) + + def readline(self, size=-1): + return self._read(self.stream.readline, size) + + def readlines(self, size=-1): + data = self._read(self.stream.read, size) + buffer = [] + end = 0 + while 1: + pos = data.find(u'\n', end) + if pos < 0: + if end < len(data): + buffer.append(data[end:]) + break + buffer.append(data[end:pos+1]) + end = pos+1 + return buffer + def reset(self): + self.data = '' + + +def getregentry(): + return (Codec().encode,Codec().decode,StreamReader,StreamWriter) + +# ex: ts=8 sts=4 et 1.5 +177 -5 KoreanCodecs/korean/qwerty2bul.py Index: qwerty2bul.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/korean/qwerty2bul.py,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- qwerty2bul.py 9 Jan 2003 21:35:48 -0000 1.4 +++ qwerty2bul.py 12 Jan 2003 22:54:12 -0000 1.5 @@ -17,10 +17,182 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: qwerty2bul.py,v 1.4 2003/01/09 21:35:48 perky Exp $ +# $Id: qwerty2bul.py,v 1.5 2003/01/12 22:54:12 perky Exp $ # -try: - from korean.c.qwerty2bul import * -except ImportError: - from korean.python.qwerty2bul import * +import codecs +from korean.hangul import Moeum, Jaeum, Chosung, Jungsung, Jongsung +from korean.hangul import ishangul, join, split, isJaeum, isMoeum + +codekeymap = { + Jaeum.G: 'r', Jaeum.GG: 'R', Jaeum.GS: 'rt', + Jaeum.N: 's', Jaeum.NJ:'sw', Jaeum.NH: 'sg', Jaeum.D: 'e', + Jaeum.DD:'E', Jaeum.L: 'f', Jaeum.LG: 'fr', Jaeum.LM: 'fa', + Jaeum.LB:'fq', Jaeum.LS:'ft', Jaeum.LT: 'fx', Jaeum.LP: 'fv', + Jaeum.LH:'fg', Jaeum.M: 'a', Jaeum.B: 'q', Jaeum.BB: 'Q', + Jaeum.BS:'qt', Jaeum.S: 't', Jaeum.SS: 'T', Jaeum.NG: 'd', + Jaeum.J: 'w', Jaeum.JJ:'W', Jaeum.C: 'c', Jaeum.K: 'z', + Jaeum.T: 'x', Jaeum.P: 'v', Jaeum.H: 'g', + + Moeum.A: 'k', Moeum.AE:'o', Moeum.YA: 'i', Moeum.YAE:'O', + Moeum.EO:'j', Moeum.E: 'p', Moeum.YEO:'u', Moeum.YE: 'P', + Moeum.O: 'h', Moeum.WA:'hk', Moeum.WAE:'ho', Moeum.OE: 'hl', + Moeum.YO:'y', Moeum.U: 'n', Moeum.WEO:'nj', Moeum.WE: 'np', + Moeum.WI:'nl', Moeum.YU:'b', Moeum.EU: 'm', Moeum.YI: 'ml', + Moeum.I: 'l', + + u'': '', +} + +keycodemap = {} +for k, v in codekeymap.items(): + keycodemap[v] = k + keycodemap.setdefault(v.upper(), k) +keycodes = ''.join(keycodemap.keys()) +del k, v + + +class Automata_Hangul2: + + # must Unicode in / Unicode out + + def __init__(self): + self.clear() + + def pushcomp(self): + if self.chosung and not self.jungsung: + self.word_valid = 0 + self.word_comp.append(join([self.chosung, self.jungsung, self.jongsung])) + self.clearcomp() + + def clearcomp(self): + self.chosung = u'' + self.jungsung = u'' + self.jongsung = u'' + + def clear(self): + self.buff = [''] + self.word_raw = [] + self.word_comp = [] + self.word_valid = 1 + self.clearcomp() + + def convert(self, s): + self.clear() + + map(self.feed, s) + self.finalize() + + return u''.join(self.buff) + + def finalize(self): + if self.chosung or self.jungsung or self.jongsung: + self.pushcomp() + if self.word_raw or self.word_comp: + if self.word_valid: + self.buff.append(u''.join(self.word_comp)) + else: + self.word_valid = 1 + self.buff.append(u''.join(self.word_raw)) + + self.word_raw, self.word_comp = [], [] + + def feed(self, c): + self.word_raw.append(c) + if c in keycodes: + code = keycodemap[c] + if isJaeum(code): + if not self.chosung: # chosung O + if self.jungsung or self.jongsung: + self.word_valid = 0 + else: + self.chosung = code + elif not self.jungsung: # chosung O jungsung X + if self.jongsung: + self.word_valid = 0 + else: + self.pushcomp() + self.chosung = code + elif not self.jongsung: # chosung O jungsung O jongsung X + if code not in Jongsung: + self.pushcomp() + self.chosung = code + else: + self.jongsung = code + else: # full + trymul = codekeymap[self.jongsung] + c + if keycodemap.has_key(trymul): # can be multi jongsung + self.jongsung = keycodemap[trymul] + else: + self.pushcomp() + self.chosung = code + else: # MOEUM... + if not self.jongsung: + if not self.jungsung: # jungsung X jongsung X + self.jungsung = code + else: # jungsung O jongsung X + trymul = codekeymap[self.jungsung] + c + if keycodemap.has_key(trymul): # can be multi jungsung + self.jungsung = keycodemap[trymul] + else: + self.pushcomp() + self.jungsung = code + else: # jongsung O + if len(codekeymap[self.jongsung]) > 1: + ojong = keycodemap[codekeymap[self.jongsung][:-1]] + ncho = keycodemap[codekeymap[self.jongsung][-1]] + self.jongsung = ojong + self.pushcomp() + self.chosung = ncho + self.jungsung = code + else: + njong = self.jongsung + self.jongsung = u'' + self.pushcomp() + self.chosung = njong + self.jungsung = code + else: # non key code + self.finalize() + self.buff.append(c) + + +class Codec(codecs.Codec): + + BASECODEC = 'korean.cp949' # fallback codec of decoder + + # Unicode to key stroke + def encode(self, data, errors='strict'): + if errors not in ('strict', 'ignore', 'replace'): + raise ValueError, "unknown error handling" + + r = [] + for c in data: + if c <= u'\u0080': + r.append(c.encode('ascii')) + elif not ishangul(c): + r.append(c.encode(self.BASECODEC, errors=errors)) + else: + for k in split(c): + r.append(codekeymap[k]) + + r = ''.join(r) + return (r, len(r)) + + # key stroke to Unicode + def decode(self, data, errors='strict'): + if errors not in ('strict', 'ignore', 'replace'): + raise ValueError, "unknown error handling" + + s = unicode(data, self.BASECODEC, errors) + am = Automata_Hangul2() + r = am.convert(s) + return (r, len(r)) + +class StreamWriter(Codec, codecs.StreamWriter): + pass + +class StreamReader(Codec, codecs.StreamReader): + pass + +def getregentry(): + return (Codec().encode, Codec().decode, StreamReader, StreamWriter) 1.5 +31 -5 KoreanCodecs/korean/unijohab.py Index: unijohab.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/korean/unijohab.py,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- unijohab.py 9 Jan 2003 21:35:48 -0000 1.4 +++ unijohab.py 12 Jan 2003 22:54:12 -0000 1.5 @@ -17,10 +17,36 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: unijohab.py,v 1.4 2003/01/09 21:35:48 perky Exp $ +# $Id: unijohab.py,v 1.5 2003/01/12 22:54:12 perky Exp $ # -try: - from korean.c.unijohab import * -except ImportError: - from korean.python.unijohab import * +import codecs +from korean.hangul import ishangul, disjoint, conjoin + +class Codec(codecs.Codec): + + # Unicode to character buffer + def encode(self, data, errors='strict'): + if errors not in ('strict', 'ignore', 'replace'): + raise ValueError, "unknown error handling" + + return disjoint(data).encode('utf-8', errors), len(data) + + # character buffer to Unicode + def decode(self, data, errors='strict'): + if errors not in ('strict', 'ignore', 'replace'): + raise ValueError, "unknown error handling" + + return conjoin(unicode(data, 'utf-8', errors)), len(data) + +class StreamWriter(Codec, codecs.StreamWriter): + pass + +class StreamReader(Codec, codecs.StreamReader): + pass + # XXX: Temporarily None. + +### encodings module API + +def getregentry(): + return (Codec().encode,Codec().decode,StreamReader,StreamWriter) |
From: Hye-Shik C. <pe...@us...> - 2003-01-12 22:54:13
|
perky 03/01/12 14:54:11 Modified: . README.en README.ko setup.py Log: Remove selective framework for two implementations, 'C' and 'Python'. We'll maintain only 1 implementation from now. Accordingly, --with[out]-extension options is removed, too. Revision Changes Path 1.26 +5 -5 KoreanCodecs/README.en Index: README.en =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/README.en,v retrieving revision 1.25 retrieving revision 1.26 diff -u -r1.25 -r1.26 --- README.en 10 Jan 2003 06:36:25 -0000 1.25 +++ README.en 12 Jan 2003 22:54:11 -0000 1.26 @@ -2,7 +2,7 @@ ============================= Copyright(C) 2002-2003 Hye-Shik Chang. -$Id: README.en,v 1.25 2003/01/10 06:36:25 perky Exp $ +$Id: README.en,v 1.26 2003/01/12 22:54:11 perky Exp $ Introduction @@ -65,10 +65,8 @@ corresponding codec names: o Wansung - - korean.python.euc-kr - - korean.python.cp949 - - korean.c.euc-kr - - korean.c.cp949 + - korean.euc-kr + - korean.cp949 - korean.mackorean o Johab @@ -106,6 +104,8 @@ - Added MacKorean codec which is used by MacOS 7 and above. - Reimplemented ISO-2022-KR codec and it can handle ksc5601 designated on G0 area, now. (MULE Compatible) + - Pure python implementation of euc-kr and cp949 codec as well as hangul + module is removed from distribution. o Version 2.0.5 - 24 July 2002 - Add two new characters which is introduced by KSX1001-1998 1.24 +9 -5 KoreanCodecs/README.ko Index: README.ko =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/README.ko,v retrieving revision 1.23 retrieving revision 1.24 diff -u -r1.23 -r1.24 --- README.ko 10 Jan 2003 06:36:25 -0000 1.23 +++ README.ko 12 Jan 2003 22:54:11 -0000 1.24 @@ -2,7 +2,7 @@ ====================== Copyright(C) 2002-2003 Hye-Shik Chang. -$Id: README.ko,v 1.23 2003/01/10 06:36:25 perky Exp $ +$Id: README.ko,v 1.24 2003/01/12 22:54:11 perky Exp $ *Ä·ÆäÀÎ* ÀÎÅͳݿ¡¼ ÇÑ±Û ¸ÂÃã¹ýÀ» Áöŵ½Ã´Ù. ^-^/~ @@ -67,17 +67,19 @@ ÀÌ ÆÐŰÁö¿¡¼´Â ´ÙÀ½ ÀÎÄÚµùµéÀ» Á¦°øÇÕ´Ï´Ù: o ¿Ï¼ºÇü - - korean.python.euc-kr : KS5601 ¿Ï¼ºÇü - - korean.python.cp949 : Microsoft È®Àå¿Ï¼ºÇü - - korean.c.euc-kr : C·Î ÀÛ¼ºµÈ EUC-KR ÄÚµ¦ - - korean.c.cp949 : C·Î ÀÛ¼ºµÈ CP949 ÄÚµ¦ + - korean.euc-kr : KS5601 ¿Ï¼ºÇü + - korean.cp949 : Microsoft È®Àå¿Ï¼ºÇü - korean.mackorean : Apple È®Àå¿Ï¼ºÇü + o Á¶ÇÕÇü - korean.johab : »ó¿ë Á¶ÇÕÇü (8ºñÆ®) + o ISO-2022-KR - korean.iso-2022-kr : RFC1557 ÇÑ±Û ÀÎÅÍ³Ý ¸Þ½ÃÁö ÀÎÄÚµù + o À¯´ÏÄÚµå Á¶ÇÕÇü - korean.unijohab : À¯´ÏÁ¶ÇÕ (MacOS Á¶ÇÕ) + o Qwerty ÀÚÆÇ ¸ÅÇÎ - korean.qwerty2bul : 2¹ú½Ä - ÄõƼÀÚÆÇ ¸ÅÇÎ @@ -106,6 +108,8 @@ - MacOS 7 À̻󿡼 »ç¿ëµÇ´Â MacKorean ÄÚµ¦ÀÌ Ãß°¡µÇ¾ú½À´Ï´Ù. - »õ·Î ±¸ÇöµÈ ISO-2022-KR ÄÚµ¦Àº G0 ¿µ¿ª¿¡ ÁöÁ¤µÈ KSC5601µµ ´Ù·ê ¼ö ÀÖ°Ô µÇ¾ú½À´Ï´Ù. (MULE ȣȯ) + - EUC-KR, CP949 ÄÚµ¦°ú hangul ¸ðµâÀÇ ¼ø¼ö ÆÄÀ̽㠱¸ÇöÀÌ ´õÀÌ»ó ¹èÆ÷µÇÁö + ¾Ê½À´Ï´Ù. o ¹öÁ¯ 2.0.5 2002³â 7¿ù 24ÀÏ - KSX1001-1998¿¡ Ãß°¡µÈ À¯·ÎÈ Ç¥±â¿Í µî·Ï»óÇ¥ ¸¶Å©¸¦ Ãß°¡ÇÏ¿´½À´Ï´Ù. 1.32 +6 -9 KoreanCodecs/setup.py Index: setup.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/setup.py,v retrieving revision 1.31 retrieving revision 1.32 diff -u -r1.31 -r1.32 --- setup.py 9 Jan 2003 23:37:46 -0000 1.31 +++ setup.py 12 Jan 2003 22:54:11 -0000 1.32 @@ -18,7 +18,7 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: setup.py,v 1.31 2003/01/09 23:37:46 perky Exp $ +# $Id: setup.py,v 1.32 2003/01/12 22:54:11 perky Exp $ # import sys @@ -27,7 +27,6 @@ flavors = { 'aliases': 1, - 'extension': 1, } for flname in flavors.keys(): if '--without-'+flname in sys.argv: @@ -65,10 +64,8 @@ url = "http://sourceforge.net/projects/koco", cmdclass = {'install': Install}, packages = ['korean', - 'korean.mappings', - 'korean.c', - 'korean.python'], - ext_modules = flavors['extension'] and [ - Extension("korean.c._koco", ["src/_koco.c"]), - Extension("korean.c.hangul", ["src/hangul.c"]), - ] or []) + 'korean.mappings'], + ext_modules = [ + Extension("korean._koco", ["src/_koco.c"]), + Extension("korean.hangul", ["src/hangul.c"]), + ]) |
From: Hye-Shik C. <pe...@us...> - 2003-01-11 15:01:58
|
perky 03/01/11 07:01:57 Modified: test test_cp949.py test_mackorean.py Log: Don't retrieve mapping file automatically, give a chance to user. Revision Changes Path 1.11 +8 -7 KoreanCodecs/test/test_cp949.py Index: test_cp949.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/test/test_cp949.py,v retrieving revision 1.10 retrieving revision 1.11 diff -u -r1.10 -r1.11 --- test_cp949.py 10 Jan 2003 04:29:34 -0000 1.10 +++ test_cp949.py 11 Jan 2003 15:01:56 -0000 1.11 @@ -16,7 +16,7 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: test_cp949.py,v 1.10 2003/01/10 04:29:34 perky Exp $ +# $Id: test_cp949.py,v 1.11 2003/01/11 15:01:56 perky Exp $ # import CodecTestBase @@ -38,15 +38,16 @@ ) def test_mapping(self): - import sys, os, urllib + import sys, os - if not os.access('map-cp949.txt', os.R_OK): - sys.stdout.write('(Trying to download cp949 mapping) ') + if not os.access('CP949.TXT', os.R_OK): + sys.stdout.write('skipped -- CP949.TXT not found, download from' + ' http://www.unicode.org/Public/MAPPINGS' + '/VENDORS/MICSFT/WINDOWS/CP949.TXT ') sys.stdout.flush() - urllib.urlretrieve('http://www.unicode.org/Public/MAPPINGS' - '/VENDORS/MICSFT/WINDOWS/CP949.TXT', 'map-cp949.txt') + return - for line in open('map-cp949.txt'): + for line in open('CP949.TXT'): if not line: break data = line.split('#')[0].strip().split() 1.4 +8 -7 KoreanCodecs/test/test_mackorean.py Index: test_mackorean.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/test/test_mackorean.py,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- test_mackorean.py 10 Jan 2003 04:29:34 -0000 1.3 +++ test_mackorean.py 11 Jan 2003 15:01:56 -0000 1.4 @@ -16,7 +16,7 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: test_mackorean.py,v 1.3 2003/01/10 04:29:34 perky Exp $ +# $Id: test_mackorean.py,v 1.4 2003/01/11 15:01:56 perky Exp $ # import CodecTestBase @@ -40,15 +40,16 @@ ) def test_mapping(self): - import os, urllib, sys + import os, sys - if not os.access('map-mackorean.txt', os.R_OK): - sys.stdout.write('(Trying to download MacKorean mapping) ') + if not os.access('KOREAN.TXT', os.R_OK): + sys.stdout.write('skipped -- KOREAN.TXT not found, download from' + ' http://www.unicode.org/Public/MAPPINGS' + '/VENDORS/APPLE/KOREAN.TXT ') sys.stdout.flush() - urllib.urlretrieve('http://www.unicode.org/Public/MAPPINGS' - '/VENDORS/APPLE/KOREAN.TXT', 'map-mackorean.txt') + return - for line in open('map-mackorean.txt'): + for line in open('KOREAN.TXT'): if not line: break data = line.split('#')[0].strip().split() |
From: Hye-Shik C. <pe...@us...> - 2003-01-10 06:50:45
|
perky 03/01/09 22:50:44 Modified: korean aliases.py Log: Fix a dumb typo. macwansung -> mackorean Revision Changes Path 1.10 +2 -2 KoreanCodecs/korean/aliases.py Index: aliases.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/korean/aliases.py,v retrieving revision 1.9 retrieving revision 1.10 diff -u -r1.9 -r1.10 --- aliases.py 9 Jan 2003 22:40:39 -0000 1.9 +++ aliases.py 10 Jan 2003 06:50:44 -0000 1.10 @@ -17,7 +17,7 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: aliases.py,v 1.9 2003/01/09 22:40:39 perky Exp $ +# $Id: aliases.py,v 1.10 2003/01/10 06:50:44 perky Exp $ # import encodings.aliases @@ -42,5 +42,5 @@ 'unijohab': 'korean.unijohab', 'macjohab': 'korean.unijohab', 'mackorean': 'korean.mackorean', - 'macwansung': 'korean.macwansung', + 'macwansung': 'korean.mackorean', }) |
From: Hye-Shik C. <pe...@us...> - 2003-01-10 06:36:26
|
perky 03/01/09 22:36:25 Modified: . README.en README.ko Log: Mention about new iso-2022-kr codec implementation Revision Changes Path 1.25 +3 -1 KoreanCodecs/README.en Index: README.en =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/README.en,v retrieving revision 1.24 retrieving revision 1.25 diff -u -r1.24 -r1.25 --- README.en 10 Jan 2003 02:31:56 -0000 1.24 +++ README.en 10 Jan 2003 06:36:25 -0000 1.25 @@ -2,7 +2,7 @@ ============================= Copyright(C) 2002-2003 Hye-Shik Chang. -$Id: README.en,v 1.24 2003/01/10 02:31:56 perky Exp $ +$Id: README.en,v 1.25 2003/01/10 06:36:25 perky Exp $ Introduction @@ -104,6 +104,8 @@ - Fixed a bug that korean.python.cp949 codec taints korean.python.euc_kr codec's decoding and encoding mappings on its loading time. - Added MacKorean codec which is used by MacOS 7 and above. + - Reimplemented ISO-2022-KR codec and it can handle ksc5601 designated + on G0 area, now. (MULE Compatible) o Version 2.0.5 - 24 July 2002 - Add two new characters which is introduced by KSX1001-1998 1.23 +3 -1 KoreanCodecs/README.ko Index: README.ko =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/README.ko,v retrieving revision 1.22 retrieving revision 1.23 diff -u -r1.22 -r1.23 --- README.ko 10 Jan 2003 02:31:56 -0000 1.22 +++ README.ko 10 Jan 2003 06:36:25 -0000 1.23 @@ -2,7 +2,7 @@ ====================== Copyright(C) 2002-2003 Hye-Shik Chang. -$Id: README.ko,v 1.22 2003/01/10 02:31:56 perky Exp $ +$Id: README.ko,v 1.23 2003/01/10 06:36:25 perky Exp $ *Ä·ÆäÀÎ* ÀÎÅͳݿ¡¼ ÇÑ±Û ¸ÂÃã¹ýÀ» Áöŵ½Ã´Ù. ^-^/~ @@ -104,6 +104,8 @@ - korean.python.cp949 ÄÚµ¦ÀÌ ·ÎµåµÇ¸é¼ korean.python.euc_kr ÀÇ ¸ÅÇο¡ cp949¸¦ Ãß°¡Çعö¸®´Â ¹ö±×°¡ ¼öÁ¤µÇ¾ú½À´Ï´Ù. - MacOS 7 À̻󿡼 »ç¿ëµÇ´Â MacKorean ÄÚµ¦ÀÌ Ãß°¡µÇ¾ú½À´Ï´Ù. + - »õ·Î ±¸ÇöµÈ ISO-2022-KR ÄÚµ¦Àº G0 ¿µ¿ª¿¡ ÁöÁ¤µÈ KSC5601µµ ´Ù·ê ¼ö ÀÖ°Ô + µÇ¾ú½À´Ï´Ù. (MULE ȣȯ) o ¹öÁ¯ 2.0.5 2002³â 7¿ù 24ÀÏ - KSX1001-1998¿¡ Ãß°¡µÈ À¯·ÎÈ Ç¥±â¿Í µî·Ï»óÇ¥ ¸¶Å©¸¦ Ãß°¡ÇÏ¿´½À´Ï´Ù. |
From: Hye-Shik C. <pe...@us...> - 2003-01-10 06:09:31
|
perky 03/01/09 22:09:31 Modified: korean/python iso_2022_kr.py Log: Change to reimplemented code that can handle ksc5601 designated on G0 area. Revision Changes Path 1.13 +52 -82 KoreanCodecs/korean/python/iso_2022_kr.py Index: iso_2022_kr.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/korean/python/iso_2022_kr.py,v retrieving revision 1.12 retrieving revision 1.13 diff -u -r1.12 -r1.13 --- iso_2022_kr.py 9 Jan 2003 21:35:49 -0000 1.12 +++ iso_2022_kr.py 10 Jan 2003 06:09:31 -0000 1.13 @@ -17,42 +17,40 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: iso_2022_kr.py,v 1.12 2003/01/09 21:35:49 perky Exp $ +# $Id: iso_2022_kr.py,v 1.13 2003/01/10 06:09:31 perky Exp $ # import codecs -from korean.mappings import ksc5601_hangul -encmap_hangul = ksc5601_hangul.encoding_map -decmap_hangul = ksc5601_hangul.decoding_map -encmap_ideo, decmap_ideo = {}, {} -encmap_misc, decmap_misc = {}, {} -US_ASCII = 1 -KSC5601_1987 = 2 +KSC5601_CODEC = 'korean.euc-kr' + +US_ASCII = 1 +KSC5601_1987 = 2 + +G0, G1 = 0, 1 # iso-2022-kr doesn't handle G2 and G3 area. CHARSETS = { - "\033(B": US_ASCII, - "\033$)C": KSC5601_1987, + "\033(B": (G0, US_ASCII), + "\033)B": (G1, US_ASCII), + "\033$(C": (G0, KSC5601_1987), + "\033$)C": (G1, KSC5601_1987), } SI = '\x0f' SO = '\x0e' ESC = '\033' -DESIGNATIONS = {} +DESIGNATION_MARK = {} for k, v in CHARSETS.items(): - DESIGNATIONS[v] = k - -# StreamReader was adopted from Tamito KAJIYAMA's iso-2022-jp codec. + DESIGNATION_MARK[v] = k class Codec(codecs.Codec): # Unicode to character buffer def encode(self, data, errors='strict'): - global encmap_ideo, encmap_misc - if errors not in ('strict', 'ignore', 'replace'): raise ValueError, "unknown error handling" buffer = [] - new_charset = charset = US_ASCII + designation = [US_ASCII, US_ASCII] + new_designation = designation[:] new_shiftstate = shiftstate = 0 for c in data: if c in ('\n', '\r'): @@ -60,49 +58,30 @@ if c < u'\u0080': new_shiftstate = 0 + new_designation[0] = US_ASCII s = c.encode("ascii", errors) - elif encmap_hangul.has_key(c): - new_charset = KSC5601_1987 - new_shiftstate = 1 - s = encmap_hangul[c] else: - if not encmap_misc: - from korean.mappings import ksc5601_misc - encmap_misc = ksc5601_misc.encoding_map - if encmap_misc.has_key(c): - new_charset = KSC5601_1987 - new_shiftstate = 1 - s = encmap_misc[c] - else: - if not encmap_ideo: - from korean.mappings import ksc5601_ideograph - encmap_ideo = ksc5601_ideograph.encoding_map - if encmap_ideo.has_key(c): - new_charset = KSC5601_1987 - new_shiftstate = 1 - s = encmap_ideo[c] - elif errors == 'replace': - new_charset = KSC5601_1987 - new_shiftstate = 1 - s = '\xa1\xa1' - elif errors == 'strict': - raise UnicodeError, "cannot map \\u%04x to ISO-2022-KR" % ord(c) - else: - continue - - if charset != new_charset: - charset = new_charset - buffer.append(DESIGNATIONS[charset]) - if new_shiftstate != shiftstate: + new_shiftstate = 1 + new_designation[1] = KSC5601_1987 + s = c.encode('korean.euc_kr', errors) + + if designation[0] != new_designation[0]: + buffer.append(DESIGNATION_MARK[(G0, new_designation[0])]) + designation[0] = new_designation[0] + if designation[1] != new_designation[1]: + buffer.append(DESIGNATION_MARK[(G1, new_designation[1])]) + designation[1] = new_designation[1] + if shiftstate != new_shiftstate: + buffer.append([SI, SO][new_shiftstate]) shiftstate = new_shiftstate - buffer.append([SI, SO][shiftstate]) if shiftstate: s = chr(ord(s[0])&0x7F) + chr(ord(s[1])&0x7F) buffer.append(s) + if shiftstate: buffer.append(SI) - #buffer.append(DESIGNATIONS[US_ASCII]) + return (''.join(buffer), len(data)) # character buffer to Unicode @@ -114,10 +93,12 @@ buffer = [] data = str(data) # character buffer compatible object size = len(data) - charset = US_ASCII - shiftstate = 0 - escstart = -1 - p = 0 + + designation = [US_ASCII, KSC5601_1987] + shiftstate = 0 + escstart = -1 + p = 0 + while p < size: if data[p] in ('\n', '\r'): shiftstate = 0 @@ -127,8 +108,10 @@ escstr = data[escstart:p+1] if CHARSETS.has_key(escstr): charset = CHARSETS[escstr] + designation[charset[0]] = charset[1] elif errors == 'strict': - raise UnicodeError, "unsupported charset found: %s" % repr(data[escstart:p+1]) + raise UnicodeError, "unsupported charset found: " \ + + repr(data[escstart:p+1]) escstart = -1 p += 1 elif data[p] == SO: @@ -141,39 +124,24 @@ escstart = p p += 1 else: - if not shiftstate and ( - charset == US_ASCII or data[p] < '\x80'): # ascii + if (ord(data[p]) | (shiftstate and 0x80 or 0x00)) >= 0x80: + codearea = G1 + else: + codearea = G0 + + if designation[codearea] == US_ASCII: buffer.append(unicode(data[p], "ascii", errors)) p += 1 - else: + elif ord(data[p]) & 0x7F >= 0x20: # KSC5601_1987 c = data[p:p+2] p += 2 if len(c) == 2: c = chr(ord(c[0])|0x80) + chr(ord(c[1])|0x80) - if decmap_hangul.has_key(c): - buffer.append(decmap_hangul[c]) - continue - - if not decmap_misc: - from korean.mappings import ksc5601_misc - decmap_misc = ksc5601_misc.decoding_map - if decmap_misc.has_key(c): - buffer.append(decmap_misc[c]) - continue - - if not decmap_ideo: - from korean.mappings import ksc5601_ideograph - decmap_ideo = ksc5601_ideograph.decoding_map - if decmap_ideo.has_key(c): - buffer.append(decmap_ideo[c]) - continue + buffer.append(unicode(c, KSC5601_CODEC, errors)) + else: # control characters + buffer.append(unichr(ord(data[p]) & 0x7F)) + p += 1 - if errors == 'replace': - buffer.append(u'\uFFFD') # REPLACEMENT CHARACTER - elif errors == 'strict': - raise UnicodeError, "unexpected byte 0x%02x%02x found" % tuple(map(ord, c)) - # XXX: only 1byte? - return (u''.join(buffer), len(data)) class StreamWriter(Codec, codecs.StreamWriter): @@ -188,3 +156,5 @@ def getregentry(): return (Codec().encode,Codec().decode,StreamReader,StreamWriter) + +# ex: ts=8 sts=4 et |
From: Hye-Shik C. <pe...@us...> - 2003-01-10 06:08:22
|
perky 03/01/09 22:08:22 Modified: test CodecTestBase.py test_iso_2022_kr.py Log: Don't test roundtrip for iso-2022-kr Revision Changes Path 1.9 +5 -3 KoreanCodecs/test/CodecTestBase.py Index: CodecTestBase.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/test/CodecTestBase.py,v retrieving revision 1.8 retrieving revision 1.9 diff -u -r1.8 -r1.9 --- CodecTestBase.py 9 Jan 2003 21:41:37 -0000 1.8 +++ CodecTestBase.py 10 Jan 2003 06:08:21 -0000 1.9 @@ -16,7 +16,7 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: CodecTestBase.py,v 1.8 2003/01/09 21:41:37 perky Exp $ +# $Id: CodecTestBase.py,v 1.9 2003/01/10 06:08:21 perky Exp $ # import StringIO @@ -40,8 +40,9 @@ encoding = '' # codec name textfile_chunk = None # (native, utf-8) file name tuple textfile_stream = None # (native, utf-8) - + errortests = None # must set. error test tuple + roundtriptest = 1 # set if roundtrip is possible with unicode def setUp(self): if not self.textfile_chunk: @@ -54,7 +55,8 @@ for native, utf8 in zip(*[open(f).readlines() for f in self.textfile_chunk]): u = unicode(native, self.encoding) self.assertEqual(u, unicode(utf8, 'utf-8')) - self.assertEqual(native, u.encode(self.encoding)) + if self.roundtriptest: + self.assertEqual(native, u.encode(self.encoding)) def test_ErrorHandling(self): encode, decode, Reader, Writer = codecs.lookup(self.encoding) 1.6 +5 -4 KoreanCodecs/test/test_iso_2022_kr.py Index: test_iso_2022_kr.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/test/test_iso_2022_kr.py,v retrieving revision 1.5 retrieving revision 1.6 diff -u -r1.5 -r1.6 --- test_iso_2022_kr.py 9 Jan 2003 21:41:37 -0000 1.5 +++ test_iso_2022_kr.py 10 Jan 2003 06:08:22 -0000 1.6 @@ -16,7 +16,7 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: test_iso_2022_kr.py,v 1.5 2003/01/09 21:41:37 perky Exp $ +# $Id: test_iso_2022_kr.py,v 1.6 2003/01/10 06:08:22 perky Exp $ # import CodecTestBase @@ -25,12 +25,13 @@ encoding = 'iso-2022-kr' textfile_chunk = ('texts/%s.roundrobin' % encoding, 'texts/%s.utf-8' % encoding) textfile_stream = ('texts/%s.stream' % encoding, 'texts/%s.utf-8' % encoding) + roundtriptest = 0 errortests = ( # invalid bytes - ("abc\x1b$)C\x0e\x00\x00AD\x0f\x1b$(B", "strict", None), - ("abc\x1b$)C\x0e\x00\x00AD\x0f\x1b$(B", "replace", u"abc\ufffd\uc894"), - ("abc\x1b$)C\x0e\x00\x00AD\x0f\x1b$(B", "ignore", u"abc\uc894"), + ("abc\x1b$)C\x0eA\x12AD\x0f\x1b$(B", "strict", None), + ("abc\x1b$)C\x0eA\x12AD\x0f\x1b$(B", "replace", u"abc\ufffd\uc894"), + ("abc\x1b$)C\x0eA\x12AD\x0f\x1b$(B", "ignore", u"abc\uc894"), ) if __name__ == '__main__': |
From: Hye-Shik C. <pe...@us...> - 2003-01-10 04:50:41
|
perky 03/01/09 20:50:40 Modified: test/texts iso-2022-kr.roundrobin iso-2022-kr.stream iso-2022-kr.utf-8 Log: Add an example that designates ksc5601 on G0 area. (Emacs way) Revision Changes Path 1.2 +1 -0 KoreanCodecs/test/texts/iso-2022-kr.roundrobin Index: iso-2022-kr.roundrobin =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/test/texts/iso-2022-kr.roundrobin,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- iso-2022-kr.roundrobin 9 Jan 2003 21:41:38 -0000 1.1 +++ iso-2022-kr.roundrobin 10 Jan 2003 04:50:40 -0000 1.2 @@ -2,3 +2,4 @@ $)C1]>W@: 9+GQ@L0m 4kCb@O@: >pA&3* @T4O4Y. *^^* $)C&354C (>$?7g5N (8$F$i(>$?7g (9$S(2$@(7$D?d ^$Q^*&5 +$(C0!3;(B $(C5N7g(B $(C8^7UGO=J4O1n(B. Hehe$(C@W@W(B. 1.2 +1 -0 KoreanCodecs/test/texts/iso-2022-kr.stream Index: iso-2022-kr.stream =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/test/texts/iso-2022-kr.stream,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- iso-2022-kr.stream 9 Jan 2003 21:41:38 -0000 1.1 +++ iso-2022-kr.stream 10 Jan 2003 04:50:40 -0000 1.2 @@ -2,3 +2,4 @@ 1]>W@: 9+GQ@L0m 4kCb@O@: >pA&3* @T4O4Y. *^^* &354C (>$?7g5N (8$F$i(>$?7g (9$S(2$@(7$D?d ^$Q^*&5 +$(C0!3;(B $(C5N7g(B $(C8^7UGO=J4O1n(B. Hehe$(C@W@W(B. 1.2 +1 -0 KoreanCodecs/test/texts/iso-2022-kr.utf-8 <<Binary file>> |
From: Hye-Shik C. <pe...@us...> - 2003-01-10 04:29:37
|
perky 03/01/09 20:29:35 Modified: test test_cp949.py test_mackorean.py Log: Use self.encoding instead of hardcoded encoding name. Revision Changes Path 1.10 +3 -3 KoreanCodecs/test/test_cp949.py Index: test_cp949.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/test/test_cp949.py,v retrieving revision 1.9 retrieving revision 1.10 diff -u -r1.9 -r1.10 --- test_cp949.py 10 Jan 2003 04:26:43 -0000 1.9 +++ test_cp949.py 10 Jan 2003 04:29:34 -0000 1.10 @@ -16,7 +16,7 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: test_cp949.py,v 1.9 2003/01/10 04:26:43 perky Exp $ +# $Id: test_cp949.py,v 1.10 2003/01/10 04:29:34 perky Exp $ # import CodecTestBase @@ -62,8 +62,8 @@ continue unich = unichrs(data[1]) - self.assertEqual(unich.encode('cp949'), cp949ch) - self.assertEqual(unicode(cp949ch, 'cp949'), unich) + self.assertEqual(unich.encode(self.encoding), cp949ch) + self.assertEqual(unicode(cp949ch, self.encoding), unich) class TestCP949_CExtension(Shield.TestCP949Base): encoding = 'korean.c.cp949' 1.3 +3 -3 KoreanCodecs/test/test_mackorean.py Index: test_mackorean.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/test/test_mackorean.py,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- test_mackorean.py 10 Jan 2003 04:26:43 -0000 1.2 +++ test_mackorean.py 10 Jan 2003 04:29:34 -0000 1.3 @@ -16,7 +16,7 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: test_mackorean.py,v 1.2 2003/01/10 04:26:43 perky Exp $ +# $Id: test_mackorean.py,v 1.3 2003/01/10 04:29:34 perky Exp $ # import CodecTestBase @@ -65,8 +65,8 @@ continue unich = unichrs(data[1]) - self.assertEqual(unich.encode('mackorean'), macch) - self.assertEqual(unicode(macch, 'mackorean'), unich) + self.assertEqual(unich.encode(self.encoding), macch) + self.assertEqual(unicode(macch, self.encoding), unich) #class TestMacKorean_CExtension(Shield.TestMacKorean_Base): # encoding = 'korean.c.mackorean' |
From: Hye-Shik C. <pe...@us...> - 2003-01-10 04:26:44
|
perky 03/01/09 20:26:43 Modified: test test_cp949.py test_mackorean.py Log: Fetch mapping files from unicode.org site. Revision Changes Path 1.9 +10 -2 KoreanCodecs/test/test_cp949.py Index: test_cp949.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/test/test_cp949.py,v retrieving revision 1.8 retrieving revision 1.9 diff -u -r1.8 -r1.9 --- test_cp949.py 10 Jan 2003 03:15:25 -0000 1.8 +++ test_cp949.py 10 Jan 2003 04:26:43 -0000 1.9 @@ -16,7 +16,7 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: test_cp949.py,v 1.8 2003/01/10 03:15:25 perky Exp $ +# $Id: test_cp949.py,v 1.9 2003/01/10 04:26:43 perky Exp $ # import CodecTestBase @@ -38,7 +38,15 @@ ) def test_mapping(self): - for line in open('mappings/cp949.txt'): + import sys, os, urllib + + if not os.access('map-cp949.txt', os.R_OK): + sys.stdout.write('(Trying to download cp949 mapping) ') + sys.stdout.flush() + urllib.urlretrieve('http://www.unicode.org/Public/MAPPINGS' + '/VENDORS/MICSFT/WINDOWS/CP949.TXT', 'map-cp949.txt') + + for line in open('map-cp949.txt'): if not line: break data = line.split('#')[0].strip().split() 1.2 +10 -2 KoreanCodecs/test/test_mackorean.py Index: test_mackorean.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/test/test_mackorean.py,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- test_mackorean.py 10 Jan 2003 03:15:25 -0000 1.1 +++ test_mackorean.py 10 Jan 2003 04:26:43 -0000 1.2 @@ -16,7 +16,7 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: test_mackorean.py,v 1.1 2003/01/10 03:15:25 perky Exp $ +# $Id: test_mackorean.py,v 1.2 2003/01/10 04:26:43 perky Exp $ # import CodecTestBase @@ -40,7 +40,15 @@ ) def test_mapping(self): - for line in open('mappings/MacKorean.txt'): + import os, urllib, sys + + if not os.access('map-mackorean.txt', os.R_OK): + sys.stdout.write('(Trying to download MacKorean mapping) ') + sys.stdout.flush() + urllib.urlretrieve('http://www.unicode.org/Public/MAPPINGS' + '/VENDORS/APPLE/KOREAN.TXT', 'map-mackorean.txt') + + for line in open('map-mackorean.txt'): if not line: break data = line.split('#')[0].strip().split() |