[KoCo-CVS] [Commit] KoreanCodecs/korean/python iso_2022_kr.py
Brought to you by:
perky
From: Hye-Shik C. <pe...@us...> - 2003-01-10 06:09:31
|
perky 03/01/09 22:09:31 Modified: korean/python iso_2022_kr.py Log: Change to reimplemented code that can handle ksc5601 designated on G0 area. Revision Changes Path 1.13 +52 -82 KoreanCodecs/korean/python/iso_2022_kr.py Index: iso_2022_kr.py =================================================================== RCS file: /cvsroot/koco/KoreanCodecs/korean/python/iso_2022_kr.py,v retrieving revision 1.12 retrieving revision 1.13 diff -u -r1.12 -r1.13 --- iso_2022_kr.py 9 Jan 2003 21:35:49 -0000 1.12 +++ iso_2022_kr.py 10 Jan 2003 06:09:31 -0000 1.13 @@ -17,42 +17,40 @@ # along with KoreanCodecs; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# $Id: iso_2022_kr.py,v 1.12 2003/01/09 21:35:49 perky Exp $ +# $Id: iso_2022_kr.py,v 1.13 2003/01/10 06:09:31 perky Exp $ # import codecs -from korean.mappings import ksc5601_hangul -encmap_hangul = ksc5601_hangul.encoding_map -decmap_hangul = ksc5601_hangul.decoding_map -encmap_ideo, decmap_ideo = {}, {} -encmap_misc, decmap_misc = {}, {} -US_ASCII = 1 -KSC5601_1987 = 2 +KSC5601_CODEC = 'korean.euc-kr' + +US_ASCII = 1 +KSC5601_1987 = 2 + +G0, G1 = 0, 1 # iso-2022-kr doesn't handle G2 and G3 area. CHARSETS = { - "\033(B": US_ASCII, - "\033$)C": KSC5601_1987, + "\033(B": (G0, US_ASCII), + "\033)B": (G1, US_ASCII), + "\033$(C": (G0, KSC5601_1987), + "\033$)C": (G1, KSC5601_1987), } SI = '\x0f' SO = '\x0e' ESC = '\033' -DESIGNATIONS = {} +DESIGNATION_MARK = {} for k, v in CHARSETS.items(): - DESIGNATIONS[v] = k - -# StreamReader was adopted from Tamito KAJIYAMA's iso-2022-jp codec. + DESIGNATION_MARK[v] = k class Codec(codecs.Codec): # Unicode to character buffer def encode(self, data, errors='strict'): - global encmap_ideo, encmap_misc - if errors not in ('strict', 'ignore', 'replace'): raise ValueError, "unknown error handling" buffer = [] - new_charset = charset = US_ASCII + designation = [US_ASCII, US_ASCII] + new_designation = designation[:] new_shiftstate = shiftstate = 0 for c in data: if c in ('\n', '\r'): @@ -60,49 +58,30 @@ if c < u'\u0080': new_shiftstate = 0 + new_designation[0] = US_ASCII s = c.encode("ascii", errors) - elif encmap_hangul.has_key(c): - new_charset = KSC5601_1987 - new_shiftstate = 1 - s = encmap_hangul[c] else: - if not encmap_misc: - from korean.mappings import ksc5601_misc - encmap_misc = ksc5601_misc.encoding_map - if encmap_misc.has_key(c): - new_charset = KSC5601_1987 - new_shiftstate = 1 - s = encmap_misc[c] - else: - if not encmap_ideo: - from korean.mappings import ksc5601_ideograph - encmap_ideo = ksc5601_ideograph.encoding_map - if encmap_ideo.has_key(c): - new_charset = KSC5601_1987 - new_shiftstate = 1 - s = encmap_ideo[c] - elif errors == 'replace': - new_charset = KSC5601_1987 - new_shiftstate = 1 - s = '\xa1\xa1' - elif errors == 'strict': - raise UnicodeError, "cannot map \\u%04x to ISO-2022-KR" % ord(c) - else: - continue - - if charset != new_charset: - charset = new_charset - buffer.append(DESIGNATIONS[charset]) - if new_shiftstate != shiftstate: + new_shiftstate = 1 + new_designation[1] = KSC5601_1987 + s = c.encode('korean.euc_kr', errors) + + if designation[0] != new_designation[0]: + buffer.append(DESIGNATION_MARK[(G0, new_designation[0])]) + designation[0] = new_designation[0] + if designation[1] != new_designation[1]: + buffer.append(DESIGNATION_MARK[(G1, new_designation[1])]) + designation[1] = new_designation[1] + if shiftstate != new_shiftstate: + buffer.append([SI, SO][new_shiftstate]) shiftstate = new_shiftstate - buffer.append([SI, SO][shiftstate]) if shiftstate: s = chr(ord(s[0])&0x7F) + chr(ord(s[1])&0x7F) buffer.append(s) + if shiftstate: buffer.append(SI) - #buffer.append(DESIGNATIONS[US_ASCII]) + return (''.join(buffer), len(data)) # character buffer to Unicode @@ -114,10 +93,12 @@ buffer = [] data = str(data) # character buffer compatible object size = len(data) - charset = US_ASCII - shiftstate = 0 - escstart = -1 - p = 0 + + designation = [US_ASCII, KSC5601_1987] + shiftstate = 0 + escstart = -1 + p = 0 + while p < size: if data[p] in ('\n', '\r'): shiftstate = 0 @@ -127,8 +108,10 @@ escstr = data[escstart:p+1] if CHARSETS.has_key(escstr): charset = CHARSETS[escstr] + designation[charset[0]] = charset[1] elif errors == 'strict': - raise UnicodeError, "unsupported charset found: %s" % repr(data[escstart:p+1]) + raise UnicodeError, "unsupported charset found: " \ + + repr(data[escstart:p+1]) escstart = -1 p += 1 elif data[p] == SO: @@ -141,39 +124,24 @@ escstart = p p += 1 else: - if not shiftstate and ( - charset == US_ASCII or data[p] < '\x80'): # ascii + if (ord(data[p]) | (shiftstate and 0x80 or 0x00)) >= 0x80: + codearea = G1 + else: + codearea = G0 + + if designation[codearea] == US_ASCII: buffer.append(unicode(data[p], "ascii", errors)) p += 1 - else: + elif ord(data[p]) & 0x7F >= 0x20: # KSC5601_1987 c = data[p:p+2] p += 2 if len(c) == 2: c = chr(ord(c[0])|0x80) + chr(ord(c[1])|0x80) - if decmap_hangul.has_key(c): - buffer.append(decmap_hangul[c]) - continue - - if not decmap_misc: - from korean.mappings import ksc5601_misc - decmap_misc = ksc5601_misc.decoding_map - if decmap_misc.has_key(c): - buffer.append(decmap_misc[c]) - continue - - if not decmap_ideo: - from korean.mappings import ksc5601_ideograph - decmap_ideo = ksc5601_ideograph.decoding_map - if decmap_ideo.has_key(c): - buffer.append(decmap_ideo[c]) - continue + buffer.append(unicode(c, KSC5601_CODEC, errors)) + else: # control characters + buffer.append(unichr(ord(data[p]) & 0x7F)) + p += 1 - if errors == 'replace': - buffer.append(u'\uFFFD') # REPLACEMENT CHARACTER - elif errors == 'strict': - raise UnicodeError, "unexpected byte 0x%02x%02x found" % tuple(map(ord, c)) - # XXX: only 1byte? - return (u''.join(buffer), len(data)) class StreamWriter(Codec, codecs.StreamWriter): @@ -188,3 +156,5 @@ def getregentry(): return (Codec().encode,Codec().decode,StreamReader,StreamWriter) + +# ex: ts=8 sts=4 et |