[KoCo-CVS] [Commit] KoreanCodecs/korean/python iso_2022_kr.py

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

perky       03/01/09 22:09:31

  Modified:    korean/python iso_2022_kr.py
  Log:
  Change to reimplemented code that can handle ksc5601 designated on G0 area.

  Revision  Changes    Path
  1.13      +52 -82    KoreanCodecs/korean/python/iso_2022_kr.py

  Index: iso_2022_kr.py
  ===================================================================
  RCS file: /cvsroot/koco/KoreanCodecs/korean/python/iso_2022_kr.py,v
  retrieving revision 1.12
  retrieving revision 1.13
  diff -u -r1.12 -r1.13
  --- iso_2022_kr.py	9 Jan 2003 21:35:49 -0000	1.12
  +++ iso_2022_kr.py	10 Jan 2003 06:09:31 -0000	1.13
  @@ -17,42 +17,40 @@
   # along with KoreanCodecs; if not, write to the Free Software
   # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   #
  -# $Id: iso_2022_kr.py,v 1.12 2003/01/09 21:35:49 perky Exp $
  +# $Id: iso_2022_kr.py,v 1.13 2003/01/10 06:09:31 perky Exp $
   #

   import codecs
  -from korean.mappings import ksc5601_hangul
  -encmap_hangul = ksc5601_hangul.encoding_map
  -decmap_hangul = ksc5601_hangul.decoding_map
  -encmap_ideo, decmap_ideo = {}, {}
  -encmap_misc, decmap_misc = {}, {}

  -US_ASCII      = 1
  -KSC5601_1987  = 2
  +KSC5601_CODEC   = 'korean.euc-kr'
  +
  +US_ASCII        = 1
  +KSC5601_1987    = 2
  +
  +G0, G1 = 0, 1 # iso-2022-kr doesn't handle G2 and G3 area.

   CHARSETS = {
  -    "\033(B": US_ASCII,
  -    "\033$)C": KSC5601_1987,
  +    "\033(B": (G0, US_ASCII),
  +    "\033)B": (G1, US_ASCII),
  +    "\033$(C": (G0, KSC5601_1987),
  +    "\033$)C": (G1, KSC5601_1987),
   }
   SI = '\x0f'
   SO = '\x0e'
   ESC = '\033'

  -DESIGNATIONS = {}
  +DESIGNATION_MARK = {}
   for k, v in CHARSETS.items():
  -    DESIGNATIONS[v] = k
  -
  -# StreamReader was adopted from Tamito KAJIYAMA's iso-2022-jp codec.
  +    DESIGNATION_MARK[v] = k

   class Codec(codecs.Codec):
       # Unicode to character buffer
       def encode(self, data, errors='strict'):
  -        global encmap_ideo, encmap_misc
  -
           if errors not in ('strict', 'ignore', 'replace'):
               raise ValueError, "unknown error handling"
           buffer = []
  -        new_charset = charset = US_ASCII
  +        designation = [US_ASCII, US_ASCII]
  +        new_designation = designation[:]
           new_shiftstate = shiftstate = 0
           for c in data:
               if c in ('\n', '\r'):
  @@ -60,49 +58,30 @@

               if c < u'\u0080':
                   new_shiftstate = 0
  +                new_designation[0] = US_ASCII
                   s = c.encode("ascii", errors)
  -            elif encmap_hangul.has_key(c):
  -                new_charset = KSC5601_1987
  -                new_shiftstate = 1
  -                s = encmap_hangul[c]
               else:
  -                if not encmap_misc:
  -                    from korean.mappings import ksc5601_misc
  -                    encmap_misc = ksc5601_misc.encoding_map
  -                if encmap_misc.has_key(c):
  -                    new_charset = KSC5601_1987
  -                    new_shiftstate = 1
  -                    s = encmap_misc[c]
  -                else:
  -                    if not encmap_ideo:
  -                        from korean.mappings import ksc5601_ideograph
  -                        encmap_ideo = ksc5601_ideograph.encoding_map
  -                    if encmap_ideo.has_key(c):
  -                        new_charset = KSC5601_1987
  -                        new_shiftstate = 1
  -                        s = encmap_ideo[c]
  -                    elif errors == 'replace':
  -                        new_charset = KSC5601_1987
  -                        new_shiftstate = 1
  -                        s = '\xa1\xa1'
  -                    elif errors == 'strict':
  -                        raise UnicodeError, "cannot map \\u%04x to ISO-2022-KR" % ord(c)
  -                    else:
  -                        continue
  -
  -            if charset != new_charset:
  -                charset = new_charset
  -                buffer.append(DESIGNATIONS[charset])
  -            if new_shiftstate != shiftstate:
  +                new_shiftstate = 1
  +                new_designation[1] = KSC5601_1987
  +                s = c.encode('korean.euc_kr', errors)
  +
  +            if designation[0] != new_designation[0]:
  +                buffer.append(DESIGNATION_MARK[(G0, new_designation[0])])
  +                designation[0] = new_designation[0]
  +            if designation[1] != new_designation[1]:
  +                buffer.append(DESIGNATION_MARK[(G1, new_designation[1])])
  +                designation[1] = new_designation[1]
  +            if shiftstate != new_shiftstate:
  +                buffer.append([SI, SO][new_shiftstate])
                   shiftstate = new_shiftstate
  -                buffer.append([SI, SO][shiftstate])

               if shiftstate:
                   s = chr(ord(s[0])&0x7F) + chr(ord(s[1])&0x7F)
               buffer.append(s)
  +
           if shiftstate:
               buffer.append(SI)
  -            #buffer.append(DESIGNATIONS[US_ASCII])
  +
           return (''.join(buffer), len(data))

       # character buffer to Unicode
  @@ -114,10 +93,12 @@
           buffer = []
           data = str(data) # character buffer compatible object
           size = len(data)
  -        charset = US_ASCII
  -        shiftstate = 0
  -        escstart = -1
  -        p = 0
  +
  +        designation = [US_ASCII, KSC5601_1987]
  +        shiftstate  = 0
  +        escstart    = -1
  +        p           = 0
  +
           while p < size:
               if data[p] in ('\n', '\r'):
                   shiftstate = 0
  @@ -127,8 +108,10 @@
                       escstr = data[escstart:p+1]
                       if CHARSETS.has_key(escstr):
                           charset = CHARSETS[escstr]
  +                        designation[charset[0]] = charset[1]
                       elif errors == 'strict':
  -                        raise UnicodeError, "unsupported charset found: %s" % repr(data[escstart:p+1])
  +                        raise UnicodeError, "unsupported charset found: " \
  +                                            + repr(data[escstart:p+1])
                       escstart = -1
                   p += 1
               elif data[p] == SO:
  @@ -141,39 +124,24 @@
                   escstart = p
                   p += 1
               else:
  -                if not shiftstate and (
  -                        charset == US_ASCII or data[p] < '\x80'): # ascii
  +                if (ord(data[p]) | (shiftstate and 0x80 or 0x00)) >= 0x80:
  +                    codearea = G1
  +                else:
  +                    codearea = G0
  +
  +                if designation[codearea] == US_ASCII:
                       buffer.append(unicode(data[p], "ascii", errors))
                       p += 1
  -                else:
  +                elif ord(data[p]) & 0x7F >= 0x20: # KSC5601_1987
                       c = data[p:p+2]
                       p += 2
                       if len(c) == 2:
                           c = chr(ord(c[0])|0x80) + chr(ord(c[1])|0x80)
  -                        if decmap_hangul.has_key(c):
  -                            buffer.append(decmap_hangul[c])
  -                            continue
  -
  -                        if not decmap_misc:
  -                            from korean.mappings import ksc5601_misc
  -                            decmap_misc = ksc5601_misc.decoding_map
  -                        if decmap_misc.has_key(c):
  -                            buffer.append(decmap_misc[c])
  -                            continue
  -
  -                        if not decmap_ideo:
  -                            from korean.mappings import ksc5601_ideograph
  -                            decmap_ideo = ksc5601_ideograph.decoding_map
  -                        if decmap_ideo.has_key(c):
  -                            buffer.append(decmap_ideo[c])
  -                            continue
  +                        buffer.append(unicode(c, KSC5601_CODEC, errors))
  +                else: # control characters
  +                    buffer.append(unichr(ord(data[p]) & 0x7F))
  +                    p += 1

  -                    if errors == 'replace':
  -                        buffer.append(u'\uFFFD') # REPLACEMENT CHARACTER
  -                    elif errors == 'strict':
  -                        raise UnicodeError, "unexpected byte 0x%02x%02x found" % tuple(map(ord, c))
  -                        # XXX: only 1byte?
  -        
           return (u''.join(buffer), len(data))

   class StreamWriter(Codec, codecs.StreamWriter):
  @@ -188,3 +156,5 @@

   def getregentry():
       return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
  +
  +# ex: ts=8 sts=4 et