[KoCo-CVS] [Commit] KoreanCodecs/korean/python hangul.py

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

perky       02/04/23 20:36:01

  Added:       korean/python hangul.py
  Log:
  - Move hangul python implementation into python/
  - Added hangul.format, the hangul adaptive formatter

  Revision  Changes    Path
  1.1                  KoreanCodecs/korean/python/hangul.py

  Index: hangul.py
  ===================================================================
  #!/usr/local/bin/python
  # ex:ts=4
  #
  # Unicode hangul abstractive controller
  #
  #   written by Hye-Shik Chang <pe...@fa...>
  #
  #
  # Unicode Hangul Code-Area Specifications:
  #  http://www.unicode.org/charts/PDF/UAC00.pdf
  #
  # Jamo Short Name Conventions:
  #  http://www.unicode.org/unicode/uni2book/ch04.pdf  (section 4.4)
  #
  # Conjoining Jamo Behavior:
  #  http://www.unicode.org/unicode/uni2book/ch03.pdf  (section 3.11)
  #
  # $Id: hangul.py,v 1.1 2002/04/24 03:36:01 perky Exp $
  #

  class UnicodeHangulError(Exception):

      def __init__ (self, msg):
          self.msg = msg

      def __repr__ (self):
          return self.msg

      __str__ = __repr__

  Null = u''

  class Jaeum: # XXX: 1100-1159 Old Jaeum need?

      Codes = (u'\u3131', u'\u3132', u'\u3133', u'\u3134', u'\u3135', u'\u3136',
              #    G         GG          GS         N          NJ         NH
               u'\u3137', u'\u3138', u'\u3139', u'\u313a', u'\u313b', u'\u313c',
              #    D         DD          L          LG         LM         LB
               u'\u313d', u'\u313e', u'\u313f', u'\u3140', u'\u3141', u'\u3142',
              #    LS        LT          LP         LH         M          B
               u'\u3143', u'\u3144', u'\u3145', u'\u3146', u'\u3147', u'\u3148',
              #    BB        BS          S          SS         NG         J
               u'\u3149', u'\u314a', u'\u314b', u'\u314c', u'\u314d', u'\u314e')
              #    JJ        C           K          T          P          H
      Width = len(Codes)
      G, GG, GS, N, NJ, NH, D, DD, L, LG, LM, LB, LS, LT, LP, LH, M, B, \
      BB, BS, S, SS, NG, J, JJ, C, K, T, P, H = Codes
      Chosung = [G, GG, N, D, DD, L, M, B, BB, S, SS, NG, J, JJ, C, K, T, P, H]
      Jongsung = [Null, G, GG, GS, N, NJ, NH, D, L, LG, LM, LB, LS, LT, \
                  LP, LH, M, B, BS, S, SS, NG, J, C, K, T, P, H]
      MultiElement = {
          GG: (G, G),  GS: (G, S),  NJ: (N, J),  NH: (N, H),  DD: (D, D),
          LG: (L, G),  LM: (L, M),  LB: (L, B),  LS: (L, S),  LT: (L, T),
          LP: (L, P),  LH: (L, H),  BB: (B, B),  BS: (B, S),  SS: (S, S),
          JJ: (J, J)
      }

  class Moeum: # XXX: 1161-117f Old Moeum need?

      Codes = (u'\u314f', u'\u3150', u'\u3151', u'\u3152', u'\u3153', u'\u3154',
              #    A          AE        YA         YAE         EO         E
               u'\u3155', u'\u3156', u'\u3157', u'\u3158', u'\u3159', u'\u315a',
              #    YEO        YE        O          WA          WAE        OE
               u'\u315b', u'\u315c', u'\u315d', u'\u315e', u'\u315f', u'\u3160',
              #    YO         U         WEO        WE          WI         YU
               u'\u3161', u'\u3162', u'\u3163')
              #    EU         YI        I
      Width = len(Codes)
      A, AE, YA, YAE, EO, E, YEO, YE, O, WA, WAE, OE, YO, \
      U, WEO, WE, WI, YU, EU, YI, I = Codes
      Jungsung = list(Codes)
      MultiElement = {
          AE: (A, I),  YAE: (YA, I),  YE: (YEO, I), WA: (O, A),  WAE: (O, A, I),
          OE: (O, I),  WEO: (U, EO),  WE: (U, E),   WI: (U, I),  YI: (EU, I)
      }

  # Aliases for your convinience
  Chosung = Jaeum.Chosung
  Jungsung = Moeum.Jungsung
  Jongsung = Jaeum.Jongsung

  for name, code in Jaeum.__dict__.items() + Moeum.__dict__.items():
      if name.isupper() and len(name) <= 3:
          exec "%s = %s" % (name, repr(code))

  isJaeum = lambda c: c in Jaeum.Codes
  isMoeum = lambda c: c in Moeum.Codes

  # Unicode Hangul Syllables Characteristics
  zone = (u'\uAC00', u'\uD7A3')
  splitters = [ ( len(Jongsung)*len(Jungsung), Chosung  ),
                ( len(Jongsung),               Jungsung ),
                ( 1,                           Jongsung ) ]

  ishangul = (
      lambda code:
          zone[0] <= code <= zone[1] or
          code in Jaeum.Codes or
          code in Moeum.Codes
  )

  # Alternative Suffixes
  ALT_SUFFIXES = {
      u'\uc744': (u'\ub97c', u'\uc744'), # reul, eul
      u'\ub97c': (u'\ub97c', u'\uc744'), # reul, eul
      u'\uc740': (u'\ub294', u'\uc740'), # neun, eun
      u'\ub294': (u'\ub294', u'\uc740'), # neun, eun
      u'\uc774': (u'\uac00', u'\uc774'), # yi, ga
      u'\uac00': (u'\uac00', u'\uc774'), # yi, ga
      u'\uc640': (u'\uc640', u'\uacfc'), # wa, gwa
      u'\uacfc': (u'\uc640', u'\uacfc'), # wa, gwa
  }

  # Ida-Varitaion Suffixes
  IDA_SUFFIXES = {
      u'(\uc774)': (u'', u'\uc774'),     # (yi)da
      u'(\uc785)': (17, u'\uc785'),      # (ip)nida
      u'(\uc778)': (4, u'\uc778'),       # (in)-
  }

  def join(codes):
      """ Join function which makes hangul syllable from jamos """
      if len(codes) is not 3:
          raise UnicodeHangulError("needs 3-element tuple")
      if not codes[0] or not codes[1]: # single jamo
          return codes[0] or codes[1]

      r = ord(zone[0])
      codes = codes[:]  # simple copy :D
      for multiplier, codeset in splitters:
          r = r + multiplier*codeset.index(codes.pop(0))

      return unichr(r)

  def split(code):
      """ Split function which splits hangul syllable into jamos """
      if len(code) != 1 or not ishangul(code):
          raise UnicodeHangulError("needs 1 hangul letter")
      if code in Jaeum.Codes:
          return [code, Null, Null]
      if code in Moeum.Codes:
          return [Null, code, Null]

      code = ord(code) - ord(zone[0])
      r = []
      for divider, codeset in splitters:
          value, code = code / divider, code % divider
          r.append(codeset[value])
      return r

  def dividestring(str, intoelements=0):
      if type(str) is not type(u''):
          raise UnicodeHangulError("needs unicode string")

      r = u''
      for char in str:
          if ishangul(char):
              elems = split(char)
              for elem in elems:
                  for htype in (Jaeum, Moeum, None):
                      if htype == None:
                          r += elem
                      elif intoelements and \
                         htype.MultiElement.has_key(elem):
                          r += u''.join(htype.MultiElement[elem])
                          break
          else:
              r += char

      return r

  def _has_final(c):
      # for internal use only
      if u'\uac00' <= c <= u'\ud7a3': # hangul
          return 1, (ord(c) - 0xac00) % 28 > 0
      else:
          return 0, c in u'013678.bklmnptMN'

  def format(fmtstr, args):
      if not isinstance(args, dict):
          argget = iter(args).next
      else:
          argget = lambda:args

      obuff = []
      ncur = escape = fmtinpth = 0
      ofmt = fmt = u''

      while ncur < len(fmtstr):
          c = fmtstr[ncur]

          if escape:
              obuff.append(c)
              escape = 0
              ofmt   = u''
          elif c == u'\\':
              escape = 1
          elif fmt:
              fmt += c
              if not fmtinpth and c.isalpha():
                  ofmt = fmt % argget()
                  obuff.append(ofmt)
                  fmt = u''
              elif fmtinpth and c == u')':
                  fmtinpth = 0
              elif c == u'(':
                  fmtinpth = 1
              elif c == u'%':
                  obuff.append(u'%')
          elif c == u'%':
              fmt  += c
              ofmt = u''
          else:
              if ofmt and ALT_SUFFIXES.has_key(c):
                  obuff.append(ALT_SUFFIXES[c][
                      _has_final(ofmt[-1])[1] and 1 or 0
                  ])
              elif ofmt and IDA_SUFFIXES.has_key(fmtstr[ncur:ncur+3]):
                  sel = IDA_SUFFIXES[fmtstr[ncur:ncur+3]]
                  ishan, hasfinal = _has_final(ofmt[-1])

                  if hasfinal:
                      obuff.append(sel[1])
                  elif ishan:
                      if sel[0]:
                          obuff[-1] = obuff[-1][:-1] + unichr(ord(ofmt[-1]) + sel[0])
                  else:
                      obuff.append(sel[0] and sel[1])
                  ncur += 2
              else:
                  obuff.append(c)

              ofmt = u''

          ncur += 1

      return u''.join(obuff)

  if __name__ == '__main__':

      print ( join([Jaeum.P, Moeum.EO, Null]) + \
              join([Jaeum.K, Moeum.I, Null]) + \
              join([Jaeum.JJ, Moeum.A, Jaeum.NG]) ).encode("utf-8")

      while 1:
          code = raw_input(">>> ")
          print dividestring(unicode(code, "utf-8"), 1).encode("utf-8")