Thread: [KoCo-CVS] [Commit] cjkcodecs/tools genmap_schinese.py genmap_tchinese.py genmap_zh_TW_codecs.py
Brought to you by:
perky
From: Hye-Shik C. <pe...@us...> - 2003-05-17 20:33:07
|
perky 03/05/17 13:33:06 Added: tools genmap_schinese.py genmap_tchinese.py Removed: tools genmap_zh_TW_codecs.py Log: Add map generators for chinese. Revision Changes Path 1.1 cjkcodecs/tools/genmap_schinese.py Index: genmap_schinese.py =================================================================== # # genmap_schinese.py: Simplified Chinese Codecs Map Generator # # Copyright (C) 2003 Hye-Shik Chang <pe...@Fr...>. # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # # $Id: genmap_schinese.py,v 1.1 2003/05/17 20:33:06 perky Exp $ # from genmap_support import * GB2312_C1 = (0x21, 0x7e) GB2312_C2 = (0x21, 0x7e) GBKL1_C1 = (0x81, 0xa8) GBKL1_C2 = (0x40, 0xfe) GBKL2_C1 = (0xa9, 0xfe) GBKL2_C2 = (0x40, 0xa0) GB18030EXTP1_C1 = (0xa1, 0xa9) GB18030EXTP1_C2 = (0x40, 0xfe) GB18030EXTP2_C1 = (0xaa, 0xaf) GB18030EXTP2_C2 = (0xa1, 0xfe) GB18030EXTP3_C1 = (0xd7, 0xd7) GB18030EXTP3_C2 = (0xfa, 0xfe) GB18030EXTP4_C1 = (0xf8, 0xfd) GB18030EXTP4_C2 = (0xa1, 0xfe) GB18030EXTP5_C1 = (0xfe, 0xfe) GB18030EXTP5_C2 = (0x50, 0xfe) try: gb2312map = open('GB2312.TXT') except IOError: print "=>> Please download mapping table from http://www.unicode." \ "org/Public/MAPPINGS/OBSOLETE/EASTASIA/GB/GB2312.TXT" raise SystemExit try: cp936map = open('CP936.TXT') except IOError: print "=>> Please download mapping table from http://www.unicode." \ "org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP936.TXT" raise SystemExit try: gb18030map = open('gb-18030-2000.xml') except IOError: print "=>> Please download mapping table from http://oss.software" \ ".ibm.com/cvs/icu/~checkout~/charset/data/xml/gb-18030-2000.xml" raise SystemExit re_gb18030ass = re.compile('<a u="([A-F0-9]{4})" b="([0-9A-F ]+)"/>') def parse_gb18030map(fo): m, gbuni = {}, {} for i in range(65536): if i < 0xd800 or i > 0xdfff: # exclude unicode surrogate area gbuni[i] = None for uni, native in re_gb18030ass.findall(fo.read()): uni = eval('0x'+uni) native = [eval('0x'+u) for u in native.split()] if len(native) <= 2: del gbuni[uni] if len(native) == 2: # we can decode algorithmically for 1 or 4 bytes m.setdefault(native[0], {}) m[native[0]][native[1]] = uni gbuni = gbuni.keys() gbuni.sort() return m, gbuni print "Loading Mapping File..." gb18030decmap, gb18030unilinear = parse_gb18030map(gb18030map) datever, gbkdecmap = loadmap(cp936map) gb2312_datever, gb2312decmap = loadmap(gb2312map) difmap = {} for c1, m in gbkdecmap.items(): for c2, code in m.items(): del gb18030decmap[c1][c2] if not gb18030decmap[c1]: del gb18030decmap[c1] for c1, m in gb2312decmap.items(): for c2, code in m.items(): gbkc1, gbkc2 = c1 | 0x80, c2 | 0x80 if gbkdecmap[gbkc1][gbkc2] == code: del gbkdecmap[gbkc1][gbkc2] if not gbkdecmap[gbkc1]: del gbkdecmap[gbkc1] gb2312_gbkencmap, gb18030encmap = {}, {} for c1, m in gbkdecmap.iteritems(): for c2, code in m.iteritems(): gb2312_gbkencmap.setdefault(code >> 8, {}) gb2312_gbkencmap[code >> 8][code & 0xff] = c1 << 8 | c2 # MSB set for c1, m in gb2312decmap.iteritems(): for c2, code in m.iteritems(): gb2312_gbkencmap.setdefault(code >> 8, {}) gb2312_gbkencmap[code >> 8][code & 0xff] = c1 << 8 | c2 # MSB unset for c1, m in gb18030decmap.iteritems(): for c2, code in m.iteritems(): gb18030encmap.setdefault(code >> 8, {}) gb18030encmap[code >> 8][code & 0xff] = c1 << 8 | c2 omap = open('map_gb2312.h', 'w') printcopyright(omap) print "Generating GB2312 decode map..." codebunch = [] genmap_decode(codebunch, "gb2312", GB2312_C1, GB2312_C2, gb2312decmap) print_decmap(omap, codebunch, "gb2312", gb2312decmap) omap = open('map_gbkext.h', 'w') printcopyright(omap) print "Generating GBK decode map..." codebunch = [] genmap_decode(codebunch, "gbkext", GBKL1_C1, GBKL1_C2, gbkdecmap) genmap_decode(codebunch, "gbkext", GBKL2_C1, GBKL2_C2, gbkdecmap) print_decmap(omap, codebunch, "gbkext", gbkdecmap) omap = open('map_gbcommon.h', 'w') printcopyright(omap) print "Generating GB2312 && GBK encode map..." codebunch =[] genmap_encode(codebunch, "gbcommon", gb2312_gbkencmap) print_encmap(omap, codebunch, "gbcommon", gb2312_gbkencmap) omap = open('map_gb18030ext.h', 'w') printcopyright(omap) print "Generating GB18030 extension decode map..." codebunch = [] for i in range(1, 6): genmap_decode(codebunch, "gb18030ext", eval("GB18030EXTP%d_C1" % i), eval("GB18030EXTP%d_C2" % i), gb18030decmap) print_decmap(omap, codebunch, "gb18030ext", gb18030decmap) print "Generating GB18030 extension encode map..." codebunch =[] genmap_encode(codebunch, "gb18030ext", gb18030encmap) print_encmap(omap, codebunch, "gb18030ext", gb18030encmap) omap = open('map_gb18030uni.h', 'w') printcopyright(omap) print "Generating GB18030 Unicode BMP Mapping Ranges..." ranges = [[-1, -1, -1]] gblinnum = 0 print >> omap, """ static const struct _gb18030_to_unibmp_ranges { Py_UNICODE first, last; DBCHAR base; } gb18030_to_unibmp_ranges[] = {""" for uni in gb18030unilinear: if uni == ranges[-1][1] + 1: ranges[-1][1] = uni else: ranges.append([uni, uni, gblinnum]) gblinnum += 1 for first, last, base in ranges[1:]: print >> omap, " { 0x%04x, 0x%04x, 0x%04x }," % (first, last, base) print >> omap, """\ { 0x0000, 0x0000, 0x%04x }, };""" % (ranges[-1][2] + ranges[-1][1] - ranges[-1][0] + 1) print "\nDone!" # ex: ts=8 sts=4 et 1.1 cjkcodecs/tools/genmap_tchinese.py Index: genmap_tchinese.py =================================================================== # # genmap_tchinese.py: Traditional Chinese Codecs Map Generator # # Copyright (C) 2003 Hye-Shik Chang <pe...@Fr...>. # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # # $Id: genmap_tchinese.py,v 1.1 2003/05/17 20:33:06 perky Exp $ # from genmap_support import * BIG5_C1 = (0xa1, 0xfe) BIG5_C2 = (0x40, 0xfe) # big5 map doesn't have 0xA3E1 (EURO SIGN), but we ignore # that for forward compatiblilty. "Hey! we have the euro-big5!" :) CP950_C1 = BIG5_C1 CP950_C2 = BIG5_C2 try: big5map = open('BIG5.TXT') except IOError: print "=>> Please download mapping table from http://www.unicode." \ "org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/BIG5.TXT" raise SystemExit try: cp950map = open('CP950.TXT') except IOError: print "=>> Please download mapping table from http://www.unicode." \ "org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP950.TXT" raise SystemExit print "Loading Mapping File..." datever, cp950decmap = loadmap(cp950map) big5_datever, big5decmap = loadmap(big5map) big5encmap, cp950encmap = {}, {} for c1, m in cp950decmap.items(): for c2, code in m.items(): if not (not big5decmap.has_key(c1) or not big5decmap[c1].has_key(c2) or big5decmap[c1][c2] != code): del cp950decmap[c1][c2] else: cp950encmap.setdefault(code >> 8, {}) cp950encmap[code >> 8][code & 0xff] = c1 << 8 | c2 for c1, m in big5decmap.items(): for c2, code in m.items(): big5encmap.setdefault(code >> 8, {}) big5encmap[code >> 8][code & 0xff] = c1 << 8 | c2 omap = open('map_big5.h', 'w') printcopyright(omap) print "Generating BIG5 decode map..." codebunch = [] genmap_decode(codebunch, "big5", BIG5_C1, BIG5_C2, big5decmap) print_decmap(omap, codebunch, "big5", big5decmap) print "Generating BIG5 encode map..." codebunch = [] genmap_encode(codebunch, "big5", big5encmap) print_encmap(omap, codebunch, "big5", big5encmap) omap = open('map_cp950ext.h', 'w') printcopyright(omap) print "Generating CP950 extension decode map..." codebunch = [] genmap_decode(codebunch, "cp950ext", BIG5_C1, BIG5_C2, cp950decmap) print_decmap(omap, codebunch, "cp950ext", cp950decmap) print "Generating CP950 extension encode map..." codebunch = [] genmap_encode(codebunch, "cp950ext", cp950encmap) print_encmap(omap, codebunch, "cp950ext", cp950encmap) print "\nDone!" # ex: ts=8 sts=4 et |