|
From: T.Meyarivan <mey...@us...> - 2003-04-26 07:53:11
|
Update of /cvsroot/indlinux/scripts
In directory sc8-pr-cvs1:/tmp/cvs-serv1172
Modified Files:
ChangeLog SCRIPTS.list
Added Files:
iscii2utf8.py
Log Message:
Initial checkin of iscii2utf.py - converter for iscii -> utf8
--- NEW FILE: iscii2utf8.py ---
# public domain script written by
# mary <ma...@sa...> aka meyarivan <se...@me...>
# inspired by ICU
# code still in alpha stage.. lots of redundant code.. and probably incorrect
# if ya find errors, pls submit bug reports at indlinux
# for usage, either run the script or scroll down to end of the script
import sys
# Generic Constants
ISCII_ATR = 0x00EF
ATR_MASK = 0x004F
DANDA = 0x0964
DELTA = 0x0080
DEV_ANUDATTA = 0x0952
DOUBLE_DANDA = 0x0965
ISCII_EXT = 0x00F0
EXT_RANGE_BEGIN = 0x00A1
EXT_RANGE_END = 0x00EE
HALANT = 0x094d
INDIC_BLOCK_BEGIN = 0x0900
INDIC_BLOCK_END = 0x0D7F
INVALID_CHAR = 0xFFFF
ISCII_BEGIN = 0x00A0
ISCII_DANDA = 0x00EA
ISCII_HALANT = 0x00E8
ISCII_INV = 0x00D9
ISCII_NUKTA = 0x00E9
LF = 0x000A
NO_CHAR = 0xFFFE
NUKTA = 0x093c
UNI_BEGIN = 0x0900
UNI_END = 0x097F
ZWJ = 0x200d
ZWNJ = 0x200c
# map between the ISCII scripts as specified via the ATR switch
# and the script in unicode
# bengali and assamese have same script except for two characters
# (according to ISCII 91
ISCII_SCRIPTS = {
0x40 : -1, # DEFAULT
0x42 : 0, # DEVNAG
0x43 : 1, # BENGALI
0x44 : 5, # TAMIL
0x45 : 6, # TELUGU
0x46 : 1, # ASSAMESE
0x47 : 4, # ORIYA
0x48 : 7, # KANNADA
0x49 : 8, # MALAYALAM
0x4A : 3, # GUJARATI
0x4B : 2 # PUNJABI
}
# invalid iscii values -> IGNORE
INVALID_ISCII = range(235, 239) + range(251, 256)
# iscii to unicode map
iscii_to_unicode = (
0x0000, #0x0
0x0001, #0x1
0x0002, #0x2
0x0003, #0x3
0x0004, #0x4
0x0005, #0x5
0x0006, #0x6
0x0007, #0x7
0x0008, #0x8
0x0009, #0x9
0x000a, #0xa
0x000b, #0xb
0x000c, #0xc
0x000d, #0xd
0x000e, #0xe
0x000f, #0xf
0x0010, #0x10
0x0011, #0x11
0x0012, #0x12
0x0013, #0x13
0x0014, #0x14
0x0015, #0x15
0x0016, #0x16
0x0017, #0x17
0x0018, #0x18
0x0019, #0x19
0x001a, #0x1a
0x001b, #0x1b
0x001c, #0x1c
0x001d, #0x1d
0x001e, #0x1e
0x001f, #0x1f
0x0020, #0x20
0x0021, #0x21
0x0022, #0x22
0x0023, #0x23
0x0024, #0x24
0x0025, #0x25
0x0026, #0x26
0x0027, #0x27
0x0028, #0x28
0x0029, #0x29
0x002a, #0x2a
0x002b, #0x2b
0x002c, #0x2c
0x002d, #0x2d
0x002e, #0x2e
0x002f, #0x2f
0x0030, #0x30
0x0031, #0x31
0x0032, #0x32
0x0033, #0x33
0x0034, #0x34
0x0035, #0x35
0x0036, #0x36
0x0037, #0x37
0x0038, #0x38
0x0039, #0x39
0x003A, #0x3a
0x003B, #0x3b
0x003c, #0x3c
0x003d, #0x3d
0x003e, #0x3e
0x003f, #0x3f
0x0040, #0x40
0x0041, #0x41
0x0042, #0x42
0x0043, #0x43
0x0044, #0x44
0x0045, #0x45
0x0046, #0x46
0x0047, #0x47
0x0048, #0x48
0x0049, #0x49
0x004a, #0x4a
0x004b, #0x4b
0x004c, #0x4c
0x004d, #0x4d
0x004e, #0x4e
0x004f, #0x4f
0x0050, #0x50
0x0051, #0x51
0x0052, #0x52
0x0053, #0x53
0x0054, #0x54
0x0055, #0x55
0x0056, #0x56
0x0057, #0x57
0x0058, #0x58
0x0059, #0x59
0x005a, #0x5a
0x005b, #0x5b
0x005c, #0x5c
0x005d, #0x5d
0x005e, #0x5e
0x005f, #0x5f
0x0060, #0x60
0x0061, #0x61
0x0062, #0x62
0x0063, #0x63
0x0064, #0x64
0x0065, #0x65
0x0066, #0x66
0x0067, #0x67
0x0068, #0x68
0x0069, #0x69
0x006a, #0x6a
0x006b, #0x6b
0x006c, #0x6c
0x006d, #0x6d
0x006e, #0x6e
0x006f, #0x6f
0x0070, #0x70
0x0071, #0x71
0x0072, #0x72
0x0073, #0x73
0x0074, #0x74
0x0075, #0x75
0x0076, #0x76
0x0077, #0x77
0x0078, #0x78
0x0079, #0x79
0x007a, #0x7a
0x007b, #0x7b
0x007c, #0x7c
0x007d, #0x7d
0x007e, #0x7e
0x007f, #0x7f
0x0080, #0x80
0x0081, #0x81
0x0082, #0x82
0x0083, #0x83
0x0084, #0x84
0x0085, #0x85
0x0086, #0x86
0x0087, #0x87
0x0088, #0x88
0x0089, #0x89
0x008a, #0x8a
0x008b, #0x8b
0x008c, #0x8c
0x008d, #0x8d
0x008e, #0x8e
0x008f, #0x8f
0x0090, #0x90
0x0091, #0x91
0x0092, #0x92
0x0093, #0x93
0x0094, #0x94
0x0095, #0x95
0x0096, #0x96
0x0097, #0x97
0x0098, #0x98
0x0099, #0x99
0x009a, #0x9a
0x009b, #0x9b
0x009c, #0x9c
0x009d, #0x9d
0x009e, #0x9e
0x009f, #0x9f
0x0900, #0xa0
0x0901, #0xa1
0x0902, #0xa2
0x0903, #0xa3
0x0905, #0xa4
0x0906, #0xa5
0x0907, #0xa6
0x0908, #0xa7
0x0909, #0xa8
0x090a, #0xa9
0x090b, #0xaa
0x090e, #0xab
0x090f, #0xac
0x0910, #0xad
0x090d, #0xae
0x0912, #0xaf
0x0913, #0xb0
0x0914, #0xb1
0x0911, #0xb2
0x0915, #0xb3
0x0916, #0xb4
0x0917, #0xb5
0x0918, #0xb6
0x0919, #0xb7
0x091a, #0xb8
0x091b, #0xb9
0x091c, #0xba
0x091d, #0xbb
0x091e, #0xbc
0x091f, #0xbd
0x0920, #0xbe
0x0921, #0xbf
0x0922, #0xc0
0x0923, #0xc1
0x0924, #0xc2
0x0925, #0xc3
0x0926, #0xc4
0x0927, #0xc5
0x0928, #0xc6
0x0929, #0xc7
0x092a, #0xc8
0x092b, #0xc9
0x092c, #0xca
0x092d, #0xcb
0x092e, #0xcc
0x092f, #0xcd
0x095f, #0xce
0x0930, #0xcf
0x0931, #0xd0
0x0932, #0xd1
0x0933, #0xd2
0x0934, #0xd3
0x0935, #0xd4
0x0936, #0xd5
0x0937, #0xd6
0x0938, #0xd7
0x0939, #0xd8
0x200D, #0xd9
0x093e, #0xda
0x093f, #0xdb
0x0940, #0xdc
0x0941, #0xdd
0x0942, #0xde
0x0943, #0xdf
0x0946, #0xe0
0x0947, #0xe1
0x0948, #0xe2
0x0945, #0xe3
0x094a, #0xe4
0x094b, #0xe5
0x094c, #0xe6
0x0949, #0xe7
0x094d, #0xe8
0x093c, #0xe9
0x0964, #0xea
0xFFFF, #0xeb
0xFFFF, #0xec
0xFFFF, #0xed
0xFFFF, #0xee
0xFFFF, #0xef
0xFFFF, #0xf0
0x0966, #0xf1
0x0967, #0xf2
0x0968, #0xf3
0x0969, #0xf4
0x096a, #0xf5
0x096b, #0xf6
0x096c, #0xf7
0x096d, #0xf8
0x096e, #0xf9
0x096f, #0xfa
0xFFFF, #0xfb
0xFFFF, #0xfc
0xFFFF, #0xfd
0xFFFF, #0xfe
0xFFFF #0xff
)
"""
# code to generate the validation_table
( ya ya .. agreed that it is kludgy)
# need python 2.3
import unicodedata
UNI_BEGIN = 0x0900
UNI_END = 0x097F
DELTA = 0x80
SCRS = 9
table = []
for char in range(UNI_END - UNI_BEGIN + 1):
res = [0] * SCRS
for scr in range(0, SCRS):
val = unichr(UNI_BEGIN + (scr * DELTA) + char)
res[scr] = int(unicodedata.name(val, None) is not None)
table.append(res)
"""
validation_table = [
[0, 0, 0, 0, 0, 0, 0, 0, 0], # 0x0 0
[1, 1, 0, 1, 1, 0, 1, 0, 0], # 0x1 1
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x2 2
[1, 1, 0, 1, 1, 1, 1, 1, 1], # 0x3 3
[0, 0, 0, 0, 0, 0, 0, 0, 0], # 0x4 4
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x5 5
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x6 6
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x7 7
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x8 8
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x9 9
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0xa 10
[1, 1, 0, 1, 1, 0, 1, 1, 1], # 0xb 11
[1, 1, 0, 0, 1, 0, 1, 1, 1], # 0xc 12
[1, 0, 0, 1, 0, 0, 0, 0, 0], # 0xd 13
[1, 0, 0, 0, 0, 1, 1, 1, 1], # 0xe 14
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0xf 15
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x10 16
[1, 0, 0, 1, 0, 0, 0, 0, 0], # 0x11 17
[1, 0, 0, 0, 0, 1, 1, 1, 1], # 0x12 18
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x13 19
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x14 20
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x15 21
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x16 22
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x17 23
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x18 24
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x19 25
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x1a 26
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x1b 27
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x1c 28
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x1d 29
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x1e 30
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x1f 31
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x20 32
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x21 33
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x22 34
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x23 35
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x24 36
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x25 37
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x26 38
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x27 39
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x28 40
[1, 0, 0, 0, 0, 1, 0, 0, 0], # 0x29 41
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x2a 42
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x2b 43
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x2c 44
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x2d 45
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x2e 46
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x2f 47
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x30 48
[1, 0, 0, 0, 0, 1, 1, 1, 1], # 0x31 49
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x32 50
[1, 0, 1, 1, 1, 1, 1, 1, 1], # 0x33 51
[1, 0, 0, 0, 0, 1, 0, 0, 1], # 0x34 52
[1, 0, 1, 1, 0, 1, 1, 1, 1], # 0x35 53
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x36 54
[1, 1, 0, 1, 1, 1, 1, 1, 1], # 0x37 55
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x38 56
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x39 57
[0, 0, 0, 0, 0, 0, 0, 0, 0], # 0x3a 58
[0, 0, 0, 0, 0, 0, 0, 0, 0], # 0x3b 59
[1, 1, 1, 1, 1, 0, 0, 0, 0], # 0x3c 60
[1, 0, 0, 1, 1, 0, 0, 0, 0], # 0x3d 61
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x3e 62
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x3f 63
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x40 64
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x41 65
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x42 66
[1, 1, 0, 1, 1, 0, 1, 1, 1], # 0x43 67
[1, 1, 0, 1, 0, 0, 1, 1, 0], # 0x44 68
[1, 0, 0, 1, 0, 0, 0, 0, 0], # 0x45 69
[1, 0, 0, 0, 0, 1, 1, 1, 1], # 0x46 70
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x47 71
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x48 72
[1, 0, 0, 1, 0, 0, 0, 0, 0], # 0x49 73
[1, 0, 0, 0, 0, 1, 1, 1, 1], # 0x4a 74
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x4b 75
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x4c 76
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x4d 77
[0, 0, 0, 0, 0, 0, 0, 0, 0], # 0x4e 78
[0, 0, 0, 0, 0, 0, 0, 0, 0], # 0x4f 79
[1, 0, 0, 1, 0, 0, 0, 0, 0], # 0x50 80
[1, 0, 0, 0, 0, 0, 0, 0, 0], # 0x51 81
[1, 0, 0, 0, 0, 0, 0, 0, 0], # 0x52 82
[1, 0, 0, 0, 0, 0, 0, 0, 0], # 0x53 83
[1, 0, 0, 0, 0, 0, 0, 0, 0], # 0x54 84
[0, 0, 0, 0, 0, 0, 1, 1, 0], # 0x55 85
[0, 0, 0, 0, 1, 0, 1, 1, 0], # 0x56 86
[0, 1, 0, 0, 1, 1, 0, 0, 1], # 0x57 87
[1, 0, 0, 0, 0, 0, 0, 0, 0], # 0x58 88
[1, 0, 1, 0, 0, 0, 0, 0, 0], # 0x59 89
[1, 0, 1, 0, 0, 0, 0, 0, 0], # 0x5a 90
[1, 0, 1, 0, 0, 0, 0, 0, 0], # 0x5b 91
[1, 1, 1, 0, 1, 0, 0, 0, 0], # 0x5c 92
[1, 1, 0, 0, 1, 0, 0, 0, 0], # 0x5d 93
[1, 0, 1, 0, 0, 0, 0, 1, 0], # 0x5e 94
[1, 1, 0, 0, 1, 0, 0, 0, 0], # 0x5f 95
[1, 1, 0, 1, 1, 0, 1, 1, 1], # 0x60 96
[1, 1, 0, 0, 1, 0, 1, 1, 1], # 0x61 97
[1, 1, 0, 0, 0, 0, 0, 0, 0], # 0x62 98
[1, 1, 0, 0, 0, 0, 0, 0, 0], # 0x63 99
[1, 0, 0, 0, 0, 0, 0, 0, 0], # 0x64 100
[1, 0, 0, 0, 0, 0, 0, 0, 0], # 0x65 101
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x66 102
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x67 103
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x68 104
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x69 105
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x6a 106
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x6b 107
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x6c 108
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x6d 109
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x6e 110
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x6f 111
[1, 1, 1, 0, 1, 1, 0, 0, 0], # 0x70 112
[0, 1, 1, 0, 0, 1, 0, 0, 0], # 0x71 113
[0, 1, 1, 0, 0, 1, 0, 0, 0], # 0x72 114
[0, 1, 1, 0, 0, 0, 0, 0, 0], # 0x73 115
[0, 1, 1, 0, 0, 0, 0, 0, 0], # 0x74 116
[0, 1, 0, 0, 0, 0, 0, 0, 0], # 0x75 117
[0, 1, 0, 0, 0, 0, 0, 0, 0], # 0x76 118
[0, 1, 0, 0, 0, 0, 0, 0, 0], # 0x77 119
[0, 1, 0, 0, 0, 0, 0, 0, 0], # 0x78 120
[0, 1, 0, 0, 0, 0, 0, 0, 0], # 0x79 121
[0, 1, 0, 0, 0, 0, 0, 0, 0], # 0x7a 122
[0, 0, 0, 0, 0, 0, 0, 0, 0], # 0x7b 123
[0, 0, 0, 0, 0, 0, 0, 0, 0], # 0x7c 124
[0, 0, 0, 0, 0, 0, 0, 0, 0], # 0x7d 125
[0, 0, 0, 0, 0, 0, 0, 0, 0], # 0x7e 126
[0, 0, 0, 0, 0, 0, 0, 0, 0] # 0x7f 127
]
# special characters formed by combination of consonants and nukta
nukta_specials = {
0xA6 : 0x090c,
0xEA : 0x093D,
0xDF : 0x0944,
0xA1 : 0x0950,
0xB3 : 0x0958,
0xB4 : 0x0959,
0xB5 : 0x095A,
0xBA : 0x095B,
0xBF : 0x095C,
0xC0 : 0x095D,
0xC9 : 0x095E,
0xAA : 0x0960,
0xA7 : 0x0961,
0xDB : 0x0962,
0xDC : 0x0963
}
special_maps = {
## the two points which are different between assamese and bengali
## (according to the charts in ISCII-91 documentation)
(5, 0xCF) : 0x09F0,
(5, 0xD4) : 0x09F1,
}
def to_utf8(y):
"""
converts an array of integers to utf8 string
"""
out = []
for x in y:
if x < 0x080:
out.append(x)
elif x < 0x0800:
out.append((x >> 6) | 0xC0)
out.append((x & 0x3F) | 0x80)
elif x < 0x10000:
out.append((x >> 12) | 0xE0)
out.append(((x >> 6) & 0x3F) | 0x80)
out.append((x & 0x3F) | 0x80)
else:
out.append((x >> 18) | 0xF0)
out.append((x >> 12) & 0x3F)
out.append(((x >> 6) & 0x3F) | 0x80)
out.append((x & 0x3F) | 0x80)
return ''.join(map(chr, out))
class IllegalInput(Exception):
def __init__(self, e):
self.exception = e
def __str__(self):
return repr(self.exception)
class Parser:
def __init__(self):
self.delta = 0
self.curr_mask = 0 # current mask to unicode
self.prev_char = self.src_char = self.dest_char = NO_CHAR
self.dest = []
# construct a map of valid chars to encode and all
ign = self.addchar_ign = {}
ign[ZWJ] = ign[ZWNJ] = ign[DANDA] = ign[DOUBLE_DANDA] = True
inv = self.addchar_invalid = {}
inv[NO_CHAR] = inv[None] = inv[0xFFFF] = False
# end
self.pos = 0
def add_char(self, i):
# do not include any of the marker CHARS..
# the unmapped space in iscii is cleaned because each of those
# code points map to 0xFFFF
if self.addchar_invalid.has_key(i):
raise ValueError, "Invalid Input %s" % (hex(i))
if not self.addchar_ign.has_key(i):
j = self.delta + i
else:
j = i
self.pos += 1
self.dest.append(j)
def write_output(self):
out = to_utf8(self.dest)
sys.stdout.write(out)
self.dest = []
def set_script(self, i):
"""
set the value of delta to reflect the current codepage
"""
try:
n = ISCII_SCRIPTS[i]
except KeyError:
if i in range(9):
n = i - 1
else:
raise IllegalInput, "Invalid Value for ATR %s" % (hex(i))
if n > -1: # n = -1 is the default script ..
self.curr_script = n
self.delta = n * DELTA
return
def isvalid(self, i):
return bool(validity_table[i & 0xFF][self.curr_script])
def isvalid_iscii(self, x):
return x not in INVALID_ISCII
def get_mapping(self, i):
t = special_maps.get((self.curr_script, i), None)
if t:
m = t[1]
elif (i <= 0xFF):
m = iscii_to_unicode[i]
else:
m = i
return m
def is_nukta_special(self, i):
x = nukta_specials.get(i, None)
return x
def handle_ext(self, curr_char):
self.pos += 1 # for EXT
for a in range(1):
if not ((EXT_RANGE_END >= curr_char) and\
(EXT_RANGE_BEGIN <= curr_char)):
break
if curr_char not in [0xBF, 0xB8]:
break
if curr_char == 0xBF:
dest_char = DEV_ABBR_SIGN
else:
dest_char = DEV_ANUDATTA
if self.isvalid(dest_char):
self.add_char(dest_char)
return
raise ValueError, "Invalid Input after EXT %s" % (hex(i))
def handle_atr(self, i):
if i in ISCII_SCRIPTS.keys():
self.set_script(ISCII_SCRIPTS[i])
print >> sys.stderr, "setting script to", i
self.pos += 2 # for ATR and the following char
return
def handle_inv(self, i):
if i == ISCII_HALANT:
ret = 0x0020
else:
ret = ZWJ
self.add_char(ret)
self.pos += 1 # for INV
def post_analysis(self, prev_char, src_char):
if prev_char == ISCII_ATR:
self.handle_atr(src_char)
elif prev_char == ISCII_EXT:
self.handle_ext(src_char)
elif prev_char == ISCII_INV:
self.handle_inv(src_char)
return
def iscii2utf8(self, src, flush = 0):
dest = self.dest
src = map(ord, src)
curr_char = prev_char = NO_CHAR
n = len(src)
self.pos = 0
for i in range(n):
curr_char = src[i]
dest_char = NO_CHAR
if curr_char in INVALID_ISCII:
# just ignore the invalid iscii characters
print >> sys.stderr, 'ignoring invalid iscii char', \
hex(curr_char)
self.pos += 1
continue
if (i == (n - 1)) and flush:
self.add_char(self.get_mapping(curr_char))
continue
if (prev_char == NO_CHAR):
prev_char = curr_char
continue
if prev_char in [ISCII_ATR, ISCII_EXT, ISCII_INV]:
self.post_analysis(prev_char, curr_char)
prev_char = NO_CHAR
continue
if curr_char in [ISCII_INV, ISCII_EXT, ISCII_ATR]:
self.add_char(self.get_mapping(prev_char))
prev_char = curr_char
continue
if curr_char == ISCII_DANDA:
if prev_char == ISCII_DANDA:
dest_char = DOUBLE_DANDA
prev_char = NO_CHAR
self.pos += 1
elif curr_char == ISCII_HALANT:
if prev_char == ISCII_HALANT:
self.add_char(ISCII_HALANT)
dest_char = ZWNJ
prev_char = NO_CHAR
elif curr_char == ISCII_NUKTA:
if prev_char == ISCII_HALANT:
self.add_char(self.get_mapping(ISCII_HALANT))
dest_char = ZWJ
prev_char = NO_CHAR
else:
tmp = self.is_nukta_special(prev_char)
if tmp: # nukta special
dest_char = tmp
prev_char = NO_CHAR
self.pos += 1
if dest_char != NO_CHAR:
self.add_char(self.get_mapping(dest_char))
else:
self.add_char(self.get_mapping(prev_char))
prev_char = curr_char
return self.pos
def show_usage(name):
usage = """
Usage:
%s script
where script is a number between 1-9
1 - devnag
2 - bengali / assamese
3 - punjabi
4 - gujarati
5 - oriya
6 - tamil
7 - telugu
8 - kannada
9 - malayalam
the program reads from stdin and writes to stdout
any msgs to the user (error msgs etc) are printed on stderr
""" % (name)
print >> sys.stderr, usage
sys.exit(1)
if __name__ == '__main__':
try:
i = int(sys.argv[1])
if i not in range(1, 10):
raise ValueError
except (ValueError, IndexError):
show_usage(sys.argv[0])
mypar = Parser()
mypar.set_script(i)
y = ''
flush = 0
while 1:
if flush:
break
x = sys.stdin.read(4096)
if not x:
flush = 1
x = y + x
n = mypar.iscii2utf8(x, flush)
y = x[n:]
mypar.write_output()
Index: ChangeLog
===================================================================
RCS file: /cvsroot/indlinux/scripts/ChangeLog,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** ChangeLog 12 Apr 2003 13:15:49 -0000 1.1
--- ChangeLog 26 Apr 2003 07:53:07 -0000 1.2
***************
*** 1,4 ****
! 2003-04-12 Guntupalli Karunakar <kar...@fr...>
! * Initial checkin of scripts module
--- 1,7 ----
! 2003-04-26 mary <ma...@sa...>
! * Initial checkin of iscii2utf8.py
+ 2003-04-12 Guntupalli Karunakar <kar...@fr...>
+
+ * Initial checkin of scripts module
\ No newline at end of file
Index: SCRIPTS.list
===================================================================
RCS file: /cvsroot/indlinux/scripts/SCRIPTS.list,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** SCRIPTS.list 12 Apr 2003 13:15:51 -0000 1.2
--- SCRIPTS.list 26 Apr 2003 07:53:07 -0000 1.3
***************
*** 1,3 ****
! SCRIPT NAME: ISCII to UTF-8 convertor
FILENAME: iscii2utf8.pl-new
PURPOSE: A perl script to convert ISCII to UTF-8 for Devanagari.
--- 1,3 ----
! SCRIPT NAME: ISCII to UTF-8 converter
FILENAME: iscii2utf8.pl-new
PURPOSE: A perl script to convert ISCII to UTF-8 for Devanagari.
***************
*** 8,12 ****
LICENSE: GNU GPL
! SCRIPT NAME: UTF-8 to ISCII convertor
FILENAME: utf82iscii.pl-new
PURPOSE: A perl script to convert UTF-8 to ISCII for Devanagari.
--- 8,12 ----
LICENSE: GNU GPL
! SCRIPT NAME: UTF-8 to ISCII converter
FILENAME: utf82iscii.pl-new
PURPOSE: A perl script to convert UTF-8 to ISCII for Devanagari.
***************
*** 17,21 ****
LICENSE: GNU GPL
! SCRIPT NAME: Itrans to ISCII convertor
FILENAME: itrans-to-iscii.c
PURPOSE: Convert Itrans input to ISCII text
--- 17,21 ----
LICENSE: GNU GPL
! SCRIPT NAME: Itrans to ISCII converter
FILENAME: itrans-to-iscii.c
PURPOSE: Convert Itrans input to ISCII text
***************
*** 24,25 ****
--- 24,34 ----
DATE: 12-04-2003
LICENSE: GNU GPL
+
+ SCRIPT NAME: ISCII to UTF-8 converter
+ FILENAME: iscii2utf8.py
+ PURPOSE: A python script to convert ISCII to UTF-8 for all ISCII scripts.
+ AUTHOR: mary [ meyarivan ]
+ MAINTAINER:
+ EMAIL: ma...@sa... [ se...@me... ]
+ DATE: 26-04-2003
+ LICENSE:
|