Learn how easy it is to sync an existing GitHub or Google Code repo to a SourceForge project! See Demo

Close

[1c7f3e]: scripts / GenerateCaseConvert.py Maximize Restore History

Download this file

GenerateCaseConvert.py    123 lines (95 with data), 4.4 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# Script to generate CaseConvert.cxx from Python's Unicode data
# Should be run rarely when a Python with a new version of Unicode data is available.
# Should not be run with old versions of Python.
# Current best approach divides case conversions into two cases:
# simple symmetric and complex.
# Simple symmetric is where a lower and upper case pair convert to each
# other and the folded form is the same as the lower case.
# There are 1006 symmetric pairs.
# These are further divided into ranges (stored as lower, upper, range length,
# range pitch and singletons (stored as lower, upper).
# Complex is for cases that don't fit the above: where there are multiple
# characters in one of the forms or fold is different to lower or
# lower(upper(x)) or upper(lower(x)) are not x. These are represented as UTF-8
# strings with original, folded, upper, and lower separated by '|'.
# There are 126 complex cases.
import codecs, itertools, os, string, sys, unicodedata
from FileGenerator import Regenerate
def contiguousRanges(l, diff):
# l is s list of lists
# group into lists where first element of each element differs by diff
out = [[l[0]]]
for s in l[1:]:
if s[0] != out[-1][-1][0] + diff:
out.append([])
out[-1].append(s)
return out
def flatten(listOfLists):
"Flatten one level of nesting"
return itertools.chain.from_iterable(listOfLists)
def conversionSets():
# For all Unicode characters, see whether they have case conversions
# Return 2 sets: one of simple symmetric conversion cases and another
# with complex cases.
complexes = []
symmetrics = []
for ch in range(sys.maxunicode):
if ch >= 0xd800 and ch <= 0xDBFF:
continue
if ch >= 0xdc00 and ch <= 0xDFFF:
continue
uch = chr(ch)
fold = uch.casefold()
upper = uch.upper()
lower = uch.lower()
symmetric = False
if uch != upper and len(upper) == 1 and uch == lower and uch == fold:
lowerUpper = upper.lower()
foldUpper = upper.casefold()
if lowerUpper == foldUpper and lowerUpper == uch:
symmetric = True
symmetrics.append((ch, ord(upper), ch - ord(upper)))
if uch != lower and len(lower) == 1 and uch == upper and lower == fold:
upperLower = lower.upper()
if upperLower == uch:
symmetric = True
if fold == uch:
fold = ""
if upper == uch:
upper = ""
if lower == uch:
lower = ""
if (fold or upper or lower) and not symmetric:
complexes.append((uch, fold, upper, lower))
return symmetrics, complexes
def groupRanges(symmetrics):
# Group the symmetrics into groups where possible, returning a list
# of ranges and a list of symmetrics that didn't fit into a range
def distance(s):
return s[2]
groups = []
uniquekeys = []
for k, g in itertools.groupby(symmetrics, distance):
groups.append(list(g)) # Store group iterator as a list
uniquekeys.append(k)
contiguousGroups = flatten([contiguousRanges(g, 1) for g in groups])
longGroups = [(x[0][0], x[0][1], len(x), 1) for x in contiguousGroups if len(x) > 4]
oneDiffs = [s for s in symmetrics if s[2] == 1]
contiguousOnes = flatten([contiguousRanges(g, 2) for g in [oneDiffs]])
longOneGroups = [(x[0][0], x[0][1], len(x), 2) for x in contiguousOnes if len(x) > 4]
rangeGroups = sorted(longGroups+longOneGroups, key=lambda s: s[0])
rangeCoverage = list(flatten([range(r[0], r[0]+r[2]*r[3], r[3]) for r in rangeGroups]))
nonRanges = [(l, u) for l, u, d in symmetrics if l not in rangeCoverage]
return rangeGroups, nonRanges
def updateCaseConvert():
symmetrics, complexes = conversionSets()
rangeGroups, nonRanges = groupRanges(symmetrics)
print(len(rangeGroups), "ranges")
rangeLines = ["%d,%d,%d,%d, " % x for x in rangeGroups]
print(len(nonRanges), "non ranges")
nonRangeLines = ["%d,%d, " % x for x in nonRanges]
print(len(symmetrics), "symmetric")
complexLines = ['"%s|%s|%s|%s|"' % x for x in complexes]
print(len(complexLines), "complex")
Regenerate("../src/CaseConvert.cxx", "//", rangeLines, nonRangeLines, complexLines)
updateCaseConvert()