From: Adam T. <aat...@ou...> - 2022-11-16 20:39:40
|
Since Python 3.3 (PEP 393), all Python builds have support for the full range of Unicode, and hence there are no longer "wide" and "narrow" builds. As such, I propose the patch below to remove "wide" from ``generate_punctuation_chars.py``. Note this is API preserving as the resulting "delimiters" string remains the same (the difference is that it is now a constant rather than the concatenation of two strings). ----- Index: docutils/tools/dev/generate_punctuation_chars.py =================================================================== --- docutils/tools/dev/generate_punctuation_chars.py (revision 9254) +++ docutils/tools/dev/generate_punctuation_chars.py (working copy) @@ -55,8 +55,6 @@ # ``docutils/tools/dev/generate_punctuation_chars.py``. # :: -import sys - """Docutils character category patterns. Patterns for the implementation of the `inline markup recognition rules`_ @@ -86,8 +84,6 @@ %(openers)s %(closers)s %(delimiters)s -if sys.maxunicode >= 0x10FFFF: # "wide" build -%(delimiters_wide)s closing_delimiters = r'\\.,;!?' @@ -245,14 +241,6 @@ closing_delimiters)] -def separate_wide_chars(s): - """Return (s1,s2) with characters above 0xFFFF in s2""" - maxunicode_narrow = 0xFFFF - l1 = [ch for ch in s if ord(ch) <= maxunicode_narrow] - l2 = [ch for ch in s if ord(ch) > maxunicode_narrow] - return ''.join(l1), ''.join(l2) - - def mark_intervals(s): """Return s with shortcut notation for runs of consecutive characters @@ -281,6 +269,7 @@ def wrap_string(s, startstring="(", endstring=" )", wrap=71): """Line-wrap a unicode string literal definition.""" + s = s.encode('unicode-escape').decode() c = len(startstring) left_indent = ' '*(c - len(startstring.lstrip(' '))) line_start_string = f"\n {left_indent}'" @@ -332,18 +321,11 @@ (o, c, d, cd) = character_category_patterns() -# Characters in the upper plane require a "wide" build:: - - o, o_wide = separate_wide_chars(o) - c, c_wide = separate_wide_chars(c) - d, d_wide = separate_wide_chars(d) - # delimiters: sort and use shortcut for intervals (saves ~150 characters) # (`openers` and `closers` must be verbose and keep order # because they are also used in `match_chars()`):: d = d[:5] + mark_intervals(d[5:]) - d_wide = mark_intervals(d_wide) # Test: compare module content with re-generated definitions @@ -364,13 +346,8 @@ % unicodedata.unidata_version) print_differences(openers, o, 'openers') - if o_wide: - print('+ openers-wide = r"""%s"""' % o_wide.encode('utf-8')) print_differences(closers, c, 'closers') - if c_wide: - print('+ closers-wide = r"""%s"""' % c_wide.encode('utf-8')) - - print_differences(delimiters, d + d_wide, 'delimiters') + print_differences(delimiters, d, 'delimiters') print_differences(closing_delimiters, cd, 'closing_delimiters') sys.exit() @@ -386,15 +363,9 @@ substitutions = { 'python_version': sys.version.split()[0], 'unidata_version': unicodedata.unidata_version, - 'openers': wrap_string(o.encode('unicode-escape').decode(), - startstring="openers = ("), - 'closers': wrap_string(c.encode('unicode-escape').decode(), - startstring="closers = ("), - 'delimiters': wrap_string(d.encode('unicode-escape').decode(), - startstring="delimiters = ("), - 'delimiters_wide': wrap_string( - d_wide.encode('unicode-escape').decode(), - startstring=" delimiters += (") + 'openers': wrap_string(o, startstring="openers = ("), + 'closers': wrap_string(c, startstring="closers = ("), + 'delimiters': wrap_string(d, startstring="delimiters = ("), } print(module_template % substitutions, end='') ----- Thanks, Adam |