Thread: [Docutils-develop] Remove "wide" Unicode distinction in "generate_punctuation_chars"

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Since Python 3.3 (PEP 393), all Python builds have support for the full range of Unicode, and hence there are no longer "wide" and "narrow" builds.

As such, I propose the patch below to remove "wide" from ``generate_punctuation_chars.py``. Note this is API preserving as the resulting "delimiters" string remains the same (the difference is that it is now a constant rather than the concatenation of two strings).

-----

Index: docutils/tools/dev/generate_punctuation_chars.py
===================================================================

--- docutils/tools/dev/generate_punctuation_chars.py	(revision 9254)
+++ docutils/tools/dev/generate_punctuation_chars.py	(working copy)
@@ -55,8 +55,6 @@
 # ``docutils/tools/dev/generate_punctuation_chars.py``.
 # ::
 
-import sys
-
 """Docutils character category patterns.
 
    Patterns for the implementation of the `inline markup recognition rules`_
@@ -86,8 +84,6 @@
 %(openers)s
 %(closers)s
 %(delimiters)s
-if sys.maxunicode >= 0x10FFFF:  # "wide" build
-%(delimiters_wide)s
 closing_delimiters = r'\\.,;!?'
 
 
@@ -245,14 +241,6 @@
                                          closing_delimiters)]
 
 
-def separate_wide_chars(s):
-    """Return (s1,s2) with characters above 0xFFFF in s2"""
-    maxunicode_narrow = 0xFFFF
-    l1 = [ch for ch in s if ord(ch) <= maxunicode_narrow]
-    l2 = [ch for ch in s if ord(ch) > maxunicode_narrow]
-    return ''.join(l1), ''.join(l2)
-
-
 def mark_intervals(s):
     """Return s with shortcut notation for runs of consecutive characters
 
@@ -281,6 +269,7 @@
 
 def wrap_string(s, startstring="(", endstring="    )", wrap=71):
     """Line-wrap a unicode string literal definition."""
+    s = s.encode('unicode-escape').decode()
     c = len(startstring)
     left_indent = ' '*(c - len(startstring.lstrip(' ')))
     line_start_string = f"\n    {left_indent}'"
@@ -332,18 +321,11 @@
 
     (o, c, d, cd) = character_category_patterns()
 
-# Characters in the upper plane require a "wide" build::
-
-    o, o_wide = separate_wide_chars(o)
-    c, c_wide = separate_wide_chars(c)
-    d, d_wide = separate_wide_chars(d)
-
 # delimiters: sort and use shortcut for intervals (saves ~150 characters)
 # (`openers` and `closers` must be verbose and keep order
 # because they are also used in `match_chars()`)::
 
     d = d[:5] + mark_intervals(d[5:])
-    d_wide = mark_intervals(d_wide)
 
 
 # Test: compare module content with re-generated definitions
@@ -364,13 +346,8 @@
               % unicodedata.unidata_version)
 
         print_differences(openers, o, 'openers')
-        if o_wide:
-            print('+ openers-wide = r"""%s"""' % o_wide.encode('utf-8'))
         print_differences(closers, c, 'closers')
-        if c_wide:
-            print('+ closers-wide = r"""%s"""' % c_wide.encode('utf-8'))
-
-        print_differences(delimiters, d + d_wide, 'delimiters')
+        print_differences(delimiters, d, 'delimiters')
         print_differences(closing_delimiters, cd, 'closing_delimiters')
 
         sys.exit()
@@ -386,15 +363,9 @@
     substitutions = {
         'python_version': sys.version.split()[0],
         'unidata_version': unicodedata.unidata_version,
-        'openers': wrap_string(o.encode('unicode-escape').decode(),
-                               startstring="openers = ("),
-        'closers': wrap_string(c.encode('unicode-escape').decode(),
-                               startstring="closers = ("),
-        'delimiters': wrap_string(d.encode('unicode-escape').decode(),
-                                  startstring="delimiters = ("),
-        'delimiters_wide': wrap_string(
-                            d_wide.encode('unicode-escape').decode(),
-                            startstring="    delimiters += (")
+        'openers': wrap_string(o, startstring="openers = ("),
+        'closers': wrap_string(c, startstring="closers = ("),
+        'delimiters': wrap_string(d, startstring="delimiters = ("),
         }
 
     print(module_template % substitutions, end='')


-----

Thanks,
Adam



Thread: [Docutils-develop] Remove "wide" Unicode distinction in "generate_punctuation_chars"

docutils-develop