From: Adam T. <aat...@ou...> - 2022-11-16 20:39:40
|
Since Python 3.3 (PEP 393), all Python builds have support for the full range of Unicode, and hence there are no longer "wide" and "narrow" builds. As such, I propose the patch below to remove "wide" from ``generate_punctuation_chars.py``. Note this is API preserving as the resulting "delimiters" string remains the same (the difference is that it is now a constant rather than the concatenation of two strings). ----- Index: docutils/tools/dev/generate_punctuation_chars.py =================================================================== --- docutils/tools/dev/generate_punctuation_chars.py (revision 9254) +++ docutils/tools/dev/generate_punctuation_chars.py (working copy) @@ -55,8 +55,6 @@ # ``docutils/tools/dev/generate_punctuation_chars.py``. # :: -import sys - """Docutils character category patterns. Patterns for the implementation of the `inline markup recognition rules`_ @@ -86,8 +84,6 @@ %(openers)s %(closers)s %(delimiters)s -if sys.maxunicode >= 0x10FFFF: # "wide" build -%(delimiters_wide)s closing_delimiters = r'\\.,;!?' @@ -245,14 +241,6 @@ closing_delimiters)] -def separate_wide_chars(s): - """Return (s1,s2) with characters above 0xFFFF in s2""" - maxunicode_narrow = 0xFFFF - l1 = [ch for ch in s if ord(ch) <= maxunicode_narrow] - l2 = [ch for ch in s if ord(ch) > maxunicode_narrow] - return ''.join(l1), ''.join(l2) - - def mark_intervals(s): """Return s with shortcut notation for runs of consecutive characters @@ -281,6 +269,7 @@ def wrap_string(s, startstring="(", endstring=" )", wrap=71): """Line-wrap a unicode string literal definition.""" + s = s.encode('unicode-escape').decode() c = len(startstring) left_indent = ' '*(c - len(startstring.lstrip(' '))) line_start_string = f"\n {left_indent}'" @@ -332,18 +321,11 @@ (o, c, d, cd) = character_category_patterns() -# Characters in the upper plane require a "wide" build:: - - o, o_wide = separate_wide_chars(o) - c, c_wide = separate_wide_chars(c) - d, d_wide = separate_wide_chars(d) - # delimiters: sort and use shortcut for intervals (saves ~150 characters) # (`openers` and `closers` must be verbose and keep order # because they are also used in `match_chars()`):: d = d[:5] + mark_intervals(d[5:]) - d_wide = mark_intervals(d_wide) # Test: compare module content with re-generated definitions @@ -364,13 +346,8 @@ % unicodedata.unidata_version) print_differences(openers, o, 'openers') - if o_wide: - print('+ openers-wide = r"""%s"""' % o_wide.encode('utf-8')) print_differences(closers, c, 'closers') - if c_wide: - print('+ closers-wide = r"""%s"""' % c_wide.encode('utf-8')) - - print_differences(delimiters, d + d_wide, 'delimiters') + print_differences(delimiters, d, 'delimiters') print_differences(closing_delimiters, cd, 'closing_delimiters') sys.exit() @@ -386,15 +363,9 @@ substitutions = { 'python_version': sys.version.split()[0], 'unidata_version': unicodedata.unidata_version, - 'openers': wrap_string(o.encode('unicode-escape').decode(), - startstring="openers = ("), - 'closers': wrap_string(c.encode('unicode-escape').decode(), - startstring="closers = ("), - 'delimiters': wrap_string(d.encode('unicode-escape').decode(), - startstring="delimiters = ("), - 'delimiters_wide': wrap_string( - d_wide.encode('unicode-escape').decode(), - startstring=" delimiters += (") + 'openers': wrap_string(o, startstring="openers = ("), + 'closers': wrap_string(c, startstring="closers = ("), + 'delimiters': wrap_string(d, startstring="delimiters = ("), } print(module_template % substitutions, end='') ----- Thanks, Adam |
From: Guenter M. <mi...@us...> - 2022-11-17 22:57:57
|
Dear Adam, On 2022-11-16, Adam Turner wrote: > Since Python 3.3 (PEP 393), all Python builds have support for the full > range of Unicode, and hence there are no longer "wide" and "narrow" > builds. > As such, I propose the patch below to remove "wide" from > ``generate_punctuation_chars.py``. Thank you for the patch. > Note this is API preserving as the > resulting "delimiters" string remains the same (the difference is that > it is now a constant rather than the concatenation of two strings). I updated "punctuation_chars.py" accordingly. I also fixed r9253, as DOUBLE LOW-REVERSED-9 QUOTATION MARK is used as quote in Old Hungarian together with ‟. (I found out after studying your reference https://www.unicode.org/L2/L2012/12168r-n4268r-oldhungarian.pdf#page=26 Realizing that Old Hungarian is written right to left (with lines top to bottom), the pairing quotes in the sample text became clear. Once at it, and looking for a solution to the non-matching quote characters, I did a complete revision of "generate_punctuation_chars.py" (see below). Günter ------ docutils/docutils/utils/punctuation_chars.py | 29 ++-- docutils/tools/dev/generate_punctuation_chars.py | 169 ++++++++++++----------- 2 files changed, 97 insertions(+), 101 deletions(-) diff --git a/docutils/docutils/utils/punctuation_chars.py b/docutils/docutils/utils/punctuation_chars.py index 161dbd60f..dfbc3dc3e 100644 --- a/docutils/docutils/utils/punctuation_chars.py +++ b/docutils/docutils/utils/punctuation_chars.py @@ -13,8 +13,6 @@ # ``docutils/tools/dev/generate_punctuation_chars.py``. # :: -import sys - """Docutils character category patterns. Patterns for the implementation of the `inline markup recognition rules`_ @@ -86,22 +84,24 @@ delimiters = ( '\ufe50-\ufe52\ufe54-\ufe58\ufe5f-\ufe61\ufe63\ufe68\ufe6a' '\ufe6b\uff01-\uff03\uff05-\uff07\uff0a\uff0c-\uff0f\uff1a' '\uff1b\uff1f\uff20\uff3c\uff61\uff64\uff65' + '\U00010100\U00010101\U0001039f\U000103d0\U00010857' + '\U0001091f\U0001093f\U00010a50-\U00010a58\U00010a7f' + '\U00010b39-\U00010b3f\U000110bb\U000110bc\U000110be-' + '\U000110c1\U00012470-\U00012473' ) -if sys.maxunicode >= 0x10FFFF: # "wide" build - delimiters += ( - '\U00010100\U00010101\U0001039f\U000103d0\U00010857' - '\U0001091f\U0001093f\U00010a50-\U00010a58\U00010a7f' - '\U00010b39-\U00010b3f\U000110bb\U000110bc\U000110be-' - '\U000110c1\U00012470-\U00012473' - ) closing_delimiters = r'\\.,;!?' # Matching open/close quotes # -------------------------- +# Matching open/close pairs are at the same position in +# `punctuation_chars.openers` and `punctuation_chars.closers`. +# Additional matches (due to different typographic conventions +# in different languages) are stored in `quote_pairs`. + quote_pairs = { - # open char: matching closing characters # usage example + # open char: matching closing characters # use case '\xbb': '\xbb', # » » Swedish '\u2018': '\u201a', # ‘ ‚ Albanian/Greek/Turkish '\u2019': '\u2019', # ’ ’ Swedish @@ -115,14 +115,7 @@ quote_pairs = { def match_chars(c1, c2): - """Test whether `c1` and `c2` are a matching open/close character pair. - - Matching open/close pairs are at the same position in - `punctuation_chars.openers` and `punctuation_chars.closers`. - The pairing of open/close quotes is ambiguous due to different - typographic conventions in different languages, - so we test for additional matches stored in `quote_pairs`. - """ + """Test whether `c1` and `c2` are a matching open/close character pair.""" try: i = openers.index(c1) except ValueError: # c1 not in openers diff --git a/docutils/tools/dev/generate_punctuation_chars.py b/docutils/tools/dev/generate_punctuation_chars.py index da5fd9e87..13de40f03 100755 --- a/docutils/tools/dev/generate_punctuation_chars.py +++ b/docutils/tools/dev/generate_punctuation_chars.py @@ -55,8 +55,6 @@ module_template = r'''# :Id: $Id$ # ``docutils/tools/dev/generate_punctuation_chars.py``. # :: -import sys - """Docutils character category patterns. Patterns for the implementation of the `inline markup recognition rules`_ @@ -86,16 +84,19 @@ import sys %(openers)s %(closers)s %(delimiters)s -if sys.maxunicode >= 0x10FFFF: # "wide" build -%(delimiters_wide)s closing_delimiters = r'\\.,;!?' # Matching open/close quotes # -------------------------- +# Matching open/close pairs are at the same position in +# `punctuation_chars.openers` and `punctuation_chars.closers`. +# Additional matches (due to different typographic conventions +# in different languages) are stored in `quote_pairs`. + quote_pairs = { - # open char: matching closing characters # usage example + # open char: matching closing characters # use case '\xbb': '\xbb', # » » Swedish '\u2018': '\u201a', # ‘ ‚ Albanian/Greek/Turkish '\u2019': '\u2019', # ’ ’ Swedish @@ -104,19 +105,14 @@ quote_pairs = { '\u201e': '\u201c\u201d', # „ “ German, „ ” Polish '\u201d': '\u201d', # ” ” Swedish '\u203a': '\u203a', # › › Swedish + '\u301d': '\u301f' # 〝 〟 CJK punctuation + '\u2e42': '\u201F', # ⹂ ‟ Old Hungarian (right to left) } """Additional open/close quote pairs.""" def match_chars(c1, c2): - """Test whether `c1` and `c2` are a matching open/close character pair. - - Matching open/close pairs are at the same position in - `punctuation_chars.openers` and `punctuation_chars.closers`. - The pairing of open/close quotes is ambiguous due to different - typographic conventions in different languages, - so we test for additional matches stored in `quote_pairs`. - """ + """Test whether `c1` and `c2` are a matching open/close character pair.""" try: i = openers.index(c1) except ValueError: # c1 not in openers @@ -136,13 +132,12 @@ def match_chars(c1, c2): # :: unicode_punctuation_categories = { - # 'Pc': 'Connector', # not used in Docutils inline markup recognition - 'Pd': 'Dash', - 'Ps': 'Open', - 'Pe': 'Close', - 'Pi': 'Initial quote', # may behave like Ps or Pe depending on usage - 'Pf': 'Final quote', # may behave like Ps or Pe depending on usage - 'Po': 'Other' + 'Pd': 'dash', + 'Ps': 'open', + 'Pe': 'close', + 'Pi': 'initial quote', # may behave like Ps or Pe depending on language + 'Pf': 'final quote', # may behave like Ps or Pe depending on language + 'Po': 'other' } """Unicode character categories for punctuation""" @@ -191,30 +186,42 @@ def character_category_patterns(): # Rearange the lists to ensure matching characters at the same # index position. - # low quotation marks are also used as closers (e.g. in Greek) - # move them to category Pi: + # LOW-9 QUOTATION MARKs are categorized as Ps (open) without matching Pe. + # They are used as initial quotes in German and final quotes in Greek. + # Remove them to get balanced Ps/Pe pairs. ucharlists['Ps'].remove('‚') # 201A SINGLE LOW-9 QUOTATION MARK ucharlists['Ps'].remove('„') # 201E DOUBLE LOW-9 QUOTATION MARK - ucharlists['Pi'] += ['‚', '„'] - - ucharlists['Pi'].remove('‛') # 201B … HIGH-REVERSED-9 QUOTATION MARK - ucharlists['Pi'].remove('‟') # 201F … HIGH-REVERSED-9 QUOTATION MARK - ucharlists['Pf'] += ['‛', '‟'] - - # 301F LOW DOUBLE PRIME QUOTATION MARK misses the opening pendant: - ucharlists['Ps'].insert(ucharlists['Pe'].index('\u301f'), '\u301d') - - # 2E42 DOUBLE LOW-REVERSED-9 QUOTATION MARK has no pair, and the only - # usages identified thus far are in old hungarian, where it doesn't seem to - # be used as a quoting character. Remove from openers (Ps) for now, for - # simplicity. + # + # HIGH-REVERSED-9 QUOTATION MARKs are categorized as Pi (initial quote) + # without matching Pf (final quote). + # Insert the LOW-9 QUOTATION MARKs at the "empty slots" in Pf. + ucharlists['Pf'].insert(ucharlists['Pi'].index('‛'), '‚') + ucharlists['Pf'].insert(ucharlists['Pi'].index('‟'), '„') + + # '⹂' 2E42 DOUBLE LOW-REVERSED-9 QUOTATION MARK + # is categorized as Ps (open) without matching Pe (close). + # It is used in Old Hungarian (written right to left) as quoting character + # matching DOUBLE HIGH-REVERSED-9 QUOTATION MARK. # https://www.unicode.org/L2/L2012/12168r-n4268r-oldhungarian.pdf#page=26 - ucharlists['Ps'].remove('⹂') + # + # '⹂' 301F LOW DOUBLE PRIME QUOTATION MARK + # is categorized as Pe (close) without matching Ps (open). + # Move to the place matching 2E42: + ucharlists['Pe'].remove('\u301f') + ucharlists['Pe'].insert(ucharlists['Ps'].index('⹂'), '\u301f') + + # check for balanced lists: + if len(ucharlists['Ps']) != len(ucharlists['Pe']): + print('Missmatch between "Open" and "Close" categories') + print(''.join(ucharlists['Ps'])) + print(''.join(ucharlists['Pe'])) + raise AssertionError + if len(ucharlists['Pi']) != len(ucharlists['Pf']): + print('Missmatch between "initial quote" and "final quote" categories') + print(''.join(ucharlists['Pi'])) + print(''.join(ucharlists['Pf'])) + raise AssertionError - # print(''.join(ucharlists['Ps']).encode('utf-8') - # print(''.join(ucharlists['Pe']).encode('utf-8') - # print(''.join(ucharlists['Pi']).encode('utf-8') - # print(''.join(ucharlists['Pf']).encode('utf-8') # The Docutils character categories # --------------------------------- @@ -245,14 +252,6 @@ def character_category_patterns(): closing_delimiters)] -def separate_wide_chars(s): - """Return (s1,s2) with characters above 0xFFFF in s2""" - maxunicode_narrow = 0xFFFF - l1 = [ch for ch in s if ord(ch) <= maxunicode_narrow] - l2 = [ch for ch in s if ord(ch) > maxunicode_narrow] - return ''.join(l1), ''.join(l2) - - def mark_intervals(s): """Return s with shortcut notation for runs of consecutive characters @@ -281,6 +280,7 @@ def mark_intervals(s): def wrap_string(s, startstring="(", endstring=" )", wrap=71): """Line-wrap a unicode string literal definition.""" + s = s.encode('unicode-escape').decode() c = len(startstring) left_indent = ' '*(c - len(startstring.lstrip(' '))) line_start_string = f"\n {left_indent}'" @@ -299,17 +299,21 @@ def wrap_string(s, startstring="(", endstring=" )", wrap=71): def print_differences(old, new, name): """List characters missing in old/new.""" if old != new: - print('new %s:' % name) - for c in new: - if c not in old: - print(' %04x'%ord(c), c, unicodedata.name(c)) - print('removed %s:' % name) - for c in old: - if c not in new: - print(' %04x'%ord(c), unicodedata.name(c)) + print(f'"{name}" changed') + if '-' in old or '-' in new: + print('-', old) + print('+', new) + else: + for c in new: + if c not in old: + print('+ %04x'%ord(c), c, unicodedata.name(c)) + for c in old: + if c not in new: + print('- %04x'%ord(c), c, unicodedata.name(c)) + return True else: - print('%s unchanged' % name) - + print(f'"{name}" unchanged') + return False # Output # ------ @@ -322,7 +326,8 @@ if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('-t', '--test', action="store_true", help='test for changed character categories') - parser.add_argument('-o', '--out') + parser.add_argument('--pairs', action="store_true", + help='show openers/closers in human readable form') args = parser.parse_args() # (Re)create character patterns @@ -332,18 +337,11 @@ if __name__ == '__main__': (o, c, d, cd) = character_category_patterns() -# Characters in the upper plane require a "wide" build:: - - o, o_wide = separate_wide_chars(o) - c, c_wide = separate_wide_chars(c) - d, d_wide = separate_wide_chars(d) - # delimiters: sort and use shortcut for intervals (saves ~150 characters) # (`openers` and `closers` must be verbose and keep order # because they are also used in `match_chars()`):: d = d[:5] + mark_intervals(d[5:]) - d_wide = mark_intervals(d_wide) # Test: compare module content with re-generated definitions @@ -363,16 +361,27 @@ if __name__ == '__main__': ' module\n and a regeneration based on Unicode version %s:' % unicodedata.unidata_version) - print_differences(openers, o, 'openers') - if o_wide: - print('+ openers-wide = r"""%s"""' % o_wide.encode('utf-8')) - print_differences(closers, c, 'closers') - if c_wide: - print('+ closers-wide = r"""%s"""' % c_wide.encode('utf-8')) - - print_differences(delimiters, d + d_wide, 'delimiters') + delta_o = print_differences(openers, o, 'openers') + delta_c = print_differences(closers, c, 'closers') + print_differences(delimiters, d, 'delimiters') print_differences(closing_delimiters, cd, 'closing_delimiters') + if delta_o or delta_c: + print('\nChanges in "openers" and/or "closers",' + '\nCheck open/close pairs with option "--pairs"!') + sys.exit() + + +# Print debugging output +# ~~~~~~~~~~~~~~~~~~~~~~ +# +# Print comparison of `openers` and `closers` in human readable form +# to allow checking for matching pairs. + + if args.pairs: + for o_i, c_i in zip(o, c): + print(o_i, c_i, + unicodedata.name(o_i), '\t', unicodedata.name(c_i)) sys.exit() # Print re-generation of the punctuation_chars module @@ -386,15 +395,9 @@ if __name__ == '__main__': substitutions = { 'python_version': sys.version.split()[0], 'unidata_version': unicodedata.unidata_version, - 'openers': wrap_string(o.encode('unicode-escape').decode(), - startstring="openers = ("), - 'closers': wrap_string(c.encode('unicode-escape').decode(), - startstring="closers = ("), - 'delimiters': wrap_string(d.encode('unicode-escape').decode(), - startstring="delimiters = ("), - 'delimiters_wide': wrap_string( - d_wide.encode('unicode-escape').decode(), - startstring=" delimiters += (") + 'openers': wrap_string(o, startstring="openers = ("), + 'closers': wrap_string(c, startstring="closers = ("), + 'delimiters': wrap_string(d, startstring="delimiters = ("), } print(module_template % substitutions, end='') |