|
From: <mi...@us...> - 2017-01-17 15:06:20
|
Revision: 8016
http://sourceforge.net/p/docutils/code/8016
Author: milde
Date: 2017-01-17 15:06:17 +0000 (Tue, 17 Jan 2017)
Log Message:
-----------
Generate the complete punctuation_chars module with the corresponding tool.
Modified Paths:
--------------
trunk/docutils/docutils/utils/punctuation_chars.py
trunk/docutils/tools/dev/generate_punctuation_chars.py
Modified: trunk/docutils/docutils/utils/punctuation_chars.py
===================================================================
--- trunk/docutils/docutils/utils/punctuation_chars.py 2017-01-08 09:54:35 UTC (rev 8015)
+++ trunk/docutils/docutils/utils/punctuation_chars.py 2017-01-17 15:06:17 UTC (rev 8016)
@@ -1,6 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
-# :Copyright: © 2011 Günter Milde.
+# :Id: $Id$
+# :Copyright: © 2011, 2017 Günter Milde.
# :License: Released under the terms of the `2-Clause BSD license`_, in short:
#
# Copying and distribution of this file, with or without modification,
@@ -9,46 +10,39 @@
# This file is offered as-is, without any warranty.
#
# .. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause
-
-# :Id: $Id$
#
+# This file is generated by
+# ``docutils/tools/dev/generate_punctuation_chars.py``.
# ::
import sys, re
import unicodedata
-# Docutils character category patterns
-# ------------------------------------
-#
-# This module provides patterns for the implementation of the
-# `inline markup recognition rules`_ in the reStructuredText parser
-# `<../parsers/rst/states.py>`__ based on Unicode character categories.
-# The patterns are used inside ``[ ]`` in regular expressions.
-#
-# Rule (5) requires determination of matching open/close pairs. However,
-# the pairing of open/close quotes is ambigue due to different typographic
-# conventions in different languages. The ``quote_pairs`` function tests
-# whether two characters form an open/close pair.
-#
-# The patterns are generated by
-# ``docutils/tools/dev/generate_punctuation_chars.py`` to prevent dependance
-# on the Python version and avoid the time-consuming generation with every
-# Docutils run. See there for motives and implementation details.
-#
-# The category of some characters changed with the development of the Unicode
-# standard. The current lists are generated with the help of the "unicodedata"
-# module of Python 2.7 (based on the UnicodeData.txt file version 5.2.0).
-#
-# Updating the patterns with a new Unicode standard version is an API
-# change (may render valid rST documents invalid). It should only be done for
-# "feature releases" and requires also updating the specification of `inline
-# markup recognition rules`_ in ../../docs/ref/rst/restructuredtext.txt.
-#
-# .. _inline markup recognition rules:
-# ../../docs/ref/rst/restructuredtext.html#inline-markup
-#
-# ::
+"""Docutils character category patterns.
+ Patterns for the implementation of the `inline markup recognition rules`_
+ in the reStructuredText parser `docutils.parsers.rst.states.py` based
+ on Unicode character categories.
+ The patterns are used inside ``[ ]`` in regular expressions.
+
+ Rule (5) requires determination of matching open/close pairs. However, the
+ pairing of open/close quotes is ambiguous due to different typographic
+ conventions in different languages. The ``quote_pairs`` function tests
+ whether two characters form an open/close pair.
+
+ The patterns are generated by
+ ``docutils/tools/dev/generate_punctuation_chars.py`` to prevent dependence
+ on the Python version and avoid the time-consuming generation with every
+ Docutils run. See there for motives and implementation details.
+
+ The category of some characters changed with the development of the
+ Unicode standard. The current lists are generated with the help of the
+ "unicodedata" module of Python 2.7.13 (based on Unicode version 5.2.0).
+
+ .. _inline markup recognition rules:
+ http://docutils.sf.net/docs/ref/rst/restructuredtext.html#inline-markup-recognition-rules
+"""
+
openers = (u'"\'(<\\[{\u0f3a\u0f3c\u169b\u2045\u207d\u208d\u2329\u2768'
u'\u276a\u276c\u276e\u2770\u2772\u2774\u27c5\u27e6\u27e8\u27ea'
u'\u27ec\u27ee\u2983\u2985\u2987\u2989\u298b\u298d\u298f\u2991'
@@ -99,23 +93,28 @@
# Matching open/close quotes
# --------------------------
-#
-# The pairing of open/close quotes is ambigue due to different typographic
-# conventions in different languages. Specify additional valid matches::
-quote_pairs = {# open char: matching closing characters
- u'\xbb': u'\xbb', # Swedish
- u'\u2018': u'\u201a', # Greek
- u'\u2019': u'\u2019', # Swedish
- u'\u201a': u'\u2018\u2019', # German, Polish
- u'\u201c': u'\u201e', # German
- u'\u201e': u'\u201c\u201d',
- u'\u201d': u'\u201d', # Swedish
- u'\u203a': u'\u203a', # Swedish
- }
+quote_pairs = {# open char: matching closing characters # usage example
+ u'\xbb': u'\xbb', # » » Swedish
+ u'\u2018': u'\u201a', # ‘ ‚ Albanian/Greek/Turkish
+ u'\u2019': u'\u2019', # ’ ’ Swedish
+ u'\u201a': u'\u2018\u2019', # ‚ ‘ German ‚ ’ Polish
+ u'\u201c': u'\u201e', # “ „ Albanian/Greek/Turkish
+ u'\u201e': u'\u201c\u201d', # „ “ German „ ” Polish
+ u'\u201d': u'\u201d', # ” ” Swedish
+ u'\u203a': u'\u203a', # › › Swedish
+ }
+"""Additional open/close quote pairs."""
def match_chars(c1, c2):
- """Test whether `c1` and `c2` are a matching open/close character pair."""
+ """Test whether `c1` and `c2` are a matching open/close character pair.
+
+ Matching open/close pairs are at the same position in
+ `punctuation_chars.openers` and `punctuation_chars.closers`.
+ The pairing of open/close quotes is ambiguous due to different
+ typographic conventions in different languages,
+ so we test for additional matches stored in `quote_pairs`.
+ """
try:
i = openers.index(c1)
except ValueError: # c1 not in openers
Modified: trunk/docutils/tools/dev/generate_punctuation_chars.py
===================================================================
--- trunk/docutils/tools/dev/generate_punctuation_chars.py 2017-01-08 09:54:35 UTC (rev 8015)
+++ trunk/docutils/tools/dev/generate_punctuation_chars.py 2017-01-17 15:06:17 UTC (rev 8016)
@@ -1,6 +1,6 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
-# :Copyright: © 2011, 2016 Günter Milde.
+# :Copyright: © 2011, 2017 Günter Milde.
# :License: Released under the terms of the `2-Clause BSD license`_, in short:
#
# Copying and distribution of this file, with or without modification,
@@ -14,37 +14,131 @@
#
# ::
-import sys, re
-import unicodedata
+"""(Re)generate the utils.punctuation_chars module."""
-# import the punctuation_chars module from the source or Py3k build
-# path for local Python modules
-if sys.version_info < (3,):
- sys.path.insert(0, '../../docutils')
-else:
- sys.path.insert(0, '../../build/lib')
- unichr = chr
-
-from docutils.utils.punctuation_chars import (openers, closers, delimiters,
- closing_delimiters)
-
# (re)generate the utils.punctuation_chars module
# ===============================================
#
-# The category of some characters may change with the development of the
+# The category of some characters can change with the development of the
# Unicode standard. This tool checks the patterns in `utils.punctuation_chars`
# against a re-calculation based on the "unicodedata" stdlib module
# which may give different results for different Python versions.
#
-# Updating the patterns with a new (Python|Unicode standard) version is an API
-# change (may render valid rST documents invalid). It should only be done for
-# "feature releases" and requires also updating the specification of `inline
-# markup recognition rules`_ in ../../docs/ref/rst/restructuredtext.txt.
+# Updating the module with changed `unicode_punctuation_categories` (due to
+# a new Python or Unicode standard version is an API cange (may render valid
+# rST documents invalid). It should only be done for "feature releases" and
+# requires also updating the specification of `inline markup recognition
+# rules`_ in ../../docs/ref/rst/restructuredtext.txt.
#
+# .. _inline markup recognition rules:
+# ../../docs/ref/rst/restructuredtext.html#inline-markup
+
+
+# Setup::
+
+import sys, re
+import unicodedata
+
+if sys.version_info >= (3,):
+ unichr = chr # unichr not available in Py3k
+else:
+ import codecs
+ sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
+
+
+# Template for utils.punctuation_chars
+# ------------------------------------
+#
+# Problem: ``ur`` prefix fails with Py 3.5 ::
+
+module_template = u'''#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# :Id: $Id$
+# :Copyright: © 2011, 2017 Günter Milde.
+# :License: Released under the terms of the `2-Clause BSD license`_, in short:
+#
+# Copying and distribution of this file, with or without modification,
+# are permitted in any medium without royalty provided the copyright
+# notice and this notice are preserved.
+# This file is offered as-is, without any warranty.
+#
+# .. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause
+#
+# This file is generated by
+# ``docutils/tools/dev/generate_punctuation_chars.py``.
+# ::
+
+import sys, re
+import unicodedata
+
+"""Docutils character category patterns.
+
+ Patterns for the implementation of the `inline markup recognition rules`_
+ in the reStructuredText parser `docutils.parsers.rst.states.py` based
+ on Unicode character categories.
+ The patterns are used inside ``[ ]`` in regular expressions.
+
+ Rule (5) requires determination of matching open/close pairs. However, the
+ pairing of open/close quotes is ambiguous due to different typographic
+ conventions in different languages. The ``quote_pairs`` function tests
+ whether two characters form an open/close pair.
+
+ The patterns are generated by
+ ``docutils/tools/dev/generate_punctuation_chars.py`` to prevent dependence
+ on the Python version and avoid the time-consuming generation with every
+ Docutils run. See there for motives and implementation details.
+
+ The category of some characters changed with the development of the
+ Unicode standard. The current lists are generated with the help of the
+ "unicodedata" module of Python %(python_version)s (based on Unicode version %(unidata_version)s).
+
+ .. _inline markup recognition rules:
+ http://docutils.sf.net/docs/ref/rst/restructuredtext.html#inline-markup-recognition-rules
+"""
+
+%(openers)s
+%(closers)s
+%(delimiters)s
+if sys.maxunicode >= 0x10FFFF: # "wide" build
+%(delimiters_wide)s
+closing_delimiters = u'\\\\\\\\.,;!?'
+
+
+# Matching open/close quotes
+# --------------------------
+
+quote_pairs = {# open char: matching closing characters # usage example
+ u'\\xbb': u'\\xbb', # » » Swedish
+ u'\\u2018': u'\\u201a', # ‘ ‚ Albanian/Greek/Turkish
+ u'\\u2019': u'\\u2019', # ’ ’ Swedish
+ u'\\u201a': u'\\u2018\\u2019', # ‚ ‘ German ‚ ’ Polish
+ u'\\u201c': u'\\u201e', # “ „ Albanian/Greek/Turkish
+ u'\\u201e': u'\\u201c\\u201d', # „ “ German „ ” Polish
+ u'\\u201d': u'\\u201d', # ” ” Swedish
+ u'\\u203a': u'\\u203a', # › › Swedish
+ }
+"""Additional open/close quote pairs."""
+
+def match_chars(c1, c2):
+ """Test whether `c1` and `c2` are a matching open/close character pair.
+
+ Matching open/close pairs are at the same position in
+ `punctuation_chars.openers` and `punctuation_chars.closers`.
+ The pairing of open/close quotes is ambiguous due to different
+ typographic conventions in different languages,
+ so we test for additional matches stored in `quote_pairs`.
+ """
+ try:
+ i = openers.index(c1)
+ except ValueError: # c1 not in openers
+ return False
+ return c2 == closers[i] or c2 in quote_pairs.get(c1, u'')\
+'''
+
+
# Generation of the character category patterns
# ----------------------------------------------
#
-#
# Unicode punctuation character categories
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
@@ -160,11 +254,6 @@
# non-matching, after markup
closing_delimiters = [r'\\.,;!?']
- # # Test open/close matching:
- # for i in range(min(len(openers),len(closers))):
- # print('%4d %s %s' % (i, openers[i].encode('utf8'),
- # closers[i].encode('utf8'))
-
return [u''.join(chars) for chars in (openers, closers, delimiters,
closing_delimiters)]
@@ -202,7 +291,7 @@
return ''.join(l2)
def wrap_string(s, startstring= "(u'",
- endstring = "')", wrap=65):
+ endstring = "')", wrap=67):
"""Line-wrap a unicode string literal definition."""
c = len(startstring)
contstring = "'\n" + ' ' * (len(startstring)-2) + "u'"
@@ -228,8 +317,20 @@
for c in old:
if c not in new:
print(' %04x'%ord(c), unicodedata.name(c))
+ else:
+ print('%s unchanged' % name)
+def print_quote_pairs():
+ pairs = [(o,c) for o,c in quote_pairs.items()]
+ for o,c in sorted(pairs):
+ print((u'%s %s' % (o,c)).encode('utf8'))
+ # # Test open/close matching:
+ # for i in range(min(len(openers),len(closers))):
+ # print('%4d %s %s' % (i, openers[i].encode('utf8'),
+ # closers[i].encode('utf8'))
+
+
# Output
# ------
#
@@ -237,62 +338,91 @@
if __name__ == '__main__':
-# (Re)create and compare character patterns
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ import argparse
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument('-t', '--test', action="store_true",
+ help='test for changed character categories')
+ args = parser.parse_args()
+
+# (Re)create character patterns
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# ::
(o, c, d, cd) = character_category_patterns()
+
+# Characters in the upper plane require a "wide" build::
+
o, o_wide = separate_wide_chars(o)
c, c_wide = separate_wide_chars(c)
d, d_wide = separate_wide_chars(d)
+
+# delimiters: sort and use shortcut for intervals (saves ~150 characters)
+# (`openers` and `closers` must be verbose and keep order
+# because they are also used in `match_chars()`)::
+
d = d[:5] + mark_intervals(d[5:])
d_wide = mark_intervals(d_wide)
- print_differences(openers, o, 'openers')
- if o_wide:
- print('+ openers-wide = ur"""%s"""' % o_wide.encode('utf8'))
- print_differences(closers, c, 'closers')
- if c_wide:
- print('+ closers-wide = ur"""%s"""' % c_wide.encode('utf8'))
- print_differences(delimiters, d + d_wide, 'delimiters')
- print_differences(closing_delimiters, cd, 'closing_delimiters')
+# Test: compare module content with re-generated definitions
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# ::
-# Print literal code to define the character sets
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#
-# This code can be copied to punctuation_chars.py if an update is wanted.
+ if args.test:
-# Unicode version::
+# Import the punctuation_chars module from the source
+# or Py3k build path for local Python modules::
- print('# based on Unicode version %s' % unicodedata.unidata_version)
+ if sys.version_info < (3,):
+ sys.path.insert(0, '../../docutils')
+ else:
+ sys.path.insert(0, '../../build/lib')
-# `openers` and `closers` must be verbose and keep order because they are
-# also used in `match_chars()`::
+ from docutils.utils.punctuation_chars import (openers, closers,
+ delimiters, closing_delimiters)
- print(wrap_string(o.encode('unicode-escape').decode(),
- startstring="openers = (u'"))
- print(wrap_string(c.encode('unicode-escape').decode(),
- startstring="closers = (u'"))
+ print('Check for differences between the current `punctuation_chars`'
+ ' module\n and a regeneration based on Unicode version %s:'
+ % unicodedata.unidata_version)
-# delimiters: sort and use shortcut for intervals (saves ~150 characters)::
+ print_differences(openers, o, 'openers')
+ if o_wide:
+ print('+ openers-wide = ur"""%s"""' % o_wide.encode('utf8'))
+ print_differences(closers, c, 'closers')
+ if c_wide:
+ print('+ closers-wide = ur"""%s"""' % c_wide.encode('utf8'))
- print(wrap_string(d.encode('unicode-escape').decode(),
- startstring="delimiters = (u'"))
+ print_differences(delimiters, d + d_wide, 'delimiters')
+ print_differences(closing_delimiters, cd, 'closing_delimiters')
-# add characters in the upper plane only in a "wide" build::
+ sys.exit()
- print('if sys.maxunicode >= 0x10FFFF: # "wide" build')
- print(wrap_string(d_wide.encode('unicode-escape').decode(),
- startstring=" delimiters += (u'"))
+# Print re-generation of the punctuation_chars module
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# The output can be copied to docutils/utils if an update is wanted
+# (API change, see Intro).
-# additional closing delimiters::
+# Replacements::
- print(wrap_string(cd.encode('unicode-escape').decode(),
- startstring="closing_delimiters = (u'"))
+ substitutions = {
+ 'python_version': '.'.join(str(s) for s in sys.version_info[:3]),
+ 'unidata_version': unicodedata.unidata_version,
+ 'openers': wrap_string(o.encode('unicode-escape').decode(),
+ startstring="openers = (u'"),
+ 'closers': wrap_string(c.encode('unicode-escape').decode(),
+ startstring="closers = (u'"),
+ 'delimiters': wrap_string(d.encode('unicode-escape').decode(),
+ startstring="delimiters = (u'"),
+ 'delimiters_wide': wrap_string(
+ d_wide.encode('unicode-escape').decode(),
+ startstring=" delimiters += (u'")
+ }
+ print(module_template % substitutions)
+
# test prints
# ~~~~~~~~~~~
#
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|