[Docutils-checkins] SF.net SVN: docutils:[8016] trunk/docutils

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 8016
          http://sourceforge.net/p/docutils/code/8016
Author:   milde
Date:     2017-01-17 15:06:17 +0000 (Tue, 17 Jan 2017)
Log Message:
-----------
Generate the complete punctuation_chars module with the corresponding tool.

Modified Paths:
--------------
    trunk/docutils/docutils/utils/punctuation_chars.py
    trunk/docutils/tools/dev/generate_punctuation_chars.py

Modified: trunk/docutils/docutils/utils/punctuation_chars.py
===================================================================

--- trunk/docutils/docutils/utils/punctuation_chars.py	2017-01-08 09:54:35 UTC (rev 8015)
+++ trunk/docutils/docutils/utils/punctuation_chars.py	2017-01-17 15:06:17 UTC (rev 8016)
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
-# :Copyright: © 2011 Günter Milde.
+# :Id: $Id$
+# :Copyright: © 2011, 2017 Günter Milde.
 # :License: Released under the terms of the `2-Clause BSD license`_, in short:
 #
 #    Copying and distribution of this file, with or without modification,
@@ -9,46 +10,39 @@
 #    This file is offered as-is, without any warranty.
 #
 # .. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause
-
-# :Id: $Id$
 #
+# This file is generated by
+# ``docutils/tools/dev/generate_punctuation_chars.py``.
 # ::
 
 import sys, re
 import unicodedata
 
-# Docutils character category patterns
-# ------------------------------------
-#
-# This module provides patterns for the implementation of the
-# `inline markup recognition rules`_ in the reStructuredText parser
-# `<../parsers/rst/states.py>`__ based on Unicode character categories.
-# The patterns are used inside ``[ ]`` in regular expressions.
-#
-# Rule (5) requires determination of matching open/close pairs. However,
-# the pairing of open/close quotes is ambigue due to  different typographic
-# conventions in different languages. The ``quote_pairs`` function tests
-# whether two characters form an open/close pair.
-#
-# The patterns are generated by
-# ``docutils/tools/dev/generate_punctuation_chars.py`` to  prevent dependance
-# on the Python version and avoid the time-consuming generation with every
-# Docutils run. See there for motives and implementation details.
-#
-# The category of some characters changed with the development of the Unicode
-# standard. The current lists are generated with the help of the "unicodedata"
-# module of Python 2.7 (based on the UnicodeData.txt file version 5.2.0).
-#
-# Updating the patterns with a new Unicode standard version is an API
-# change (may render valid rST documents invalid). It should only be done for
-# "feature releases" and requires also updating the specification of `inline
-# markup recognition rules`_ in ../../docs/ref/rst/restructuredtext.txt.
-#
-# .. _inline markup recognition rules:
-#     ../../docs/ref/rst/restructuredtext.html#inline-markup
-#
-# ::
+"""Docutils character category patterns.
 
+   Patterns for the implementation of the `inline markup recognition rules`_
+   in the reStructuredText parser `docutils.parsers.rst.states.py` based
+   on Unicode character categories.
+   The patterns are used inside ``[ ]`` in regular expressions.
+
+   Rule (5) requires determination of matching open/close pairs. However, the
+   pairing of open/close quotes is ambiguous due to  different typographic
+   conventions in different languages. The ``quote_pairs`` function tests
+   whether two characters form an open/close pair.
+
+   The patterns are generated by
+   ``docutils/tools/dev/generate_punctuation_chars.py`` to  prevent dependence
+   on the Python version and avoid the time-consuming generation with every
+   Docutils run. See there for motives and implementation details.
+
+   The category of some characters changed with the development of the
+   Unicode standard. The current lists are generated with the help of the
+   "unicodedata" module of Python 2.7.13 (based on Unicode version 5.2.0).
+
+   .. _inline markup recognition rules:
+      http://docutils.sf.net/docs/ref/rst/restructuredtext.html#inline-markup-recognition-rules
+"""
+
 openers = (u'"\'(<\\[{\u0f3a\u0f3c\u169b\u2045\u207d\u208d\u2329\u2768'
            u'\u276a\u276c\u276e\u2770\u2772\u2774\u27c5\u27e6\u27e8\u27ea'
            u'\u27ec\u27ee\u2983\u2985\u2987\u2989\u298b\u298d\u298f\u2991'
@@ -99,23 +93,28 @@
 
 # Matching open/close quotes
 # --------------------------
-#
-# The pairing of open/close quotes is ambigue due to  different typographic
-# conventions in different languages. Specify additional valid matches::
 
-quote_pairs = {# open char: matching closing characters
-            u'\xbb': u'\xbb', # Swedish
-            u'\u2018': u'\u201a', # Greek
-            u'\u2019': u'\u2019', # Swedish
-            u'\u201a': u'\u2018\u2019', # German, Polish
-            u'\u201c': u'\u201e', # German
-            u'\u201e': u'\u201c\u201d',
-            u'\u201d': u'\u201d', # Swedish
-            u'\u203a': u'\u203a', # Swedish
-            }
+quote_pairs = {# open char: matching closing characters # usage example
+               u'\xbb':   u'\xbb',         # » » Swedish
+               u'\u2018': u'\u201a',       # ‘ ‚ Albanian/Greek/Turkish
+               u'\u2019': u'\u2019',       # ’ ’ Swedish
+               u'\u201a': u'\u2018\u2019', # ‚ ‘ German ‚ ’ Polish
+               u'\u201c': u'\u201e',       # “ „ Albanian/Greek/Turkish
+               u'\u201e': u'\u201c\u201d', # „ “ German „ ” Polish
+               u'\u201d': u'\u201d',       # ” ” Swedish
+               u'\u203a': u'\u203a',       # › › Swedish
+              }
+"""Additional open/close quote pairs."""
 
 def match_chars(c1, c2):
-    """Test whether `c1` and `c2` are a matching open/close character pair."""
+    """Test whether `c1` and `c2` are a matching open/close character pair.
+
+    Matching open/close pairs are at the same position in
+    `punctuation_chars.openers` and `punctuation_chars.closers`.
+    The pairing of open/close quotes is ambiguous due to  different
+    typographic conventions in different languages,
+    so we test for additional matches stored in `quote_pairs`.
+    """
     try:
         i = openers.index(c1)
     except ValueError:  # c1 not in openers

Modified: trunk/docutils/tools/dev/generate_punctuation_chars.py
===================================================================
--- trunk/docutils/tools/dev/generate_punctuation_chars.py	2017-01-08 09:54:35 UTC (rev 8015)
+++ trunk/docutils/tools/dev/generate_punctuation_chars.py	2017-01-17 15:06:17 UTC (rev 8016)
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
-# :Copyright: © 2011, 2016 Günter Milde.
+# :Copyright: © 2011, 2017 Günter Milde.
 # :License: Released under the terms of the `2-Clause BSD license`_, in short:
 #
 #    Copying and distribution of this file, with or without modification,
@@ -14,37 +14,131 @@
 #
 # ::
 
-import sys, re
-import unicodedata
+"""(Re)generate the utils.punctuation_chars module."""
 
-# import the punctuation_chars module from the source or Py3k build
-# path for local Python modules
-if sys.version_info < (3,):
-    sys.path.insert(0, '../../docutils')
-else:
-    sys.path.insert(0, '../../build/lib')
-    unichr = chr
-
-from docutils.utils.punctuation_chars import (openers, closers, delimiters,
-                                              closing_delimiters)
-
 # (re)generate the utils.punctuation_chars module
 # ===============================================
 #
-# The category of some characters may change with the development of the
+# The category of some characters can change with the development of the
 # Unicode standard. This tool checks the patterns in `utils.punctuation_chars`
 # against a re-calculation based on the "unicodedata" stdlib module
 # which may give different results for different Python versions.
 #
-# Updating the patterns with a new (Python|Unicode standard) version is an API
-# change (may render valid rST documents invalid). It should only be done for
-# "feature releases" and requires also updating the specification of `inline
-# markup recognition rules`_ in ../../docs/ref/rst/restructuredtext.txt.
+# Updating the module with changed `unicode_punctuation_categories` (due to
+# a new Python or Unicode standard version is an API cange (may render valid
+# rST documents invalid). It should only be done for "feature releases" and
+# requires also updating the specification of `inline markup recognition
+# rules`_ in ../../docs/ref/rst/restructuredtext.txt.
 #
+# .. _inline markup recognition rules:
+#     ../../docs/ref/rst/restructuredtext.html#inline-markup
+
+
+# Setup::
+
+import sys, re
+import unicodedata
+
+if sys.version_info >= (3,):
+    unichr = chr # unichr not available in Py3k
+else:
+    import codecs
+    sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
+
+
+# Template for utils.punctuation_chars
+# ------------------------------------
+#
+# Problem: ``ur`` prefix fails with Py 3.5 ::
+
+module_template = u'''#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# :Id: $Id$
+# :Copyright: © 2011, 2017 Günter Milde.
+# :License: Released under the terms of the `2-Clause BSD license`_, in short:
+#
+#    Copying and distribution of this file, with or without modification,
+#    are permitted in any medium without royalty provided the copyright
+#    notice and this notice are preserved.
+#    This file is offered as-is, without any warranty.
+#
+# .. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause
+#
+# This file is generated by
+# ``docutils/tools/dev/generate_punctuation_chars.py``.
+# ::
+
+import sys, re
+import unicodedata
+
+"""Docutils character category patterns.
+
+   Patterns for the implementation of the `inline markup recognition rules`_
+   in the reStructuredText parser `docutils.parsers.rst.states.py` based
+   on Unicode character categories.
+   The patterns are used inside ``[ ]`` in regular expressions.
+
+   Rule (5) requires determination of matching open/close pairs. However, the
+   pairing of open/close quotes is ambiguous due to  different typographic
+   conventions in different languages. The ``quote_pairs`` function tests
+   whether two characters form an open/close pair.
+
+   The patterns are generated by
+   ``docutils/tools/dev/generate_punctuation_chars.py`` to  prevent dependence
+   on the Python version and avoid the time-consuming generation with every
+   Docutils run. See there for motives and implementation details.
+
+   The category of some characters changed with the development of the
+   Unicode standard. The current lists are generated with the help of the
+   "unicodedata" module of Python %(python_version)s (based on Unicode version %(unidata_version)s).
+
+   .. _inline markup recognition rules:
+      http://docutils.sf.net/docs/ref/rst/restructuredtext.html#inline-markup-recognition-rules
+"""
+
+%(openers)s
+%(closers)s
+%(delimiters)s
+if sys.maxunicode >= 0x10FFFF: # "wide" build
+%(delimiters_wide)s
+closing_delimiters = u'\\\\\\\\.,;!?'
+
+
+# Matching open/close quotes
+# --------------------------
+
+quote_pairs = {# open char: matching closing characters # usage example
+               u'\\xbb':   u'\\xbb',         # » » Swedish
+               u'\\u2018': u'\\u201a',       # ‘ ‚ Albanian/Greek/Turkish
+               u'\\u2019': u'\\u2019',       # ’ ’ Swedish
+               u'\\u201a': u'\\u2018\\u2019', # ‚ ‘ German ‚ ’ Polish
+               u'\\u201c': u'\\u201e',       # “ „ Albanian/Greek/Turkish
+               u'\\u201e': u'\\u201c\\u201d', # „ “ German „ ” Polish
+               u'\\u201d': u'\\u201d',       # ” ” Swedish
+               u'\\u203a': u'\\u203a',       # › › Swedish
+              }
+"""Additional open/close quote pairs."""
+
+def match_chars(c1, c2):
+    """Test whether `c1` and `c2` are a matching open/close character pair.
+
+    Matching open/close pairs are at the same position in
+    `punctuation_chars.openers` and `punctuation_chars.closers`.
+    The pairing of open/close quotes is ambiguous due to  different
+    typographic conventions in different languages,
+    so we test for additional matches stored in `quote_pairs`.
+    """
+    try:
+        i = openers.index(c1)
+    except ValueError:  # c1 not in openers
+        return False
+    return c2 == closers[i] or c2 in quote_pairs.get(c1, u'')\
+'''
+
+
 # Generation of the  character category patterns
 # ----------------------------------------------
 #
-#
 # Unicode punctuation character categories
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #
@@ -160,11 +254,6 @@
     # non-matching, after markup
     closing_delimiters = [r'\\.,;!?']
 
-    # # Test open/close matching:
-    # for i in range(min(len(openers),len(closers))):
-    #     print('%4d    %s    %s' % (i, openers[i].encode('utf8'),
-    #                                closers[i].encode('utf8'))
-
     return [u''.join(chars) for chars in (openers, closers, delimiters,
                                             closing_delimiters)]
 
@@ -202,7 +291,7 @@
     return ''.join(l2)
 
 def wrap_string(s, startstring= "(u'",
-                    endstring = "')", wrap=65):
+                    endstring = "')", wrap=67):
     """Line-wrap a unicode string literal definition."""
     c = len(startstring)
     contstring = "'\n" + ' ' * (len(startstring)-2) + "u'"
@@ -228,8 +317,20 @@
         for c in old:
             if c not in new:
                 print('  %04x'%ord(c), unicodedata.name(c))
+    else:
+        print('%s unchanged' % name)
 
+def print_quote_pairs():
+    pairs = [(o,c) for o,c in quote_pairs.items()]
+    for o,c in sorted(pairs):
+        print((u'%s %s' % (o,c)).encode('utf8'))
 
+    # # Test open/close matching:
+    # for i in range(min(len(openers),len(closers))):
+    #     print('%4d    %s    %s' % (i, openers[i].encode('utf8'),
+    #                                closers[i].encode('utf8'))
+
+
 # Output
 # ------
 #
@@ -237,62 +338,91 @@
 
 if __name__ == '__main__':
 
-# (Re)create and compare character patterns
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    import argparse
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument('-t', '--test', action="store_true",
+                        help='test for changed character categories')
+    args = parser.parse_args()
+
+# (Re)create character patterns
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #
 # ::
 
     (o, c, d, cd) = character_category_patterns()
+
+# Characters in the upper plane require a "wide" build::
+
     o, o_wide = separate_wide_chars(o)
     c, c_wide = separate_wide_chars(c)
     d, d_wide = separate_wide_chars(d)
+
+# delimiters: sort and use shortcut for intervals (saves ~150 characters)
+# (`openers` and `closers` must be verbose and keep order
+# because they are also used in `match_chars()`)::
+
     d = d[:5] + mark_intervals(d[5:])
     d_wide = mark_intervals(d_wide)
 
-    print_differences(openers, o, 'openers')
-    if o_wide:
-        print('+ openers-wide = ur"""%s"""' % o_wide.encode('utf8'))
-    print_differences(closers, c, 'closers')
-    if c_wide:
-        print('+ closers-wide = ur"""%s"""' % c_wide.encode('utf8'))
 
-    print_differences(delimiters, d + d_wide, 'delimiters')
-    print_differences(closing_delimiters, cd, 'closing_delimiters')
+# Test: compare module content with re-generated definitions
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# ::
 
-# Print literal code to define the character sets
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#
-# This code can be copied to punctuation_chars.py if an update is wanted.
+    if args.test:
 
-# Unicode version::
+# Import the punctuation_chars module from the source
+# or Py3k build path for local Python modules::
 
-    print('# based on Unicode version %s' % unicodedata.unidata_version)
+        if sys.version_info < (3,):
+            sys.path.insert(0, '../../docutils')
+        else:
+            sys.path.insert(0, '../../build/lib')
 
-# `openers` and `closers` must be verbose and keep order because they are
-# also used in `match_chars()`::
+        from docutils.utils.punctuation_chars import (openers, closers,
+                                          delimiters, closing_delimiters)
 
-    print(wrap_string(o.encode('unicode-escape').decode(),
-                      startstring="openers = (u'"))
-    print(wrap_string(c.encode('unicode-escape').decode(),
-                      startstring="closers = (u'"))
+        print('Check for differences between the current `punctuation_chars`'
+              ' module\n and a regeneration based on Unicode version %s:'
+              % unicodedata.unidata_version)
 
-# delimiters: sort and use shortcut for intervals (saves ~150 characters)::
+        print_differences(openers, o, 'openers')
+        if o_wide:
+            print('+ openers-wide = ur"""%s"""' % o_wide.encode('utf8'))
+        print_differences(closers, c, 'closers')
+        if c_wide:
+            print('+ closers-wide = ur"""%s"""' % c_wide.encode('utf8'))
 
-    print(wrap_string(d.encode('unicode-escape').decode(),
-                      startstring="delimiters = (u'"))
+        print_differences(delimiters, d + d_wide, 'delimiters')
+        print_differences(closing_delimiters, cd, 'closing_delimiters')
 
-# add characters in the upper plane only in a "wide" build::
+        sys.exit()
 
-    print('if sys.maxunicode >= 0x10FFFF: # "wide" build')
-    print(wrap_string(d_wide.encode('unicode-escape').decode(),
-                      startstring="    delimiters += (u'"))
+# Print re-generation of the punctuation_chars module
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# The output can be copied to docutils/utils if an update is wanted
+# (API change, see Intro).
 
-# additional closing delimiters::
+# Replacements::
 
-    print(wrap_string(cd.encode('unicode-escape').decode(),
-                      startstring="closing_delimiters = (u'"))
+    substitutions = {
+        'python_version': '.'.join(str(s) for s in sys.version_info[:3]),
+        'unidata_version': unicodedata.unidata_version,
+        'openers': wrap_string(o.encode('unicode-escape').decode(),
+                               startstring="openers = (u'"),
+        'closers': wrap_string(c.encode('unicode-escape').decode(),
+                               startstring="closers = (u'"),
+        'delimiters': wrap_string(d.encode('unicode-escape').decode(),
+                                  startstring="delimiters = (u'"),
+        'delimiters_wide': wrap_string(
+                            d_wide.encode('unicode-escape').decode(),
+                            startstring="    delimiters += (u'")
+        }
 
+    print(module_template % substitutions)
 
+
 # test prints
 # ~~~~~~~~~~~
 #

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.