[Docutils-checkins] SF.net SVN: docutils:[8014] trunk/docutils

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 454-5900

Revision: 8014
          http://sourceforge.net/p/docutils/code/8014
Author:   milde
Date:     2017-01-05 09:49:26 +0000 (Thu, 05 Jan 2017)
Log Message:
-----------
"Outsourcing" of development code from utils.punctuation_chars.

Modified Paths:
--------------
    trunk/docutils/docutils/utils/punctuation_chars.py

Added Paths:
-----------
    trunk/docutils/tools/dev/generate_punctuation_chars.py

Modified: trunk/docutils/docutils/utils/punctuation_chars.py
===================================================================

--- trunk/docutils/docutils/utils/punctuation_chars.py	2017-01-04 22:58:22 UTC (rev 8013)
+++ trunk/docutils/docutils/utils/punctuation_chars.py	2017-01-05 09:49:26 UTC (rev 8014)
@@ -17,38 +17,37 @@
 import sys, re
 import unicodedata
 
-# punctuation characters around inline markup
-# ===========================================
+# Docutils character category patterns
+# ------------------------------------
 #
-# This module provides the lists of characters for the implementation of
-# the `inline markup recognition rules`_ in the reStructuredText parser
-# `<../parsers/rst/states.py>`__.
-# For efficiency and stability, the lists are pre-generated and stored in
-# module-level variables.
+# This module provides patterns for the implementation of the
+# `inline markup recognition rules`_ in the reStructuredText parser
+# `<../parsers/rst/states.py>`__ based on Unicode character categories.
+# The patterns are used inside ``[ ]`` in regular expressions.
 #
+# Rule (5) requires determination of matching open/close pairs. However,
+# the pairing of open/close quotes is ambigue due to  different typographic
+# conventions in different languages. The ``quote_pairs`` function tests
+# whether two characters form an open/close pair.
+#
+# The patterns are generated by
+# ``docutils/tools/dev/generate_punctuation_chars.py`` to  prevent dependance
+# on the Python version and avoid the time-consuming generation with every
+# Docutils run. See there for motives and implementation details.
+#
 # The category of some characters changed with the development of the Unicode
 # standard. The current lists are generated with the help of the "unicodedata"
 # module of Python 2.7 (based on the UnicodeData.txt file version 5.2.0).
 #
-# Running this file as a standalone module (``__main__``) checks the
-# definitions against a re-calculation. Updating the lists with a new
-# Unicode standard version is an API change (may render valid rST documents
-# invalid). It should only be done for "feature releases" and requires
-# also updating the specification of `inline markup recognition rules`_ in
-# ../../docs/ref/rst/restructuredtext.txt.
+# Updating the patterns with a new Unicode standard version is an API
+# change (may render valid rST documents invalid). It should only be done for
+# "feature releases" and requires also updating the specification of `inline
+# markup recognition rules`_ in ../../docs/ref/rst/restructuredtext.txt.
 #
 # .. _inline markup recognition rules:
 #     ../../docs/ref/rst/restructuredtext.html#inline-markup
 #
-#
-# Docutils punctuation category sample strings
-# --------------------------------------------
-#
-# The sample strings are generated by punctuation_samples() and put here
-# literal to avoid the time-consuming generation with every Docutils run
-# and prevent dependance on the Python version.
-# As the samples are used inside ``[ ]`` in regular expressions, hyphen and
-# square brackets are escaped. ::
+# ::
 
 openers = (u'"\'(<\\[{\u0f3a\u0f3c\u169b\u2045\u207d\u208d\u2329\u2768'
            u'\u276a\u276c\u276e\u2770\u2772\u2774\u27c5\u27e6\u27e8\u27ea'
@@ -101,10 +100,8 @@
 # Matching open/close quotes
 # --------------------------
 #
-# Rule (5) requires determination of matching open/close pairs. However,
-# the pairing of open/close quotes is ambigue due to  different typographic
-# conventions in different languages. The following dictionary specifies
-# additional valid matches::
+# The pairing of open/close quotes is ambigue due to  different typographic
+# conventions in different languages. Specify additional valid matches::
 
 quote_pairs = {# open char: matching closing characters
             u'\xbb': u'\xbb', # Swedish
@@ -124,281 +121,3 @@
     except ValueError:  # c1 not in openers
         return False
     return c2 == closers[i] or c2 in quote_pairs.get(c1, u'')
-
-
-# Running this file as a standalone module checks the definitions against a
-# re-calculation::
-
-if __name__ == '__main__':
-
-
-# Unicode punctuation character categories
-# ----------------------------------------
-#
-# For details about Unicode categories, see
-# http://www.unicode.org/Public/5.1.0/ucd/UCD.html#General_Category_Values
-# ::
-
-    unicode_punctuation_categories = {
-        # 'Pc': 'Connector', # not used in Docutils inline markup recognition
-        'Pd': 'Dash',
-        'Ps': 'Open',
-        'Pe': 'Close',
-        'Pi': 'Initial quote', # may behave like Ps or Pe depending on usage
-        'Pf': 'Final quote', # may behave like Ps or Pe depending on usage
-        'Po': 'Other'
-        }
-    """Unicode character categories for punctuation"""
-
-
-# generate character pattern strings
-# ==================================
-#
-# ::
-
-    def unicode_charlists(categories, cp_min=0, cp_max=None):
-        """Return dictionary of Unicode character lists.
-
-        For each of the `catagories`, an item contains a list with all Unicode
-        characters with `cp_min` <= code-point <= `cp_max` that belong to
-        the category.
-
-        The default values check every code-point supported by Python
-        (`sys.maxint` is 0x10FFFF in a "wide" build and 0xFFFF in a "narrow"
-        build, i.e. ucs4 and ucs2 respectively).
-        """
-        # Determine highest code point with one of the given categories
-        # (may shorten the search time considerably if there are many
-        # categories with not too high characters):
-        if cp_max is None:
-            cp_max = max(x for x in xrange(sys.maxunicode+1)
-                        if unicodedata.category(unichr(x)) in categories)
-            # print cp_max # => 74867 for unicode_punctuation_categories
-        charlists = {}
-        for cat in categories:
-            charlists[cat] = [unichr(x) for x in xrange(cp_min, cp_max+1)
-                              if unicodedata.category(unichr(x)) == cat]
-        return charlists
-
-
-# Character categories in Docutils
-# --------------------------------
-#
-# ::
-
-    def punctuation_samples():
-
-        """Docutils punctuation category sample strings.
-
-        Return list of sample strings for the categories "Open", "Close",
-        "Delimiters" and "Closing-Delimiters" used in the `inline markup
-        recognition rules`_.
-        """
-
-        # Lists with characters in Unicode punctuation character categories
-        cp_min = 160 # ASCII chars have special rules for backwards compatibility
-        ucharlists = unicode_charlists(unicode_punctuation_categories, cp_min)
-
-        # match opening/closing characters
-        # --------------------------------
-        # Rearange the lists to ensure matching characters at the same
-        # index position.
-
-        # low quotation marks are also used as closers (e.g. in Greek)
-        # move them to category Pi:
-        ucharlists['Ps'].remove(u'‚') # 201A  SINGLE LOW-9 QUOTATION MARK
-        ucharlists['Ps'].remove(u'„') # 201E  DOUBLE LOW-9 QUOTATION MARK
-        ucharlists['Pi'] += [u'‚', u'„']
-
-        ucharlists['Pi'].remove(u'‛') # 201B  SINGLE HIGH-REVERSED-9 QUOTATION MARK
-        ucharlists['Pi'].remove(u'‟') # 201F  DOUBLE HIGH-REVERSED-9 QUOTATION MARK
-        ucharlists['Pf'] += [u'‛', u'‟']
-
-        # 301F  LOW DOUBLE PRIME QUOTATION MARK misses the opening pendant:
-        ucharlists['Ps'].insert(ucharlists['Pe'].index(u'\u301f'), u'\u301d')
-
-        # print u''.join(ucharlists['Ps']).encode('utf8')
-        # print u''.join(ucharlists['Pe']).encode('utf8')
-        # print u''.join(ucharlists['Pi']).encode('utf8')
-        # print u''.join(ucharlists['Pf']).encode('utf8')
-
-        # The Docutils character categories
-        # ---------------------------------
-        #
-        # The categorization of ASCII chars is non-standard to reduce
-        # both false positives and need for escaping. (see `inline markup
-        # recognition rules`_)
-
-        # allowed before markup if there is a matching closer
-        openers = [u'"\'(<\\[{']
-        for category in ('Ps', 'Pi', 'Pf'):
-            openers.extend(ucharlists[category])
-
-        # allowed after markup if there is a matching opener
-        closers = [u'"\')>\\]}']
-        for category in ('Pe', 'Pf', 'Pi'):
-            closers.extend(ucharlists[category])
-
-        # non-matching, allowed on both sides
-        delimiters = [u'\\-/:']
-        for category in ('Pd', 'Po'):
-            delimiters.extend(ucharlists[category])
-
-        # non-matching, after markup
-        closing_delimiters = [r'\\.,;!?']
-
-        # # Test open/close matching:
-        # for i in range(min(len(openers),len(closers))):
-        #     print '%4d    %s    %s' % (i, openers[i].encode('utf8'),
-        #                                closers[i].encode('utf8'))
-
-        return [u''.join(chars) for chars in (openers, closers, delimiters,
-                                              closing_delimiters)]
-
-    def separate_wide_chars(s):
-        """Return (s1,s2) with characters above 0xFFFF in s2"""
-        maxunicode_narrow = 0xFFFF
-        l1 = [ch for ch in s if ord(ch) <= maxunicode_narrow]
-        l2 = [ch for ch in s if ord(ch) > maxunicode_narrow]
-        return ''.join(l1), ''.join(l2)
-
-    def mark_intervals(s):
-        """Return s with shortcut notation for runs of consecutive characters
-
-        Sort string and replace 'cdef' by 'c-f' and similar.
-        """
-        l =[]
-        s = [ord(ch) for ch in s]
-        s.sort()
-        for n in s:
-            try:
-                if l[-1][-1]+1 == n:
-                    l[-1].append(n)
-                else:
-                    l.append([n])
-            except IndexError:
-                l.append([n])
-
-        l2 = []
-        for i in l:
-            i = [unichr(n) for n in i]
-            if len(i) > 2:
-                i = i[0], u'-', i[-1]
-            l2.extend(i)
-
-        return ''.join(l2)
-
-    def wrap_string(s, startstring= "(u'",
-                       endstring = "')", wrap=65):
-        """Line-wrap a unicode string literal definition."""
-        c = len(startstring)
-        contstring = "'\n" + ' ' * (len(startstring)-2) + "u'"
-        l = [startstring]
-        for ch in s.replace("'", r"\'"):
-            c += 1
-            if ch == '\\' and c > wrap:
-                c = len(startstring)
-                ch = contstring + ch
-            l.append(ch)
-        l.append(endstring)
-        return ''.join(l)
-
-
-    def print_differences(old, new, name):
-        """List characters missing in old/new."""
-        if old != new:
-            print('new %s:' % name)
-            for c in new:
-                if c not in old:
-                    print '  %04x'%ord(c), unicodedata.name(c)
-            print('removed %s:' % name)
-            for c in old:
-                if c not in new:
-                    print '  %04x'%ord(c), unicodedata.name(c)
-
-
-# print results
-# =============
-#
-# (re) create and compare the samples:
-#
-# ::
-
-    (o, c, d, cd) = punctuation_samples()
-    o, o_wide = separate_wide_chars(o)
-    c, c_wide = separate_wide_chars(c)
-    d, d_wide = separate_wide_chars(d)
-    d = d[:5] + mark_intervals(d[5:])
-    d_wide = mark_intervals(d_wide)
-
-    print_differences(openers, o, 'openers')
-    if o_wide:
-        print '+ openers-wide = ur"""%s"""' % o_wide.encode('utf8')
-    print_differences(closers, c, 'closers')
-    if c_wide:
-        print '+ closers-wide = ur"""%s"""' % c_wide.encode('utf8')
-
-    print_differences(delimiters, d + d_wide, 'delimiters')
-    print_differences(closing_delimiters, cd, 'closing_delimiters')
-
-# Print literal code to define the character sets:
-#
-# ::
-
-    print '# based on Unicode version', unicodedata.unidata_version
-
-    # `openers` and `closers` must be verbose and keep order because they are
-    # also used in `match_chars()`.
-    print wrap_string(o.encode('unicode-escape').decode(),
-                      startstring="openers = (u'")
-    print wrap_string(c.encode('unicode-escape').decode(),
-                      startstring="closers = (u'")
-    # delimiters: sort and use shortcut for intervals (saves ~150 characters):
-    print wrap_string(d.encode('unicode-escape').decode(),
-                      startstring="delimiters = (u'")
-    # add characters in the upper plane only in a "wide" build:
-    print 'if sys.maxunicode >= 0x10FFFF: # "wide" build'
-    print wrap_string(d_wide.encode('unicode-escape').decode(),
-                      startstring="    delimiters += (u'")
-    # additional closing delimiters:
-    print wrap_string(cd.encode('unicode-escape').decode(),
-                      startstring="closing_delimiters = (u'")
-     
-
-# test prints
-#
-# ::
-
-    # print "wide" Unicode characters:
-    # ucharlists = unicode_charlists(unicode_punctuation_categories)
-    # for key in ucharlists:
-    #     if key.endswith('wide'):
-    #         print key, ucharlists[key]
-
-    # print 'openers = ', repr(openers)
-    # print 'closers = ', repr(closers)
-    # print 'delimiters = ', repr(delimiters)
-    # print 'closing_delimiters = ', repr(closing_delimiters)
-
-    # ucharlists = unicode_charlists(unicode_punctuation_categories)
-    # for cat, chars in ucharlists.items():
-    #     # print cat, chars
-    #     # compact output (visible with a comprehensive font):
-    #     print (u":%s: %s" % (cat, u''.join(chars))).encode('utf8')
-
-# verbose print
-#
-# ::
-
-    # print 'openers:'
-    # for ch in openers:
-    #     print ch.encode('utf8'), unicodedata.name(ch)
-    # print 'closers:'
-    # for ch in closers:
-    #     print ch.encode('utf8'), unicodedata.name(ch)
-    # print 'delimiters:'
-    # for ch in delimiters:
-    #     print ch.encode('utf8'), unicodedata.name(ch)
-    # print 'closing_delimiters:'
-    # for ch in closing_delimiters:
-    #     print ch.encode('utf8'), unicodedata.name(ch)

Added: trunk/docutils/tools/dev/generate_punctuation_chars.py
===================================================================
--- trunk/docutils/tools/dev/generate_punctuation_chars.py	                        (rev 0)
+++ trunk/docutils/tools/dev/generate_punctuation_chars.py	2017-01-05 09:49:26 UTC (rev 8014)
@@ -0,0 +1,334 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# :Copyright: © 2011, 2016 Günter Milde.
+# :License: Released under the terms of the `2-Clause BSD license`_, in short:
+#
+#    Copying and distribution of this file, with or without modification,
+#    are permitted in any medium without royalty provided the copyright
+#    notice and this notice are preserved.
+#    This file is offered as-is, without any warranty.
+#
+# .. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause
+
+# :Id: $Id$
+#
+# ::
+
+import sys, re
+import unicodedata
+
+# import the punctuation_chars module from the source or Py3k build
+# path for local Python modules
+if sys.version_info < (3,):
+    sys.path.insert(0, '../../docutils')
+else:
+    sys.path.insert(0, '../../build/lib')
+    unichr = chr
+
+from docutils.utils.punctuation_chars import (openers, closers, delimiters,
+                                              closing_delimiters)
+
+# (re)generate the utils.punctuation_chars module
+# ===============================================
+#
+# The category of some characters may change with the development of the
+# Unicode standard. This tool checks the patterns in `utils.punctuation_chars`
+# against a re-calculation based on the "unicodedata" stdlib module
+# which may give different results for different Python versions.
+#
+# Updating the patterns with a new (Python|Unicode standard) version is an API
+# change (may render valid rST documents invalid). It should only be done for
+# "feature releases" and requires also updating the specification of `inline
+# markup recognition rules`_ in ../../docs/ref/rst/restructuredtext.txt.
+#
+# Generation of the  character category patterns
+# ----------------------------------------------
+#
+#
+# Unicode punctuation character categories
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# For details about Unicode categories, see
+# http://www.unicode.org/Public/5.1.0/ucd/UCD.html#General_Category_Values
+# ::
+
+unicode_punctuation_categories = {
+    # 'Pc': 'Connector', # not used in Docutils inline markup recognition
+    'Pd': 'Dash',
+    'Ps': 'Open',
+    'Pe': 'Close',
+    'Pi': 'Initial quote', # may behave like Ps or Pe depending on usage
+    'Pf': 'Final quote', # may behave like Ps or Pe depending on usage
+    'Po': 'Other'
+    }
+"""Unicode character categories for punctuation"""
+
+
+# generate character pattern strings
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# ::
+
+def unicode_charlists(categories, cp_min=0, cp_max=None):
+    """Return dictionary of Unicode character lists.
+
+    For each of the `catagories`, an item contains a list with all Unicode
+    characters with `cp_min` <= code-point <= `cp_max` that belong to
+    the category.
+
+    The default values check every code-point supported by Python
+    (`sys.maxint` is 0x10FFFF in a "wide" build and 0xFFFF in a "narrow"
+    build, i.e. ucs4 and ucs2 respectively).
+    """
+    # Determine highest code point with one of the given categories
+    # (may shorten the search time considerably if there are many
+    # categories with not too high characters):
+    if cp_max is None:
+        cp_max = max(x for x in range(sys.maxunicode+1)
+                    if unicodedata.category(unichr(x)) in categories)
+        # print(cp_max) # => 74867 for unicode_punctuation_categories
+    charlists = {}
+    for cat in categories:
+        charlists[cat] = [unichr(x) for x in range(cp_min, cp_max+1)
+                            if unicodedata.category(unichr(x)) == cat]
+    return charlists
+
+
+# Character categories in Docutils
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# ::
+
+def character_category_patterns():
+
+    """Docutils character category patterns.
+
+    Return list of pattern strings for the categories "Open", "Close",
+    "Delimiters" and "Closing-Delimiters" used in the `inline markup
+    recognition rules`_.
+    """
+
+    cp_min = 160 # ASCII chars have special rules for backwards compatibility
+    ucharlists = unicode_charlists(unicode_punctuation_categories, cp_min)
+    """Strings of characters in Unicode punctuation character categories"""
+
+    # match opening/closing characters
+    # --------------------------------
+    # Rearange the lists to ensure matching characters at the same
+    # index position.
+
+    # low quotation marks are also used as closers (e.g. in Greek)
+    # move them to category Pi:
+    ucharlists['Ps'].remove(u'‚') # 201A  SINGLE LOW-9 QUOTATION MARK
+    ucharlists['Ps'].remove(u'„') # 201E  DOUBLE LOW-9 QUOTATION MARK
+    ucharlists['Pi'] += [u'‚', u'„']
+
+    ucharlists['Pi'].remove(u'‛') # 201B  SINGLE HIGH-REVERSED-9 QUOTATION MARK
+    ucharlists['Pi'].remove(u'‟') # 201F  DOUBLE HIGH-REVERSED-9 QUOTATION MARK
+    ucharlists['Pf'] += [u'‛', u'‟']
+
+    # 301F  LOW DOUBLE PRIME QUOTATION MARK misses the opening pendant:
+    ucharlists['Ps'].insert(ucharlists['Pe'].index(u'\u301f'), u'\u301d')
+
+    # print(u''.join(ucharlists['Ps']).encode('utf8')
+    # print(u''.join(ucharlists['Pe']).encode('utf8')
+    # print(u''.join(ucharlists['Pi']).encode('utf8')
+    # print(u''.join(ucharlists['Pf']).encode('utf8')
+
+    # The Docutils character categories
+    # ---------------------------------
+    #
+    # The categorization of ASCII chars is non-standard to reduce
+    # both false positives and need for escaping. (see `inline markup
+    # recognition rules`_)
+
+    # allowed before markup if there is a matching closer
+    openers = [u'"\'(<\\[{']
+    for category in ('Ps', 'Pi', 'Pf'):
+        openers.extend(ucharlists[category])
+
+    # allowed after markup if there is a matching opener
+    closers = [u'"\')>\\]}']
+    for category in ('Pe', 'Pf', 'Pi'):
+        closers.extend(ucharlists[category])
+
+    # non-matching, allowed on both sides
+    delimiters = [u'\\-/:']
+    for category in ('Pd', 'Po'):
+        delimiters.extend(ucharlists[category])
+
+    # non-matching, after markup
+    closing_delimiters = [r'\\.,;!?']
+
+    # # Test open/close matching:
+    # for i in range(min(len(openers),len(closers))):
+    #     print('%4d    %s    %s' % (i, openers[i].encode('utf8'),
+    #                                closers[i].encode('utf8'))
+
+    return [u''.join(chars) for chars in (openers, closers, delimiters,
+                                            closing_delimiters)]
+
+def separate_wide_chars(s):
+    """Return (s1,s2) with characters above 0xFFFF in s2"""
+    maxunicode_narrow = 0xFFFF
+    l1 = [ch for ch in s if ord(ch) <= maxunicode_narrow]
+    l2 = [ch for ch in s if ord(ch) > maxunicode_narrow]
+    return ''.join(l1), ''.join(l2)
+
+def mark_intervals(s):
+    """Return s with shortcut notation for runs of consecutive characters
+
+    Sort string and replace 'cdef' by 'c-f' and similar.
+    """
+    l =[]
+    s = [ord(ch) for ch in s]
+    s.sort()
+    for n in s:
+        try:
+            if l[-1][-1]+1 == n:
+                l[-1].append(n)
+            else:
+                l.append([n])
+        except IndexError:
+            l.append([n])
+
+    l2 = []
+    for i in l:
+        i = [unichr(n) for n in i]
+        if len(i) > 2:
+            i = i[0], u'-', i[-1]
+        l2.extend(i)
+
+    return ''.join(l2)
+
+def wrap_string(s, startstring= "(u'",
+                    endstring = "')", wrap=65):
+    """Line-wrap a unicode string literal definition."""
+    c = len(startstring)
+    contstring = "'\n" + ' ' * (len(startstring)-2) + "u'"
+    l = [startstring]
+    for ch in s.replace("'", r"\'"):
+        c += 1
+        if ch == '\\' and c > wrap:
+            c = len(startstring)
+            ch = contstring + ch
+        l.append(ch)
+    l.append(endstring)
+    return ''.join(l)
+
+
+def print_differences(old, new, name):
+    """List characters missing in old/new."""
+    if old != new:
+        print('new %s:' % name)
+        for c in new:
+            if c not in old:
+                print('  %04x'%ord(c), unicodedata.name(c))
+        print('removed %s:' % name)
+        for c in old:
+            if c not in new:
+                print('  %04x'%ord(c), unicodedata.name(c))
+
+
+# Output
+# ------
+#
+# ::
+
+if __name__ == '__main__':
+
+# (Re)create and compare character patterns
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# ::
+
+    (o, c, d, cd) = character_category_patterns()
+    o, o_wide = separate_wide_chars(o)
+    c, c_wide = separate_wide_chars(c)
+    d, d_wide = separate_wide_chars(d)
+    d = d[:5] + mark_intervals(d[5:])
+    d_wide = mark_intervals(d_wide)
+
+    print_differences(openers, o, 'openers')
+    if o_wide:
+        print('+ openers-wide = ur"""%s"""' % o_wide.encode('utf8'))
+    print_differences(closers, c, 'closers')
+    if c_wide:
+        print('+ closers-wide = ur"""%s"""' % c_wide.encode('utf8'))
+
+    print_differences(delimiters, d + d_wide, 'delimiters')
+    print_differences(closing_delimiters, cd, 'closing_delimiters')
+
+# Print literal code to define the character sets
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# This code can be copied to punctuation_chars.py if an update is wanted.
+
+# Unicode version::
+
+    print('# based on Unicode version %s' % unicodedata.unidata_version)
+
+# `openers` and `closers` must be verbose and keep order because they are
+# also used in `match_chars()`::
+
+    print(wrap_string(o.encode('unicode-escape').decode(),
+                      startstring="openers = (u'"))
+    print(wrap_string(c.encode('unicode-escape').decode(),
+                      startstring="closers = (u'"))
+
+# delimiters: sort and use shortcut for intervals (saves ~150 characters)::
+
+    print(wrap_string(d.encode('unicode-escape').decode(),
+                      startstring="delimiters = (u'"))
+
+# add characters in the upper plane only in a "wide" build::
+
+    print('if sys.maxunicode >= 0x10FFFF: # "wide" build')
+    print(wrap_string(d_wide.encode('unicode-escape').decode(),
+                      startstring="    delimiters += (u'"))
+
+# additional closing delimiters::
+
+    print(wrap_string(cd.encode('unicode-escape').decode(),
+                      startstring="closing_delimiters = (u'"))
+
+
+# test prints
+# ~~~~~~~~~~~
+#
+# For interactive use in development you may uncomment the following
+# definitions::
+
+    # print "wide" Unicode characters:
+    # ucharlists = unicode_charlists(unicode_punctuation_categories)
+    # for key in ucharlists:
+    #     if key.endswith('wide'):
+    #         print key, ucharlists[key]
+
+    # print 'openers = ', repr(openers)
+    # print 'closers = ', repr(closers)
+    # print 'delimiters = ', repr(delimiters)
+    # print 'closing_delimiters = ', repr(closing_delimiters)
+
+    # ucharlists = unicode_charlists(unicode_punctuation_categories)
+    # for cat, chars in ucharlists.items():
+    #     # print cat, chars
+    #     # compact output (visible with a comprehensive font):
+    #     print (u":%s: %s" % (cat, u''.join(chars))).encode('utf8')
+
+# verbose print
+#
+# ::
+
+    # print 'openers:'
+    # for ch in openers:
+    #     print ch.encode('utf8'), unicodedata.name(ch)
+    # print 'closers:'
+    # for ch in closers:
+    #     print ch.encode('utf8'), unicodedata.name(ch)
+    # print 'delimiters:'
+    # for ch in delimiters:
+    #     print ch.encode('utf8'), unicodedata.name(ch)
+    # print 'closing_delimiters:'
+    # for ch in closing_delimiters:
+    #     print ch.encode('utf8'), unicodedata.name(ch)


Property changes on: trunk/docutils/tools/dev/generate_punctuation_chars.py
___________________________________________________________________
Added: svn:keywords
## -0,0 +1 ##
+Author Date Id Revision
\ No newline at end of property
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.