[Docutils-checkins] r7232 - in trunk/docutils: HISTORY.txt docs/ref/rst/restructuredtext.txt docuti

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 454-5900

Author: milde
Date: 2011-11-29 09:22:57 +0100 (Tue, 29 Nov 2011)
New Revision: 7232

Modified:
   trunk/docutils/HISTORY.txt
   trunk/docutils/docs/ref/rst/restructuredtext.txt
   trunk/docutils/docutils/parsers/rst/states.py
   trunk/docutils/test/test_parsers/test_rst/test_inline_markup.py
Log:
Allow also non-ASCII whitespace characters around inline markup.

First part of fix for [ 3402314 ]
Only editorial changes to the specification, 
it already speaks of "whitespace".

Modified: trunk/docutils/HISTORY.txt
===================================================================

--- trunk/docutils/HISTORY.txt	2011-11-25 21:48:01 UTC (rev 7231)
+++ trunk/docutils/HISTORY.txt	2011-11-29 08:22:57 UTC (rev 7232)
@@ -35,6 +35,11 @@
   - DependencyList uses io.FileOutput to prevent errors recording
     non-ASCII filenames (fixes [ 3434355 ].
 
+* docutils/parsers/rst/states.py
+
+  - Allow also non-ASCII whitespace characters around inline markup.
+    (first part of fix for [ 3402314 ]).
+
 * docutils/parsers/rst/tableparser.py
 
   - Fix [ 2926161 ] for simple tables.

Modified: trunk/docutils/docs/ref/rst/restructuredtext.txt
===================================================================
--- trunk/docutils/docs/ref/rst/restructuredtext.txt	2011-11-25 21:48:01 UTC (rev 7231)
+++ trunk/docutils/docs/ref/rst/restructuredtext.txt	2011-11-29 08:22:57 UTC (rev 7232)
@@ -2374,20 +2374,20 @@
 
 1. Inline markup start-strings must start a text block or be
    immediately preceded by whitespace, one of the ASCII
-   characters ``( [ { <``, or the Unicode characters:
+   characters ``' " ( [ { <``, or the Unicode characters:
 
        .. class:: borderless
 
        ===  ==========================================================
        ‘    (U+2018, left single-quote)
        “    (U+201C, left double-quote)
-       ’    (U+2019, apostrophe)
+       ’    (U+2019, right single-quote, or apostrophe)
        «    (U+00AB, left guillemet, or double angle quotation mark)
        ¡    (U+00A1, inverted exclamation mark)
        ¿    (U+00BF, inverted question mark)
        ===  ==========================================================
 
-   The ASCII characters ``" ' - / :`` and the Unicode characters
+   The ASCII characters ``- / :`` and the Unicode characters
 
        .. class:: borderless
 
@@ -2397,8 +2397,7 @@
        ‒    (U+2012, figure dash)
        –    (U+2013, en dash)
        —    (U+2014, em dash)
-
-       " "  (U+00A0, non-breaking space [between the quotes])
+       [ ]  (U+00A0, non-breaking space [between the brackets])
        ===  ==========================================================
 
    are _`delimiters`. They may precede or follow inline markup.
@@ -2411,7 +2410,7 @@
 
 4. Inline markup end-strings must end a text block or be immediately
    followed by whitespace, the ASCII characters
-   ``) ] } > . , ; ! ? \`` or the Unicode characters:
+   ``' " ) ] } > . , ; ! ? \``, the Unicode characters:
 
        .. class:: borderless
 
@@ -2421,14 +2420,16 @@
        »    (U+00BB, right guillemet, or double angle quotation mark)
        ===  ==========================================================
 
-   The `delimiters`_ listed in (1) above may precede or follow inline
-   markup.
+   or the `delimiters`_ listed in (1) above.
 
 5. If an inline markup start-string is immediately preceded by a
    single or double quote, "(", "[", "{", or "<", it must not be
    immediately followed by the corresponding single or double quote,
    ")", "]", "}", or ">".
 
+   .. this also holds for the opening/closing Unicode character pairs
+      (since at least 05. Sep 2008).
+
 6. An inline markup end-string must be separated by at least one
    character from the start-string.
 

Modified: trunk/docutils/docutils/parsers/rst/states.py
===================================================================
--- trunk/docutils/docutils/parsers/rst/states.py	2011-11-25 21:48:01 UTC (rev 7231)
+++ trunk/docutils/docutils/parsers/rst/states.py	2011-11-29 08:22:57 UTC (rev 7232)
@@ -528,14 +528,19 @@
             processed += self.implicit_inline(remaining, lineno)
         return processed, messages
 
+    # Inline object recognition
+    # -------------------------
+    # character categories:
     openers = u'\'"([{<\u2018\u201c\xab\u00a1\u00bf' # see quoted_start below
     closers = u'\'")]}>\u2019\u201d\xbb!?'
-    unicode_delimiters = u'\u2010\u2011\u2012\u2013\u2014\u00a0'
-    start_string_prefix = (u'((?<=^)|(?<=[-/: \\n\u2019%s%s]))'
-                           % (re.escape(unicode_delimiters),
+    delimiters = u'-/:\u2010\u2011\u2012\u2013\u2014\u00a0'
+    # lookahead and look-behind expressions for inline markup rules
+    # (see todo.html#inline-markup-syntax-rules)
+    start_string_prefix = (u'((?<=^)|(?<=\\s|[\u2019%s%s]))'
+                           % (re.escape(delimiters),
                               re.escape(openers)))
-    end_string_suffix = (r'((?=$)|(?=[-/:.,; \n\x00%s%s]))'
-                         % (re.escape(unicode_delimiters),
+    end_string_suffix = (u'((?=$)|(?=\\s|[.,; \x00%s%s]))'
+                         % (re.escape(delimiters),
                             re.escape(closers)))
     non_whitespace_before = r'(?<![ \n])'
     non_whitespace_escape_before = r'(?<![ \n\x00])'
@@ -589,9 +594,9 @@
     patterns = Struct(
           initial=build_regexp(parts),
           emphasis=re.compile(non_whitespace_escape_before
-                              + r'(\*)' + end_string_suffix),
+                              + r'(\*)' + end_string_suffix, re.UNICODE),
           strong=re.compile(non_whitespace_escape_before
-                            + r'(\*\*)' + end_string_suffix),
+                            + r'(\*\*)' + end_string_suffix, re.UNICODE),
           interpreted_or_phrase_ref=re.compile(
               r"""
               %(non_unescaped_whitespace_escape_before)s
@@ -615,7 +620,7 @@
                 >                       # close bracket w/o whitespace before
               )
               $                         # end of string
-              """ % locals(), re.VERBOSE),
+              """ % locals(), re.VERBOSE | re.UNICODE),
           literal=re.compile(non_whitespace_before + '(``)'
                              + end_string_suffix),
           target=re.compile(non_whitespace_escape_before
@@ -623,7 +628,8 @@
           substitution_ref=re.compile(non_whitespace_escape_before
                                       + r'(\|_{0,2})'
                                       + end_string_suffix),
-          email=re.compile(email_pattern % locals() + '$', re.VERBOSE),
+          email=re.compile(email_pattern % locals() + '$',
+                           re.VERBOSE | re.UNICODE),
           uri=re.compile(
                 (r"""
                 %(start_string_prefix)s
@@ -655,7 +661,7 @@
                   )
                 )
                 %(end_string_suffix)s
-                """) % locals(), re.VERBOSE),
+                """) % locals(), re.VERBOSE | re.UNICODE),
           pep=re.compile(
                 r"""
                 %(start_string_prefix)s
@@ -664,12 +670,12 @@
                 |
                   (PEP\s+(?P<pepnum2>\d+))      # reference by name
                 )
-                %(end_string_suffix)s""" % locals(), re.VERBOSE),
+                %(end_string_suffix)s""" % locals(), re.VERBOSE | re.UNICODE),
           rfc=re.compile(
                 r"""
                 %(start_string_prefix)s
                 (RFC(-|\s+)?(?P<rfcnum>\d+))
-                %(end_string_suffix)s""" % locals(), re.VERBOSE))
+                %(end_string_suffix)s""" % locals(), re.VERBOSE | re.UNICODE))
 
     def quoted_start(self, match):
         """Return 1 if inline markup start-string is 'quoted', 0 if not."""
@@ -1044,7 +1050,7 @@
     enum.sequenceregexps = {}
     for sequence in enum.sequences:
         enum.sequenceregexps[sequence] = re.compile(
-              enum.sequencepats[sequence] + '$')
+              enum.sequencepats[sequence] + '$', re.UNICODE)
 
     grid_table_top_pat = re.compile(r'\+-[-+]+-\+ *$')
     """Matches the top (& bottom) of a full table)."""
@@ -1136,7 +1142,8 @@
         return elements
 
     # U+2014 is an em-dash:
-    attribution_pattern = re.compile(u'(---?(?!-)|\u2014) *(?=[^ \\n])')
+    attribution_pattern = re.compile(u'(---?(?!-)|\u2014) *(?=[^ \\n])',
+                                     re.UNICODE)
 
     def split_attribution(self, indented, line_offset):
         """
@@ -1793,7 +1800,7 @@
                             [ ]?            # optional space
                             :               # end of reference name
                             ([ ]+|$)        # followed by whitespace
-                            """ % vars(Inliner), re.VERBOSE),
+                            """ % vars(Inliner), re.VERBOSE | re.UNICODE),
           reference=re.compile(r"""
                                (
                                  (?P<simple>%(simplename)s)_
@@ -1815,7 +1822,8 @@
                                     \|               # close delimiter
                                   )
                                   ([ ]+|$)           # followed by whitespace
-                                  """ % vars(Inliner), re.VERBOSE),)
+                                  """ % vars(Inliner),
+                                  re.VERBOSE | re.UNICODE),)
 
     def footnote(self, match):
         src, srcline = self.state_machine.get_source_and_line()
@@ -2262,13 +2270,13 @@
                       \.\.[ ]+          # explicit markup start
                       _                 # target indicator
                       (?![ ]|$)         # first char. not space or EOL
-                      """, re.VERBOSE)),
+                      """, re.VERBOSE | re.UNICODE)),
           (substitution_def,
            re.compile(r"""
                       \.\.[ ]+          # explicit markup start
                       \|                # substitution indicator
                       (?![ ]|$)         # first char. not space or EOL
-                      """, re.VERBOSE)),
+                      """, re.VERBOSE | re.UNICODE)),
           (directive,
            re.compile(r"""
                       \.\.[ ]+          # explicit markup start
@@ -3029,7 +3037,7 @@
         """Match arbitrary quote character on the first line only."""
         self.remove_transition('initial_quoted')
         quote = match.string[0]
-        pattern = re.compile(re.escape(quote))
+        pattern = re.compile(re.escape(quote), re.UNICODE)
         # New transition matches consistent quotes only:
         self.add_transition('quoted',
                             (pattern, self.quoted, self.__class__.__name__))

Modified: trunk/docutils/test/test_parsers/test_rst/test_inline_markup.py
===================================================================
--- trunk/docutils/test/test_parsers/test_rst/test_inline_markup.py	2011-11-25 21:48:01 UTC (rev 7231)
+++ trunk/docutils/test/test_parsers/test_rst/test_inline_markup.py	2011-11-29 08:22:57 UTC (rev 7232)
@@ -1339,7 +1339,7 @@
 """],
 ]
 
-totest['miscellaneous'] = [
+totest['markup recognition rules'] = [
 ["""\
 __This__ should be left alone.
 """,
@@ -1370,7 +1370,7 @@
 """],
 [u"""\
 text-*separated*\u2010*by*\u2011*various*\u2012*dashes*\u2013*and*\u2014*hyphens*.
-\u00bf*punctuation*? \u00a1*examples*!\u00a0*too*.
+\u00bf*punctuation*? \u00a1*examples*!\u00a0*\u00a0no-break-space\u00a0*.
 """,
 u"""\
 <document source="test data">
@@ -1402,10 +1402,113 @@
             examples
         !\xa0
         <emphasis>
-            too
+            \u00a0no-break-space\u00a0
         .
 """],
+# Whitespace characters:                                      
+#  \u180e*MONGOLIAN VOWEL SEPARATOR*\u180e,   fails in Python 2.4
 [u"""\
+text separated by
+*newline*
+or *space* or one of
+\xa0*NO-BREAK SPACE*\xa0,
+\u1680*OGHAM SPACE MARK*\u1680,
+\u2000*EN QUAD*\u2000,
+\u2001*EM QUAD*\u2001,
+\u2002*EN SPACE*\u2002,
+\u2003*EM SPACE*\u2003,
+\u2004*THREE-PER-EM SPACE*\u2004,
+\u2005*FOUR-PER-EM SPACE*\u2005,
+\u2006*SIX-PER-EM SPACE*\u2006,
+\u2007*FIGURE SPACE*\u2007,
+\u2008*PUNCTUATION SPACE*\u2008,
+\u2009*THIN SPACE*\u2009,
+\u200a*HAIR SPACE*\u200a,
+\u202f*NARROW NO-BREAK SPACE*\u202f,
+\u205f*MEDIUM MATHEMATICAL SPACE*\u205f,
+\u3000*IDEOGRAPHIC SPACE*\u3000,
+\u2028*LINE SEPARATOR*\u2028
+""",
+u"""\
+<document source="test data">
+    <paragraph>
+        text separated by
+        <emphasis>
+            newline
+        \n\
+        or \n\
+        <emphasis>
+            space
+         or one of
+        \xa0
+        <emphasis>
+            NO-BREAK SPACE
+        \xa0,
+        \u1680
+        <emphasis>
+            OGHAM SPACE MARK
+        \u1680,
+        \u2000
+        <emphasis>
+            EN QUAD
+        \u2000,
+        \u2001
+        <emphasis>
+            EM QUAD
+        \u2001,
+        \u2002
+        <emphasis>
+            EN SPACE
+        \u2002,
+        \u2003
+        <emphasis>
+            EM SPACE
+        \u2003,
+        \u2004
+        <emphasis>
+            THREE-PER-EM SPACE
+        \u2004,
+        \u2005
+        <emphasis>
+            FOUR-PER-EM SPACE
+        \u2005,
+        \u2006
+        <emphasis>
+            SIX-PER-EM SPACE
+        \u2006,
+        \u2007
+        <emphasis>
+            FIGURE SPACE
+        \u2007,
+        \u2008
+        <emphasis>
+            PUNCTUATION SPACE
+        \u2008,
+        \u2009
+        <emphasis>
+            THIN SPACE
+        \u2009,
+        \u200a
+        <emphasis>
+            HAIR SPACE
+        \u200a,
+        \u202f
+        <emphasis>
+            NARROW NO-BREAK SPACE
+        \u202f,
+        \u205f
+        <emphasis>
+            MEDIUM MATHEMATICAL SPACE
+        \u205f,
+        \u3000
+        <emphasis>
+            IDEOGRAPHIC SPACE
+        \u3000,
+    <paragraph>
+        <emphasis>
+            LINE SEPARATOR
+"""],
+[u"""\
 None of these should be markup (matched openers & closers):
 
 \u2018*\u2019 \u201c*\u201d \xab*\xbb \u00bf*? \u00a1*!





[Docutils-checkins] r7232 - in trunk/docutils: HISTORY.txt docs/ref/rst/restructuredtext.txt docuti

[Docutils-checkins] r7232 - in trunk/docutils: HISTORY.txt docs/ref/rst/restructuredtext.txt docutils/parsers/rst/states.py test/test_parsers/test_rst/test_inline_markup.py