Author: milde Date: 2011-11-29 09:22:57 +0100 (Tue, 29 Nov 2011) New Revision: 7232 Modified: trunk/docutils/HISTORY.txt trunk/docutils/docs/ref/rst/restructuredtext.txt trunk/docutils/docutils/parsers/rst/states.py trunk/docutils/test/test_parsers/test_rst/test_inline_markup.py Log: Allow also non-ASCII whitespace characters around inline markup. First part of fix for [ 3402314 ] Only editorial changes to the specification, it already speaks of "whitespace". Modified: trunk/docutils/HISTORY.txt =================================================================== --- trunk/docutils/HISTORY.txt 2011-11-25 21:48:01 UTC (rev 7231) +++ trunk/docutils/HISTORY.txt 2011-11-29 08:22:57 UTC (rev 7232) @@ -35,6 +35,11 @@ - DependencyList uses io.FileOutput to prevent errors recording non-ASCII filenames (fixes [ 3434355 ]. +* docutils/parsers/rst/states.py + + - Allow also non-ASCII whitespace characters around inline markup. + (first part of fix for [ 3402314 ]). + * docutils/parsers/rst/tableparser.py - Fix [ 2926161 ] for simple tables. Modified: trunk/docutils/docs/ref/rst/restructuredtext.txt =================================================================== --- trunk/docutils/docs/ref/rst/restructuredtext.txt 2011-11-25 21:48:01 UTC (rev 7231) +++ trunk/docutils/docs/ref/rst/restructuredtext.txt 2011-11-29 08:22:57 UTC (rev 7232) @@ -2374,20 +2374,20 @@ 1. Inline markup start-strings must start a text block or be immediately preceded by whitespace, one of the ASCII - characters ``( [ { <``, or the Unicode characters: + characters ``' " ( [ { <``, or the Unicode characters: .. class:: borderless === ========================================================== ‘ (U+2018, left single-quote) “ (U+201C, left double-quote) - ’ (U+2019, apostrophe) + ’ (U+2019, right single-quote, or apostrophe) « (U+00AB, left guillemet, or double angle quotation mark) ¡ (U+00A1, inverted exclamation mark) ¿ (U+00BF, inverted question mark) === ========================================================== - The ASCII characters ``" ' - / :`` and the Unicode characters + The ASCII characters ``- / :`` and the Unicode characters .. class:: borderless @@ -2397,8 +2397,7 @@ ‒ (U+2012, figure dash) – (U+2013, en dash) — (U+2014, em dash) - - " " (U+00A0, non-breaking space [between the quotes]) + [ ] (U+00A0, non-breaking space [between the brackets]) === ========================================================== are _`delimiters`. They may precede or follow inline markup. @@ -2411,7 +2410,7 @@ 4. Inline markup end-strings must end a text block or be immediately followed by whitespace, the ASCII characters - ``) ] } > . , ; ! ? \`` or the Unicode characters: + ``' " ) ] } > . , ; ! ? \``, the Unicode characters: .. class:: borderless @@ -2421,14 +2420,16 @@ » (U+00BB, right guillemet, or double angle quotation mark) === ========================================================== - The `delimiters`_ listed in (1) above may precede or follow inline - markup. + or the `delimiters`_ listed in (1) above. 5. If an inline markup start-string is immediately preceded by a single or double quote, "(", "[", "{", or "<", it must not be immediately followed by the corresponding single or double quote, ")", "]", "}", or ">". + .. this also holds for the opening/closing Unicode character pairs + (since at least 05. Sep 2008). + 6. An inline markup end-string must be separated by at least one character from the start-string. Modified: trunk/docutils/docutils/parsers/rst/states.py =================================================================== --- trunk/docutils/docutils/parsers/rst/states.py 2011-11-25 21:48:01 UTC (rev 7231) +++ trunk/docutils/docutils/parsers/rst/states.py 2011-11-29 08:22:57 UTC (rev 7232) @@ -528,14 +528,19 @@ processed += self.implicit_inline(remaining, lineno) return processed, messages + # Inline object recognition + # ------------------------- + # character categories: openers = u'\'"([{<\u2018\u201c\xab\u00a1\u00bf' # see quoted_start below closers = u'\'")]}>\u2019\u201d\xbb!?' - unicode_delimiters = u'\u2010\u2011\u2012\u2013\u2014\u00a0' - start_string_prefix = (u'((?<=^)|(?<=[-/: \\n\u2019%s%s]))' - % (re.escape(unicode_delimiters), + delimiters = u'-/:\u2010\u2011\u2012\u2013\u2014\u00a0' + # lookahead and look-behind expressions for inline markup rules + # (see todo.html#inline-markup-syntax-rules) + start_string_prefix = (u'((?<=^)|(?<=\\s|[\u2019%s%s]))' + % (re.escape(delimiters), re.escape(openers))) - end_string_suffix = (r'((?=$)|(?=[-/:.,; \n\x00%s%s]))' - % (re.escape(unicode_delimiters), + end_string_suffix = (u'((?=$)|(?=\\s|[.,; \x00%s%s]))' + % (re.escape(delimiters), re.escape(closers))) non_whitespace_before = r'(?<![ \n])' non_whitespace_escape_before = r'(?<![ \n\x00])' @@ -589,9 +594,9 @@ patterns = Struct( initial=build_regexp(parts), emphasis=re.compile(non_whitespace_escape_before - + r'(\*)' + end_string_suffix), + + r'(\*)' + end_string_suffix, re.UNICODE), strong=re.compile(non_whitespace_escape_before - + r'(\*\*)' + end_string_suffix), + + r'(\*\*)' + end_string_suffix, re.UNICODE), interpreted_or_phrase_ref=re.compile( r""" %(non_unescaped_whitespace_escape_before)s @@ -615,7 +620,7 @@ > # close bracket w/o whitespace before ) $ # end of string - """ % locals(), re.VERBOSE), + """ % locals(), re.VERBOSE | re.UNICODE), literal=re.compile(non_whitespace_before + '(``)' + end_string_suffix), target=re.compile(non_whitespace_escape_before @@ -623,7 +628,8 @@ substitution_ref=re.compile(non_whitespace_escape_before + r'(\|_{0,2})' + end_string_suffix), - email=re.compile(email_pattern % locals() + '$', re.VERBOSE), + email=re.compile(email_pattern % locals() + '$', + re.VERBOSE | re.UNICODE), uri=re.compile( (r""" %(start_string_prefix)s @@ -655,7 +661,7 @@ ) ) %(end_string_suffix)s - """) % locals(), re.VERBOSE), + """) % locals(), re.VERBOSE | re.UNICODE), pep=re.compile( r""" %(start_string_prefix)s @@ -664,12 +670,12 @@ | (PEP\s+(?P<pepnum2>\d+)) # reference by name ) - %(end_string_suffix)s""" % locals(), re.VERBOSE), + %(end_string_suffix)s""" % locals(), re.VERBOSE | re.UNICODE), rfc=re.compile( r""" %(start_string_prefix)s (RFC(-|\s+)?(?P<rfcnum>\d+)) - %(end_string_suffix)s""" % locals(), re.VERBOSE)) + %(end_string_suffix)s""" % locals(), re.VERBOSE | re.UNICODE)) def quoted_start(self, match): """Return 1 if inline markup start-string is 'quoted', 0 if not.""" @@ -1044,7 +1050,7 @@ enum.sequenceregexps = {} for sequence in enum.sequences: enum.sequenceregexps[sequence] = re.compile( - enum.sequencepats[sequence] + '$') + enum.sequencepats[sequence] + '$', re.UNICODE) grid_table_top_pat = re.compile(r'\+-[-+]+-\+ *$') """Matches the top (& bottom) of a full table).""" @@ -1136,7 +1142,8 @@ return elements # U+2014 is an em-dash: - attribution_pattern = re.compile(u'(---?(?!-)|\u2014) *(?=[^ \\n])') + attribution_pattern = re.compile(u'(---?(?!-)|\u2014) *(?=[^ \\n])', + re.UNICODE) def split_attribution(self, indented, line_offset): """ @@ -1793,7 +1800,7 @@ [ ]? # optional space : # end of reference name ([ ]+|$) # followed by whitespace - """ % vars(Inliner), re.VERBOSE), + """ % vars(Inliner), re.VERBOSE | re.UNICODE), reference=re.compile(r""" ( (?P<simple>%(simplename)s)_ @@ -1815,7 +1822,8 @@ \| # close delimiter ) ([ ]+|$) # followed by whitespace - """ % vars(Inliner), re.VERBOSE),) + """ % vars(Inliner), + re.VERBOSE | re.UNICODE),) def footnote(self, match): src, srcline = self.state_machine.get_source_and_line() @@ -2262,13 +2270,13 @@ \.\.[ ]+ # explicit markup start _ # target indicator (?![ ]|$) # first char. not space or EOL - """, re.VERBOSE)), + """, re.VERBOSE | re.UNICODE)), (substitution_def, re.compile(r""" \.\.[ ]+ # explicit markup start \| # substitution indicator (?![ ]|$) # first char. not space or EOL - """, re.VERBOSE)), + """, re.VERBOSE | re.UNICODE)), (directive, re.compile(r""" \.\.[ ]+ # explicit markup start @@ -3029,7 +3037,7 @@ """Match arbitrary quote character on the first line only.""" self.remove_transition('initial_quoted') quote = match.string[0] - pattern = re.compile(re.escape(quote)) + pattern = re.compile(re.escape(quote), re.UNICODE) # New transition matches consistent quotes only: self.add_transition('quoted', (pattern, self.quoted, self.__class__.__name__)) Modified: trunk/docutils/test/test_parsers/test_rst/test_inline_markup.py =================================================================== --- trunk/docutils/test/test_parsers/test_rst/test_inline_markup.py 2011-11-25 21:48:01 UTC (rev 7231) +++ trunk/docutils/test/test_parsers/test_rst/test_inline_markup.py 2011-11-29 08:22:57 UTC (rev 7232) @@ -1339,7 +1339,7 @@ """], ] -totest['miscellaneous'] = [ +totest['markup recognition rules'] = [ ["""\ __This__ should be left alone. """, @@ -1370,7 +1370,7 @@ """], [u"""\ text-*separated*\u2010*by*\u2011*various*\u2012*dashes*\u2013*and*\u2014*hyphens*. -\u00bf*punctuation*? \u00a1*examples*!\u00a0*too*. +\u00bf*punctuation*? \u00a1*examples*!\u00a0*\u00a0no-break-space\u00a0*. """, u"""\ <document source="test data"> @@ -1402,10 +1402,113 @@ examples !\xa0 <emphasis> - too + \u00a0no-break-space\u00a0 . """], +# Whitespace characters: +# \u180e*MONGOLIAN VOWEL SEPARATOR*\u180e, fails in Python 2.4 [u"""\ +text separated by +*newline* +or *space* or one of +\xa0*NO-BREAK SPACE*\xa0, +\u1680*OGHAM SPACE MARK*\u1680, +\u2000*EN QUAD*\u2000, +\u2001*EM QUAD*\u2001, +\u2002*EN SPACE*\u2002, +\u2003*EM SPACE*\u2003, +\u2004*THREE-PER-EM SPACE*\u2004, +\u2005*FOUR-PER-EM SPACE*\u2005, +\u2006*SIX-PER-EM SPACE*\u2006, +\u2007*FIGURE SPACE*\u2007, +\u2008*PUNCTUATION SPACE*\u2008, +\u2009*THIN SPACE*\u2009, +\u200a*HAIR SPACE*\u200a, +\u202f*NARROW NO-BREAK SPACE*\u202f, +\u205f*MEDIUM MATHEMATICAL SPACE*\u205f, +\u3000*IDEOGRAPHIC SPACE*\u3000, +\u2028*LINE SEPARATOR*\u2028 +""", +u"""\ +<document source="test data"> + <paragraph> + text separated by + <emphasis> + newline + \n\ + or \n\ + <emphasis> + space + or one of + \xa0 + <emphasis> + NO-BREAK SPACE + \xa0, + \u1680 + <emphasis> + OGHAM SPACE MARK + \u1680, + \u2000 + <emphasis> + EN QUAD + \u2000, + \u2001 + <emphasis> + EM QUAD + \u2001, + \u2002 + <emphasis> + EN SPACE + \u2002, + \u2003 + <emphasis> + EM SPACE + \u2003, + \u2004 + <emphasis> + THREE-PER-EM SPACE + \u2004, + \u2005 + <emphasis> + FOUR-PER-EM SPACE + \u2005, + \u2006 + <emphasis> + SIX-PER-EM SPACE + \u2006, + \u2007 + <emphasis> + FIGURE SPACE + \u2007, + \u2008 + <emphasis> + PUNCTUATION SPACE + \u2008, + \u2009 + <emphasis> + THIN SPACE + \u2009, + \u200a + <emphasis> + HAIR SPACE + \u200a, + \u202f + <emphasis> + NARROW NO-BREAK SPACE + \u202f, + \u205f + <emphasis> + MEDIUM MATHEMATICAL SPACE + \u205f, + \u3000 + <emphasis> + IDEOGRAPHIC SPACE + \u3000, + <paragraph> + <emphasis> + LINE SEPARATOR +"""], +[u"""\ None of these should be markup (matched openers & closers): \u2018*\u2019 \u201c*\u201d \xab*\xbb \u00bf*? \u00a1*! |