[Epydoc-commits] SF.net SVN: epydoc: [1440] trunk/epydoc/src/epydoc/markup/doctest.py

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 1440
          http://svn.sourceforge.net/epydoc/?rev=1440&view=rev
Author:   edloper
Date:     2007-02-06 22:37:21 -0800 (Tue, 06 Feb 2007)

Log Message:
-----------
- Improved/refactored version of doctest colorizer.

Modified Paths:
--------------
    trunk/epydoc/src/epydoc/markup/doctest.py

Modified: trunk/epydoc/src/epydoc/markup/doctest.py
===================================================================

--- trunk/epydoc/src/epydoc/markup/doctest.py	2007-02-07 06:21:26 UTC (rev 1439)
+++ trunk/epydoc/src/epydoc/markup/doctest.py	2007-02-07 06:37:21 UTC (rev 1440)
@@ -19,176 +19,292 @@
 import re
 from epydoc.util import plaintext_to_html, plaintext_to_latex
 
+__all__ = ['doctest_to_html', 'doctest_to_latex',
+           'DoctestColorizer', 'XMLDoctestColorizer', 
+           'HTMLDoctestColorizer', 'LaTeXDoctestColorizer']
+
 def doctest_to_html(s):
     """
     Perform syntax highlighting on the given doctest string, and
     return the resulting HTML code.  This code consists of a C{<pre>}
     block with class=py-doctest.  Syntax highlighting is performed
-    using the following css classes: 'py-prompt', 'py-keyword',
-    'py-string', 'py-comment', and 'py-output'.
+    using the following css classes:
+    
+      - C{py-prompt} -- the Python PS1 prompt (>>>)
+      - C{py-more} -- the Python PS2 prompt (...)
+      - C{py-keyword} -- a Python keyword (for, if, etc.)
+      - C{py-builtin} -- a Python builtin name (abs, dir, etc.)
+      - C{py-string} -- a string literal
+      - C{py-comment} -- a comment
+      - C{py-except} -- an exception traceback (up to the next >>>)
+      - C{py-output} -- the output from a doctest block.
+      - C{py-defname} -- the name of a function or class defined by
+        a C{def} or C{class} statement.
     """
-    return ('<pre class="py-doctest">\n%s\n</pre>\n' %
-            colorize_doctest(s, _tag_span_html).strip())
+    return HTMLDoctestColorizer().colorize_doctest(s)
 
 def doctest_to_latex(s):
     """
     Perform syntax highlighting on the given doctest string, and
     return the resulting LaTeX code.  This code consists of an
-    C{alltt} environment.  Syntax highlighting is performed using five
-    new latex commands, which must be defined externally:
-    '\pysrcprompt', '\pysrckeyword', '\pysrcstring', '\pysrccomment',
-    and '\pysrcoutput'.
+    C{alltt} environment.  Syntax highlighting is performed using 
+    the following new latex commands, which must be defined externally:
+      - C{\pysrcprompt} -- the Python PS1 prompt (>>>)
+      - C{\pysrcmore} -- the Python PS2 prompt (...)
+      - C{\pysrckeyword} -- a Python keyword (for, if, etc.)
+      - C{\pysrcbuiltin} -- a Python builtin name (abs, dir, etc.)
+      - C{\pysrcstring} -- a string literal
+      - C{\pysrccomment} -- a comment
+      - C{\pysrcexcept} -- an exception traceback (up to the next >>>)
+      - C{\pysrcoutput} -- the output from a doctest block.
+      - C{\pysrcdefname} -- the name of a function or class defined by
+        a C{def} or C{class} statement.
     """
-    return ('\\begin{alltt}\n%s\n\\end{alltt}\n' % 
-            colorize_doctest(s, _tag_span_latex).strip())
+    return LaTeXDoctestColorizer().colorize_doctest(s)
 
-def _tag_span_html(s, tag):
-    return '<span class="py-%s">%s</span>' % (tag, plaintext_to_html(s))
+class DoctestColorizer:
+    """
+    An abstract base class for performing syntax highlighting on
+    doctest blocks and other bits of Python code.  Subclasses should
+    provide definitions for:
 
-def _tag_span_latex(s, tag):
-    return '\\pysrc%s{%s}' % (tag, plaintext_to_latex(s))
+      - The L{markup()} method, which takes a substring and a tag, and
+        returns a colorized version of the substring.
+      - The L{PREFIX} and L{SUFFIX} variables, which will be added
+        to the beginning and end of the strings returned by
+        L{colorize_codeblock} and L{colorize_doctest}.  
+    """
 
-# Regular expressions for colorize_doctestblock
-# set of keywords as listed in the Python Language Reference 2.4.1
-# added 'as' as well since IDLE already colorizes it as a keyword.
-# The documentation states that 'None' will become a keyword
-# eventually, but IDLE currently handles that as a builtin.
-_KEYWORDS = """
-and       del       for       is        raise    
-assert    elif      from      lambda    return   
-break     else      global    not       try      
-class     except    if        or        while    
-continue  exec      import    pass      yield    
-def       finally   in        print
-as
-""".split()
-_KEYWORD = '|'.join([r'\b%s\b' % _KW for _KW in _KEYWORDS])
+    #: A string that is added to the beginning of the strings
+    #: returned by L{colorize_codeblock} and L{colorize_doctest}.
+    #: Typically, this string begins a preformatted area.
+    PREFIX = None
 
-_BUILTINS = [_BI for _BI in dir(__builtins__) if not _BI.startswith('__')]
-_BUILTIN = '|'.join([r'\b%s\b' % _BI for _BI in _BUILTINS])
+    #: A string that is added to the end of the strings
+    #: returned by L{colorize_codeblock} and L{colorize_doctest}.
+    #: Typically, this string ends a preformatted area.
+    SUFFIX = None
 
-_STRING = '|'.join([r'("""("""|.*?((?!").)"""))', r'("("|.*?((?!").)"))',
-                    r"('''('''|.*?[^\\']'''))", r"('('|.*?[^\\']'))"])
-_COMMENT = '(#.*?$)'
-_PROMPT1 = r'^\s*>>>(?:\s|$)'
-_PROMPT2 = r'^\s*\.\.\.(?:\s|$)'
+    #: A list of the names of all Python keywords.  ('as' is included
+    #: even though it is technically not a keyword.)
+    _KEYWORDS = ("and       del       for       is        raise"
+                 "assert    elif      from      lambda    return"
+                 "break     else      global    not       try"
+                 "class     except    if        or        while"
+                 "continue  exec      import    pass      yield"
+                 "def       finally   in        print     as").split()
 
-PROMPT_RE = re.compile('(%s|%s)' % (_PROMPT1, _PROMPT2),
-		       re.MULTILINE | re.DOTALL)
-PROMPT2_RE = re.compile('(%s)' % _PROMPT2, re.MULTILINE | re.DOTALL)
-'''The regular expression used to find Python prompts (">>>" and
-"...") in doctest blocks.'''
+    #: A list of all Python builtins.
+    _BUILTINS = [_BI for _BI in dir(__builtins__)
+                 if not _BI.startswith('__')]
 
-EXCEPT_RE = re.compile(r'(.*)(^Traceback \(most recent call last\):.*)',
-                       re.DOTALL | re.MULTILINE)
+    #: A regexp group that matches keywords.
+    _KEYWORD_GRP = '|'.join([r'\b%s\b' % _KW for _KW in _KEYWORDS])
 
-DOCTEST_DIRECTIVE_RE = re.compile(r'#\s*doctest:.*')
+    #: A regexp group that matches Python builtins.
+    _BUILTIN_GRP = (r'(?<!\.)(?:%s)' % '|'.join([r'\b%s\b' % _BI
+                                                 for _BI in _BUILTINS]))
 
-DOCTEST_RE = re.compile(r"""(?P<STRING>%s)|(?P<COMMENT>%s)|"""
-                        r"""(?P<KEYWORD>(%s))|(?P<BUILTIN>(%s))|"""
-                        r"""(?P<PROMPT1>%s)|(?P<PROMPT2>%s)|.+?""" %
-  (_STRING, _COMMENT, _KEYWORD, _BUILTIN, _PROMPT1, _PROMPT2),
-  re.MULTILINE | re.DOTALL)
-'''The regular expression used by L{_doctest_sub} to colorize doctest
-blocks.'''
+    #: A regexp group that matches Python strings.
+    _STRING_GRP = '|'.join(
+        [r'("""("""|.*?((?!").)"""))', r'("("|.*?((?!").)"))',
+         r"('''('''|.*?[^\\']'''))", r"('('|.*?[^\\']'))"])
 
-def colorize_doctest(s, markup_func, inline=False, strip_directives=False):
-    """
-    Colorize the given doctest string C{s} using C{markup_func()}.
-    C{markup_func()} should be a function that takes a substring and a
-    tag, and returns a colorized version of the substring.  E.g.:
+    #: A regexp group that matches Python comments.
+    _COMMENT_GRP = '(#.*?$)'
 
-        >>> def html_markup_func(s, tag):
-        ...     return '<span class="%s">%s</span>' % (tag, s)
+    #: A regexp group that matches Python ">>>" prompts.
+    _PROMPT1_GRP = r'^[ \t]*>>>(?:[ \t]|$)'
+    
+    #: A regexp group that matches Python "..." prompts.
+    _PROMPT2_GRP = r'^[ \t]*\.\.\.(?:[ \t]|$)'
 
-    The tags that will be passed to the markup function are: 
-        - C{prompt} -- the Python PS1 prompt (>>>)
-	- C{more} -- the Python PS2 prompt (...)
-        - C{keyword} -- a Python keyword (for, if, etc.)
-        - C{builtin} -- a Python builtin name (abs, dir, etc.)
-        - C{string} -- a string literal
-        - C{comment} -- a comment
-	- C{except} -- an exception traceback (up to the next >>>)
-        - C{output} -- the output from a doctest block.
-        - C{other} -- anything else (does *not* include output.)
-    """
-    pysrc = [] # the source code part of a docstest block (lines)
-    pyout = [] # the output part of a doctest block (lines)
-    result = []
-    out = result.append
+    #: A regexp group that matches function and class definitions.
+    _DEFINE_GRP = r'\b(?:def|class)[ \t]+\w+'
 
-    if strip_directives:
-        s = DOCTEST_DIRECTIVE_RE.sub('', s)
+    #: A regexp that matches Python prompts
+    PROMPT_RE = re.compile('(%s|%s)' % (_PROMPT1_GRP, _PROMPT2_GRP),
+                           re.MULTILINE | re.DOTALL)
 
-    def subfunc(match):
+    #: A regexp that matches Python "..." prompts.
+    PROMPT2_RE = re.compile('(%s)' % _PROMPT2_GRP,
+                            re.MULTILINE | re.DOTALL)
+
+    #: A regexp that matches doctest exception blocks.
+    EXCEPT_RE = re.compile(r'^[ \t]*Traceback \(most recent call last\):.*',
+                           re.DOTALL | re.MULTILINE)
+
+    #: A regexp that matches doctest directives.
+    DOCTEST_DIRECTIVE_RE = re.compile(r'#[ \t]*doctest:.*')
+
+    #: A regexp that matches all of the regions of a doctest block
+    #: that should be colored.
+    DOCTEST_RE = re.compile(
+        r'(.*?)((?P<STRING>%s)|(?P<COMMENT>%s)|(?P<DEFINE>%s)|'
+              r'(?P<KEYWORD>%s)|(?P<BUILTIN>%s)|'
+              r'(?P<PROMPT1>%s)|(?P<PROMPT2>%s)|(?P<EOS>\Z))' % (
+        _STRING_GRP, _COMMENT_GRP, _DEFINE_GRP, _KEYWORD_GRP, _BUILTIN_GRP,
+        _PROMPT1_GRP, _PROMPT2_GRP), re.MULTILINE | re.DOTALL)
+
+    #: This regular expression is used to find doctest examples in a
+    #: string.  This is copied from the standard Python doctest.py
+    #: module (after the refactoring in Python 2.4+).
+    DOCTEST_EXAMPLE_RE = re.compile(r'''
+        # Source consists of a PS1 line followed by zero or more PS2 lines.
+        (?P<source>
+            (?:^(?P<indent> [ ]*) >>>    .*)    # PS1 line
+            (?:\n           [ ]*  \.\.\. .*)*   # PS2 lines
+          \n?)
+        # Want consists of any non-blank lines that do not start with PS1.
+        (?P<want> (?:(?![ ]*$)    # Not a blank line
+                     (?![ ]*>>>)  # Not a line starting with PS1
+                     .*$\n?       # But any other line
+                  )*)
+        ''', re.MULTILINE | re.VERBOSE)
+
+    def colorize_inline(self, s):
+        """
+        Colorize a string containing Python code.  Do not add the
+        L{PREFIX} and L{SUFFIX} strings to the returned value.  This
+        method is intended for generating syntax-highlighted strings
+        that are appropriate for inclusion as inline expressions.
+        """
+        return self.DOCTEST_RE.sub(self.subfunc, s)
+
+    def colorize_codeblock(self, s):
+        """
+        Colorize a string containing only Python code.  This method
+        differs from L{colorize_doctest} in that it will not search
+        for doctest prompts when deciding how to colorize the string.
+        """
+        body = self.DOCTEST_RE.sub(self.subfunc, s)
+        return self.PREFIX + body + self.SUFFIX
+
+    def colorize_doctest(self, s, strip_directives=False):
+        """
+        Colorize a string containing one or more doctest examples.
+        """
+        output = []
+        charno = 0
+        for m in self.DOCTEST_EXAMPLE_RE.finditer(s):
+            # Parse the doctest example:
+            pysrc, want = m.group('source', 'want')
+            # Pre-example text:
+            output.append(s[charno:m.start()])
+            # Example source code:
+            output.append(self.DOCTEST_RE.sub(self.subfunc, pysrc))
+            # Example output:
+            if want:
+                if self.EXCEPT_RE.match(want):
+                    output += [self.markup(line, 'except')+'\n'
+                               for line in want.split('\n')]
+                else:
+                    output += [self.markup(line, 'output')+'\n'
+                               for line in want.split('\n')]
+            # Update charno
+            charno = m.end()
+        # Add any remaining post-example text.
+        output.append(s[charno:])
+        
+        return self.PREFIX + ''.join(output) + self.SUFFIX
+    
+    def subfunc(self, match):
+        other, text = match.group(1, 2)
+        #print 'M %20r %20r' % (other, text) # <- for debugging
+        if other:
+            other = '\n'.join([self.markup(line, 'other')
+                               for line in other.split('\n')])
+            
         if match.group('PROMPT1'):
-            return markup_func(match.group(), 'prompt')
-	if match.group('PROMPT2'):
-	    return markup_func(match.group(), 'more')
-        if match.group('KEYWORD'):
-            return markup_func(match.group(), 'keyword')
-        if match.group('BUILTIN'):
-            return markup_func(match.group(), 'builtin')
-        if match.group('COMMENT'):
-            return markup_func(match.group(), 'comment')
-        if match.group('STRING') and '\n' not in match.group():
-            return markup_func(match.group(), 'string')
+            return other + self.markup(text, 'prompt')
+        elif match.group('PROMPT2'):
+            return other + self.markup(text, 'more')
+        elif match.group('KEYWORD'):
+            return other + self.markup(text, 'keyword')
+        elif match.group('BUILTIN'):
+            return other + self.markup(text, 'builtin')
+        elif match.group('COMMENT'):
+            return other + self.markup(text, 'comment')
+        elif match.group('STRING') and '\n' not in text:
+            return other + self.markup(text, 'string')
         elif match.group('STRING'):
             # It's a multiline string; colorize the string & prompt
             # portion of each line.
-            pieces = [markup_func(s, ['string','more'][i%2])
-                      for i, s in enumerate(PROMPT2_RE.split(match.group()))]
-            return ''.join(pieces)
+            pieces = []
+            for line in text.split('\n'):
+                if self.PROMPT2_RE.match(line):
+                    if len(line) > 4:
+                        pieces.append(self.markup(line[:4], 'more') +
+                                      self.markup(line[4:], 'string'))
+                    else:
+                        pieces.append(self.markup(line[:4], 'more'))
+                elif line:
+                    pieces.append(self.markup(line, 'string'))
+                else:
+                    pieces.append('')
+            return other + '\n'.join(pieces)
+        elif match.group('DEFINE'):
+            m = re.match('(?P<def>\w+)(?P<space>\s+)(?P<name>\w+)', text)
+            return other + (self.markup(m.group('def'), 'keyword') +
+                        self.markup(m.group('space'), 'other') +
+                        self.markup(m.group('name'), 'defname'))
+        elif match.group('EOS') is not None:
+            return other
         else:
-            return markup_func(match.group(), 'other')
+            assert 0, 'Unexpected match!'
 
-    if inline:
-	pysrc = DOCTEST_RE.sub(subfunc, s)
-	return pysrc.strip()
+    def markup(self, s, tag):
+        """
+        Apply syntax highlighting to a single substring from a doctest
+        block.  C{s} is the substring, and C{tag} is the tag that
+        should be applied to the substring.  C{tag} will be one of the
+        following strings:
+        
+          - C{prompt} -- the Python PS1 prompt (>>>)
+          - C{more} -- the Python PS2 prompt (...)
+          - C{keyword} -- a Python keyword (for, if, etc.)
+          - C{builtin} -- a Python builtin name (abs, dir, etc.)
+          - C{string} -- a string literal
+          - C{comment} -- a comment
+          - C{except} -- an exception traceback (up to the next >>>)
+          - C{output} -- the output from a doctest block.
+          - C{defname} -- the name of a function or class defined by
+            a C{def} or C{class} statement.
+          - C{other} -- anything else (does *not* include output.)
+        """
+        raise AssertionError("Abstract method")
 
-    # need to add a third state here for correctly formatting exceptions
+class XMLDoctestColorizer(DoctestColorizer):
+    """
+    A subclass of DoctestColorizer that generates XML-like output.
+    This class is mainly intended to be used for testing purposes.
+    """
+    PREFIX = '<colorized>\n'
+    SUFFIX = '</colorized>\n'
+    def markup(self, s, tag):
+        s = s.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
+        if tag == 'other': return s
+        else: return '<%s>%s</%s>' % (tag, s, tag)
 
-    for line in s.split('\n')+['\n']:
-        if PROMPT_RE.match(line):
-            pysrc.append(line)
-            if pyout:
-                pyout = '\n'.join(pyout).strip()
-                m = EXCEPT_RE.match(pyout)
-                if m:
-                    pyout, pyexc = m.group(1).strip(), m.group(2).strip()
-                    if pyout:
-                        print ('Warning: doctest does not allow for mixed '
-                               'output and exceptions!')
-                        result.append(markup_func(pyout, 'output'))
-                    result.append(markup_func(pyexc, 'except'))
-                else:
-                    result.append(markup_func(pyout, 'output'))
-                pyout = []
+class HTMLDoctestColorizer(DoctestColorizer):
+    """A subclass of DoctestColorizer that generates HTML output."""
+    PREFIX = '<pre class="py-doctest">\n'
+    SUFFIX = '</pre>\n'
+    def markup(self, s, tag):
+        if tag == 'other':
+            return plaintext_to_html(s)
         else:
-            pyout.append(line)
-            if pysrc:
-                pysrc = DOCTEST_RE.sub(subfunc, '\n'.join(pysrc))
-                result.append(pysrc.strip())
-                #result.append(markup_func(pysrc.strip(), 'python'))
-                pysrc = []
+            return ('<span class="py-%s">%s</span>' %
+                    (tag, plaintext_to_html(s)))
 
-    remainder = '\n'.join(pyout).strip()
-    if remainder:
-        result.append(markup_func(remainder, 'output'))
-    result = '\n'.join(result)
+class LaTeXDoctestColorizer(DoctestColorizer):
+    """A subclass of DoctestColorizer that generates LaTeX output."""
+    PREFIX = '\\begin{alltt}\n'
+    SUFFIX = '\\end{alltt}\n'
+    def markup(self, s, tag):
+        if tag == 'other':
+            return plaintext_to_latex(s)
+        else:
+            return '\\pysrc%s{%s}' % (tag, plaintext_to_latex(s))
 
-    # Merge adjacent spans w/ the same class.  I.e, convert:
-    #   <span class="x">foo</span><span class="x">foo</span>
-    # to:
-    #   <span class="x">foofoo</span>
-    prev_span_class = [None]
-    def subfunc(match):
-        if match.group(2) == prev_span_class[0]:
-            prev_span_class[0] = match.group(2)
-            return match.group(1) or ''
-        else:
-            prev_span_class[0] = match.group(2)
-            return match.group()
-    result = re.sub(r'</span>(\n?)<span class="([^"]+)">', subfunc, result)
         
-    return result


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.