[Docutils-checkins] SF.net SVN: docutils:[7267] trunk/docutils

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 7267
          http://docutils.svn.sourceforge.net/docutils/?rev=7267&view=rev
Author:   milde
Date:     2011-12-20 14:14:21 +0000 (Tue, 20 Dec 2011)
Log Message:
-----------
docutils.utils is now a package (providing a place for sub-modules)

important:: docutils/math, docutils/error_reporting.py, and
docutils/urischemes.py will move to the utils package in the next
release, too. Code importing these modules needs to adapt.

Modified Paths:
--------------
    trunk/docutils/HISTORY.txt
    trunk/docutils/RELEASE-NOTES.txt
    trunk/docutils/docutils/__init__.py
    trunk/docutils/docutils/parsers/rst/directives/body.py
    trunk/docutils/docutils/parsers/rst/roles.py
    trunk/docutils/docutils/parsers/rst/states.py
    trunk/docutils/setup.py
    trunk/docutils/test/test_parsers/test_rst/test_directives/test_code.py
    trunk/docutils/test/test_parsers/test_rst/test_directives/test_code_long.py
    trunk/docutils/test/test_parsers/test_rst/test_directives/test_include.py
    trunk/docutils/test/test_parsers/test_rst/test_interpreted.py
    trunk/docutils/test/test_utils.py

Added Paths:
-----------
    trunk/docutils/docutils/utils/
    trunk/docutils/docutils/utils/__init__.py
    trunk/docutils/docutils/utils/code_analyzer.py
    trunk/docutils/docutils/utils/punctuation_chars.py

Removed Paths:
-------------
    trunk/docutils/docutils/parsers/code_analyzer.py
    trunk/docutils/docutils/parsers/rst/punctuation_chars.py
    trunk/docutils/docutils/utils.py

Modified: trunk/docutils/HISTORY.txt
===================================================================

--- trunk/docutils/HISTORY.txt	2011-12-20 09:36:10 UTC (rev 7266)
+++ trunk/docutils/HISTORY.txt	2011-12-20 14:14:21 UTC (rev 7267)
@@ -30,8 +30,15 @@
 
   - Fix [ 3395948 ] (Work around encoding problems in Py3k).
 
-* docutils/utils.py
+* docutils/utils.py -> docutils/utils/__init__.py
 
+  - docutils.utils is now a package (providing a place for sub-modules)
+
+  .. important:: docutils/math, docutils/error_reporting.py, and
+     docutils/urischemes.py will move to the utils package in the next
+     release, too. Code importing these modules needs to adapt
+     (``import docutils.math`` -> ``import docutils.utils.math``, etc.).
+
   - DependencyList uses io.FileOutput and 'utf8' encoding to prevent
     errors recording non-ASCII filenames (fixes [ 3434355 ].
 
@@ -60,11 +67,11 @@
 * General:
 
   - Fix [ 3364658 ] (Change last file with Apache license to BSD-2-Clause)
-    and [ 3395920 ] (correct copyright info for rst.el). 
-  
+    and [ 3395920 ] (correct copyright info for rst.el).
+
 * docutils/test/
 
-  -  Apply [ 3303733 ] and [ 3365041 ] to fix tests under py3k. 
+  -  Apply [ 3303733 ] and [ 3365041 ] to fix tests under py3k.
 
 * docutils/writers/latex2e/__init__.py
 
@@ -95,7 +102,7 @@
 
   - Most directives now support a "name" option that attaches a
     reference name.
-    
+
   - Directive content may start on the first line also when the directive
     type accepts options.
 

Modified: trunk/docutils/RELEASE-NOTES.txt
===================================================================
--- trunk/docutils/RELEASE-NOTES.txt	2011-12-20 09:36:10 UTC (rev 7266)
+++ trunk/docutils/RELEASE-NOTES.txt	2011-12-20 14:14:21 UTC (rev 7267)
@@ -28,11 +28,24 @@
 
   .. _Pygments: http://pygments.org/
 
+* docutils/utils.py -> docutils/utils/__init__.py
+
+  - docutils.utils is now a package (providing a place for sub-modules)
+
+  .. important:: docutils/math, docutils/error_reporting.py, and
+     docutils/urischemes.py will move to the utils package in the next
+     release, too. Code importing these modules needs to adapt
+     (``import docutils.math`` -> ``import docutils.utils.math``, etc.).
+
 * docutils/writers/html4css1/__init__.py
 
   - change default for `math-output` setting to MathJax
 
+* docutils/writers/latex2e/__init__.py
 
+  - Record only files required to generate the LaTeX source as dependencies.
+
+
 Release 0.8.1 (2011-08-30)
 ==========================
 

Modified: trunk/docutils/docutils/__init__.py
===================================================================
--- trunk/docutils/docutils/__init__.py	2011-12-20 09:36:10 UTC (rev 7266)
+++ trunk/docutils/docutils/__init__.py	2011-12-20 14:14:21 UTC (rev 7267)
@@ -29,9 +29,6 @@
 - urischemes.py: Contains a complete mapping of known URI addressing
   scheme names to descriptions.
 
-- utils.py: Contains the ``Reporter`` system warning class and miscellaneous
-  utilities.
-
 Subpackages:
 
 - languages: Language-specific mappings of terms.
@@ -44,6 +41,9 @@
 - transforms: Modules used by readers and writers to modify DPS
   doctrees.
 
+- utils: Contains the ``Reporter`` system warning class and miscellaneous
+  utilities used by readers, writers, and transforms.
+
 - writers: Format-specific output translators.
 """
 

Deleted: trunk/docutils/docutils/parsers/code_analyzer.py
===================================================================
--- trunk/docutils/docutils/parsers/code_analyzer.py	2011-12-20 09:36:10 UTC (rev 7266)
+++ trunk/docutils/docutils/parsers/code_analyzer.py	2011-12-20 14:14:21 UTC (rev 7267)
@@ -1,134 +0,0 @@
-#!/usr/bin/python
-# coding: utf-8
-
-"""Lexical analysis of formal languages (i.e. code) using Pygments."""
-
-# :Author: Georg Brandl; Felix Wiemann; Günter Milde
-# :Date: $Date$
-# :Copyright: This module has been placed in the public domain.
-
-from docutils import ApplicationError
-try:
-    import pygments
-    from pygments.lexers import get_lexer_by_name
-    from pygments.formatters.html import _get_ttype_class
-    with_pygments = True
-except ImportError:
-    with_pygments = False
-
-# Filter the following token types from the list of class arguments:
-unstyled_tokens = ['token', # Token (base token type)
-                   'text',  # Token.Text
-                   '']      # short name for Token and Text
-# (Add, e.g., Token.Punctuation with ``unstyled_tokens += 'punctuation'``.)
-
-class LexerError(ApplicationError): 
-    pass
-
-class Lexer(object):
-    """Parse `code` lines and yield "classified" tokens.
-
-    Arguments
-
-      code       -- string of source code to parse,
-      language   -- formal language the code is written in,
-      tokennames -- either 'long', 'short', or '' (see below).
-
-    Merge subsequent tokens of the same token-type.
-
-    Iterating over an instance yields the tokens as ``(tokentype, value)``
-    tuples. The value of `tokennames` configures the naming of the tokentype:
-
-      'long':  downcased full token type name,
-      'short': short name defined by pygments.token.STANDARD_TYPES
-               (= class argument used in pygments html output),
-      'none':      skip lexical analysis.
-    """
-
-    def __init__(self, code, language, tokennames='short'):
-        """
-        Set up a lexical analyzer for `code` in `language`.
-        """
-        self.code = code
-        self.language = language
-        self.tokennames = tokennames
-        self.lexer = None
-        # get lexical analyzer for `language`:
-        if language in ('', 'text') or tokennames == 'none':
-            return
-        if not with_pygments:
-            raise LexerError('Cannot analyze code. '
-                                    'Pygments package not found.')
-        try:
-            self.lexer = get_lexer_by_name(self.language)
-        except pygments.util.ClassNotFound:
-            raise LexerError('Cannot analyze code. '
-                'No Pygments lexer found for "%s".' % language)
-
-    # Since version 1.2. (released Jan 01, 2010) Pygments has a
-    # TokenMergeFilter. However, this requires Python >= 2.4. When Docutils
-    # requires same minimal version,  ``self.merge(tokens)`` in __iter__ can
-    # be replaced by ``self.lexer.add_filter('tokenmerge')`` in __init__.
-    def merge(self, tokens):
-        """Merge subsequent tokens of same token-type.
-
-           Also strip the final newline (added by pygments).
-        """
-        tokens = iter(tokens)
-        (lasttype, lastval) = tokens.next()
-        for ttype, value in tokens:
-            if ttype is lasttype:
-                lastval += value
-            else:
-                yield(lasttype, lastval)
-                (lasttype, lastval) = (ttype, value)
-        if lastval.endswith('\n'):
-            lastval = lastval[:-1]
-        if lastval:
-            yield(lasttype, lastval)
-
-    def __iter__(self):
-        """Parse self.code and yield "classified" tokens.
-        """
-        if self.lexer is None:
-            yield ([], self.code)
-            return
-        tokens = pygments.lex(self.code, self.lexer)
-        for tokentype, value in self.merge(tokens):
-            if self.tokennames == 'long': # long CSS class args
-                classes = str(tokentype).lower().split('.')
-            else: # short CSS class args
-                classes = [_get_ttype_class(tokentype)]
-            classes = [cls for cls in classes if cls not in unstyled_tokens]
-            yield (classes, value)
-
-
-class NumberLines(object):
-    """Insert linenumber-tokens at the start of every code line.
-
-    Arguments
-
-       tokens    -- iterable of ``(classes, value)`` tuples
-       startline -- first line number
-       endline   -- last line number
-
-    Iterating over an instance yields the tokens with a
-    ``(['ln'], '<the line number>')`` token added for every code line.
-    Multi-line tokens are splitted."""
-
-    def __init__(self, tokens, startline, endline):
-        self.tokens = tokens
-        self.startline = startline
-        # pad linenumbers, e.g. endline == 100 -> fmt_str = '%3d '
-        self.fmt_str = '%%%dd ' % len(str(endline))
-
-    def __iter__(self):
-        lineno = self.startline
-        yield (['ln'], self.fmt_str % lineno)
-        for ttype, value in self.tokens:
-            lines = value.split('\n')
-            for line in lines[:-1]:
-                yield (ttype, line + '\n')
-                lineno += 1
-                yield (['ln'], self.fmt_str % lineno)
-            yield (ttype, lines[-1])

Modified: trunk/docutils/docutils/parsers/rst/directives/body.py
===================================================================
--- trunk/docutils/docutils/parsers/rst/directives/body.py	2011-12-20 09:36:10 UTC (rev 7266)
+++ trunk/docutils/docutils/parsers/rst/directives/body.py	2011-12-20 14:14:21 UTC (rev 7267)
@@ -16,7 +16,7 @@
 from docutils.parsers.rst import Directive
 from docutils.parsers.rst import directives
 from docutils.parsers.rst.roles import set_classes
-from docutils.parsers.code_analyzer import Lexer, LexerError, NumberLines
+from docutils.utils.code_analyzer import Lexer, LexerError, NumberLines
 
 class BasePseudoSection(Directive):
 

Deleted: trunk/docutils/docutils/parsers/rst/punctuation_chars.py
===================================================================
--- trunk/docutils/docutils/parsers/rst/punctuation_chars.py	2011-12-20 09:36:10 UTC (rev 7266)
+++ trunk/docutils/docutils/parsers/rst/punctuation_chars.py	2011-12-20 14:14:21 UTC (rev 7267)
@@ -1,211 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf8 -*-
-# :Copyright: © 2011 Günter Milde.
-# :License: Released under the terms of the `2-Clause BSD license`_, in short:
-#
-#    Copying and distribution of this file, with or without modification,
-#    are permitted in any medium without royalty provided the copyright
-#    notice and this notice are preserved.
-#    This file is offered as-is, without any warranty.
-#
-# .. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause
-
-# :Id: $Id$
-
-import sys, re
-import unicodedata
-
-# punctuation characters around inline markup
-# ===========================================
-#
-# This module provides the lists of characters for the implementation of
-# the `inline markup recognition rules`_ in the reStructuredText parser
-# (states.py)
-#
-# .. _inline markup recognition rules:
-#     ../../../docs/ref/rst/restructuredtext.html#inline-markup
-
-# Docutils punctuation category sample strings
-# --------------------------------------------
-#
-# The sample strings are generated by punctuation_samples() and put here
-# literal to avoid the time-consuming generation with every Docutils
-# run. Running this file as a standalone module checks the definitions below
-# against a re-calculation.
-
-openers = ur"""\"\'\(\<\[\{༺༼᚛⁅⁽₍〈❨❪❬❮❰❲❴⟅⟦⟨⟪⟬⟮⦃⦅⦇⦉⦋⦍⦏⦑⦓⦕⦗⧘⧚⧼⸢⸤⸦⸨〈《「『【〔〖〘〚〝〝﴾︗︵︷︹︻︽︿﹁﹃﹇﹙﹛﹝（［｛｟｢«‘“‹⸂⸄⸉⸌⸜⸠‚„»’”›⸃⸅⸊⸍⸝⸡‛‟"""
-closers = ur"""\"\'\)\>\]\}༻༽᚜⁆⁾₎〉❩❫❭❯❱❳❵⟆⟧⟩⟫⟭⟯⦄⦆⦈⦊⦌⦎⦐⦒⦔⦖⦘⧙⧛⧽⸣⸥⸧⸩〉》」』】〕〗〙〛〞〟﴿︘︶︸︺︼︾﹀﹂﹄﹈﹚﹜﹞）］｝｠｣»’”›⸃⸅⸊⸍⸝⸡‛‟«‘“‹⸂⸄⸉⸌⸜⸠‚„"""
-delimiters = ur"\-\/\:֊־᐀᠆‐‑‒–—―⸗⸚〜〰゠︱︲﹘﹣－¡·¿;·՚՛՜՝՞՟։׀׃׆׳״؉؊،؍؛؞؟٪٫٬٭۔܀܁܂܃܄܅܆܇܈܉܊܋܌܍߷߸߹࠰࠱࠲࠳࠴࠵࠶࠷࠸࠹࠺࠻࠼࠽࠾।॥॰෴๏๚๛༄༅༆༇༈༉༊་༌།༎༏༐༑༒྅࿐࿑࿒࿓࿔၊။၌၍၎၏჻፡።፣፤፥፦፧፨᙭᙮᛫᛬᛭᜵᜶។៕៖៘៙៚᠀᠁᠂᠃᠄᠅᠇᠈᠉᠊᥄᥅᧞᧟᨞᨟᪠᪡᪢᪣᪤᪥᪦᪨᪩᪪᪫᪬᪭᭚᭛᭜᭝᭞᭟᭠᰻᰼᰽᰾᰿᱾᱿᳓‖‗†‡•‣․‥…‧‰‱′″‴‵‶‷‸※‼‽‾⁁⁂⁃⁇⁈⁉⁊⁋⁌⁍⁎⁏⁐⁑⁓⁕⁖⁗⁘⁙⁚⁛⁜⁝⁞⳹⳺⳻⳼⳾⳿⸀⸁⸆⸇⸈⸋⸎⸏⸐⸑⸒⸓⸔⸕⸖⸘⸙⸛⸞⸟⸪⸫⸬⸭⸮⸰⸱、。〃〽・꓾꓿꘍꘎꘏꙳꙾꛲꛳꛴꛵꛶꛷꡴꡵꡶꡷꣎꣏꣸꣹꣺꤮꤯꥟꧁꧂꧃꧄꧅꧆꧇꧈꧉꧊꧋꧌꧍꧞꧟꩜꩝꩞꩟꫞꫟꯫︐︑︒︓︔︕︖︙︰﹅﹆﹉﹊﹋﹌﹐﹑﹒﹔﹕﹖﹗﹟﹠﹡﹨﹪﹫！＂＃％＆＇＊，．／：；？＠＼｡､･𐄀𐄁𐎟𐏐𐡗𐤟𐤿𐩐𐩑𐩒𐩓𐩔𐩕𐩖𐩗𐩘𐩿𐬹𐬺𐬻𐬼𐬽𐬾𐬿𑂻𑂼𑂾𑂿𑃀𑃁𒑰𒑱𒑲𒑳"
-closing_delimiters = ur"\.\,\;\!\?"
-
-
-# Unicode punctuation character categories
-# ----------------------------------------
-
-unicode_punctuation_categories = {
-    # 'Pc': 'Connector', # not used in Docutils inline markup recognition
-    'Pd': 'Dash',
-    'Ps': 'Open',
-    'Pe': 'Close',
-    'Pi': 'Initial quote', # may behave like Ps or Pe depending on usage
-    'Pf': 'Final quote', # may behave like Ps or Pe depending on usage
-    'Po': 'Other'
-    }
-"""Unicode character categories for punctuation"""
-
-
-# generate character pattern strings
-# ==================================
-
-def unicode_charlists(categories, cp_min=0, cp_max=None):
-    """Return dictionary of Unicode character lists.
-
-    For each of the `catagories`, an item contains a list with all Unicode
-    characters with `cp_min` <= code-point <= `cp_max` that belong to the
-    category. (The default values check every code-point supported by Python.)
-    """
-    # Determine highest code point with one of the given categories
-    # (may shorten the search time considerably if there are many
-    # categories with not too high characters):
-    if cp_max is None:
-        cp_max = max(x for x in xrange(sys.maxunicode + 1)
-                     if unicodedata.category(unichr(x)) in categories)
-        # print cp_max # => 74867 for unicode_punctuation_categories
-    charlists = {}
-    for cat in categories:
-        charlists[cat] = [unichr(x) for x in xrange(cp_min, cp_max+1)
-                          if unicodedata.category(unichr(x)) == cat]
-    return charlists
-
-
-# Character categories in Docutils
-# --------------------------------
-
-def punctuation_samples():
-
-    """Docutils punctuation category sample strings.
-
-    Return list of sample strings for the categories "Open", "Close",
-    "Delimiters" and "Closing-Delimiters" used in the `inline markup
-    recognition rules`_.
-    """
-
-    # Lists with characters in Unicode punctuation character categories
-    cp_min = 160 # ASCII chars have special rules for backwards compatibility
-    ucharlists = unicode_charlists(unicode_punctuation_categories, cp_min)
-
-    # match opening/closing characters
-    # --------------------------------
-    # Rearange the lists to ensure matching characters at the same
-    # index position.
-
-    # low quotation marks are also used as closers (e.g. in Greek)
-    # move them to category Pi:
-    ucharlists['Ps'].remove(u'‚') # 201A  SINGLE LOW-9 QUOTATION MARK
-    ucharlists['Ps'].remove(u'„') # 201E  DOUBLE LOW-9 QUOTATION MARK
-    ucharlists['Pi'] += [u'‚', u'„']
-
-    ucharlists['Pi'].remove(u'‛') # 201B  SINGLE HIGH-REVERSED-9 QUOTATION MARK
-    ucharlists['Pi'].remove(u'‟') # 201F  DOUBLE HIGH-REVERSED-9 QUOTATION MARK
-    ucharlists['Pf'] += [u'‛', u'‟']
-
-    # 301F  LOW DOUBLE PRIME QUOTATION MARK misses the opening pendant:
-    ucharlists['Ps'].insert(ucharlists['Pe'].index(u'\u301f'), u'\u301d')
-
-    # print u''.join(ucharlists['Ps']).encode('utf8')
-    # print u''.join(ucharlists['Pe']).encode('utf8')
-    # print u''.join(ucharlists['Pi']).encode('utf8')
-    # print u''.join(ucharlists['Pf']).encode('utf8')
-
-    # The Docutils character categories
-    # ---------------------------------
-    #
-    # The categorization of ASCII chars is non-standard to reduce both
-    # false positives and need for escaping. (see `inline markup recognition
-    # rules`_)
-
-    # matching, allowed before markup
-    openers = [re.escape('"\'(<[{')]
-    for cat in ('Ps', 'Pi', 'Pf'):
-        openers.extend(ucharlists[cat])
-
-    # matching, allowed after markup
-    closers = [re.escape('"\')>]}')]
-    for cat in ('Pe', 'Pf', 'Pi'):
-        closers.extend(ucharlists[cat])
-
-    # non-matching, allowed on both sides
-    delimiters = [re.escape('-/:')]
-    for cat in ('Pd', 'Po'):
-        delimiters.extend(ucharlists[cat])
-
-    # non-matching, after markup
-    closing_delimiters = [re.escape('.,;!?')]
-
-    # # Test open/close matching:
-    # for i in range(min(len(openers),len(closers))):
-    #     print '%4d    %s    %s' % (i, openers[i].encode('utf8'),
-    #                                closers[i].encode('utf8'))
-
-    return [u''.join(chars)
-            for chars in (openers, closers, delimiters, closing_delimiters)]
-
-
-# Matching open/close quotes
-# --------------------------
-
-# Rule (5) requires determination of matching open/close pairs. However,
-# the pairing of open/close quotes is ambigue due to  different typographic
-# conventions in different languages.
-
-quote_pairs = {u'\xbb': u'\xbb', # Swedish
-               u'\u2018': u'\u201a', # Greek
-               u'\u2019': u'\u2019', # Swedish
-               u'\u201a': u'\u2018\u2019', # German, Polish
-               u'\u201c': u'\u201e', # German
-               u'\u201e': u'\u201c\u201d',
-               u'\u201d': u'\u201d', # Swedish
-               u'\u203a': u'\u203a', # Swedish
-              }
-
-def match_chars(c1, c2):
-    try:
-        i = openers.index(c1)
-    except ValueError:  # c1 not in openers
-        return False
-    return c2 == closers[i] or c2 in quote_pairs.get(c1, '')
-
-
-
-
-# print results
-# =============
-
-if __name__ == '__main__':
-
-    # (re) create and compare the samples:
-    (o, c, d, cd) = punctuation_samples()
-    if o != openers:
-        print '- openers = ur"""%s"""' % openers.encode('utf8')
-        print '+ openers = ur"""%s"""' % o.encode('utf8')
-    if c != closers:
-        print '- closers = ur"""%s"""' % closers.encode('utf8')
-        print '+ closers = ur"""%s"""' % c.encode('utf8')
-    if d != delimiters:
-        print '- delimiters = ur"%s"' % delimiters.encode('utf8')
-        print '+ delimiters = ur"%s"' % d.encode('utf8')
-    if cd != closing_delimiters:
-        print '- closing_delimiters = ur"%s"' % closing_delimiters.encode('utf8')
-        print '+ closing_delimiters = ur"%s"' % cd.encode('utf8')
-
-    # # test prints
-    # print 'openers = ', repr(openers)
-    # print 'closers = ', repr(closers)
-    # print 'delimiters = ', repr(delimiters)
-    # print 'closing_delimiters = ', repr(closing_delimiters)
-
-    # ucharlists = unicode_charlists(unicode_punctuation_categories)
-    # for cat, chars in ucharlists.items():
-    #     # print cat, chars
-    #     # compact output (visible with a comprehensive font):
-    #     print (u":%s: %s" % (cat, u''.join(chars))).encode('utf8')

Modified: trunk/docutils/docutils/parsers/rst/roles.py
===================================================================
--- trunk/docutils/docutils/parsers/rst/roles.py	2011-12-20 09:36:10 UTC (rev 7266)
+++ trunk/docutils/docutils/parsers/rst/roles.py	2011-12-20 14:14:21 UTC (rev 7267)
@@ -75,7 +75,7 @@
 from docutils import nodes, utils
 from docutils.parsers.rst import directives
 from docutils.parsers.rst.languages import en as _fallback_language_module
-from docutils.parsers.code_analyzer import Lexer, LexerError
+from docutils.utils.code_analyzer import Lexer, LexerError
 
 DEFAULT_INTERPRETED_ROLE = 'title-reference'
 """

Modified: trunk/docutils/docutils/parsers/rst/states.py
===================================================================
--- trunk/docutils/docutils/parsers/rst/states.py	2011-12-20 09:36:10 UTC (rev 7266)
+++ trunk/docutils/docutils/parsers/rst/states.py	2011-12-20 14:14:21 UTC (rev 7267)
@@ -107,16 +107,17 @@
 import re
 import roman
 from types import FunctionType, MethodType
+
 from docutils import nodes, statemachine, utils, urischemes
 from docutils import ApplicationError, DataError
 from docutils.statemachine import StateMachineWS, StateWS
 from docutils.nodes import fully_normalize_name as normalize_name
 from docutils.nodes import whitespace_normalize_name
-from docutils.utils import escape2null, unescape, column_width
 import docutils.parsers.rst
 from docutils.parsers.rst import directives, languages, tableparser, roles
 from docutils.parsers.rst.languages import en as _fallback_language_module
-from docutils.parsers.rst import punctuation_chars
+from docutils.utils import escape2null, unescape, column_width
+from docutils.utils import punctuation_chars
 
 class MarkupError(DataError): pass
 class UnknownInterpretedRoleError(DataError): pass

Copied: trunk/docutils/docutils/utils/__init__.py (from rev 7266, trunk/docutils/docutils/utils.py)
===================================================================
--- trunk/docutils/docutils/utils/__init__.py	                        (rev 0)
+++ trunk/docutils/docutils/utils/__init__.py	2011-12-20 14:14:21 UTC (rev 7267)
@@ -0,0 +1,730 @@
+# coding: utf8
+# $Id$
+# Author: David Goodger <go...@py...>
+# Copyright: This module has been placed in the public domain.
+
+"""
+Miscellaneous utilities for the documentation utilities.
+"""
+
+__docformat__ = 'reStructuredText'
+
+import sys
+import os
+import os.path
+import warnings
+import unicodedata
+from docutils import ApplicationError, DataError
+from docutils import nodes
+from docutils.io import FileOutput
+from docutils.error_reporting import ErrorOutput, SafeString
+
+
+class SystemMessage(ApplicationError):
+
+    def __init__(self, system_message, level):
+        Exception.__init__(self, system_message.astext())
+        self.level = level
+
+
+class SystemMessagePropagation(ApplicationError): pass
+
+
+class Reporter:
+
+    """
+    Info/warning/error reporter and ``system_message`` element generator.
+
+    Five levels of system messages are defined, along with corresponding
+    methods: `debug()`, `info()`, `warning()`, `error()`, and `severe()`.
+
+    There is typically one Reporter object per process.  A Reporter object is
+    instantiated with thresholds for reporting (generating warnings) and
+    halting processing (raising exceptions), a switch to turn debug output on
+    or off, and an I/O stream for warnings.  These are stored as instance
+    attributes.
+
+    When a system message is generated, its level is compared to the stored
+    thresholds, and a warning or error is generated as appropriate.  Debug
+    messages are produced if the stored debug switch is on, independently of
+    other thresholds.  Message output is sent to the stored warning stream if
+    not set to ''.
+
+    The Reporter class also employs a modified form of the "Observer" pattern
+    [GoF95]_ to track system messages generated.  The `attach_observer` method
+    should be called before parsing, with a bound method or function which
+    accepts system messages.  The observer can be removed with
+    `detach_observer`, and another added in its place.
+
+    .. [GoF95] Gamma, Helm, Johnson, Vlissides. *Design Patterns: Elements of
+       Reusable Object-Oriented Software*. Addison-Wesley, Reading, MA, USA,
+       1995.
+    """
+
+    levels = 'DEBUG INFO WARNING ERROR SEVERE'.split()
+    """List of names for system message levels, indexed by level."""
+
+    # system message level constants:
+    (DEBUG_LEVEL,
+     INFO_LEVEL,
+     WARNING_LEVEL,
+     ERROR_LEVEL,
+     SEVERE_LEVEL) = range(5)
+
+    def __init__(self, source, report_level, halt_level, stream=None,
+                 debug=0, encoding=None, error_handler='backslashreplace'):
+        """
+        :Parameters:
+            - `source`: The path to or description of the source data.
+            - `report_level`: The level at or above which warning output will
+              be sent to `stream`.
+            - `halt_level`: The level at or above which `SystemMessage`
+              exceptions will be raised, halting execution.
+            - `debug`: Show debug (level=0) system messages?
+            - `stream`: Where warning output is sent.  Can be file-like (has a
+              ``.write`` method), a string (file name, opened for writing),
+              '' (empty string) or `False` (for discarding all stream messages)
+              or `None` (implies `sys.stderr`; default).
+            - `encoding`: The output encoding.
+            - `error_handler`: The error handler for stderr output encoding.
+        """
+
+        self.source = source
+        """The path to or description of the source data."""
+
+        self.error_handler = error_handler
+        """The character encoding error handler."""
+
+        self.debug_flag = debug
+        """Show debug (level=0) system messages?"""
+
+        self.report_level = report_level
+        """The level at or above which warning output will be sent
+        to `self.stream`."""
+
+        self.halt_level = halt_level
+        """The level at or above which `SystemMessage` exceptions
+        will be raised, halting execution."""
+
+        if not isinstance(stream, ErrorOutput):
+            stream = ErrorOutput(stream, encoding, error_handler)
+
+        self.stream = stream
+        """Where warning output is sent."""
+
+        self.encoding = encoding or getattr(stream, 'encoding', 'ascii')
+        """The output character encoding."""
+
+        self.observers = []
+        """List of bound methods or functions to call with each system_message
+        created."""
+
+        self.max_level = -1
+        """The highest level system message generated so far."""
+
+    def set_conditions(self, category, report_level, halt_level,
+                       stream=None, debug=0):
+        warnings.warn('docutils.utils.Reporter.set_conditions deprecated; '
+                      'set attributes via configuration settings or directly',
+                      DeprecationWarning, stacklevel=2)
+        self.report_level = report_level
+        self.halt_level = halt_level
+        if not isinstance(stream, ErrorOutput):
+            stream = ErrorOutput(stream, self.encoding, self.error_handler)
+        self.stream = stream
+        self.debug_flag = debug
+
+    def attach_observer(self, observer):
+        """
+        The `observer` parameter is a function or bound method which takes one
+        argument, a `nodes.system_message` instance.
+        """
+        self.observers.append(observer)
+
+    def detach_observer(self, observer):
+        self.observers.remove(observer)
+
+    def notify_observers(self, message):
+        for observer in self.observers:
+            observer(message)
+
+    def system_message(self, level, message, *children, **kwargs):
+        """
+        Return a system_message object.
+
+        Raise an exception or generate a warning if appropriate.
+        """
+        # `message` can be a `string`, `unicode`, or `Exception` instance.
+        if isinstance(message, Exception):
+            message = SafeString(message)
+
+        attributes = kwargs.copy()
+        if 'base_node' in kwargs:
+            source, line = get_source_line(kwargs['base_node'])
+            del attributes['base_node']
+            if source is not None:
+                attributes.setdefault('source', source)
+            if line is not None:
+                attributes.setdefault('line', line)
+                # assert source is not None, "node has line- but no source-argument"
+        if not 'source' in attributes: # 'line' is absolute line number
+            try: # look up (source, line-in-source)
+                source, line = self.locator(attributes.get('line'))
+                # print "locator lookup", kwargs.get('line'), "->", source, line
+            except AttributeError:
+                source, line = None, None
+            if source is not None:
+                attributes['source'] = source
+            if line is not None:
+                attributes['line'] = line
+        # assert attributes['line'] is not None, (message, kwargs)
+        # assert attributes['source'] is not None, (message, kwargs)
+        attributes.setdefault('source', self.source)
+
+        msg = nodes.system_message(message, level=level,
+                                   type=self.levels[level],
+                                   *children, **attributes)
+        if self.stream and (level >= self.report_level
+                            or self.debug_flag and level == self.DEBUG_LEVEL
+                            or level >= self.halt_level):
+            self.stream.write(msg.astext() + '\n')
+        if level >= self.halt_level:
+            raise SystemMessage(msg, level)
+        if level > self.DEBUG_LEVEL or self.debug_flag:
+            self.notify_observers(msg)
+        self.max_level = max(level, self.max_level)
+        return msg
+
+    def debug(self, *args, **kwargs):
+        """
+        Level-0, "DEBUG": an internal reporting issue. Typically, there is no
+        effect on the processing. Level-0 system messages are handled
+        separately from the others.
+        """
+        if self.debug_flag:
+            return self.system_message(self.DEBUG_LEVEL, *args, **kwargs)
+
+    def info(self, *args, **kwargs):
+        """
+        Level-1, "INFO": a minor issue that can be ignored. Typically there is
+        no effect on processing, and level-1 system messages are not reported.
+        """
+        return self.system_message(self.INFO_LEVEL, *args, **kwargs)
+
+    def warning(self, *args, **kwargs):
+        """
+        Level-2, "WARNING": an issue that should be addressed. If ignored,
+        there may be unpredictable problems with the output.
+        """
+        return self.system_message(self.WARNING_LEVEL, *args, **kwargs)
+
+    def error(self, *args, **kwargs):
+        """
+        Level-3, "ERROR": an error that should be addressed. If ignored, the
+        output will contain errors.
+        """
+        return self.system_message(self.ERROR_LEVEL, *args, **kwargs)
+
+    def severe(self, *args, **kwargs):
+        """
+        Level-4, "SEVERE": a severe error that must be addressed. If ignored,
+        the output will contain severe errors. Typically level-4 system
+        messages are turned into exceptions which halt processing.
+        """
+        return self.system_message(self.SEVERE_LEVEL, *args, **kwargs)
+
+
+class ExtensionOptionError(DataError): pass
+class BadOptionError(ExtensionOptionError): pass
+class BadOptionDataError(ExtensionOptionError): pass
+class DuplicateOptionError(ExtensionOptionError): pass
+
+
+def extract_extension_options(field_list, options_spec):
+    """
+    Return a dictionary mapping extension option names to converted values.
+
+    :Parameters:
+        - `field_list`: A flat field list without field arguments, where each
+          field body consists of a single paragraph only.
+        - `options_spec`: Dictionary mapping known option names to a
+          conversion function such as `int` or `float`.
+
+    :Exceptions:
+        - `KeyError` for unknown option names.
+        - `ValueError` for invalid option values (raised by the conversion
+           function).
+        - `TypeError` for invalid option value types (raised by conversion
+           function).
+        - `DuplicateOptionError` for duplicate options.
+        - `BadOptionError` for invalid fields.
+        - `BadOptionDataError` for invalid option data (missing name,
+          missing data, bad quotes, etc.).
+    """
+    option_list = extract_options(field_list)
+    option_dict = assemble_option_dict(option_list, options_spec)
+    return option_dict
+
+def extract_options(field_list):
+    """
+    Return a list of option (name, value) pairs from field names & bodies.
+
+    :Parameter:
+        `field_list`: A flat field list, where each field name is a single
+        word and each field body consists of a single paragraph only.
+
+    :Exceptions:
+        - `BadOptionError` for invalid fields.
+        - `BadOptionDataError` for invalid option data (missing name,
+          missing data, bad quotes, etc.).
+    """
+    option_list = []
+    for field in field_list:
+        if len(field[0].astext().split()) != 1:
+            raise BadOptionError(
+                'extension option field name may not contain multiple words')
+        name = str(field[0].astext().lower())
+        body = field[1]
+        if len(body) == 0:
+            data = None
+        elif len(body) > 1 or not isinstance(body[0], nodes.paragraph) \
+              or len(body[0]) != 1 or not isinstance(body[0][0], nodes.Text):
+            raise BadOptionDataError(
+                  'extension option field body may contain\n'
+                  'a single paragraph only (option "%s")' % name)
+        else:
+            data = body[0][0].astext()
+        option_list.append((name, data))
+    return option_list
+
+def assemble_option_dict(option_list, options_spec):
+    """
+    Return a mapping of option names to values.
+
+    :Parameters:
+        - `option_list`: A list of (name, value) pairs (the output of
+          `extract_options()`).
+        - `options_spec`: Dictionary mapping known option names to a
+          conversion function such as `int` or `float`.
+
+    :Exceptions:
+        - `KeyError` for unknown option names.
+        - `DuplicateOptionError` for duplicate options.
+        - `ValueError` for invalid option values (raised by conversion
+           function).
+        - `TypeError` for invalid option value types (raised by conversion
+           function).
+    """
+    options = {}
+    for name, value in option_list:
+        convertor = options_spec[name]  # raises KeyError if unknown
+        if convertor is None:
+            raise KeyError(name)        # or if explicitly disabled
+        if name in options:
+            raise DuplicateOptionError('duplicate option "%s"' % name)
+        try:
+            options[name] = convertor(value)
+        except (ValueError, TypeError), detail:
+            raise detail.__class__('(option: "%s"; value: %r)\n%s'
+                                   % (name, value, ' '.join(detail.args)))
+    return options
+
+
+class NameValueError(DataError): pass
+
+
+def decode_path(path):
+    """
+    Ensure `path` is Unicode. Return `nodes.reprunicode` object.
+
+    Decode file/path string in a failsave manner if not already done.
+    """
+    # see also http://article.gmane.org/gmane.text.docutils.user/2905
+    if isinstance(path, unicode):
+        return path
+    try:
+        path = path.decode(sys.getfilesystemencoding(), 'strict')
+    except AttributeError: # default value None has no decode method
+        return nodes.reprunicode(path)
+    except UnicodeDecodeError:
+        try:
+            path = path.decode('utf-8', 'strict')
+        except UnicodeDecodeError:
+            path = path.decode('ascii', 'replace')
+    return nodes.reprunicode(path)
+
+
+def extract_name_value(line):
+    """
+    Return a list of (name, value) from a line of the form "name=value ...".
+
+    :Exception:
+        `NameValueError` for invalid input (missing name, missing data, bad
+        quotes, etc.).
+    """
+    attlist = []
+    while line:
+        equals = line.find('=')
+        if equals == -1:
+            raise NameValueError('missing "="')
+        attname = line[:equals].strip()
+        if equals == 0 or not attname:
+            raise NameValueError(
+                  'missing attribute name before "="')
+        line = line[equals+1:].lstrip()
+        if not line:
+            raise NameValueError(
+                  'missing value after "%s="' % attname)
+        if line[0] in '\'"':
+            endquote = line.find(line[0], 1)
+            if endquote == -1:
+                raise NameValueError(
+                      'attribute "%s" missing end quote (%s)'
+                      % (attname, line[0]))
+            if len(line) > endquote + 1 and line[endquote + 1].strip():
+                raise NameValueError(
+                      'attribute "%s" end quote (%s) not followed by '
+                      'whitespace' % (attname, line[0]))
+            data = line[1:endquote]
+            line = line[endquote+1:].lstrip()
+        else:
+            space = line.find(' ')
+            if space == -1:
+                data = line
+                line = ''
+            else:
+                data = line[:space]
+                line = line[space+1:].lstrip()
+        attlist.append((attname.lower(), data))
+    return attlist
+
+def new_reporter(source_path, settings):
+    """
+    Return a new Reporter object.
+
+    :Parameters:
+        `source` : string
+            The path to or description of the source text of the document.
+        `settings` : optparse.Values object
+            Runtime settings.
+    """
+    reporter = Reporter(
+        source_path, settings.report_level, settings.halt_level,
+        stream=settings.warning_stream, debug=settings.debug,
+        encoding=settings.error_encoding,
+        error_handler=settings.error_encoding_error_handler)
+    return reporter
+
+def new_document(source_path, settings=None):
+    """
+    Return a new empty document object.
+
+    :Parameters:
+        `source_path` : string
+            The path to or description of the source text of the document.
+        `settings` : optparse.Values object
+            Runtime settings.  If none are provided, a default core set will
+            be used.  If you will use the document object with any Docutils
+            components, you must provide their default settings as well.  For
+            example, if parsing, at least provide the parser settings,
+            obtainable as follows::
+
+                settings = docutils.frontend.OptionParser(
+                    components=(docutils.parsers.rst.Parser,)
+                    ).get_default_values()
+    """
+    from docutils import frontend
+    if settings is None:
+        settings = frontend.OptionParser().get_default_values()
+    source_path = decode_path(source_path)
+    reporter = new_reporter(source_path, settings)
+    document = nodes.document(settings, reporter, source=source_path)
+    document.note_source(source_path, -1)
+    return document
+
+def clean_rcs_keywords(paragraph, keyword_substitutions):
+    if len(paragraph) == 1 and isinstance(paragraph[0], nodes.Text):
+        textnode = paragraph[0]
+        for pattern, substitution in keyword_substitutions:
+            match = pattern.search(textnode)
+            if match:
+                paragraph[0] = nodes.Text(pattern.sub(substitution, textnode))
+                return
+
+def relative_path(source, target):
+    """
+    Build and return a path to `target`, relative to `source` (both files).
+
+    If there is no common prefix, return the absolute path to `target`.
+    """
+    source_parts = os.path.abspath(source or 'dummy_file').split(os.sep)
+    target_parts = os.path.abspath(target).split(os.sep)
+    # Check first 2 parts because '/dir'.split('/') == ['', 'dir']:
+    if source_parts[:2] != target_parts[:2]:
+        # Nothing in common between paths.
+        # Return absolute path, using '/' for URLs:
+        return '/'.join(target_parts)
+    source_parts.reverse()
+    target_parts.reverse()
+    while (source_parts and target_parts
+           and source_parts[-1] == target_parts[-1]):
+        # Remove path components in common:
+        source_parts.pop()
+        target_parts.pop()
+    target_parts.reverse()
+    parts = ['..'] * (len(source_parts) - 1) + target_parts
+    return '/'.join(parts)
+
+def get_stylesheet_reference(settings, relative_to=None):
+    """
+    Retrieve a stylesheet reference from the settings object.
+
+    Deprecated. Use get_stylesheet_list() instead to
+    enable specification of multiple stylesheets as a comma-separated
+    list.
+    """
+    if settings.stylesheet_path:
+        assert not settings.stylesheet, (
+            'stylesheet and stylesheet_path are mutually exclusive.')
+        if relative_to == None:
+            relative_to = settings._destination
+        return relative_path(relative_to, settings.stylesheet_path)
+    else:
+        return settings.stylesheet
+
+# Return 'stylesheet' or 'stylesheet_path' arguments as list.
+#
+# The original settings arguments are kept unchanged: you can test
+# with e.g. ``if settings.stylesheet_path:``
+#
+# Differences to ``get_stylesheet_reference``:
+# * return value is a list
+# * no re-writing of the path (and therefore no optional argument)
+#   (if required, use ``utils.relative_path(source, target)``
+#   in the calling script)
+def get_stylesheet_list(settings):
+    """
+    Retrieve list of stylesheet references from the settings object.
+    """
+    assert not (settings.stylesheet and settings.stylesheet_path), (
+            'stylesheet and stylesheet_path are mutually exclusive.')
+    if settings.stylesheet_path:
+        sheets = settings.stylesheet_path.split(",")
+    elif settings.stylesheet:
+        sheets = settings.stylesheet.split(",")
+    else:
+        sheets = []
+    # strip whitespace (frequently occuring in config files)
+    return [sheet.strip(u' \t\n') for sheet in sheets]
+
+def get_trim_footnote_ref_space(settings):
+    """
+    Return whether or not to trim footnote space.
+
+    If trim_footnote_reference_space is not None, return it.
+
+    If trim_footnote_reference_space is None, return False unless the
+    footnote reference style is 'superscript'.
+    """
+    if settings.trim_footnote_reference_space is None:
+        return hasattr(settings, 'footnote_references') and \
+               settings.footnote_references == 'superscript'
+    else:
+        return settings.trim_footnote_reference_space
+
+def get_source_line(node):
+    """
+    Return the "source" and "line" attributes from the `node` given or from
+    its closest ancestor.
+    """
+    while node:
+        if node.source or node.line:
+            return node.source, node.line
+        node = node.parent
+    return None, None
+
+def escape2null(text):
+    """Return a string with escape-backslashes converted to nulls."""
+    parts = []
+    start = 0
+    while 1:
+        found = text.find('\\', start)
+        if found == -1:
+            parts.append(text[start:])
+            return ''.join(parts)
+        parts.append(text[start:found])
+        parts.append('\x00' + text[found+1:found+2])
+        start = found + 2               # skip character after escape
+
+def unescape(text, restore_backslashes=0):
+    """
+    Return a string with nulls removed or restored to backslashes.
+    Backslash-escaped spaces are also removed.
+    """
+    if restore_backslashes:
+        return text.replace('\x00', '\\')
+    else:
+        for sep in ['\x00 ', '\x00\n', '\x00']:
+            text = ''.join(text.split(sep))
+        return text
+
+def strip_combining_chars(text):
+    if isinstance(text, str) and sys.version_info < (3,0):
+        return text
+    return u''.join([c for c in text if not unicodedata.combining(c)])
+
+def find_combining_chars(text):
+    """Return indices of all combining chars in  Unicode string `text`.
+
+    >>> find_combining_chars(u'A t̆ab̆lĕ')
+    [3, 6, 9]
+    """
+    if isinstance(text, str) and sys.version_info < (3,0):
+        return []
+    return [i for i,c in enumerate(text) if unicodedata.combining(c)]
+
+def column_indices(text):
+    """Indices of Unicode string `text` when skipping combining characters.
+
+    >>> column_indices(u'A t̆ab̆lĕ')
+    [0, 1, 2, 4, 5, 7, 8]
+    """
+    # TODO: account for asian wide chars here instead of using dummy
+    # replacements in the tableparser?
+    string_indices = range(len(text))
+    for index in find_combining_chars(text):
+        string_indices[index] = None
+    return [i for i in string_indices if i is not None]
+
+east_asian_widths = {'W': 2,   # Wide
+                     'F': 2,   # Full-width (wide)
+                     'Na': 1,  # Narrow
+                     'H': 1,   # Half-width (narrow)
+                     'N': 1,   # Neutral (not East Asian, treated as narrow)
+                     'A': 1}   # Ambiguous (s/b wide in East Asian context,
+                               # narrow otherwise, but that doesn't work)
+"""Mapping of result codes from `unicodedata.east_asian_widt()` to character
+column widths."""
+
+def column_width(text):
+    """Return the column width of text.
+
+    Correct ``len(text)`` for wide East Asian and combining Unicode chars.
+    """
+    if isinstance(text, str) and sys.version_info < (3,0):
+        return len(text)
+    try:
+        width = sum([east_asian_widths[unicodedata.east_asian_width(c)]
+                     for c in text])
+    except AttributeError:  # east_asian_width() New in version 2.4.
+        width = len(text)
+    # correction for combining chars:
+    width -= len(find_combining_chars(text))
+    return width
+
+def uniq(L):
+     r = []
+     for item in L:
+         if not item in r:
+             r.append(item)
+     return r
+
+# by Li Daobing http://code.activestate.com/recipes/190465/
+# since Python 2.6 there is also itertools.combinations()
+def unique_combinations(items, n):
+    """Return n-length tuples, in sorted order, no repeated elements"""
+    if n==0: yield []
+    else:
+        for i in xrange(len(items)-n+1):
+            for cc in unique_combinations(items[i+1:],n-1):
+                yield [items[i]]+cc
+
+def normalize_language_tag(tag):
+    """Return a list of normalized combinations for a `BCP 47` language tag.
+
+    Example:
+
+      >>> normalize_language_tag('de-AT-1901')
+      ['de_at_1901', 'de_at', 'de_1901', 'de']
+    """
+    # normalize:
+    tag = tag.lower().replace('-','_')
+    # find all combinations of subtags
+    taglist = []
+    base_tag= tag.split('_')[:1]
+    subtags = tag.split('_')[1:]
+    # print base_tag, subtags
+    for n in range(len(subtags), 0, -1):
+        for tags in unique_combinations(subtags, n):
+            # print tags
+            taglist.append('_'.join(base_tag + tags))
+    taglist += base_tag
+    return taglist
+
+
+class DependencyList(object):
+
+    """
+    List of dependencies, with file recording support.
+
+    Note that the output file is not automatically closed.  You have
+    to explicitly call the close() method.
+    """
+
+    def __init__(self, output_file=None, dependencies=[]):
+        """
+        Initialize the dependency list, automatically setting the
+        output file to `output_file` (see `set_output()`) and adding
+        all supplied dependencies.
+        """
+        self.set_output(output_file)
+        for i in dependencies:
+            self.add(i)
+
+    def set_output(self, output_file):
+        """
+        Set the output file and clear the list of already added
+        dependencies.
+
+        `output_file` must be a string.  The specified file is
+        immediately overwritten.
+
+        If output_file is '-', the output will be written to stdout.
+        If it is None, no file output is done when calling add().
+        """
+        self.list = []
+        if output_file:
+            if output_file == '-':
+                of = None
+            else:
+                of = output_file
+            self.file = FileOutput(destination_path=of,
+                                   encoding='utf8', autoclose=False)
+        else:
+            self.file = None
+
+    def add(self, *filenames):
+        """
+        If the dependency `filename` has not already been added,
+        append it to self.list and print it to self.file if self.file
+        is not None.
+        """
+        for filename in filenames:
+            if not filename in self.list:
+                self.list.append(filename)
+                if self.file is not None:
+                    self.file.write(filename+'\n')
+
+    def close(self):
+        """
+        Close the output file.
+        """
+        self.file.close()
+        self.file = None
+
+    def __repr__(self):
+        try:
+            output_file = self.file.name
+        except AttributeError:
+            output_file = None
+        return '%s(%r, %s)' % (self.__class__.__name__, output_file, self.list)

Copied: trunk/docutils/docutils/utils/code_analyzer.py (from rev 7266, trunk/docutils/docutils/parsers/code_analyzer.py)
===================================================================
--- trunk/docutils/docutils/utils/code_analyzer.py	                        (rev 0)
+++ trunk/docutils/docutils/utils/code_analyzer.py	2011-12-20 14:14:21 UTC (rev 7267)
@@ -0,0 +1,134 @@
+#!/usr/bin/python
+# coding: utf-8
+
+"""Lexical analysis of formal languages (i.e. code) using Pygments."""
+
+# :Author: Georg Brandl; Felix Wiemann; Günter Milde
+# :Date: $Date$
+# :Copyright: This module has been placed in the public domain.
+
+from docutils import ApplicationError
+try:
+    import pygments
+    from pygments.lexers import get_lexer_by_name
+    from pygments.formatters.html import _get_ttype_class
+    with_pygments = True
+except ImportError:
+    with_pygments = False
+
+# Filter the following token types from the list of class arguments:
+unstyled_tokens = ['token', # Token (base token type)
+                   'text',  # Token.Text
+                   '']      # short name for Token and Text
+# (Add, e.g., Token.Punctuation with ``unstyled_tokens += 'punctuation'``.)
+
+class LexerError(ApplicationError): 
+    pass
+
+class Lexer(object):
+    """Parse `code` lines and yield "classified" tokens.
+
+    Arguments
+
+      code       -- string of source code to parse,
+      language   -- formal language the code is written in,
+      tokennames -- either 'long', 'short', or '' (see below).
+
+    Merge subsequent tokens of the same token-type.
+
+    Iterating over an instance yields the tokens as ``(tokentype, value)``
+    tuples. The value of `tokennames` configures the naming of the tokentype:
+
+      'long':  downcased full token type name,
+      'short': short name defined by pygments.token.STANDARD_TYPES
+               (= class argument used in pygments html output),
+      'none':      skip lexical analysis.
+    """
+
+    def __init__(self, code, language, tokennames='short'):
+        """
+        Set up a lexical analyzer for `code` in `language`.
+        """
+        self.code = code
+        self.language = language
+        self.tokennames = tokennames
+        self.lexer = None
+        # get lexical analyzer for `language`:
+        if language in ('', 'text') or tokennames == 'none':
+            return
+        if not with_pygments:
+            raise LexerError('Cannot analyze code. '
+                                    'Pygments package not found.')
+        try:
+            self.lexer = get_lexer_by_name(self.language)
+        except pygments.util.ClassNotFound:
+            raise LexerError('Cannot analyze code. '
+                'No Pygments lexer found for "%s".' % language)
+
+    # Since version 1.2. (released Jan 01, 2010) Pygments has a
+    # TokenMergeFilter. However, this requires Python >= 2.4. When Docutils
+    # requires same minimal version,  ``self.merge(tokens)`` in __iter__ can
+    # be replaced by ``self.lexer.add_filter('tokenmerge')`` in __init__.
+    def merge(self, tokens):
+        """Merge subsequent tokens of same token-type.
+
+           Also strip the final newline (added by pygments).
+        """
+        tokens = iter(tokens)
+        (lasttype, lastval) = tokens.next()
+        for ttype, value in tokens:
+            if ttype is lasttype:
+                lastval += value
+            else:
+                yield(lasttype, lastval)
+                (lasttype, lastval) = (ttype, value)
+        if lastval.endswith('\n'):
+            lastval = lastval[:-1]
+        if lastval:
+            yield(lasttype, lastval)
+
+    def __iter__(self):
+        """Parse self.code and yield "classified" tokens.
+        """
+        if self.lexer is None:
+            yield ([], self.code)
+            return
+        tokens = pygments.lex(self.code, self.lexer)
+        for tokentype, value in self.merge(tokens):
+            if self.tokennames == 'long': # long CSS class args
+                classes = str(tokentype).lower().split('.')
+            else: # short CSS class args
+                classes = [_get_ttype_class(tokentype)]
+            classes = [cls for cls in classes if cls not in unstyled_tokens]
+            yield (classes, value)
+
+
+class NumberLines(object):
+    """Insert linenumber-tokens at the start of every code line.
+
+    Arguments
+
+       tokens    -- iterable of ``(classes, value)`` tuples
+       startline -- first line number
+       endline   -- last line number
+
+    Iterating over an instance yields the tokens with a
+    ``(['ln'], '<the line number>')`` token added for every code line.
+    Multi-line tokens are splitted."""
+
+    def __init__(self, tokens, startline, endline):
+        self.tokens = tokens
+        self.startline = startline
+        # pad linenumbers, e.g. endline == 100 -> fmt_str = '%3d '
+        self.fmt_str = '%%%dd ' % len(str(endline))
+
+    def __iter__(self):
+        lineno = self.startline
+        yield (['ln'], self.fmt_str % lineno)
+        for ttype, value in self.tokens:
+            lines = value.split('\n')
+            for line in lines[:-1]:
+                yield (ttype, line + '\n')
+                lineno += 1
+                yield (['ln'], self.fmt_str % lineno)
+            yield (ttype, lines[-1])

Copied: trunk/docutils/docutils/utils/punctuation_chars.py (from rev 7266, trunk/docutils/docutils/parsers/rst/punctuation_chars.py)
===================================================================
--- trunk/docutils/docutils/utils/punctuation_chars.py	                        (rev 0)
+++ trunk/docutils/docutils/utils/punctuation_chars.py	2011-12-20 14:14:21 UTC (rev 7267)
@@ -0,0 +1,211 @@
+#!/usr/bin/env python
+# -*- coding: utf8 -*-
+# :Copyright: © 2011 Günter Milde.
+# :License: Released under the terms of the `2-Clause BSD license`_, in short:
+#
+#    Copying and distribution of this file, with or without modification,
+#    are permitted in any medium without royalty provided the copyright
+#    notice and this notice are preserved.
+#    This file is offered as-is, without any warranty.
+#
+# .. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause
+
+# :Id: $Id$
+
+import sys, re
+import unicodedata
+
+# punctuation characters around inline markup
+# ===========================================
+#
+# This module provides the lists of characters for the implementation of
+# the `inline markup recognition rules`_ in the reStructuredText parser
+# (states.py)
+#
+# .. _inline markup recognition rules:
+#     ../../../docs/ref/rst/restructuredtext.html#inline-markup
+
+# Docutils punctuation category sample strings
+# --------------------------------------------
+#
+# The sample strings are generated by punctuation_samples() and put here
+# literal to avoid the time-consuming generation with every Docutils
+# run. Running this file as a standalone module checks the definitions below
+# against a re-calculation.
+
+openers = ur"""\"\'\(\<\[\{༺༼᚛⁅⁽₍〈❨❪❬❮❰❲❴⟅⟦⟨⟪⟬⟮⦃⦅⦇⦉⦋⦍⦏⦑⦓⦕⦗⧘⧚⧼⸢⸤⸦⸨〈《「『【〔〖〘〚〝〝﴾︗︵︷︹︻︽︿﹁﹃﹇﹙﹛﹝（［｛｟｢«‘“‹⸂⸄⸉⸌⸜⸠‚„»’”›⸃⸅⸊⸍⸝⸡‛‟"""
+closers = ur"""\"\'\)\>\]\}༻༽᚜⁆⁾₎〉❩❫❭❯❱❳❵⟆⟧⟩⟫⟭⟯⦄⦆⦈⦊⦌⦎⦐⦒⦔⦖⦘⧙⧛⧽⸣⸥⸧⸩〉》」』】〕〗〙〛〞〟﴿︘︶︸︺︼︾﹀﹂﹄﹈﹚﹜﹞）］｝｠｣»’”›⸃⸅⸊⸍⸝⸡‛‟«‘“‹⸂⸄⸉⸌⸜⸠‚„"""
+delimiters = ur"\-\/\:֊־᐀᠆‐‑‒–—―⸗⸚〜〰゠︱︲﹘﹣－¡·¿;·՚՛՜՝՞՟։׀׃׆׳״؉؊،؍؛؞؟٪٫٬٭۔܀܁܂܃܄܅܆܇܈܉܊܋܌܍߷߸߹࠰࠱࠲࠳࠴࠵࠶࠷࠸࠹࠺࠻࠼࠽࠾।॥॰෴๏๚๛༄༅༆༇༈༉༊་༌།༎༏༐༑༒྅࿐࿑࿒࿓࿔၊။၌၍၎၏჻፡።፣፤፥፦፧፨᙭᙮᛫᛬᛭᜵᜶។៕៖៘៙៚᠀᠁᠂᠃᠄᠅᠇᠈᠉᠊᥄᥅᧞᧟᨞᨟᪠᪡᪢᪣᪤᪥᪦᪨᪩᪪᪫᪬᪭᭚᭛᭜᭝᭞᭟᭠᰻᰼᰽᰾᰿᱾᱿᳓‖‗†‡•‣․‥…‧‰‱′″‴‵‶‷‸※‼‽‾⁁⁂⁃⁇⁈⁉⁊⁋⁌⁍⁎⁏⁐⁑⁓⁕⁖⁗⁘⁙⁚⁛⁜⁝⁞⳹⳺⳻⳼⳾⳿⸀⸁⸆⸇⸈⸋⸎⸏⸐⸑⸒⸓⸔⸕⸖⸘⸙⸛⸞⸟⸪⸫⸬⸭⸮⸰⸱、。〃〽・꓾꓿꘍꘎꘏꙳꙾꛲꛳꛴꛵꛶꛷꡴꡵꡶꡷꣎꣏꣸꣹꣺꤮꤯꥟꧁꧂꧃꧄꧅꧆꧇꧈꧉꧊꧋꧌꧍꧞꧟꩜꩝꩞꩟꫞꫟꯫︐︑︒︓︔︕︖︙︰﹅﹆﹉﹊﹋﹌﹐﹑﹒﹔﹕﹖﹗﹟﹠﹡﹨﹪﹫！＂＃％＆＇＊，．／：；？＠＼｡､･𐄀𐄁𐎟𐏐𐡗𐤟𐤿𐩐𐩑𐩒𐩓𐩔𐩕𐩖𐩗𐩘𐩿𐬹𐬺𐬻𐬼𐬽𐬾𐬿𑂻𑂼𑂾𑂿𑃀𑃁𒑰𒑱𒑲𒑳"
+closing_delimiters = ur"\.\,\;\!\?"
+
+
+# Unicode punctuation character categories
+# ----------------------------------------
+
+unicode_punctuation_categories = {
+    # 'Pc': 'Connector', # not used in Docutils inline markup recognition
+    'Pd': 'Dash',
+    'Ps': 'Open',
+    'Pe': 'Close',
+    'Pi': 'Initial quote', # may behave like Ps or Pe depending on usage
+    'Pf': 'Final quote', # may behave like Ps or Pe depending on usage
+    'Po': 'Other'
+    }
+"""Unicode character categories for punctuation"""
+
+
+# generate character pattern strings
+# ==================================
+
+def unicode_charlists(categories, cp_min=0, cp_max=None):
+    """Return dictionary of Unicode character lists.
+
+    For each of the `catagories`, an item contains a list with all Unicode
+    characters with `cp_min` <= code-point <= `cp_max` that belong to the
+    category. (The default values check every code-point supported by Python.)
+    """
+    # Determine highest code point with one of the given categories
+    # (may shorten the search time considerably if there are many
+    # categories with not too high characters):
+    if cp_max is None:
+        cp_max = max(x for x in xrange(sys.maxunicode + 1)
+                     if unicodedata.category(unichr(x)) in categories)
+        # print cp_max # => 74867 for unicode_punctuation_categories
+    charlists = {}
+    for cat in categories:
+        charlists[cat] = [unichr(x) for x in xrange(cp_min, cp_max+1)
+                          if unicodedata.category(unichr(x)) == cat]
+    return charlists
+
+
+# Character categories in Docutils
+# --------------------------------
+
+def punctuation_samples():
+
+    """Docutils punctuation category sample strings.
+
+    Return list of sample strings for the categories "Open", "Close",
+    "Delimiters" and "Closing-Delimiters" used in the `inline markup
+    recognition rules`_.
+    """
+
+    # Lists with characters in Unicode punctuation character categories
+    cp_min = 160 # ASCII chars have special rules for backwards compatibility
+    ucharlists = unicode_charlists(unicode_punctuation_categories, cp_min)
+
+    # match opening/closing characters
+    # --------------------------------
+    # Rearange the lists to ensure matching characters at the same
+    # index position.
+
+    # low quotation marks are also used as closers (e.g. in Greek)
+    # move them to category Pi:
+    ucharlists['Ps'].remove(u'‚') # 201A  SINGLE LOW-9 QUOTATION MARK
+    ucharlists['Ps'].remove(u'„') # 201E  DOUBLE LOW-9 QUOTATION MARK
+    ucharlists['Pi'] += [u'‚', u'„']
+
+    ucharlists['Pi'].remove(u'‛') # 201B  SINGLE HIGH-REVERSED-9 QUOTATION MARK
+    ucharlists['Pi'].remove(u'‟') # 201F  DOUBLE HIGH-REVERSED-9 QUOTATION MARK
+    ucharlists['Pf'] += [u'‛', u'‟']
+
+    # 301F  LOW DOUBLE PRIME QUOTATION MARK misses the opening pendant:
+    ucharlists['Ps'].insert(ucharlists['Pe'].index(u'\u301f'), u'\u301d')
+
+    # print u''.join(ucharlists['Ps']).encode('utf8')
+    # print u''.join(ucharlists['Pe']).encode('utf8')
+    # print u''.join(ucharlists['Pi']).encode('utf8')
+    # print u''.join(ucharlists['Pf']).encode('utf8')
+
+    # The Docutils character categories
+    # ---------------------------------
+    #
+    # The categorization of ASCII chars is non-standard to reduce both
+    # false positives and need for escaping. (see `inline markup recognition
+    # rules`_)
+
+    # matching, allowed before markup
+    openers = [re.escape('"\'(<[{')]
+    for cat in ('Ps', 'Pi', 'Pf'):
+        openers.extend(ucharlists[cat])
+
+    # matching, allowed after markup
+    closers = [re.escape('"\')>]}')]
+    for cat in ('Pe', 'Pf', 'Pi'):
+        closers.extend(ucharlists[cat])
+
+    # non-matching, allowed on both sides
+    delimiters = [re.escape('-/:')]
+    for cat in ('Pd', 'Po'):
+        delimiters.extend(ucharlists[cat])
+
+    # non-matching, after markup
+    closing_delimiters = ...
 
[truncated message content]