From: <mi...@us...> - 2012-01-18 10:16:32
|
Revision: 7315 http://docutils.svn.sourceforge.net/docutils/?rev=7315&view=rev Author: milde Date: 2012-01-18 10:16:20 +0000 (Wed, 18 Jan 2012) Log Message: ----------- XML writer overhaul (use visitor pattern, raw XML pass through). Modified Paths: -------------- trunk/docutils/HISTORY.txt trunk/docutils/docutils/nodes.py trunk/docutils/docutils/writers/docutils_xml.py trunk/docutils/test/test_writers/test_docutils_xml.py Modified: trunk/docutils/HISTORY.txt =================================================================== --- trunk/docutils/HISTORY.txt 2012-01-12 10:14:43 UTC (rev 7314) +++ trunk/docutils/HISTORY.txt 2012-01-18 10:16:20 UTC (rev 7315) @@ -64,8 +64,15 @@ * docutils/writers/html4css1/__init__.py - - change default for `math-output` setting to MathJax + - Change default for `math-output` setting to MathJax. +* docutils/writers/docutils_xml.py + + - Use the visitor pattern with default methods instead of minidom + to facilitate special handling of selected nodes. + + - Support raw XML (inserted as-is inside a <raw></raw> node). + Release 0.8.1 (2011-08-30) ========================== Modified: trunk/docutils/docutils/nodes.py =================================================================== --- trunk/docutils/docutils/nodes.py 2012-01-12 10:14:43 UTC (rev 7314) +++ trunk/docutils/docutils/nodes.py 2012-01-18 10:16:20 UTC (rev 7315) @@ -499,23 +499,29 @@ # 2to3 doesn't convert __unicode__ to __str__ __str__ = __unicode__ - def starttag(self): + def starttag(self, quoteattr=None): + # the optional arg is used by the docutils_xml writer + if quoteattr is None: + quoteattr = pseudo_quoteattr parts = [self.tagname] for name, value in self.attlist(): if value is None: # boolean attribute parts.append(name) - elif isinstance(value, list): + continue + if isinstance(value, list): values = [serial_escape('%s' % (v,)) for v in value] - parts.append('%s="%s"' % (name, ' '.join(values))) + value = ' '.join(values) else: - parts.append('%s="%s"' % (name, value)) - return '<%s>' % ' '.join(parts) + value = unicode(value) + value = quoteattr(value) + parts.append(u'%s=%s' % (name, value)) + return u'<%s>' % u' '.join(parts) def endtag(self): return '</%s>' % self.tagname def emptytag(self): - return u'<%s/>' % ' '.join([self.tagname] + + return u'<%s/>' % u' '.join([self.tagname] + ['%s="%s"' % (n, v) for n, v in self.attlist()]) @@ -1913,6 +1919,10 @@ """Escape string values that are elements of a list, for serialization.""" return value.replace('\\', r'\\').replace(' ', r'\ ') +def pseudo_quoteattr(value): + """Quote attributes for pseudo-xml""" + return '"%s"' % value + # # # Local Variables: Modified: trunk/docutils/docutils/writers/docutils_xml.py =================================================================== --- trunk/docutils/docutils/writers/docutils_xml.py 2012-01-12 10:14:43 UTC (rev 7314) +++ trunk/docutils/docutils/writers/docutils_xml.py 2012-01-18 10:16:20 UTC (rev 7315) @@ -1,18 +1,26 @@ # $Id$ -# Author: David Goodger <go...@py...> +# Author: David Goodger, Paul Tremblay, Guenter Milde +# Maintainer: doc...@li... # Copyright: This module has been placed in the public domain. """ -Simple internal document tree Writer, writes Docutils XML. +Simple document tree Writer, writes Docutils XML according to +http://docutils.sourceforge.net/docs/ref/docutils.dtd. """ __docformat__ = 'reStructuredText' +import sys +import xml.sax.saxutils +from StringIO import StringIO import docutils -from docutils import frontend, writers +from docutils import frontend, writers, nodes +class RawXmlError(docutils.ApplicationError): pass + + class Writer(writers.Writer): supported = ('xml',) @@ -20,9 +28,7 @@ settings_spec = ( '"Docutils XML" Writer Options', - 'Warning: In versions older than 2.7.3 and 3.2.3, the --newlines and ' - '--indents options may adversely affect whitespace; use them only ' - 'for reading convenience (see http://bugs.python.org/issue4147).', + None, (('Generate XML with newlines before and after tags.', ['--newlines'], {'action': 'store_true', 'validator': frontend.validate_boolean}), @@ -46,7 +52,20 @@ output = None """Final translated form of `document`.""" + def __init__(self): + writers.Writer.__init__(self) + self.translator_class = XMLTranslator + + def translate(self): + self.visitor = visitor = self.translator_class(self.document) + self.document.walkabout(visitor) + self.output = ''.join(visitor.output) + + +class XMLTranslator(nodes.GenericNodeVisitor): + xml_declaration = '<?xml version="1.0" encoding="%s"?>\n' + # TODO: add stylesheet options similar to HTML and LaTeX writers? #xml_stylesheet = '<?xml-stylesheet type="text/xsl" href="%s"?>\n' doctype = ( '<!DOCTYPE document PUBLIC' @@ -54,21 +73,107 @@ ' "http://docutils.sourceforge.net/docs/ref/docutils.dtd">\n') generator = '<!-- Generated by Docutils %s -->\n' - def translate(self): - settings = self.document.settings - indent = newline = '' + xmlparser = xml.sax.make_parser() + """SAX parser instance to check/exctract raw XML.""" + xmlparser.setFeature( + "http://xml.org/sax/features/external-general-entities", True) + + def __init__(self, document): + nodes.NodeVisitor.__init__(self, document) + + # Reporter + self.warn = self.document.reporter.warning + self.error = self.document.reporter.error + + # Settings + self.settings = settings = document.settings + self.indent = self.newline = '' if settings.newlines: - newline = '\n' + self.newline = '\n' if settings.indents: - newline = '\n' - indent = ' ' - output_prefix = [] + self.newline = '\n' + self.indent = ' ' + self.level = 0 # indentation level + self.in_simple = 0 # level of nesting inside mixed-content elements + + # Output + self.output = [] if settings.xml_declaration: - output_prefix.append( + self.output.append( self.xml_declaration % settings.output_encoding) if settings.doctype_declaration: - output_prefix.append(self.doctype) - output_prefix.append(self.generator % docutils.__version__) - docnode = self.document.asdom().childNodes[0] - self.output = (''.join(output_prefix) - + docnode.toprettyxml(indent, newline)) + self.output.append(self.doctype) + self.output.append(self.generator % docutils.__version__) + + # initialize XML parser + self.the_handle=TestXml() + self.xmlparser.setContentHandler(self.the_handle) + + # generic visit and depart methods + # -------------------------------- + + def default_visit(self, node): + """Default node visit method.""" + if not self.in_simple: + self.output.append(self.indent*self.level) + self.output.append(node.starttag(xml.sax.saxutils.quoteattr)) + self.level += 1 + if isinstance(node, nodes.TextElement): + self.in_simple += 1 + if not self.in_simple: + self.output.append(self.newline) + + def default_departure(self, node): + """Default node depart method.""" + self.level -= 1 + if not self.in_simple: + self.output.append(self.indent*self.level) + self.output.append(node.endtag()) + if isinstance(node, nodes.TextElement): + self.in_simple -= 1 + if not self.in_simple: + self.output.append(self.newline) + + + # specific visit and depart methods + # --------------------------------- + + def visit_Text(self, node): + text = xml.sax.saxutils.escape(node.astext()) + self.output.append(text) + + def depart_Text(self, node): + pass + + def visit_raw(self, node): + if 'xml' not in node.get('format', '').split(): + # skip other raw content? + # raise nodes.SkipNode + self.default_visit(node) + return + # wrap in <raw> element + self.default_visit(node) # or not? + xml_string = node.astext() + self.output.append(xml_string) + self.default_departure(node) # or not? + # Check validity of raw XML: + if isinstance(xml_string, unicode) and sys.version_info < (3,): + xml_string = xml_string.encode('utf8') + try: + self.xmlparser.parse(StringIO(xml_string)) + except xml.sax._exceptions.SAXParseException, error: + col_num = self.the_handle.locator.getColumnNumber() + line_num = self.the_handle.locator.getLineNumber() + srcline = node.line + if not isinstance(node.parent, nodes.TextElement): + srcline += 2 # directive content start line + msg = 'Invalid raw XML in column %d, line offset %d:\n%s' % ( + col_num, line_num, node.astext()) + self.warn(msg, source=node.source, line=srcline+line_num-1) + raise nodes.SkipNode # content already processed + + +class TestXml(xml.sax.ContentHandler): + + def setDocumentLocator(self, locator): + self.locator = locator Modified: trunk/docutils/test/test_writers/test_docutils_xml.py =================================================================== --- trunk/docutils/test/test_writers/test_docutils_xml.py 2012-01-12 10:14:43 UTC (rev 7314) +++ trunk/docutils/test/test_writers/test_docutils_xml.py 2012-01-18 10:16:20 UTC (rev 7315) @@ -6,15 +6,22 @@ """ Test for docutils XML writer. + +.. Attention:: + While the tests compare the output on the string-level, no guarantee + is given against changes to identical XML representations like + ``<empty></empty>`` vs. ``<empty/>``. The sample strings in this test + module mirrors the current behaviour of the docutils_xml writer. """ -from __init__ import DocutilsTestSupport +from StringIO import StringIO -import sys +from __init__ import DocutilsTestSupport # must be imported before docutils import docutils import docutils.core -# sample strings: +# sample strings +# -------------- source = u"""\ Test @@ -36,119 +43,160 @@ bodynormal = u"""\ <document source="<string>"><paragraph>Test</paragraph>\ -<transition/><paragraph>Test. \xe4\xf6\xfc€</paragraph>\ +<transition></transition><paragraph>Test. \xe4\xf6\xfc€</paragraph>\ </document>""" bodynewlines = u"""\ <document source="<string>"> <paragraph>Test</paragraph> -<transition/> +<transition> +</transition> <paragraph>Test. \xe4\xf6\xfc€</paragraph> </document> """ -bodynewlines_old = u"""\ +bodyindents = u"""\ <document source="<string>"> -<paragraph> -Test -</paragraph> -<transition/> -<paragraph> -Test. \xe4\xf6\xfc€ -</paragraph> + <paragraph>Test</paragraph> + <transition> + </transition> + <paragraph>Test. \xe4\xf6\xfc€</paragraph> </document> """ -bodyindents = u"""\ +# raw XML +# """"""" + +raw_xml_source = u"""\ +.. raw:: xml + + <root> + <child>Test \xe4\xf6\xfc\u20ac</child> + > + < + + </root> + +.. role:: xml(raw) + :format: xml + +:xml:`<test>inline raw XML</test>`. +""" + +raw_xml = u"""\ <document source="<string>"> - <paragraph>Test</paragraph> - <transition/> - <paragraph>Test. \xe4\xf6\xfc€</paragraph> +<raw format="xml" xml:space="preserve"><root> + <child>Test \xe4\xf6\xfc€</child> + > + < + +</root></raw> +<paragraph><raw classes="xml" format="xml" xml:space="preserve">\ +<test>inline raw XML</test></raw>.</paragraph> </document> """ -bodyindents_old = u"""\ +invalid_raw_xml_source = u"""\ +.. raw:: xml + + <root> + <child>Test \xe4\xf6\xfc\u20ac</child> + </mismatch> + +.. role:: xml(raw) + :format: xml + +:xml:`<test>inline raw XML</test>`. +""" + +invalid_raw_xml = u"""\ <document source="<string>"> - <paragraph> - Test - </paragraph> - <transition/> - <paragraph> - Test. \xe4\xf6\xfc€ - </paragraph> +<raw format="xml" xml:space="preserve"><root> + <child>Test \xe4\xf6\xfc\u20ac</child> +</mismatch></raw> +<paragraph><raw classes="xml" format="xml" xml:space="preserve">\ +<test>inline raw XML</test></raw>.</paragraph> </document> """ -# New formatting introduced in versions 2.7.3 and 3.2.3 on 2011-11-18 -# to fix http://bugs.python.org/issue4147 -# (Some distributions ship also earlier versions with this patch.) -if (sys.version_info < (2, 7, 3) or - sys.version_info[0] == 3 and sys.version_info < (3, 2, 3)): - whitespace_fix = False -else: - whitespace_fix = True -def publish_xml(settings): +def publish_xml(settings, source): return docutils.core.publish_string(source=source.encode('utf8'), reader_name='standalone', writer_name='docutils_xml', settings_overrides=settings) +# XML Test Case +# ------------- class DocutilsXMLTestCase(DocutilsTestSupport.StandardTestCase): settings = {'input_encoding': 'utf8', 'output_encoding': 'iso-8859-1', - '_disable_config': 1} + '_disable_config': True, + 'indents': False, + 'newlines': True, + 'xml_declaration': False, + 'doctype_declaration': False, + } def test_publish(self): - for self.settings['xml_declaration'] in True, False: - for self.settings['doctype_declaration'] in True, False: + settings = self.settings.copy() + settings['newlines'] = False + for settings['xml_declaration'] in True, False: + for settings['doctype_declaration'] in True, False: expected = u'' - if self.settings['xml_declaration']: + if settings['xml_declaration']: expected += xmldecl - if self.settings['doctype_declaration']: + if settings['doctype_declaration']: expected += doctypedecl expected += generatedby expected += bodynormal - result = publish_xml(self.settings) + result = publish_xml(settings, source) self.assertEqual(result, expected.encode('latin1')) def test_publish_indents(self): - self.settings['indents'] = True - self.settings['newlines'] = False - self.settings['xml_declaration'] = False - self.settings['doctype_declaration'] = False - result = publish_xml(self.settings) - - # New formatting introduced in versions 2.7.3 and 3.2.3 - if whitespace_fix: - expected = (generatedby + bodyindents).encode('latin1') - else: - expected = (generatedby + bodyindents_old).encode('latin1') - # Some distributions patch also earlier versions: - if (result != expected and not whitespace_fix): - expected = (generatedby + bodyindents).encode('latin1') - + settings = self.settings.copy() + settings['indents'] = True + result = publish_xml(settings, source) + expected = (generatedby + bodyindents).encode('latin1') self.assertEqual(result, expected) def test_publish_newlines(self): - self.settings['newlines'] = True - self.settings['indents'] = False - self.settings['xml_declaration'] = False - self.settings['doctype_declaration'] = False - result = publish_xml(self.settings) + settings = self.settings.copy() + result = publish_xml(settings, source) + expected = (generatedby + bodynewlines).encode('latin1') + self.assertEqual(result, expected) - # New formatting introduced in versions 2.7.3 and 3.2.3 - if whitespace_fix: - expected = (generatedby + bodynewlines).encode('latin1') - else: - expected = (generatedby + bodynewlines_old).encode('latin1') - # Some distributions patch also earlier versions: - if (result != expected and not whitespace_fix): - expected = (generatedby + bodynewlines).encode('latin1') + def test_raw_xml(self): + result = publish_xml(self.settings, raw_xml_source) + expected = (generatedby + + raw_xml).encode('latin1', 'xmlcharrefreplace') + self.assertEqual(result, expected) + def test_invalid_raw_xml(self): + warnings = StringIO() + settings = self.settings.copy() + settings['warning_stream'] = warnings + result = publish_xml(settings, invalid_raw_xml_source) + expected = (generatedby + + invalid_raw_xml).encode('latin1', 'xmlcharrefreplace') self.assertEqual(result, expected) + warnings.seek(0) + self.assertEqual(warnings.readlines(), + [u'<string>:5: ' + u'(WARNING/2) Invalid raw XML in column 2, line offset 3:\n', + u'<root>\n', + u' <child>Test \xe4\xf6\xfc\u20ac</child>\n', + u'</mismatch>\n', + u'<string>:10: ' + u'(WARNING/2) Invalid raw XML in column 30, line offset 1:\n', + u'<test>inline raw XML</test>\n']) + # abort with SystemMessage if halt_level is "info": + settings['halt_level'] = 2 + settings['warning_stream'] = '' + self.assertRaises(docutils.utils.SystemMessage, + publish_xml, settings, invalid_raw_xml_source) if __name__ == '__main__': This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |