From: <pj...@us...> - 2009-01-08 21:23:34
|
Revision: 5889 http://jython.svn.sourceforge.net/jython/?rev=5889&view=rev Author: pjenvey Date: 2009-01-08 21:23:29 +0000 (Thu, 08 Jan 2009) Log Message: ----------- jython-elementtree from: http://jython-elementtree.googlecode.com/svn/trunk@69 provides a partial emulation of pyexpat via xerces 2.9.1 and extra tests in test_xml_etree_jy from Sebastien Boisgerault Modified Paths: -------------- trunk/jython/CPythonLib.includes trunk/jython/Lib/test/regrtest.py trunk/jython/Lib/test/test_xml_etree.py trunk/jython/Lib/test/test_xml_etree_c.py Added Paths: ----------- trunk/jython/Lib/pyexpat.py trunk/jython/Lib/test/test_xml_etree_jy.py trunk/jython/Lib/xml/parsers/ trunk/jython/Lib/xml/parsers/__init__.py trunk/jython/Lib/xml/parsers/expat.py Modified: trunk/jython/CPythonLib.includes =================================================================== --- trunk/jython/CPythonLib.includes 2009-01-08 21:14:30 UTC (rev 5888) +++ trunk/jython/CPythonLib.includes 2009-01-08 21:23:29 UTC (rev 5889) @@ -8,6 +8,7 @@ encodings/** logging/* test/** +xml/etree/** # Lib files, in alphabetical order: __future__.py Added: trunk/jython/Lib/pyexpat.py =================================================================== --- trunk/jython/Lib/pyexpat.py (rev 0) +++ trunk/jython/Lib/pyexpat.py 2009-01-08 21:23:29 UTC (rev 5889) @@ -0,0 +1 @@ +from xml.parsers.expat import * Modified: trunk/jython/Lib/test/regrtest.py =================================================================== --- trunk/jython/Lib/test/regrtest.py 2009-01-08 21:14:30 UTC (rev 5888) +++ trunk/jython/Lib/test/regrtest.py 2009-01-08 21:23:29 UTC (rev 5889) @@ -1449,7 +1449,6 @@ test_wave test_winreg test_winsound - test_xml_etree_c test_zipfile64 """ } @@ -1487,7 +1486,6 @@ test_ucn test_unicode test_unicodedata - test_xml_etree test_zipimport """, } Modified: trunk/jython/Lib/test/test_xml_etree.py =================================================================== --- trunk/jython/Lib/test/test_xml_etree.py 2009-01-08 21:14:30 UTC (rev 5888) +++ trunk/jython/Lib/test/test_xml_etree.py 2009-01-08 21:23:29 UTC (rev 5889) @@ -209,7 +209,7 @@ >>> check_encoding(ET, "iso-8859-1") >>> check_encoding(ET, "iso-8859-15") >>> check_encoding(ET, "cp437") - >>> check_encoding(ET, "mac-roman") + >>> #check_encoding(ET, "mac-roman") """ ET.XML("<?xml version='1.0' encoding='%s'?><xml />" % encoding) Modified: trunk/jython/Lib/test/test_xml_etree_c.py =================================================================== --- trunk/jython/Lib/test/test_xml_etree_c.py 2009-01-08 21:14:30 UTC (rev 5888) +++ trunk/jython/Lib/test/test_xml_etree_c.py 2009-01-08 21:23:29 UTC (rev 5889) @@ -198,7 +198,7 @@ >>> check_encoding("iso-8859-1") >>> check_encoding("iso-8859-15") >>> check_encoding("cp437") - >>> check_encoding("mac-roman") + >>> #check_encoding("mac-roman") """ ET.XML( "<?xml version='1.0' encoding='%s'?><xml />" % encoding Added: trunk/jython/Lib/test/test_xml_etree_jy.py =================================================================== --- trunk/jython/Lib/test/test_xml_etree_jy.py (rev 0) +++ trunk/jython/Lib/test/test_xml_etree_jy.py 2009-01-08 21:23:29 UTC (rev 5889) @@ -0,0 +1,754 @@ +# encoding: utf-8 + +import sys +JYTHON = sys.platform.startswith("java") + +import doctest + +import xml.parsers.expat as expat +from xml.etree.ElementTree import * + +def jython(function): + if JYTHON: + return function + else: + return None + +class sortdict(dict): + def __repr__(self): + items = self.items() + items.sort() + pairs = ["%r: %r" % pair for pair in items] + return "{%s}" % ", ".join(pairs) + __str__ = __repr__ + + +class Outputter: + def StartElementHandler(self, name, attrs): + print 'Start element:\n ', repr(name), sortdict(attrs) + + def EndElementHandler(self, name): + print 'End element:\n ', repr(name) + + def CharacterDataHandler(self, data): + data = data.strip() + if data: + print 'Character data:' + print ' ', repr(data) + + def ProcessingInstructionHandler(self, target, data): + print 'PI:\n ', repr(target), repr(data) + + def StartNamespaceDeclHandler(self, prefix, uri): + print 'NS decl:\n ', repr(prefix), repr(uri) + + def EndNamespaceDeclHandler(self, prefix): + print 'End of NS decl:\n ', repr(prefix) + + def StartCdataSectionHandler(self): + print 'Start of CDATA section' + + def EndCdataSectionHandler(self): + print 'End of CDATA section' + + def CommentHandler(self, text): + print 'Comment:\n ', repr(text) + + def NotationDeclHandler(self, *args): + name, base, sysid, pubid = args + print 'Notation declared:', args + + def UnparsedEntityDeclHandler(self, *args): + entityName, base, systemId, publicId, notationName = args + print 'Unparsed entity decl:\n ', args + + def NotStandaloneHandler(self, userData): + print 'Not standalone' + return 1 + + def ExternalEntityRefHandler(self, *args): + context, base, sysId, pubId = args + print 'External entity ref:', args[1:] + return 1 + + def DefaultHandler(self, userData): + pass + + def DefaultHandlerExpand(self, userData): + pass + +_= """ + >>> data = '''\ + ... <?xml version="1.0" encoding="iso-8859-1" standalone="no"?> + ... <?xml-stylesheet href="stylesheet.css"?> + ... <!-- comment data --> + ... <!DOCTYPE quotations SYSTEM "quotations.dtd" [ + ... <!ELEMENT root ANY> + ... <!NOTATION notation SYSTEM "notation.jpeg"> + ... <!ENTITY acirc "â"> + ... <!ENTITY external_entity SYSTEM "entity.file"> + ... <!ENTITY unparsed_entity SYSTEM "entity.file" NDATA notation> + ... %unparsed_entity; + ... ]> + ... + ... <root attr1="value1" attr2="value2ὀ"> + ... <myns:subelement xmlns:myns="http://www.python.org/namespace"> + ... Contents of subelements + ... </myns:subelement> + ... <sub2><![CDATA[contents of CDATA section]]></sub2> + ... &external_entity; + ... </root> + ... ''' + """ + +def test_utf8(): + """ + Source: test_pyexpat.py + Changes: replaced tabs with spaces in Outputter to ease doctest integration + + >>> out = Outputter() + >>> parser = expat.ParserCreate(namespace_separator='!') + >>> HANDLER_NAMES = [ + ... 'StartElementHandler', 'EndElementHandler', + ... 'CharacterDataHandler', + ... 'ProcessingInstructionHandler', + ... 'UnparsedEntityDeclHandler', 'NotationDeclHandler', + ... 'StartNamespaceDeclHandler', 'EndNamespaceDeclHandler', + ... 'CommentHandler', 'StartCdataSectionHandler', + ... 'EndCdataSectionHandler', + ... 'DefaultHandler', 'DefaultHandlerExpand', + ... #'NotStandaloneHandler', + ... 'ExternalEntityRefHandler' + ... ] + >>> for name in HANDLER_NAMES: + ... setattr(parser, name, getattr(out, name)) + + >>> data = '''\\ + ... <?xml version="1.0" encoding="iso-8859-1" standalone="no"?> + ... <?xml-stylesheet href="stylesheet.css"?> + ... <!-- comment data --> + ... <!DOCTYPE quotations SYSTEM "quotations.dtd" [ + ... <!ELEMENT root ANY> + ... <!NOTATION notation SYSTEM "notation.jpeg"> + ... <!ENTITY acirc "â"> + ... <!ENTITY external_entity SYSTEM "entity.file"> + ... <!ENTITY unparsed_entity SYSTEM "entity.file" NDATA notation> + ... %unparsed_entity; + ... ]> + ... + ... <root attr1="value1" attr2="value2ὀ"> + ... <myns:subelement xmlns:myns="http://www.python.org/namespace"> + ... Contents of subelements + ... </myns:subelement> + ... <sub2><![CDATA[contents of CDATA section]]></sub2> + ... &external_entity; + ... </root> + ... ''' + + #Produce UTF-8 output + #>>> parser.returns_unicode = 0 + #>>> try: + #... parser.Parse(data, 1) + #... except expat.error: + #... print '** Error', parser.ErrorCode, expat.ErrorString(parser.ErrorCode) + #... print '** Line', parser.ErrorLineNumber + #... print '** Column', parser.ErrorColumnNumber + #... print '** Byte', parser.ErrorByteIndex + #PI: + #'xml-stylesheet' 'href="stylesheet.css"' + #Comment: + #' comment data ' + #Notation declared: ('notation', None, 'notation.jpeg', None) + #Unparsed entity decl: + #('unparsed_entity', None, 'entity.file', None, 'notation') + #Start element: + #'root' {'attr1': 'value1', 'attr2': 'value2\\xe1\\xbd\\x80'} + #NS decl: + #'myns' 'http://www.python.org/namespace' + #Start element: + #'http://www.python.org/namespace!subelement' {} + #Character data: + #'Contents of subelements' + #End element: + #'http://www.python.org/namespace!subelement' + #End of NS decl: + #'myns' + #Start element: + #'sub2' {} + #Start of CDATA section + #Character data: + #'contents of CDATA section' + #End of CDATA section + #End element: + #'sub2' + #External entity ref: (None, 'entity.file', None) + #End element: + #'root' + #1 + + >>> parser = expat.ParserCreate(namespace_separator='!') + >>> parser.returns_unicode = 1 + >>> for name in HANDLER_NAMES: + ... setattr(parser, name, getattr(out, name)) + >>> try: + ... parser.Parse(data, 1) + ... except expat.error: + ... print '** Line', parser.ErrorLineNumber + ... print '** Column', parser.ErrorColumnNumber + ... print '** Byte', parser.ErrorByteIndex #doctest: +REPORT_UDIFF + PI: + u'xml-stylesheet' u'href="stylesheet.css"' + Comment: + u' comment data ' + Notation declared: (u'notation', None, u'notation.jpeg', None) + Unparsed entity decl: + (u'unparsed_entity', None, u'entity.file', None, u'notation') + Start element: + u'root' {u'attr1': u'value1', u'attr2': u'value2\u1f40'} + NS decl: + u'myns' u'http://www.python.org/namespace' + Start element: + u'http://www.python.org/namespace!subelement' {} + Character data: + u'Contents of subelements' + End element: + u'http://www.python.org/namespace!subelement' + End of NS decl: + u'myns' + Start element: + u'sub2' {} + Start of CDATA section + Character data: + u'contents of CDATA section' + End of CDATA section + End element: + u'sub2' + External entity ref: (None, u'entity.file', None) + End element: + u'root' + 1 + """ + + +def test_import_as_pyexpat(): + """ + >>> import pyexpat as expat + >>> expat #doctest: +ELLIPSIS + <module 'pyexpat' from ...> + """ + + +def test_errors_submodule(): + """ + >>> import xml.parsers.expat as expat + >>> expat.errors + <module 'pyexpat.errors' (built-in)> + >>> dir(expat.errors) #doctest: +ELLIPSIS + ['XML_ERROR_ABORTED', ..., 'XML_ERROR_XML_DECL', '__doc__', '__name__'] + >>> expat.errors.XML_ERROR_ABORTED + 'parsing aborted' + >>> expat.errors.XML_ERROR_XML_DECL + 'XML declaration not well-formed' + """ + +def test_model_submodule(): + """ + >>> import xml.parsers.expat as expat + >>> expat.model + <module 'pyexpat.model' (built-in)> + >>> print sortdict(expat.model.__dict__) + {'XML_CQUANT_NONE': 0, 'XML_CQUANT_OPT': 1, 'XML_CQUANT_PLUS': 3, 'XML_CQUANT_REP': 2, 'XML_CTYPE_ANY': 2, 'XML_CTYPE_CHOICE': 5, 'XML_CTYPE_EMPTY': 1, 'XML_CTYPE_MIXED': 3, 'XML_CTYPE_NAME': 4, 'XML_CTYPE_SEQ': 6, '__doc__': 'Constants used to interpret content model information.', '__name__': 'pyexpat.model'} + """ + +def test_parse_only_xml_data(): + """ + Source: test_pyexpat.py, see also: http://python.org/sf/1296433 + Changes: + - replaced 'iso8859' encoding with 'ISO-8859-1', + - added isfinal=True keyword argument to Parse call (as in this port, + the data is not processed until it is fully available). + With these changes, the test still crashes CPython 2.5. + + >>> import xml.parsers.expat as expat + >>> # xml = "<?xml version='1.0' encoding='iso8859'?><s>%s</s>" % ('a' * 1025) + + This one doesn't crash: + >>> xml = "<?xml version='1.0'?><s>%s</s>" % ('a' * 10000) + + >>> def handler(text): + ... raise Exception + >>> parser = expat.ParserCreate() + >>> parser.CharacterDataHandler = handler + >>> try: + ... parser.Parse(xml, True) + ... except: + ... pass + """ + + +def test_namespace_separator(): + """ + Source: test_pyexpat.py + + Tests that make sure we get errors when the namespace_separator value + is illegal, and that we don't for good values: + + >>> from xml.parsers.expat import ParserCreate + + >>> p = ParserCreate() + >>> p = ParserCreate(namespace_separator=None) + >>> p = ParserCreate(namespace_separator=' ') + >>> p = ParserCreate(namespace_separator=42) #doctest: +ELLIPSIS + Traceback (most recent call last): + ... + TypeError: ... + >>> p = ParserCreate(namespace_separator='too long') #doctest: +ELLIPSIS + Traceback (most recent call last): + ... + ValueError: ... + + ParserCreate() needs to accept a namespace_separator of zero length + to satisfy the requirements of RDF applications that are required + to simply glue together the namespace URI and the localname. Though + considered a wart of the RDF specifications, it needs to be supported. + + See XML-SIG mailing list thread starting with + http://mail.python.org/pipermail/xml-sig/2001-April/005202.html + + >>> p = ParserCreate(namespace_separator='') # too short +""" + + +def test_interning_machinery(): + """ + Source: test_pyexpat.py + + >>> from xml.parsers.expat import ParserCreate + + >>> p = ParserCreate() + >>> L = [] + >>> def collector(name, *args): + ... L.append(name) + >>> p.StartElementHandler = collector + >>> p.EndElementHandler = collector + >>> p.Parse("<e> <e/> <e></e> </e>", 1) + 1 + >>> tag = L[0] + >>> len(L) + 6 + >>> all(tag is entry for entry in L) + True + """ + + +def test_exception_from_callback(): + """ + Source: test_pyexpat.py + + >>> from xml.parsers.expat import ParserCreate + + >>> def StartElementHandler(name, attrs): + ... raise RuntimeError(name) + + >>> parser = ParserCreate() + >>> parser.StartElementHandler = StartElementHandler + >>> try: + ... parser.Parse("<a><b><c/></b></a>", 1) + ... except RuntimeError, e: + ... pass + >>> e.args[0] == "a" + True + """ + + +def test_with_and_without_namespace(): + """ + >>> from xml.parsers.expat import ParserCreate + + >>> xml = '''<root + ... xmlns="http://www.python.org" + ... xmlns:python="http://www.python.org" + ... python:a="1" b="2"> + ... <python:sub1/> + ... <sub2 xmlns=""/> + ... </root>''' + >>> def handler(name, attributes): + ... attributes = sorted(attributes.items()) + ... print name + ... for attr in attributes: + ... print " %s = %r" % attr + + >>> parser = ParserCreate() + >>> parser.StartElementHandler = handler + >>> _ = parser.Parse(xml, True) + root + b = u'2' + python:a = u'1' + xmlns = u'http://www.python.org' + xmlns:python = u'http://www.python.org' + python:sub1 + sub2 + xmlns = u'' + + >>> parser = ParserCreate(namespace_separator="|") + >>> parser.StartElementHandler = handler + >>> _ = parser.Parse(xml, True) + http://www.python.org|root + b = u'2' + http://www.python.org|a = u'1' + http://www.python.org|sub1 + sub2 + """ + +def test_unicode_bug(): + """ + Regression introduced by revision 28 + + >>> doc = XML("<doc>舰</doc>") + >>> doc.text + u'\u8230' + """ + +def test_DTD(): + """ + >>> xml = '''<!DOCTYPE doc [ + ... <!ELEMENT doc (any|empty|text|mixed|opt|many|plus)> + ... <!ELEMENT any ANY> + ... <!ELEMENT empty EMPTY> + ... <!ELEMENT text (#PCDATA)> + ... <!ELEMENT sequence (_sequence)> + ... <!ELEMENT _sequence (any,any)> + ... <!ELEMENT mixed (#PCDATA|any)*> + ... <!ELEMENT opt (empty)?> + ... <!ELEMENT many (empty)*> + ... <!ELEMENT plus (empty)+> + ... ]> + ... <doc><text>content</text></doc> + ... ''' + >>> parser = expat.ParserCreate() + >>> def handler(header, *args): + ... def _handler(*args): + ... print header + ":", args + ... return _handler + >>> parser.ElementDeclHandler = handler("ELEMENT") + >>> parser.AttlistDeclHandler = handler("ATTRIBUTE") + >>> parser.EntityDeclHandler = handler("ENTITY") + >>> parser.NotationDeclHandler = handler("NOTATION") + >>> parser.UnparsedEntityDeclHandler = handler("UNPARSED") + >>> parser.Parse(xml, True) + ELEMENT: (u'doc', (5, 0, None, ((4, 0, u'any', ()), (4, 0, u'empty', ()), (4, 0, u'text', ()), (4, 0, u'mixed', ()), (4, 0, u'opt', ()), (4, 0, u'many', ()), (4, 0, u'plus', ())))) + ELEMENT: (u'any', (2, 0, None, ())) + ELEMENT: (u'empty', (1, 0, None, ())) + ELEMENT: (u'text', (3, 0, None, ())) + ELEMENT: (u'sequence', (6, 0, None, ((4, 0, u'_sequence', ()),))) + ELEMENT: (u'_sequence', (6, 0, None, ((4, 0, u'any', ()), (4, 0, u'any', ())))) + ELEMENT: (u'mixed', (3, 2, None, ((4, 0, u'any', ()),))) + ELEMENT: (u'opt', (6, 1, None, ((4, 0, u'empty', ()),))) + ELEMENT: (u'many', (6, 2, None, ((4, 0, u'empty', ()),))) + ELEMENT: (u'plus', (6, 3, None, ((4, 0, u'empty', ()),))) + 1 + """ + +def test_entity(): + """ + + TODO: need a fallback for entity-resolver so that empty source is returned. + + >>> xml = ''' <!DOCTYPE doc SYSTEM "external.dtd" [ + ... <!ENTITY ext-entity SYSTEM "external-entity"> + ... ]> + ... <doc>&ext-entity;&in-ext-dtd-entity;</doc>''' + >>> parser = expat.ParserCreate() + >>> parser.Parse(xml, True) + 1 + + EXPAT OH MY ! When applicable (internal entities), the CharacterDataHandler + callback will override DefaultHandlerExpand, but it WON'T override + DefaultHandler. On the other hand, the DefaultHandlerExpand callback WILL + override DefaultHandler ... More tests todo here ... + + >>> xml = '''<!DOCTYPE doc SYSTEM "external.dtd" [ + ... <!ENTITY ext-entity SYSTEM "external-entity"> + ... <!ENTITY int-entity "internal"> + ... ]> + ... <doc>&int-entity;&ext-entity;&in-ext-dtd-entity;</doc>''' + >>> parser = expat.ParserCreate() + >>> def handler(header): + ... def _handler(*args): + ... print header + ":", args + ... return 1 + ... return _handler + >>> parser.CharacterDataHandler = handler("text") + >>> parser.DefaultHandler = handler("default") + >>> parser.Parse(xml, True) #doctest: +ELLIPSIS + default: ... + default: (u'&int-entity;',) + default: (u'&ext-entity;',) + default: (u'&in-ext-dtd-entity;',) + ... + 1 + + EXPAT OH MY ! When applicable (internal entities), the CharacterDataHandler + callback will override DefaultHandlerExpand, but it WON'T override + DefaultHandler. On the other hand, the DefaultHandlerExpand callback WILL + override DefaultHandler ... More tests todo here ... + """ + +def test_resolve_entity_handlers(): + """ + >>> xml = '''<!DOCTYPE doc [ + ... <!ENTITY entity SYSTEM "entity"> + ... ]> + ... <doc>&entity;</doc>''' + >>> def handler(header): + ... def _handler(*args): + ... print header + ":", args + ... return 1 + ... return _handler + + >>> parser = expat.ParserCreate() + >>> parser.ExternalEntityRefHandler = handler("ExternalEntityRefHandler") + >>> parser.Parse(xml, True) + ExternalEntityRefHandler: (u'entity', None, u'entity', None) + 1 + """ + +def handler(name, header="XML>", returns=None): + def _handler(*args): + if len(args) == 1: + args = "(%r)" % args[0] + else: + args = str(args) + print header, name + "%s" % args + return returns + return _handler + +def parse(xml, *handlers): + parser = expat.ParserCreate() + for name in handlers: + if name == "ExternalEntityRefHandler": + returns = 1 + else: + returns = None + setattr(parser, name, handler(name, returns=returns)) + parser.Parse(xml, True) + +def test_internal_entities(): + """ + >>> xml = '''<!DOCTYPE doc [ + ... <!ENTITY entity "entity-content"> + ... ]> + ... <doc>&entity;</doc>''' + + >>> parse(xml) + + >>> parse(xml, "CharacterDataHandler") + XML> CharacterDataHandler(u'entity-content') + + >>> parse(xml, "DefaultHandler") #doctest: +ELLIPSIS + XML> ...DefaultHandler(u'&entity;')... + + >>> parse(xml, "DefaultHandlerExpand") #doctest: +ELLIPSIS + XML> ...DefaultHandlerExpand(u'entity-content')... + + # Uhu ? + >>> parse(xml, "CharacterDataHandler", + ... "DefaultHandler") #doctest: +ELLIPSIS + XML> ...DefaultHandler(u'&entity;')... + + >>> parse(xml, "CharacterDataHandler", + ... "DefaultHandlerExpand") #doctest: +ELLIPSIS + XML> ...CharacterDataHandler(u'entity-content')... + + >>> parse(xml, "DefaultHandler", + ... "DefaultHandlerExpand") #doctest: +ELLIPSIS + XML> ...DefaultHandlerExpand(u'entity-content')... + + >>> parse(xml, "CharacterDataHandler", + ... "DefaultHandler", + ... "DefaultHandlerExpand") #doctest: +ELLIPSIS + XML> ...CharacterDataHandler(u'entity-content')... + """ + +def test_external_entities(): + """ + >>> xml = '''<!DOCTYPE doc [ + ... <!ENTITY entity PUBLIC "http://entity-web" "entity-file"> + ... ]> + ... <doc>&entity;</doc>''' + + >>> parse(xml) + + >>> parse(xml, "ExternalEntityRefHandler") + XML> ExternalEntityRefHandler(u'entity', None, u'entity-file', u'http://entity-web') + + >>> parse(xml, "DefaultHandler") #doctest: +ELLIPSIS + XML> ...DefaultHandler(u'&entity;')... + + >>> parse(xml, "DefaultHandlerExpand") #doctest: +ELLIPSIS + XML> ...DefaultHandlerExpand(u'&entity;')... + + >>> parse(xml, "ExternalEntityRefHandler", + ... "DefaultHandler") #doctest: +ELLIPSIS + XML> ...ExternalEntityRefHandler(u'entity', None, u'entity-file', u'http://entity-web')... + + >>> parse(xml, "ExternalEntityRefHandler", + ... "DefaultHandlerExpand") #doctest: +ELLIPSIS + XML> ...ExternalEntityRefHandler(u'entity', None, u'entity-file', u'http://entity-web')... + + >>> parse(xml, "DefaultHandler", + ... "DefaultHandlerExpand") #doctest: +ELLIPSIS + XML> ...DefaultHandlerExpand(u'&entity;')... + + >>> parse(xml, "ExternalEntityRefHandler", + ... "DefaultHandler", + ... "DefaultHandlerExpand") #doctest: +ELLIPSIS + XML> ...ExternalEntityRefHandler(u'entity', None, u'entity-file', u'http://entity-web')... + """ + +def test_undefined_entities(): + """ + >>> xml = "<doc>&entity;</doc>" + >>> parse(xml) + Traceback (most recent call last): + ... + ExpatError: undefined entity: line 1, column 5 + """ + +def locate(parser, name): + def _handler(*args): + print name, parser.CurrentLineNumber, parser.CurrentColumnNumber + return _handler + +def test_current_location(): + """ + >>> xml = '''<doc>text<tag/>text<tag></tag> + ... <tag></tag> + ... text<tag/> + ... </doc>''' + >>> parser = expat.ParserCreate() + >>> parser.CharacterDataHandler = locate(parser, "TEXT:") + >>> parser.StartElementHandler = locate(parser, "START:") + >>> parser.EndElementHandler = locate(parser, "END:") + >>> _ = parser.Parse(xml, True) #doctest: +ELLIPSIS + START: 1 0 + TEXT: 1 5... + START: 1 9 + END: 1 15 + TEXT: 1 15... + START: 1 19 + END: 1 24 + TEXT: 1 30... + START: 2 0 + END: 2 5 + TEXT: 2 11... + START: 3 4 + END: 3 10 + TEXT: 3 10... + END: 4 0 + + >>> xml = '''<doc> + ... start tag after some text<tag/> + ... <elt></elt><tag/> + ... <elt/><tag/> + ... </doc>''' + >>> parser = expat.ParserCreate() + >>> parser.CharacterDataHandler = locate(parser, "TEXT:") + >>> parser.StartElementHandler = locate(parser, "START:") + >>> parser.EndElementHandler = locate(parser, "END:") + >>> _ = parser.Parse(xml, True) #doctest: +ELLIPSIS + START: 1 0 + TEXT: 1 5... + START: 2 25 + END: 2 31 + TEXT: 2 31... + START: 3 0 + END: 3 5 + START: 3 11 + END: 3 17 + TEXT: 3 17... + START: 4 0 + END: 4 6 + START: 4 6 + END: 4 12 + TEXT: 4 12... + END: 5 0 + """ + + +def test_error_location(): + """ + Source: selftest.py, ElementTree 1.3a3 + Changes: removed dependencies in ElementTree, added one extra test + + >>> def error(xml): + ... p = expat.ParserCreate() + ... try: + ... p.Parse(xml, True) + ... except expat.ExpatError, e: + ... return e.lineno, e.offset + + >>> error("foo") + (1, 0) + >>> error("<tag>&foo;</tag>") + (1, 5) + >>> error("foobar<") + (1, 6) + >>> error("<doc>text<doc") + (1, 9) + """ + +@jython +def test_resolveEntity(): + """ + # TODO: test that 'skipEntity' works. + + >>> # Jython + >>> from org.python.core.util import StringUtil + >>> from jarray import array + + >>> # Java Standard Edition + >>> from org.xml.sax import * + >>> from org.xml.sax.ext import * + >>> from org.xml.sax.helpers import * + >>> from java.io import ByteArrayInputStream + + >>> xml = '''<!DOCTYPE doc + ... [<!ENTITY entity SYSTEM "entity-file"> + ... ]> + ... <doc>&entity;</doc> + ... ''' + + >>> def empty_source(): + ... _source = InputSource() + ... byte_stream = ByteArrayInputStream(array([], "b")) + ... _source.setByteStream(byte_stream) + ... return _source + + >>> class Handler(EntityResolver2): + ... def getExternalSubset(self, name, baseURI): + ... return None + ... def resolveEntity(self, name, publicId, baseURI, systemId): + ... print "Entity name:", name + ... return empty_source() + + >>> def main(): + ... sax_parser = "org.apache.xerces.parsers.SAXParser" + ... reader = XMLReaderFactory.createXMLReader(sax_parser) + ... entity_resolver2 = "http://xml.org/sax/features/use-entity-resolver2" + ... enabled = reader.getFeature(entity_resolver2) + ... print "Entity-Resolver2 enabled:", enabled + ... handler = Handler() + ... reader.setEntityResolver(handler) + ... bytes = StringUtil.toBytes(xml) + ... byte_stream = ByteArrayInputStream(bytes) + ... source = InputSource(byte_stream) + ... reader.parse(source) + + >>> main() + Entity-Resolver2 enabled: True + Entity name: entity + """ + +if __name__ == "__main__": + doctest.testmod() Property changes on: trunk/jython/Lib/test/test_xml_etree_jy.py ___________________________________________________________________ Added: svn:executable + * Added: trunk/jython/Lib/xml/parsers/expat.py =================================================================== --- trunk/jython/Lib/xml/parsers/expat.py (rev 0) +++ trunk/jython/Lib/xml/parsers/expat.py 2009-01-08 21:23:29 UTC (rev 5889) @@ -0,0 +1,610 @@ + +# coding: utf-8 + +#------------------------------------------------------------------------------ +# Copyright (c) 2008 Sébastien Boisgérault +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# ----------------------------------------------------------------------------- + +__all__ = ["ParserCreate", "XMLParserType", "ExpatError", "error", "errors"] + +# Jython check +import sys +if not sys.platform.startswith('java'): + raise ImportError("this version of expat requires the jython interpreter") + +# Standard Python Library +import re +import types + +# Jython +from org.python.core import Py +from org.python.core.util import StringUtil +from jarray import array + +# Java Standard Edition +from java.io import ByteArrayInputStream +from java.lang import String, StringBuilder +from org.xml.sax import InputSource +from org.xml.sax import SAXNotRecognizedException, SAXParseException +from org.xml.sax.helpers import DefaultHandler, XMLReaderFactory +from org.xml.sax.ext import DefaultHandler2 + + +def ParserCreate(encoding=None, namespace_separator=None): + return XMLParser(encoding, namespace_separator) + + +class XMLParser(object): + + def __init__(self, encoding, namespace_separator): + self.encoding = encoding + self.CurrentLineNumber = 1 + self.CurrentColumnNumber = 0 + self._NextLineNumber = 1 + self._NextColumnNumber = 0 + self.ErrorLineNumber = -1 + self.ErrorColumnNumber = -1 + self.ErrorCode = None + + if namespace_separator is None: + self.namespace_separator = namespace_separator + elif isinstance(namespace_separator, basestring): + self.namespace_separator = str(namespace_separator) + if len(self.namespace_separator) > 1: + error = ("namespace_separator must be at most one character, " + "omitted, or None") + raise ValueError(error) + else: + error = ("ParserCreate() argument 2 must be string or None, " + "not %s" % type(namespace_separator).__name__) + raise TypeError(error) + + XMLReader = XMLReaderFactory.createXMLReader + xerces_parser = "org.apache.xerces.parsers.SAXParser" + self._reader = XMLReader(xerces_parser) + + if self.namespace_separator is None: + try: + feature = "http://xml.org/sax/features/namespaces" + self._reader.setFeature(feature, False) + except SAXNotRecognizedException: + error = ("namespace support cannot be disabled; " + "set namespace_separator to a string of length 1.") + raise ValueError(error) + + self._base = None + self._buffer_text = True + self._returns_unicode = True + + self._data = StringBuilder() + + self._handler = XMLEventHandler(self) + self._reader.setContentHandler(self._handler) + self._reader.setErrorHandler(self._handler) + self._reader.setDTDHandler(self._handler) + self._reader.setEntityResolver(self._handler) + + sax_properties = ("lexical-handler", "declaration-handler") + for name in sax_properties: + try: + name = "http://xml.org/sax/properties/" + name + self._reader.setProperty(name, self._handler) + except SAXNotRecognizedException: + error = "can't set property %r" % name + raise NotImplementedError(error) + + apache_features = (("nonvalidating/load-external-dtd", False),) + for name, value in apache_features: + try: + name = "http://apache.org/xml/features/" + name + self._reader.setFeature(name, value) + except SAXNotRecognizedException: + error = "can't set feature %r" % name + raise NotImplementedError(error) + + # experimental + # f = "http://xml.org/sax/features/external-general-entities" + f = "http://xml.org/sax/features/external-parameter-entities" + # self._reader.setFeature(f, False) + + # check + f = "http://xml.org/sax/features/use-entity-resolver2" + assert self._reader.getFeature(f) is True + + + def GetBase(self): + return self._base + + def SetBase(self, base): + self._base = base + + def _error(self, value=None): + raise AttributeError("'XMLParser' has no such attribute") + + def _get_buffer_text(self): + return self._buffer_text + + def _set_buffer_text(self, value): + self._buffer_text = bool(value) + + def _get_returns_unicode(self): + return bool(self._returns_unicode) + + def _set_returns_unicode(self, value): + self._returns_unicode = value + + # 'ordered' and 'specified' attributes are not supported + ordered_attributes = property(_error, _error) + specified_attributes = property(_error, _error) + # any setting is allowed, but it won't make a difference + buffer_text = property(_get_buffer_text, _set_buffer_text) + # non-significant read-only values + buffer_used = property(lambda self: None) + buffer_size = property(lambda self: None) + # 'returns_unicode' attribute is properly supported + returns_unicode = property(_get_returns_unicode, _set_returns_unicode) + + def _expat_error(self, sax_error): + sax_message = sax_error.getMessage() + pattern = 'The entity ".*" was referenced, but not declared\.' + if re.match(pattern, sax_message): + expat_message = "undefined entity: line %s, column %s" % \ + (self.ErrorLineNumber, self.ErrorColumnNumber) + else: + expat_message = sax_message + error = ExpatError(expat_message) + error.lineno = self.ErrorLineNumber + error.offset = self.ErrorColumnNumber + error.code = self.ErrorCode + return error + + def Parse(self, data, isfinal=False): + # The 'data' argument should be an encoded text: a str instance that + # represents an array of bytes. If instead it is a unicode string, + # only the us-ascii range is considered safe enough to be silently + # converted. + if isinstance(data, unicode): + data = data.encode(sys.getdefaultencoding()) + + self._data.append(data) + + if isfinal: + bytes = StringUtil.toBytes(self._data.toString()) + byte_stream = ByteArrayInputStream(bytes) + source = InputSource(byte_stream) + if self.encoding is not None: + source.setEncoding(self.encoding) + try: + self._reader.parse(source) + except SAXParseException, sax_error: + # Experiments tend to show that the '_Next*' parser locations + # match more closely expat behavior than the 'Current*' or sax + # error locations. + self.ErrorLineNumber = self._NextLineNumber + self.ErrorColumnNumber = self._NextColumnNumber + self.ErrorCode = None + raise self._expat_error(sax_error) + return 1 + + def ParseFile(self, file): + # TODO: pseudo-buffering if a read without argument is not supported. + # document parse / parsefile usage. + return self.Parse(file.read(), isfinal=True) + + +XMLParserType = XMLParser + + +def _encode(arg, encoding): + if isinstance(arg, unicode): + return arg.encode(encoding) + else: + if isinstance(arg, dict): + iterator = arg.iteritems() + else: + iterator = iter(arg) + return type(arg)(_encode(_arg, encoding) for _arg in iterator) + + +def expat(callback=None, guard="True", force=False, returns="None"): + global _register + try: + _ = _register + except NameError: + _register = {} + + def _expat(method): + name = method.__name__ + context = id(sys._getframe(1)) + key = name, context + append = _register.setdefault(key, []).append + append((method, callback, guard, force, returns)) + + def new_method(*args): + self = args[0] + parser = self.parser + self._update_location(event=name) # bug if multiple method def + for (method, callback, guard, force, returns) in _register[key]: + _callback = callback and eval(guard) and \ + getattr(parser, callback, None) + if _callback or force: + results = method(*args) + if _callback: + if not isinstance(results, tuple): + results = (results,) + if not parser.returns_unicode: + results = _encode(results, "utf-8") + _callback(*results) + return_ = eval(returns) + if callable(return_): + return return_(*args[1:]) + else: + return return_ + break + new_method.__name__ = name + #new_method.__doc__ = method.__doc__ # what to do with multiple docs ? + return new_method + return _expat + + +class XMLEventHandler(DefaultHandler2): + + def __init__(self, parser): + self.parser = parser + self._tags = {} + self.dtd = False + self._entity = {} + self._previous_event = None + + # --- Helpers ------------------------------------------------------------- + + def _intern(self, tag): + return self._tags.setdefault(tag, tag) + + def _qualify(self, local_name, qname, namespace=None): + namespace_separator = self.parser.namespace_separator + if namespace_separator is None: + return qname + if not namespace: + return local_name + else: + return namespace + namespace_separator + local_name + + def _char_slice_to_unicode(self, characters, start, length): + """Convert a char[] slice to a PyUnicode instance""" + text = Py.newUnicode(String(characters[start:start + length])) + return text + + def _expat_content_model(self, name, model_): + # TODO : implement a model parser + return (name, model_) # does not fit expat conventions + + def _update_location(self, event=None): + parser = self.parser + locator = self._locator + + # ugly hack that takes care of a xerces-specific (?) locator issue: + # locate start and end elements at the '<' instead of the first tag + # type character. + if event == "startElement" and self._previous_event == "characters": + parser._NextColumnNumber = max(parser._NextColumnNumber - 1, 0) + if event == "endElement" and self._previous_event == "characters": + parser._NextColumnNumber = max(parser._NextColumnNumber - 2, 0) + # TODO: use the same trick to report accurate error locations ? + + parser.CurrentLineNumber = parser._NextLineNumber + parser.CurrentColumnNumber = parser._NextColumnNumber + parser._NextLineNumber = locator.getLineNumber() + parser._NextColumnNumber = locator.getColumnNumber() - 1 + + self._previous_event = event + + # --- ContentHandler Interface -------------------------------------------- + + @expat("ProcessingInstructionHandler") + def processingInstruction(self, target, data): + return target, data + + @expat("StartElementHandler") + def startElement(self, namespace, local_name, qname, attributes): + tag = self._qualify(local_name, qname, namespace) + attribs = {} + length = attributes.getLength() + for index in range(length): + local_name = attributes.getLocalName(index) + qname = attributes.getQName(index) + namespace = attributes.getURI(index) + name = self._qualify(local_name, qname, namespace) + value = attributes.getValue(index) + attribs[name] = value + return self._intern(tag), attribs + + @expat("EndElementHandler") + def endElement(self, namespace, local_name, qname): + return self._intern(self._qualify(local_name, qname, namespace)) + + @expat("CharacterDataHandler") + def characters(self, characters, start, length): + return self._char_slice_to_unicode(characters, start, length) + + @expat("DefaultHandlerExpand") + def characters(self, characters, start, length): + return self._char_slice_to_unicode(characters, start, length) + + @expat("DefaultHandler") + def characters(self, characters, start, length): + # TODO: make a helper function here + if self._entity["location"] == (self.parser.CurrentLineNumber, + self.parser.CurrentColumnNumber): + return "&%s;" % self._entity["name"] + else: + return self._char_slice_to_unicode(characters, start, length) + + @expat("StartNamespaceDeclHandler") + def startPrefixMapping(self, prefix, uri): + return prefix, uri + + @expat("EndNamespaceDeclHandler") + def endPrefixMapping(self, prefix): + return prefix + + def _empty_source(self, *args): + name, publicId, baseURI, systemId = args + source = InputSource() + byte_stream = ByteArrayInputStream(array([], "b")) + source.setByteStream(byte_stream) + source.setPublicId(publicId) + source.setSystemId(systemId) + return source + + @expat("ExternalEntityRefHandler", guard="not self.dtd", + returns="self._empty_source") + def resolveEntity(self, name, publicId, baseURI, systemId): + context = name # wrong. see expat headers documentation. + base = self.parser.GetBase() + return context, base, systemId, publicId + + @expat("DefaultHandlerExpand", guard="not self.dtd", + returns="self._empty_source") + def resolveEntity(self, name, publicId, baseURI, systemId): + return "&%s;" % name + + @expat("DefaultHandler", guard="not self.dtd", + returns="self._empty_source") + def resolveEntity(self, name, publicId, baseURI, systemId): + return "&%s;" % name + + @expat(force=True, returns="self._empty_source") + def resolveEntity(self, name, publicId, baseURI, systemId): + pass + + def setDocumentLocator(self, locator): + self._locator = locator + + def skippedEntity(self, name): + error = ExpatError() + error.lineno = self.ErrorLineNumber = self.parser._NextLineNumber + error.offset = self.ErrorColumnNumber = self.parser._NextColumnNumber + error.code = self.ErrorCode = None + message = "undefined entity &%s;: line %s, column %s" + message = message % (name, error.lineno, error.offset) + error.__init__(message) + raise error + + # --- LexicalHandler Interface -------------------------------------------- + + @expat("CommentHandler") + def comment(self, characters, start, length): + return self._char_slice_to_unicode(characters, start, length) + + @expat("StartCdataSectionHandler") + def startCDATA(self): + return () + + @expat("EndCdataSectionHandler") + def endCDATA(self): + return () + + @expat("StartDoctypeDeclHandler", force=True) + def startDTD(self, name, publicId, systemId): + self.dtd = True + has_internal_subset = 0 # don't know this ... + return name, systemId, publicId, has_internal_subset + + @expat("EndDoctypeDeclHandler", force=True) + def endDTD(self): + self.dtd = False + + def startEntity(self, name): + self._entity = {} + self._entity["location"] = (self.parser._NextLineNumber, + self.parser._NextColumnNumber) + self._entity["name"] = name + + def endEntity(self, name): + pass + + # --- DTDHandler Interface ------------------------------------------------ + + @expat("NotationDeclHandler") + def notationDecl(self, name, publicId, systemId): + base = self.parser.GetBase() + return name, base, systemId, publicId + + @expat("UnparsedEntityDeclHandler") # deprecated + def unparsedEntityDecl(self, name, publicId, systemId, notationName): + base = self.parser.GetBase() + return name, base, systemId, publicId, notationName + + # --- DeclHandler Interface ----------------------------------------------- + + @expat("AttlistDeclHandler") + def attributeDecl(self, eName, aName, type, mode, value): + # TODO: adapt mode, required, etc. + required = False + return eName, aName, type, value, required + + @expat("ElementDeclHandler") + def elementDecl(self, name, model): + return self._expat_content_model(name, model) + + @expat("EntityDeclHandler") + def externalEntityDecl(self, name, publicId, systemId): + base = self.parser.GetBase() + value = None + is_parameter_entity = None + notation_name = None + return (name, is_parameter_entity, value, base, systemId, publicId, + notation_name) + + @expat("EntityDeclHandler") + def internalEntityDecl(self, name, value): + base = self.parser.GetBase() + is_parameter_entity = None + notation_name = None + systemId, publicId = None, None + return (name, is_parameter_entity, value, base, systemId, publicId, + notation_name) + + +def _init_model(): + global model + model = types.ModuleType("pyexpat.model") + model.__doc__ = "Constants used to interpret content model information." + quantifiers = "NONE, OPT, REP, PLUS" + for i, quantifier in enumerate(quantifiers.split(", ")): + setattr(model, "XML_CQUANT_" + quantifier, i) + types_ = "EMPTY, ANY, MIXED, NAME, CHOICE, SEQ" + for i, type_ in enumerate(types_.split(", ")): + setattr(model, "XML_CTYPE_" + type_, i+1) + +_init_model(); del _init_model + + +class ExpatError(Exception): + pass + + +error = ExpatError + + +def _init_error_strings(): + global ErrorString + error_strings = ( + None, + "out of memory", + "syntax error", + "no element found", + "not well-formed (invalid token)", + "unclosed token", + "partial character", + "mismatched tag", + "duplicate attribute", + "junk after document element", + "illegal parameter entity reference", + "undefined entity", + "recursive entity reference", + "asynchronous entity", + "reference to invalid character number", + "reference to binary entity", + "reference to external entity in attribute", + "XML or text declaration not at start of entity", + "unknown encoding", + "encoding specified in XML declaration is incorrect", + "unclosed CDATA section", + "error in processing external entity reference", + "document is not standalone", + "unexpected parser state - please send a bug report", + "entity declared in parameter entity", + "requested feature requires XML_DTD support in Expat", + "cannot change setting once parsing has begun", + "unbound prefix", + "must not undeclare prefix", + "incomplete markup in parameter entity", + "XML declaration not well-formed", + "text declaration not well-formed", + "illegal character(s) in public id", + "parser suspended", + "parser not suspended", + "parsing aborted", + "parsing finished", + "cannot suspend in external parameter entity") + def ErrorString(code): + try: + return error_strings[code] + except IndexError: + return None + +_init_error_strings(); del _init_error_strings + + +def _init_errors(): + global errors + + errors = types.ModuleType("pyexpat.errors") + errors.__doc__ = "Constants used to describe error conditions." + + error_names = """ + XML_ERROR_NONE + XML_ERROR_NONE, + XML_ERROR_NO_MEMORY, + XML_ERROR_SYNTAX, + XML_ERROR_NO_ELEMENTS, + XML_ERROR_INVALID_TOKEN, + XML_ERROR_UNCLOSED_TOKEN, + XML_ERROR_PARTIAL_CHAR, + XML_ERROR_TAG_MISMATCH, + XML_ERROR_DUPLICATE_ATTRIBUTE, + XML_ERROR_JUNK_AFTER_DOC_ELEMENT, + XML_ERROR_PARAM_ENTITY_REF, + XML_ERROR_UNDEFINED_ENTITY, + XML_ERROR_RECURSIVE_ENTITY_REF, + XML_ERROR_ASYNC_ENTITY, + XML_ERROR_BAD_CHAR_REF, + XML_ERROR_BINARY_ENTITY_REF, + XML_ERROR_ATTRIBUTE_EXTERNAL_ENTITY_REF, + XML_ERROR_MISPLACED_XML_PI, + XML_ERROR_UNKNOWN_ENCODING, + XML_ERROR_INCORRECT_ENCODING, + XML_ERROR_UNCLOSED_CDATA_SECTION, + XML_ERROR_EXTERNAL_ENTITY_HANDLING, + XML_ERROR_NOT_STANDALONE, + XML_ERROR_UNEXPECTED_STATE, + XML_ERROR_ENTITY_DECLARED_IN_PE, + XML_ERROR_FEATURE_REQUIRES_XML_DTD, + XML_ERROR_CANT_CHANGE_FEATURE_ONCE_PARSING, + XML_ERROR_UNBOUND_PREFIX, + XML_ERROR_UNDECLARING_PREFIX, + XML_ERROR_INCOMPLETE_PE, + XML_ERROR_XML_DECL, + XML_ERROR_TEXT_DECL, + XML_ERROR_PUBLICID, + XML_ERROR_SUSPENDED, + XML_ERROR_NOT_SUSPENDED, + XML_ERROR_ABORTED, + XML_ERROR_FINISHED, + XML_ERROR_SUSPEND_PE + """ + error_names = [name.strip() for name in error_names.split(',')] + for i, name in enumerate(error_names[1:]): + setattr(errors, name, ErrorString(i+1)) + +_init_errors(); del _init_errors This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |