2004-08-24 09:40:12 UTC
Hello,
Iused and made some improvment and corrections on your code.
Following the code for you if your mail does not suite correctly and for other who will be interesting by the code.
your code was designed for python 1.5.2, my modifications require at least a version 2.x of the python's virtual machine.
I espect that imporve correctly what you havec done ..
Thanks for the first way .
Jerrykhan .
-----8<---snip---snip-----
""" Copyright (C) 2003 Peter Ohler
XMLite is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation; either version 2, or (at your option) any later
version.
XMLite is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You download a copy of the GNU General Public License at
http://www.gnu.org/licenses/gpl.txt or obtain a copy of the GNU General
Public License by writing to the Free Software Foundation, Inc., 59 Temple
Place - Suite 330, Boston, MA 02111-1307, USA.
XMLite - extremely light weight XML parse and printer
The xmlite module is an extremely light weight XML parser and printer. It
does not use the DOM or SAX interfaces but instead works with a simple
list or rather nested lists to represent an XML document. The parser takes
as input a string or filename and returns a list with all the elements of
the XML file.
The first item in the top level XML list is a dict object with 'version',
'encoding', and 'standalone' keys. If there are any decl tags such as
'DOCTYPE' they will be next in the list and will be tuples with the decl
tag name and the value of the tag as the second item of the tuple.
Comments are included as lists of two items. The first item is None and
the second is a string which is the comment text.
CDATA are tuples of two items. The first item is 'CDATA' and the second is
the CDATA content.
XML elements are lists. The first item in the list is the element tag or
name. The second item is a dict object with includes all the attributes of
the element. Any remainin list items are either comments, strings, CDATA,
or more elements as lists.
Author: Peter Ohler,
peter@ohler.com
$Id: xmlite.py,v 1.3 2003/10/07 00:46:48 ohler Exp $
JVA : 2004 08 22 Jerome VACHER: toStr modification and corrections (exceptions i argument, and caractere protection)
"""
import os
import sys
import string
class XmlException:
""" XML Exception for reporting errors in parsing of an XML file or
string.
"""
def __init__(self, msg, s, pos):
""" Pass in the error message, string being parsed, and the position
in the string where the error was detected.
"""
self.msg = msg
if s == None:
self.line = -1
self.char = -1
else:
self.line = 1 + s.count(os.linesep, 0, pos)
if self.line > 1:
self.char = pos - s.rfind(os.linesep, 0, pos)
else:
self.char = pos
def __str__(self):
if self.line < 0:
return self.msg
else:
return "%s at %d of line %d" % (self.msg, self.char, self.line)
def printXml(xml, indent = 0):
""" Print out a list that matches the expected XML list format. Other
formats may not print out correctly. The output format is XML.
"""
istr = ' ' * indent
if isinstance(xml, str):
print "%s%s" % (istr, expandCodedChars(xml))
elif isinstance(xml, tuple):
if 'CDATA' == xml[0]:
print "%s<![CDATA[%s]]>" % (istr, xml[1])
else:
print "%s<!%s %s>" % (istr, xml[0], xml[1])
elif isinstance(xml, list):
tag = xml[0]
if tag == None:
print "%s<!-- %s -->" % (istr, xml[1])
return
elif isinstance(tag, dict): # the very top of the xml
print "<?xml",
for k in tag:
v = tag[k]
if v != None:
print '%s="%s"' % (k, v),
print "?>"
indent += 2
for e in xml[1:]:
printXml(e, indent)
return
n = len(xml)
if n == 1:
print "%s<%s/>" % (istr, tag)
elif n == 2:
attrs = xml[1]
if attrs == None:
print "%s<%s/>" % (istr, tag)
else:
print "%s<%s" % (istr, tag)
printAttrs(xml[1], indent + 3)
print "/>"
else:
attrs = xml[1]
if attrs == None:
print "%s<%s>" % (istr, tag)
else:
print "%s<%s" % (istr, tag)
printAttrs(xml[1], indent + 3)
print ">"
indent += 2
for e in xml[2:]:
printXml(e, indent)
print "%s</%s>" % (istr, tag)
else:
raise XmlException("Invalid format", None, 0)
def printAttrs(attrs, indent):
if not isinstance(attrs, dict):
if attrs == None:
return
raise XmlException("Invalid format", s, 0)
istr = ' ' * indent
n = len(attrs)
for a in attrs:
n -= 1
# Strings are expanded and special characters are replaces with
# character codes.
if 0 < n:
print '%s%s="%s"' % (istr, a, expandCodedChars(attrs[a]))
else:
print '%s%s="%s"' % (istr, a, expandCodedChars(attrs[a])),
def toStr(xml, indent = 0, ind_sz=2, line_sz=0, ret_attr=True):
""" Return a string that is an XML document.
JVA modifications:
I need XML for automation .. no return between attributes is required
argument ret_attr Ture => yes False => no
I need small <BASE>small text</BASE> on the same line .. readability and string constraint
argument line_sz 0 => disable n : if the line is less that n caracters, it is printed on one line
In a recursive function the sub-function does not need to know its caller main data ..
but the s argument contraints it a lot .. I forget it.
The code should be better to avoid duplicate code .. but .. not enough time
JVA correction ..
The prologue <? xml .. ?> is added .
"""
istr = ' ' * indent
if xml is None: return ""
if isinstance(xml, str):
if line_sz and len(xml) <= line_sz: return expandCodedChars(xml)
else: return "%s%s\n" % (istr, xml)
if not isinstance(xml, list):
raise XmlException("Invalid format", str(xml), 0)
# JVA base with prolog <? xml ...?>
if isinstance(xml[0], dict):
attrs = []
for k,v in xml[0].items():
if v is not None:
attrs.append(str(k)+'="'+str(v)+'"')
pro = "<?xml "+" ".join(attrs)+"?>\n"
ch = toStr(xml[1],indent, ind_sz, line_sz, ret_attr)
# JVA it is not supposed to exists an xml[2] and more at this base level.
return pro+ch
tag = xml[0]
if tag == None: # JVA no tag => it's a commentary in place of the dictionnary .. (strange, what not the content ?)
return "%s<!-- %s -->\n" % (istr, xml[1])
n = len(xml)
if n == 1:
return "%s<%s/>\n" % (istr, tag)
elif n == 2:
attrs = xml[1]
if attrs == None:
return "%s<%s/>\n" % (istr, tag)
else:
ch = "%s<%s%s" % (istr, tag, ret_attr and '\n' or '')
ch+= attrsToStr(xml[1], indent + ind_sz, ret_attr)
return ch + "/>\n"
else:
attrs = xml[1]
if attrs == None:
ch = "%s<%s>" % (istr, tag)
else:
ch = "%s<%s%s" % (istr, tag, ret_attr and "\n" or "")
ch+= attrsToStr(xml[1], indent + ind_sz, ret_attr)
ch+= ">"
indent += ind_sz
cht = ""
for e in xml[2:]:
cht += toStr(e, indent, ind_sz, line_sz, ret_attr)
#print "CHT:",cht
che = "</%s>\n" % (tag)
if line_sz and len(cht) <= line_sz and n==3:
return "%s%s%s" % (ch, cht, che)
else:
return "%s\n%s%s%s" % (ch, cht, istr, che)
def attrsToStr(attrs, indent, ret_attr=True):
if not isinstance(attrs, dict):
if attrs == None: return ""
raise XmlException("Invalid format", attrs, 0)
istr = ret_attr and ' '*indent or " "
ch = ""
for a in attrs:
ch+= '%s%s="%s"%s' % (istr, a, expandCodedChars(attrs[a]), ret_attr and '\n' or '' )
if ret_attr: ch = ch[:-1]
return ch
def load(filename):
""" Load complete file into memory and then parse the string.
"""
f = open(filename, "r")
if f == None:
return None
s = f.read()
f.close()
return parse(s)
def parse(s):
""" Make one pass and parse directly into an XML list.
"""
phase = 0 # 0 - before prolog, 1 - after prolog, 2 - after root, 3 - done
x = []
i = 0
# let an IndexError be raised if the end of the string is reached
while 3 > phase:
try:
while s[i] in string.whitespace:
i += 1
except IndexError:
break
# every element at the top level starts with '<'
if '<' != s[i]:
raise XmlException("Expected a '<' character", s, i)
i += 1
c = s[i]
if c == '?': # prolog
if phase != 0:
raise XmlException("Prolog must be the first element", s, i)
i += 1
i = readProlog(s, i, x)
phase = 1
elif c == '!': # comment or decl
i += 1
if '--' == s[i:i + 2]:
i = readComment(s, i + 2, x)
elif phase > 1:
raise XmlException("DECLs must appear before other element", s, i)
else:
i = readDecl(s, i, x)
phase = 1
else: # element
i = readElement(s, i, x)
phase = 2
return x
def readProlog(s, i, x):
version, encoding, standalone = None, None, None
if 'xml' != s[i:i + 3]:
raise XmlException("Expected 'xml' in prolog", s, i)
i += 3
while '?' != s[i]:
token, i = readNameToken(s, i)
while s[i] in string.whitespace:
i += 1
c = s[i]
if '=' == c:
i += 1
if token == "version":
version, i = readQuotedValue(s, i)
elif token == "encoding":
encoding, i = readQuotedValue(s, i)
elif token == "standalone":
standalone, i = readQuotedValue(s, i)
else:
raise XmlException("Invalid prolog attribute: '" + token + "'", s, i)
elif '?' == c:
break
else:
raise XmlException("Expected '=' or '?' in prolog", s, i)
i += 1 # past ?
if '>' != s[i]:
raise XmlException("Expected '>' after '?' in prolog", s, i)
i += 1
x.append({ 'version': version, 'encoding': encoding, 'standalone': standalone })
return i
nonNameStr = " \t\n\r?=/><\x0b\x0c"
def readNameToken(s, i):
while s[i] in string.whitespace:
i += 1
start = i
while not s[i] in nonNameStr:
i += 1
if start == i:
return None, i
return s[start:i], i
def readQuotedValue(s, i):
while s[i] in string.whitespace:
i += 1
if '"' != s[i]:
raise XmlException("Expected '\"' character", s, i)
i += 1
start = i
while '"' != s[i]:
i += 1
if start == i:
return None, i + 1
return replaceCodedChars(s[start:i]), i + 1
def readComment(s, i, x):
end = s.find('-->', i)
if 0 > end:
raise XmlException("Comment not terminated", s, i)
x.append([None, s[i:end].strip()])
return end + 3
def readDecl(s, i, x):
name, i = readNameToken(s, i)
while s[i] in string.whitespace:
i += 1
start = i
depth = 1
while 1:
c = s[i]
if '<' == c:
depth += 1
elif '>' == c:
depth -= 1
if depth == 0: # done, the end
break
i += 1
x.append((name, s[start:i]))
return i + 1
def readElement(s, i, x):
name, i = readNameToken(s, i)
element = [name, None]
while s[i] in string.whitespace:
i += 1
if '/' == s[i]:
i += 1
if '>' == s[i]: # empty element, no attributes and no children
x.append(element)
return i + 1
raise XmlException("Expected '>' after '/'", s, i)
# read attribute names until the close (/ or >) is reached
dict = None
while 1:
name, i = readNameToken(s, i)
while s[i] in string.whitespace:
i += 1
c = s[i]
i += 1
if '=' == c:
while s[i] in string.whitespace:
i += 1
value, i = readQuotedValue(s, i)
if dict == None:
dict = { name : value }
element[1] = dict
else:
dict[name] = value
elif '/' == c:
if '>' != s[i]:
raise XmlException("Expected '>' after '/'", s, i)
# no children
i += 1
x.append(element)
return i
elif '>' == c:
break
else:
raise XmlException("Format error", s, i)
# read children
while 1:
while s[i] in string.whitespace:
i += 1
if '<' == s[i]:
i += 1
c = s[i]
if '!' == c: # better be a comment or CDATA
i += 1
if '--' == s[i:i + 2]:
i = readComment(s, i + 2, element)
elif '[CDATA[' == s[i:i + 7]:
i = readCData(s, i + 1, element)
else:
raise XmlException("Comment format error", s, i)
elif '/' == c: # end of element
i += 1
name, i = readNameToken(s, i)
while s[i] in string.whitespace:
i += 1
if '>' != s[i]:
raise XmlException("Expected '>' to close element end tag", s, i)
if name != element[0]:
raise XmlException("Element end tag name mismatch", s, i)
i += 1
break
else: # read sub element
i = readElement(s, i, element)
else:
i = readText(s, i, element)
x.append(element)
return i
def readCData(s, i, x):
start = i
end = s.find(']]>', i)
if 0 > end:
raise XmlException("No CDATA closure", s, i)
x.append(('CDATA', s[start:end]))
return end + 3
def readText(s, i, x):
start = i
end = s.find('<', i)
if 0 > end:
raise XmlException("No text closure", s, i)
x.append(replaceCodedChars(s[start:end].strip()))
return end
def replaceCodedChars(text):
if '&' in text:
newtext = ""
t = 0
tend = len(text)
prev = 0
while 1:
t = text.find('&', t)
if 0 > t:
newtext += text[prev:]
text = newtext
break
else:
c, i = readCodedChar(text, t)
newtext += text[prev:t] + c
t = i
prev = t
return text
def readCodedChar(s, i):
end = s.find(';', i, i + 6)
if 0 > end:
raise XmlException("Invalid coded character. Not terminated by ';'", None, -1)
i += 1
if '#' == s[i]:
c = chr(int(s[i + 1: end]))
else:
code = s[i:end]
if 'nbsp' == code: c = ' '
elif 'lt' == code: c = '<'
elif 'gt' == code: c = '>'
elif 'amp' == code: c = '&'
elif 'quot' == code: c = '"'
elif 'apos' == code: c = "'"
else:
raise XmlException("Invalid coded character '%s'" % code, None, -1)
return c, end + 1
def expandCodedChars(s):
# todo handle nbsp correctly, needs read adjustment for strip
# JVA correction for if &,",'
if 0 < s.find(' '):
s = s.replace(' ', ' ')
s = s.replace(' ', ' ')
if '<' in s: s = s.replace('<', '<')
if '>' in s: s = s.replace('>', '>')
if '&' in s: s = s.replace('&', '&')
if '"' in s: s = s.replace('"', '"')
if "'" in s: s = s.replace("'", ''')
return s
----->8---snip---snip-----