# Copyright (c) 2001 Samuel Brauer. All Rights Reserved. NO WARRANTY.
# $Revision: 1.21 $
import cStringIO
import string
import sys
import types
import xml.sax
import xml.sax.handler
import xml.sax.saxutils
import xml.sax.saxlib
import xml.sax.xmlreader
import spb.xmlEncoding
import codecs
################################################################
# Some functions for dealing with xml files in various encodings.
################################################################
def getEncodingFromFilename(filename):
f = open(filename)
try:
s = f.readline()
finally:
f.close()
return getEncodingFromString(s)
def getEncodingFromString(s):
enc = 'utf-8'
idx = s.find('?>')
if(idx > 4):
s = s[:idx+2]
enc = spb.xmlEncoding.autoDetectXMLEncoding(s)
enc = enc.lower()
if(enc == 'utf_8'): enc = 'utf-8'
return enc
def getUTF8StringFromFilename(fullpath):
enc = getEncodingFromFilename(fullpath)
if(enc == 'utf-8'):
f = open(fullpath)
try:
data = f.read()
finally:
f.close()
else:
f = codecs.open(fullpath, 'r', enc)
try:
uni = f.read()
finally:
f.close()
data = uni.encode('utf-8')
# if the document starts with an <?xml?> header, strip the encoding
# from that header
if(data[0:5] == '<?xml'):
idx = data.find('?>')
if(idx > 4):
header = data[:idx]
out = []
for part in header.split():
if(not part.startswith("encoding=")): out.append(part)
out.append('?>')
return string.join(out, " ") + data[idx+2:]
return data
################################################################
# Some functions for coercing objects
################################################################
def objectToText(obj):
"""coerce an object to text, returning either a string (utf-8)"""
output = None
if(obj == None):
output = ""
elif(isinstance(obj, types.StringType)):
output = obj
elif(isinstance(obj, types.UnicodeType)):
output = obj.encode('utf-8')
elif(isinstance(obj, types.ListType) or isinstance(obj, types.TupleType)):
output = ""
for item in obj:
if(output): output += ' '
output += objectToText(item)
else:
output = str(obj)
return output
def objectToSaxEvents(handler, obj, keyStyle="attribute"):
"""Generate SAX2 events representing obj.
If object appears to have a "toSaxEvents()" method, try to call it.
Else if obj is a dictionary, create an element for each key.
Else if obj is a list/tuple, create an element for each item.
Otherwise, coerce the object to a string and write as characters.
keyStyle should be either "attribute" or "element", and determines
how dictionaries are serialized.
"""
if(obj == None): return
if(hasattr(obj, "toSaxEvents")):
return obj.toSaxEvents(handler)
if(isinstance(obj, types.DictType)):
xml_dict = XmlableDict(obj, keyStyle)
objectToSaxEvents(handler, xml_dict, keyStyle)
elif(isinstance(obj, types.ListType) or isinstance(obj, types.TupleType)):
for item in obj:
handler.startElementNS((None, 'item'), 'item', {})
objectToSaxEvents(handler, item, keyStyle)
handler.endElementNS((None, 'item'), 'item')
else:
handler.characters(objectToText(obj))
class PrintHandler(xml.sax.saxlib.ContentHandler):
"""A handler that prints the XML as text to an output stream."""
def __init__(self, out=sys.stdout, prefixMappings={}, uriMappings={}, undeclaredPrefixes=[]):
self.out = out
self.prefixMappings = prefixMappings
self.uriMappings = uriMappings
self.undeclaredPrefixes = undeclaredPrefixes
def writeCharacters(self, chars, escape=0):
if(isinstance(chars, types.UnicodeType)): chars = chars.encode('utf-8')
if(escape): self.out.write(xml.sax.saxutils.escape(chars))
else: self.out.write(chars)
def startPrefixMapping(self, prefix, uri):
mappings = self.prefixMappings.get(prefix, [])
mappings.append(uri)
self.prefixMappings[prefix] = mappings
mappings = self.uriMappings.get(uri, [])
mappings.append(prefix)
self.uriMappings[uri] = mappings
if(prefix not in self.undeclaredPrefixes):
self.undeclaredPrefixes.append(prefix)
def endPrefixMapping(self, prefix):
mappings = self.prefixMappings.get(prefix)
uri = mappings.pop()
mappings = self.uriMappings.get(uri)
mappings.pop()
def startDocument(self):
self.out.write('<?xml version="1.0" encoding="utf-8"?>\n')
def startElementNS(self, name, qname, attrs):
uri = name[0]
lname = name[1]
self.out.write('<')
if(uri):
#self.out.write(self.uriMappings[uri][-1])
self.writeCharacters(self.uriMappings[uri][-1])
self.out.write(':')
#self.out.write(lname)
self.writeCharacters(lname)
attdict = {}
for prefix in self.undeclaredPrefixes:
if(self.prefixMappings[prefix]):
attdict['xmlns:'+prefix] = self.prefixMappings[prefix][-1]
self.undeclaredPrefixes = []
for pair in attrs.items():
attname = pair[0]
attvalue = pair[1]
atturi = attname[0]
attlname = attname[1]
qname = ''
if(atturi):
prefix = self.uriMappings[atturi][-1]
qname = prefix+':'
qname += attlname
attdict[qname] = attvalue
for key in attdict.keys():
self.writeCharacters(' %s' % key)
self.out.write('="')
self.writeCharacters(attdict[key], 1)
self.out.write('"')
self.out.write('>')
def endElementNS(self, name, qname):
uri = name[0]
lname = name[1]
self.out.write('</')
if(uri):
#self.out.write(self.uriMappings[uri][-1])
self.writeCharacters(self.uriMappings[uri][-1])
self.out.write(':')
#self.out.write(lname)
self.writeCharacters(lname)
self.out.write('>')
def characters(self, content):
self.writeCharacters(content, 1)
def ignorableWhitespace(self, content):
self.out.write(content)
def processingInstruction(self, target, data):
self.out.write('<?')
self.writeCharacters(target)
self.out.write(' ')
self.writeCharacters(data)
self.out.write('?>')
def startElement(self, name, attrs):
nsattrs = {}
for key in attrs.keys():
nsattrs[(None, key)] = attrs[key]
self.startElementNS((None, name), name, nsattrs)
def endElement(self, name):
self.endElementNS((None, name), name)
class PrettyPrintHandler(PrintHandler):
def __init__(self, out=sys.stdout, prefixMappings={}, uriMappings={}, undeclaredPrefixes=[]):
PrintHandler.__init__(self, out, prefixMappings, uriMappings, undeclaredPrefixes)
self.indent = 0
self.newline = 1
def startElementNS(self, name, qname, attrs):
if(self.newline): self.out.write("\n")
self.newline = 1
self.out.write(" " * self.indent)
PrintHandler.startElementNS(self, name, qname, attrs)
self.indent += 1
def endElementNS(self, name, qname):
self.indent -= 1
if(not self.newline): self.out.write(" " * self.indent)
PrintHandler.endElementNS(self, name, qname)
self.out.write("\n")
self.newline = 0
class FilterHandler(xml.sax.saxlib.ContentHandler):
"""This handler is used to filter the sending of events to another handler.
'handler' is the other handler.
'documentEvents' is a flag specifying whether or not to send startDocument() and endDocument().
'skipElements' is a number of elements to skip the sending of startElementNS() and endElementNS() events for.
'skipCharacters' is a number of elements whose characters() events should be skipped.
'processingInstructions' is a flag specifying whether or not to send processingInstruction().
"""
def __init__(self, handler, documentEvents, skipElements, skipCharacters, processingInstructions):
self.handler = handler
self.documentEvents = documentEvents
self.skipElements = skipElements
self.skipCharacters = skipCharacters
self.processingInstructions = processingInstructions
self.elements = 0
def startPrefixMapping(self, prefix, uri):
self.handler.startPrefixMapping(prefix, uri)
def endPrefixMapping(self, prefix):
self.handler.endPrefixMapping(prefix)
def startDocument(self):
if(self.documentEvents): self.handler.startDocument()
def endDocument(self):
if(self.documentEvents): self.handler.endDocument()
def startElementNS(self, name, qname, attrs):
self.elements += 1
if(self.elements > self.skipElements): self.handler.startElementNS(name, qname, attrs)
def endElementNS(self, name, qname):
if(self.elements > self.skipElements): self.handler.endElementNS(name, qname)
self.elements -= 1
def characters(self, content):
if(self.elements > self.skipCharacters): self.handler.characters(content)
def ignorableWhitespace(self, content):
if(self.elements > self.skipElements): self.handler.ignorableWhitespace(content)
def processingInstruction(self, target, data):
if(self.processingInstructions): self.handler.processingInstruction(target, data)
def startElement(self, name, attrs):
nsattrs = {}
for key in attrs.keys():
nsattrs[(None, key)] = attrs[key]
self.startElementNS((None, name), name, nsattrs)
def endElement(self, name):
self.endElementNS((None, name), name)
class XmlableText:
def __init__(self, text, style='xml'):
if(style not in ('br', 'brbr', 'p', 'xhtml', 'xml', 'ul', 'ol')):
raise RuntimeError, "XmlableText constructor passed unrecognized value for 'style'."
if(text):
self.text = text.replace("\r\n", "\n").replace("\r", "\n")
else:
self.text = ""
self.style = style
def toSaxEvents(self, handler):
if(self.style == 'br'):
for s in string.split(self.text, "\n"):
handler.characters(s)
handler.startElementNS((None, 'br'), 'br', {})
handler.endElementNS((None, 'br'), 'br')
elif(self.style == 'brbr'):
chunks = self.text.split("\n\n")
for i in xrange(len(chunks)):
s = chunks[i]
handler.characters(s)
if(len(chunks) - i > 1):
handler.startElementNS((None, 'br'), 'br', {})
handler.endElementNS((None, 'br'), 'br')
handler.startElementNS((None, 'br'), 'br', {})
handler.endElementNS((None, 'br'), 'br')
elif(self.style == 'p'):
for s in self.text.split("\n\n"):
handler.startElementNS((None, 'p'), 'p', {})
handler.characters(s)
handler.endElementNS((None, 'p'), 'p')
elif(self.style in ('ul', 'ol')):
handler.startElementNS((None, self.style), self.style, {})
for s in self.text.split("\n\n"):
handler.startElementNS((None, 'li'), 'li', {})
handler.characters(s)
handler.endElementNS((None, 'li'), 'li')
handler.endElementNS((None, self.style), self.style)
elif(self.style in ('xhtml', 'xml')):
source = '<temp>'
if(isinstance(self.text, types.UnicodeType)):
source += self.text.encode('utf-8')
else:
source += self.text
source += '</temp>'
if(isWellFormed(source)):
# parse the text using the handler, but filter which events we pass to the handler (skip document events and events for the first element <temp/>)
myhandler = FilterHandler(handler, 0, 1, 0, 0)
parseString(source, myhandler, 1)
else:
#sys.stderr.write("XmlableText does not contain valid xhtml\n")
handler.characters(self.text)
class XmlableDict:
def __init__(self, dict, keyStyle='attribute'):
if(keyStyle not in ('attribute', 'element')):
raise RuntimeError, "XmlableDict constructor passed unrecognized value for 'keyStyle'."
self.dict = dict
self.style = keyStyle
def toSaxEvents(self, handler):
for key in self.dict.keys():
keystring = str(key)
if(self.style == 'element'):
handler.startElementNS((None, keystring), keystring, {})
objectToSaxEvents(handler, self.dict[key], self.style)
handler.endElementNS((None, keystring), keystring)
else:
handler.startElementNS((None, 'item'), 'item', {(None, 'key'): keystring})
objectToSaxEvents(handler, self.dict[key], self.style)
handler.endElementNS((None, 'item'), 'item')
def parseStream(input, handler=None, namespaces=1):
parser = xml.sax.make_parser()
parser.setFeature(xml.sax.handler.feature_namespaces, namespaces)
if(handler): parser.setContentHandler(handler)
inputsource = xml.sax.xmlreader.InputSource()
inputsource.setByteStream(input)
inputsource.setEncoding('utf-8')
parser.parse(inputsource)
def parseString(input, handler=None, namespaces=1):
inbuffer = cStringIO.StringIO()
inbuffer.write(input)
inbuffer.seek(0)
try:
parseStream(inbuffer, handler, namespaces)
finally:
inbuffer.close()
def isWellFormed(val):
try:
parseString(val)
except:
return 0
return 1
|