########################################################################
# $Header: /var/local/cvsroot/4Suite/Ft/Xml/Lib/HtmlPrinter.py,v 1.13 2005/02/09 09:12:06 mbrown Exp $
"""
This module supports document serialization in HTML syntax.
Copyright 2005 Fourthought, Inc. (USA).
Detailed license and copyright information: http://4suite.org/COPYRIGHT
Project home, documentation, distributions: http://4suite.org/
"""
import re
from Ft.Xml import EMPTY_NAMESPACE
import cStreamWriter
from XmlPrinter import XmlPrinter
class HtmlPrinter(XmlPrinter):
"""
An HtmlPrinter instance provides functions for serializing an XML
or XML-like document to a stream, based on SAX-like event calls
initiated by an instance of Ft.Xml.Lib.Print.PrintVisitor.
The methods in this subclass of XmlPrinter attempt to emit a
document conformant to the HTML 4.01 syntax, with no extra
whitespace added for visual formatting. The degree of correctness
of the output depends on the data supplied in the event calls; no
checks are done for conditions that would result in syntax errors,
such as two attributes with the same name, "--" in a comment, etc.
"""
def __init__(self, stream, encoding):
"""
Creates an HtmlPrinter instance.
stream must be a file-like object open for writing binary
data. encoding specifies the encoding which is to be used for
writing to the stream.
"""
XmlPrinter.__init__(self, stream, encoding)
self.disableOutputEscaping = 0
return
def startDocument(self, version='4.0', standalone=None):
"""
Handles a startDocument event.
Differs from the overridden method in that no XML declaration
is written.
"""
# If the version isn't one we know how to handle, fallback to 4.0.
if version not in self._versionedEntities:
version = '4.0'
# Set the entity maps to the particular version of HTML being output.
self.textEntities, self.attrEntitiesQuot, self.attrEntitiesApos = \
self._versionedEntities[version]
return
def doctype(self, name, publicId, systemId):
"""
Handles a doctype event.
Extends the overridden method by adding support for the case
when there is a publicId and no systemId, which is allowed in
HTML but not in XML.
"""
if publicId and not systemId:
self.writeAscii('<!DOCTYPE ')
self.writeEncode(name, 'document type name')
self.writeAscii(' PUBLIC "')
self.writeEncode(publicId, 'document type public-id')
self.writeAscii('">\n')
else:
XmlPrinter.doctype(self, name, publicId, systemId)
return
def startElement(self, namespaceUri, tagName, namespaces, attributes):
"""
Handles a startElement event.
Extends the overridden method by disabling output escaping for
the content of certain elements (SCRIPT and STYLE).
"""
if namespaceUri is not EMPTY_NAMESPACE:
XmlPrinter.startElement(self, namespaceUri, tagName, namespaces,
attributes)
return
if tagName.lower() in self.noEscapeElements:
self.disableOutputEscaping += 1
XmlPrinter.startElement(self, namespaceUri, tagName, namespaces,
attributes)
# HTML tags are never in minimized form ('<tag/>')
self.writeAscii('>')
self._inElement = False
return
def endElement(self, namespaceUri, tagName):
"""
Handles an endElement event.
Differs from the overridden method in that an end tag is not
generated for certain elements.
"""
if namespaceUri is not EMPTY_NAMESPACE:
XmlPrinter.endElement(self, namespaceUri, tagName)
return
element = tagName.lower()
if element not in self.forbiddenEndElements:
self.writeAscii('</')
self.writeEncode(tagName, 'element name')
self.writeAscii('>')
# Restore normal escaping if closing a no-escape element.
if element in self.noEscapeElements:
self.disableOutputEscaping -= 1
return
def attribute(self, elementUri, elementName, name, value):
"""
Handles an attribute event.
Extends the overridden method by writing boolean attributes in
minimized form.
"""
if elementUri is not EMPTY_NAMESPACE:
XmlPrinter.attribute(self, elementUri, elementName, name, value)
return
element = elementName.lower()
attribute = name.lower()
if element in self.booleanAttributes.get(attribute, []) \
and attribute == value.lower():
# A boolean attribute, just write out the name
self.writeAscii(' ')
self.writeEncode(name, 'attribute name')
elif element in self.uriAttributes.get(attribute, []):
# From HTML 4.0 Section B.2.1
# We recommend that user agents adopt the following convention for
# handling non-ASCII characters:
# 1. Represent each character in UTF-8 (see [RFC2279]) as one or
# more bytes.
# 2. Escape these bytes with the URI escaping mechanism (i.e., by
# converting each byte to %HH, where HH is the hexadecimal
# notation of the byte value).
# (Although this recommendation is for HTML user agents
# that encounter HTML with improperly escaped URI refs,
# we implement it in order to comply with XSLT's html
# output method, and because there's no compelling reason
# not to do it for non-XSLT serializations as well)
#
# FIXME:
# "&" should not be escaped in an attribute value when it
# it is followed by "{" (see Section B.7.1 of HTML 4.0).
value = unicode(re.sub('[\x80-\xff]',
lambda match: '%%%02X' % ord(match.group()),
value.encode('UTF-8')))
XmlPrinter.attribute(self, elementUri, elementName, name, value)
else:
XmlPrinter.attribute(self, elementUri, elementName, name, value)
return
def text(self, data, disableEscaping=0):
"""
Handles a text event.
Extends the overridden method by disabling output escaping if
in the content of certain elements like SCRIPT or STYLE.
"""
if self._inElement:
self.writeAscii('>')
self._inElement = False
disableEscaping = disableEscaping or self.disableOutputEscaping
XmlPrinter.text(self, data, disableEscaping)
return
def processingInstruction(self, target, data):
"""
Handles a processingInstruction event.
Differs from the overridden method by writing the tag with
no "?" at the end.
"""
if self._inElement:
self.writeAscii('>')
self._inElement = False
self.writeAscii('<?')
self.writeEncode(target, 'processing-instruction target')
if data:
self.writeAscii(' ')
self.writeEncode(data, 'processing-instruction data')
self.writeAscii('>')
return
# Elements for which end tags must not be emitted
forbiddenEndElements = {}
for name in ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
'img', 'input', 'isindex', 'link', 'meta', 'param']:
forbiddenEndElements[name] = True
del name
# Elements in which character data is not escaped
#
# FIXME: According to HTML 4.01 section B.3.2, "</" and unencodable
# characters within the content of a SCRIPT or STYLE slement must
# be escaped according to the conventions of the script or style
# language in use.
noEscapeElements = {'script' : True,
'style' : True,
}
# Boolean attributes that can be minimized
booleanAttributes = {
'checked' : ['input'],
'compact' : ['dl', 'ol', 'ul', 'dir', 'menu', 'li'],
'declare' : ['object'],
'defer' : ['script'],
'disabled' : ['input', 'select', 'optgroup', 'option', 'textarea',
'button'],
'ismap' : ['img', 'input'],
'multiple' : ['select'],
'nohref' : ['area'],
'noresize' : ['frame'],
'noshade' : ['hr'],
'nowrap' : ['th', 'td'],
'readonly' : ['input', 'textarea'],
'selected' : ['option'],
}
# URI attributes that can have non-ASCII characters escaped
uriAttributes = {
'action' : ['form'],
'background' : ['body'],
'cite' : ['blockquote', 'del', 'ins', 'q'],
'classid' : ['object'],
'codebase' : ['applet', 'object'],
'data' : ['object'],
'href' : ['a', 'area', 'base', 'link'],
'longdesc' : ['frame', 'iframe', 'img'],
'profile' : ['head'],
'src' : ['frame', 'iframe', 'img', 'input', 'script'],
'usemap' : ['img', 'input', 'object'],
}
# HTML 3.2 defined character entities
entities_3_2 = {
# Sect 24.2 -- ISO 8859-1
u'\u00A0' : ' ',
u'\u00A1' : '¡',
u'\u00A2' : '¢',
u'\u00A3' : '£',
u'\u00A4' : '¤',
u'\u00A5' : '¥',
u'\u00A6' : '¦',
u'\u00A7' : '§',
u'\u00A8' : '¨',
u'\u00A9' : '©',
u'\u00AA' : 'ª',
u'\u00AB' : '«',
u'\u00AC' : '¬',
u'\u00AD' : '­',
u'\u00AE' : '®',
u'\u00AF' : '¯',
u'\u00B0' : '°',
u'\u00B1' : '±',
u'\u00B2' : '²',
u'\u00B3' : '³',
u'\u00B4' : '´',
u'\u00B5' : 'µ',
u'\u00B6' : '¶',
u'\u00B7' : '·',
u'\u00B8' : '¸',
u'\u00B9' : '¹',
u'\u00BA' : 'º',
u'\u00BB' : '»',
u'\u00BC' : '¼',
u'\u00BD' : '½',
u'\u00BE' : '¾',
u'\u00BF' : '¿',
u'\u00C0' : 'À',
u'\u00C1' : 'Á',
u'\u00C2' : 'Â',
u'\u00C3' : 'Ã',
u'\u00C4' : 'Ä',
u'\u00C5' : 'Å',
u'\u00C6' : 'Æ',
u'\u00C7' : 'Ç',
u'\u00C8' : 'È',
u'\u00C9' : 'É',
u'\u00CA' : 'Ê',
u'\u00CB' : 'Ë',
u'\u00CC' : 'Ì',
u'\u00CD' : 'Í',
u'\u00CE' : 'Î',
u'\u00CF' : 'Ï',
u'\u00D0' : 'Ð',
u'\u00D1' : 'Ñ',
u'\u00D2' : 'Ò',
u'\u00D3' : 'Ó',
u'\u00D4' : 'Ô',
u'\u00D5' : 'Õ',
u'\u00D6' : 'Ö',
u'\u00D7' : '×',
u'\u00D8' : 'Ø',
u'\u00D9' : 'Ù',
u'\u00DA' : 'Ú',
u'\u00DB' : 'Û',
u'\u00DC' : 'Ü',
u'\u00DD' : 'Ý',
u'\u00DE' : 'Þ',
u'\u00DF' : 'ß',
u'\u00E0' : 'à',
u'\u00E1' : 'á',
u'\u00E2' : 'â',
u'\u00E3' : 'ã',
u'\u00E4' : 'ä',
u'\u00E5' : 'å',
u'\u00E6' : 'æ',
u'\u00E7' : 'ç',
u'\u00E8' : 'è',
u'\u00E9' : 'é',
u'\u00EA' : 'ê',
u'\u00EB' : 'ë',
u'\u00EC' : 'ì',
u'\u00ED' : 'í',
u'\u00EE' : 'î',
u'\u00EF' : 'ï',
u'\u00F0' : 'ð',
u'\u00F1' : 'ñ',
u'\u00F2' : 'ò',
u'\u00F3' : 'ó',
u'\u00F4' : 'ô',
u'\u00F5' : 'õ',
u'\u00F6' : 'ö',
u'\u00F7' : '÷',
u'\u00F8' : 'ø',
u'\u00F9' : 'ù',
u'\u00FA' : 'ú',
u'\u00FB' : 'û',
u'\u00FC' : 'ü',
u'\u00FD' : 'ý',
u'\u00FE' : 'þ',
u'\u00FF' : 'ÿ',
}
# HTML 4.01 defined character entities
entities_4_0 = {
# Sect 24.3 -- Symbols, Mathematical Symbols, and Greek Letters
# Latin Extended-B
u'\u0192' : 'ƒ',
# Greek
u'\u0391' : 'Α',
u'\u0392' : 'Β',
u'\u0393' : 'Γ',
u'\u0394' : 'Δ',
u'\u0395' : 'Ε',
u'\u0396' : 'Ζ',
u'\u0397' : 'Η',
u'\u0398' : 'Θ',
u'\u0399' : 'Ι',
u'\u039A' : 'Κ',
u'\u039B' : 'Λ',
u'\u039C' : 'Μ',
u'\u039D' : 'Ν',
u'\u039E' : 'Ξ',
u'\u039F' : 'Ο',
u'\u03A0' : 'Π',
u'\u03A1' : 'Ρ',
u'\u03A3' : 'Σ',
u'\u03A4' : 'Τ',
u'\u03A5' : 'Υ',
u'\u03A6' : 'Φ',
u'\u03A7' : 'Χ',
u'\u03A8' : 'Ψ',
u'\u03A9' : 'Ω',
u'\u03B1' : 'α',
u'\u03B2' : 'β',
u'\u03B3' : 'γ',
u'\u03B4' : 'δ',
u'\u03B5' : 'ε',
u'\u03B6' : 'ζ',
u'\u03B7' : 'η',
u'\u03B8' : 'θ',
u'\u03B9' : 'ι',
u'\u03BA' : 'κ',
u'\u03BB' : 'λ',
u'\u03BC' : 'μ',
u'\u03BD' : 'ν',
u'\u03BE' : 'ξ',
u'\u03BF' : 'ο',
u'\u03C0' : 'π',
u'\u03C1' : 'ρ',
u'\u03C2' : 'ς',
u'\u03C3' : 'σ',
u'\u03C4' : 'τ',
u'\u03C5' : 'υ',
u'\u03C6' : 'φ',
u'\u03C7' : 'χ',
u'\u03C8' : 'ψ',
u'\u03C9' : 'ω',
u'\u03D1' : 'ϑ',
u'\u03D2' : 'ϒ',
u'\u03D6' : 'ϖ',
# General Punctuation
u'\u2022' : '•', # bullet
u'\u2026' : '…', # horizontal ellipsis
u'\u2032' : '′', # prime (minutes/feet)
u'\u2033' : '″', # double prime (seconds/inches)
u'\u203E' : '‾', # overline (spacing overscore)
u'\u203A' : '⁄', # fractional slash
# Letterlike Symbols
u'\u2118' : '℘', # script capital P (power set/Weierstrass p)
u'\u2111' : 'ℑ', # blackletter capital I (imaginary part)
u'\u211C' : 'ℜ', # blackletter capital R (real part)
u'\u2122' : '™', # trademark
u'\u2135' : 'ℵ', # alef symbol (first transfinite cardinal)
# Arrows
u'\u2190' : '←', # leftwards arrow
u'\u2191' : '↑', # upwards arrow
u'\u2192' : '→', # rightwards arrow
u'\u2193' : '↓', # downwards arrow
u'\u2194' : '↔', # left right arrow
u'\u21B5' : '↵', # downwards arrow with corner leftwards
u'\u21D0' : '⇐', # leftwards double arrow
u'\u21D1' : '⇑', # upwards double arrow
u'\u21D2' : '⇒', # rightwards double arrow
u'\u21D3' : '⇓', # downwards double arrow
u'\u21D4' : '⇔', # left right double arrow
# Mathematical Operators
u'\u2200' : '∀', # for all
u'\u2202' : '∂', # partial differential
u'\u2203' : '∃', # there exists
u'\u2205' : '∅', # empty set, null set, diameter
u'\u2207' : '∇', # nabla, backward difference
u'\u2208' : '∈', # element of
u'\u2209' : '∉', # not an element of
u'\u220B' : '∋', # contains as member
u'\u220F' : '∏', # n-ary product, product sign
u'\u2211' : '∑', # n-ary sumation
u'\u2212' : '−', # minus sign
u'\u2217' : '∗', # asterisk operator
u'\u221A' : '√', # square root, radical sign
u'\u221D' : '∝', # proportional to
u'\u221E' : '∞', # infinity
u'\u2220' : '∠', # angle
u'\u2227' : '∧', # logical and, wedge
u'\u2228' : '∨', # logical or, vee
u'\u2229' : '∩', # intersection, cap
u'\u222A' : '∪', # union, cup
u'\u222B' : '∫', # integral
u'\u2234' : '∴', # therefore
u'\u223C' : '∼', # tilde operator, varies with, similar to
u'\u2245' : '≅', # approximately equal to
u'\u2248' : '≈', # almost equal to, asymptotic to
u'\u2260' : '≠', # not equal to
u'\u2261' : '≡', # identical to
u'\u2264' : '≤', # less-than or equal to
u'\u2265' : '≥', # greater-than or equal to
u'\u2282' : '⊂', # subset of
u'\u2283' : '⊃', # superset of
u'\u2284' : '⊄', # not subset of
u'\u2286' : '⊆', # subset of or equal to
u'\u2287' : '⊇', # superset of or equal to
u'\u2295' : '⊕', # circled plus, direct sum
u'\u2297' : '⊗', # circled times, vector product
u'\u22A5' : '⊥', # up tack, orthogonal to, perpendicular
u'\u22C5' : '⋅', # dot operator
u'\u2308' : '⌈', # left ceiling, apl upstile
u'\u2309' : '⌉', # right ceiling
u'\u230A' : '⌊', # left floor, apl downstile
u'\u230B' : '⌋', # right floor
u'\u2329' : '⟨', # left-pointing angle bracket, bra
u'\u232A' : '⟩', # right-pointing angle bracket, ket
u'\u25CA' : '◊', # lozenge
# Miscellaneous Symbols
u'\u2660' : '♠',
u'\u2663' : '♣',
u'\u2665' : '♥',
u'\u2666' : '♦',
# Sect 24.4 -- Markup Significant and Internationalization
# Latin Extended-A
u'\u0152' : 'Œ', # capital ligature OE
u'\u0153' : 'œ', # small ligature oe
u'\u0160' : 'Š', # capital S with caron
u'\u0161' : 'š', # small s with caron
u'\u0178' : 'Ÿ', # capital Y with diaeresis
# Spacing Modifier Letters
u'\u02C6' : 'ˆ', # circumflexx accent
u'\u02DC' : '&tidle;', # small tilde
# General Punctuation
u'\u2002' : ' ', # en space
u'\u2003' : ' ', # em space
u'\u2009' : ' ', # thin space
u'\u200C' : '‌', # zero-width non-joiner
u'\u200D' : '‍', # zero-width joiner
u'\u200E' : '‎', # left-to-right mark
u'\u200F' : '‏', # right-to-left mark
u'\u2013' : '–', # en dash
u'\u2014' : '—', # em dash
u'\u2018' : '‘', # left single quotation mark
u'\u2019' : '’', # right single quotation mark
u'\u201A' : '‚', # single low-9 quotation mark
u'\u201C' : '“', # left double quotation mark
u'\u201D' : '”', # right double quotation mark
u'\u201E' : '„', # double low-9 quotation mark
u'\u2020' : '†', # dagger
u'\u2021' : '‡', # double dagger
u'\u2030' : '‰', # per mille sign
u'\u2039' : '‹', # single left-pointing angle quotation mark
u'\u203A' : '›', # single right-pointing angle quotation mark
u'\u20AC' : '€', # euro sign
}
_versionedEntities = {
'3.2' : [],
'4.0' : [],
}
textEntities = {'<' : '<',
'>' : '>',
'&' : '&',
'\r' : ' ',
}
textEntities.update(entities_3_2)
_versionedEntities['3.2'].append(cStreamWriter.EntityMap(textEntities))
textEntities.update(entities_4_0)
textEntities = cStreamWriter.EntityMap(textEntities)
_versionedEntities['4.0'].append(textEntities)
# For HTML attribute values:
# 1. do not escape '<' (see XSLT 1.0 section 16.2)
# 2. only escape '&' if not followed by '{'
def attr_amp_escape(string, offset):
if string.startswith('&{', offset):
return '&'
else:
return '&'
attrEntitiesQuot = {'&' : attr_amp_escape,
'\t' : '	',
'\n' : ' ',
'\r' : ' ',
'"' : '"',
}
attrEntitiesQuot.update(entities_3_2)
_versionedEntities['3.2'].append(cStreamWriter.EntityMap(attrEntitiesQuot))
attrEntitiesQuot.update(entities_4_0)
attrEntitiesQuot = cStreamWriter.EntityMap(attrEntitiesQuot)
_versionedEntities['4.0'].append(attrEntitiesQuot)
attrEntitiesApos = {'&' : attr_amp_escape,
'\t' : '	',
'\n' : ' ',
'\r' : ' ',
"'" : ''', # no ' in HTML
}
attrEntitiesApos.update(entities_3_2)
_versionedEntities['3.2'].append(cStreamWriter.EntityMap(attrEntitiesApos))
attrEntitiesApos.update(entities_4_0)
attrEntitiesApos = cStreamWriter.EntityMap(attrEntitiesApos)
_versionedEntities['4.0'].append(attrEntitiesApos)
del entities_3_2
del entities_4_0
del attr_amp_escape
|