TreeCompare.py :  » XML » 4Suite » 4Suite-XML-1.0.2 » Ft » Xml » Lib » Python Open Source

Home
Python Open Source
1.3.1.2 Python
2.Ajax
3.Aspect Oriented
4.Blog
5.Build
6.Business Application
7.Chart Report
8.Content Management Systems
9.Cryptographic
10.Database
11.Development
12.Editor
13.Email
14.ERP
15.Game 2D 3D
16.GIS
17.GUI
18.IDE
19.Installer
20.IRC
21.Issue Tracker
22.Language Interface
23.Log
24.Math
25.Media Sound Audio
26.Mobile
27.Network
28.Parser
29.PDF
30.Project Management
31.RSS
32.Search
33.Security
34.Template Engines
35.Test
36.UML
37.USB Serial
38.Web Frameworks
39.Web Server
40.Web Services
41.Web Unit
42.Wiki
43.Windows
44.XML
Python Open Source » XML » 4Suite 
4Suite » 4Suite XML 1.0.2 » Ft » Xml » Lib » TreeCompare.py
########################################################################
# $Header: /var/local/cvsroot/4Suite/Ft/Xml/Lib/TreeCompare.py,v 1.26 2005/03/18 23:47:16 jkloth Exp $
"""
Comparison functions for XML and HTML documents
(mainly used in the test suites)

Copyright 2005 Fourthought, Inc. (USA).
Detailed license and copyright information: http://4suite.org/COPYRIGHT
Project home, documentation, distributions: http://4suite.org/
"""

import re, sgmllib
from sgmllib import SGMLParser
from xml.dom import Node

# adds colon to regex patterns for names
sgmllib.tagfind = re.compile('[a-zA-Z][-:_.a-zA-Z0-9]*')
sgmllib.attrfind = re.compile(r'\s*([a-zA-Z_][-:_.a-zA-Z0-9]*)(\s*=\s*'
                              r'(\'[^\']*\'|"[^"]*"|\S*))?')

from Ft.Lib.Uri import BASIC_RESOLVER,OsPathToUri
from Ft.Xml import Domlette,InputSource,XMLNS_NAMESPACE
from Ft.Xml.Lib.XmlString import XmlStrStrip,IsXmlSpace

import HtmlPrinter
FORBIDDEN_END_ELEMENTS = HtmlPrinter.HtmlPrinter.forbiddenEndElements.keys()
del HtmlPrinter

_S = "[\x20\x09\x0D\x0A]"
_OptionalS = _S + "?"
_VersionNum = "[a-zA-Z0-9_.:-]+"
_Eq = "%s?=%s?" % (_S, _S)
_VersionInfo = _S + "version" + _Eq + \
               "(?:(?:'" + _VersionNum + "')|" + '(?:"' + _VersionNum + '"))'
_EncName = "[A-Za-z][A-Za-z0-9._-]*"
_EncodingDecl = _S + "encoding" + _Eq + \
                "(?:(?:'" + _EncName + "')|" + '(?:"' + _EncName + '"))'
_SDDecl = _S + "standalone" + _Eq + \
          "(?:(?:'(?:yes|no)')|" + '(?:"(?:yes|no)"))'
g_xmlTest = re.compile(r"<\?xml" +
                       r"(?P<VersionInfo>%s)" % _VersionInfo +
                       r"(?P<EncodingDecl>%s)?" % _EncodingDecl +
                       r"(?P<SDDecl>%s)?" % _SDDecl +
                       r"%s?\?>" % _S)
g_doctypeTest = re.compile("(<!DOCTYPE[\x20\x09\x0D\x0A])")
g_htmlTest = re.compile("(<!doctype html)|(<html)",re.IGNORECASE)

def HtmlTreeCompare(expected, compared):
    """
    Compare two HTML strings.  The result is similar to the builtin cmp()
    function such that non-zero indicates non equal and zero means equal.
    """
    return not CompareHTML(expected, compared)

def XmlTreeCompare(expected, compared):
    # External Parsed Entities cannot have a standalone declaration or
    # DOCTYPE declaration.
    # See XML 1.0 2nd, 4.3.2, Well-Formed Parsed Entities
    match = g_xmlTest.match(expected)
    if match and match.groupdict().get('SDDecl'):
        asEntity = False
    else:
        asEntity = not g_doctypeTest.search(expected)
    return TreeCompare(expected, compared, asEntity=asEntity)

def NoWsTreeCompare(expected, compared):
    """
    Equivalent to calling TreeCompare() with ignoreWhitespace=1.
    """
    return TreeCompare(expected, compared, ignoreWhitespace=1)

from Ft.Xml import READ_EXTERNAL_DTD
def TreeCompare(expected, compared, ignoreWhitespace=0, baseUri=None,
                readExtDtd=READ_EXTERNAL_DTD, ignoreNsDecls=0, asEntity=False):
    """
    A cmp()-like function that compares two XML or HTML strings and
    has the side effect of reporting differences to stdout. Returns
    false if the nodes compare equal.

    XML strings are parsed into a Domlette and compared node-by-node.
    HTML strings are parsed with an SGML parser and are compared
    event-by-event. The markup type is guessed based on clues in the
    expected string.

    ignoreWhitespace controls whether whitespace differences in text
    nodes are ignored.

    'file:' URIs based on the current working directory are generated
    for each document. The baseUri argument is an optional absolute URI
    to use as the basis of the generated URIs, if a 'file' URI is
    undesirable.

    readExtDtd controls whether the external DTD subset is read
    when parsing XML. It does not affect the reading of external
    entities declared in the internal DTD subset.

    ignoreNsDecls controls whether namespace declarations are ignored
    when comparing XML documents.
    """
    # See if we need to use XML or HTML
    if not g_xmlTest.match(expected) and g_htmlTest.search(expected):
        return not CompareHTML(expected, compared, ignoreWhitespace)

    if not baseUri:
        uri = OsPathToUri('expected', attemptAbsolute=True)
    else:
        uri = BASIC_RESOLVER.normalize('expected', baseUri)
    try:
        if asEntity:
            reader = Domlette.EntityReader
        elif readExtDtd:
            reader = Domlette.NonvalidatingReader
        else:
            reader = Domlette.NoExtDtdReader

        doc1 = reader.parseString(str(expected), uri)
    except:
        print '--- Expected ---'
        print expected
        raise

    if not baseUri:
        uri = OsPathToUri('expected', attemptAbsolute=True)
    else:
        uri = BASIC_RESOLVER.normalize('compared', baseUri)
    try:
        doc2 = reader.parseString(compared, uri)
    except:
        print '--- Compared ---'
        print compared
        raise

    # If an entity is also a well-formed document entity, remove any
    # top-level whitespace-only text nodes.
    if asEntity:
        _TryEntityAsDocumentEntity(doc1)
        _TryEntityAsDocumentEntity(doc2)

    result = NodeCompare(doc1, doc2,
                         ignoreWhitespace=ignoreWhitespace,
                         ignoreNsDecls=ignoreNsDecls)
    return not result


def _TryEntityAsDocumentEntity(entity):
    # If the entity only has 1 top-level element, strip top-level whitespace
    # only text nodes to match how a document entity would have been parsed.
    elements = 0
    for node in entity.childNodes:
        elements += int(node.nodeType == Node.ELEMENT_NODE)

    if elements == 1:
        # OK to strip WS-only text nodes
        nodes = [ x for x in entity.childNodes
                  if x.nodeType == Node.TEXT_NODE and IsXmlSpace(x.data) ]

        for node in nodes:
            entity.removeChild(node)
    return


def NodeCompare(node1, node2, ignoreWhitespace=0, ignoreComments=0,
                ignoreNsDecls=0):
    """
    A function that compares two XML DOM nodes by traversing their
    attributes and descendants recursively until a mismatch is found.
    It has the side effect of reporting differences to stdout. Returns
    true if the nodes compare equal.

    ignoreWhitespace controls whether whitespace differences in text
    nodes are ignored.

    ignoreComments controls whether comment nodes are ignored.

    ignoreNsDecls controls whether namespace declarations are ignored.
    """
    if node1.nodeType != node2.nodeType:
        return __ReportError(node1, node2, 'nodeType')

    # -- Document Nodes --------------------------------------
    if node1.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE):
        if ignoreComments:
            children1 = filter(lambda n: n.nodeType != Node.COMMENT_NODE, node1.childNodes)
            children2 = filter(lambda n: n.nodeType != Node.COMMENT_NODE, node2.childNodes)
        else:
            children1 = node1.childNodes
            children2 = node2.childNodes
        for child1, child2 in map(None, children1, children2):
            if not NodeCompare(child1, child2, ignoreWhitespace,
                               ignoreComments, ignoreNsDecls):
                return 0

    # -- Document Type Nodes ---------------------------------
    elif node1.nodeType == Node.DOCUMENT_TYPE_NODE:
        if node1.name != node2.name:
            return __ReportError(node1, node2, 'name')
        if node1.publicId != node2.publicId:
            return __ReportError(node1, node2, 'publicId')
        if node1.systemId != node2.systemId:
            return __ReportError(node1, node2, 'systemId')
        if node1.internalSubset != node2.internalSubset:
            return __ReportError(node1, node2, 'internalSubset')
        if len(node1.entities) != len(node2.entities):
            return __ReportError(node1, node2, 'entities')
        if len(node1.notations) != len(node2.notations):
            return __ReportError(node1, node2, 'notations')

    # -- Element Nodes ---------------------------------------
    elif node1.nodeType == Node.ELEMENT_NODE:
        if node1.localName != node2.localName:
            return __ReportError(node1, node2, 'localName')
        if node1.namespaceURI != node2.namespaceURI:
            return __ReportError(node1, node2, 'namespaceURI')

        # Compare attributes
        attrs1 = node1.attributes.values()
        attrs2 = node2.attributes.values()
        if ignoreNsDecls:
            # Remove XML Namespace declarations
            attrs1 = [ a for a in attrs1 if a.namespaceURI != XMLNS_NAMESPACE ]
            attrs2 = [ a for a in attrs2 if a.namespaceURI != XMLNS_NAMESPACE ]
        if len(attrs1) != len(attrs2):
            return __ReportError(node1, node2, 'attributes')
        # Sort the attributes by qualified name
        attrs1.sort(lambda a, b: cmp(a.name, b.name))
        attrs2.sort(lambda a, b: cmp(a.name, b.name))
        for attr1, attr2 in zip(attrs1, attrs2):
            if attr1.localName != attr2.localName:
                print node1.attributes.keys()
                print node2.attributes.keys()
                return __ReportError(attr1, attr2, 'localName')
            if attr1.namespaceURI != attr2.namespaceURI:
                return __ReportError(attr1, attr2, 'namespaceURI')

        # Compare children
        if ignoreComments:
            # Remove comment nodes
            children1 = [ c for c in node1.childNodes
                          if c.nodeType != Node.COMMENT_NODE ]
            children2 = [ c for c in node2.childNodes
                          if c.nodeType != Node.COMMENT_NODE ]
        else:
            children1 = node1.childNodes
            children2 = node2.childNodes
        if len(children1) != len(children2):
            return __ReportError(node1, node2, 'childNodes')
        for child1, child2 in zip(children1, children2):
            if not NodeCompare(child1, child2, ignoreWhitespace,
                               ignoreComments, ignoreNsDecls):
                return 0

    # -- Text Nodes ------------------------------------------
    elif node1.nodeType == Node.TEXT_NODE:
        text1 = node1.data
        text2 = node2.data
        if ignoreWhitespace:
            if IsXmlSpace(text1):
                text1 = None
            if IsXmlSpace(text2):
                text2 = None
        if cmp(text1, text2):
            return __ReportError(node1, node2, 'data')

    # -- Comment Nodes ---------------------------------------
    elif node1.nodeType == Node.COMMENT_NODE:
        if node1.data != node2.data:
            return __ReportError(node1, node2, 'data')

    # -- Processing Instruction Nodes ------------------------
    elif node1.nodeType == Node.PROCESSING_INSTRUCTION_NODE:
        if node1.target != node2.target:
            return __ReportError(node1, node2, 'target')
        if node1.data != node2.data:
            return __ReportError(node1, node2, 'data')

    # All tests pass, they are the same
    return 1


def __ReportError(node1, node2, attribute):
    import pprint
    print '--- expected ---'
    __PrintParentage(node1)
    print 'node: %s' % repr(node1)
    print 'node.%s:' % attribute
    pprint.pprint(getattr(node1, attribute))
    print '--- compared ---'
    __PrintParentage(node2)
    print 'node: %s' % repr(node2)
    print 'node.%s:' % attribute
    pprint.pprint(getattr(node2, attribute))
    return 0

def __PrintParentage(node):
    nodes = [node]
    if node.nodeType == Node.ATTRIBUTE_NODE:
        parent = node.ownerElement
    else:
        parent = node.parentNode
    while parent:
        nodes.insert(0, parent)
        parent = parent.parentNode
    indent = ''
    for node in nodes:
        print '%s%s' % (indent, node.nodeName)
        indent = indent + '  '



class SGMLParserEventGenerator(SGMLParser):
    """
    An HTML parser that meets our needs better than Python's
    htmllib.HTMLParser, and that works with Python 2.1.

    Used by CompareHTML().
    """
    TEXT_EVENT = 1
    COMMENT_EVENT = 2
    START_TAG_EVENT = 3
    END_TAG_EVENT = 4
    ENTITYREF_EVENT = 6
    CHARREF_EVENT = 7

    def __init__(self, verbose=0):
        self.testdata = ""
        SGMLParser.__init__(self, verbose)
        self.events = []
        self.ignoreable_ws = 1

    def handle_data(self, data):
        self.testdata = self.testdata + data

    def flush(self):
        data = self.testdata
        if data:
            self.testdata = ""
            if not self.ignoreable_ws:
                self.events.append((self.TEXT_EVENT,data))

    def handle_comment(self, data):
        self.flush()
        self.events.append((self.COMMENT_EVENT,data))

    def unknown_starttag(self, tagname, attrs):
        self.flush()
        if self.ignoreable_ws:
            self.ignoreable_ws = 0

        # Convert list of tuples to dictionary
        dict = {}
        for name, value in attrs:
            dict[name] = value
        self.events.append((self.START_TAG_EVENT, tagname, dict))
        if tagname.lower() in FORBIDDEN_END_ELEMENTS:
            self.events.append((self.END_TAG_EVENT,tagname))
        return

    def unknown_endtag(self, tag):
        self.flush()
        self.events.append((self.END_TAG_EVENT,tag))

    def unknown_entityref(self, ref):
        self.flush()
        self.events.append((self.ENTITYREF_EVENT,ref))

    def unknown_charref(self, ref):
        self.flush()
        self.events.append((self.CHARREF_EVENT,ref))

    def close(self):
        SGMLParser.close(self)
        self.flush()

g_xmlEmptyTagPattern = re.compile(
    '<([a-zA-Z][-:_.a-zA-Z0-9]*\s*([a-zA-Z_][-:_.a-zA-Z0-9]*)(\s*=\s*'
    r'(\'[^\']*\'|"[^"]*"|\S*))?)/>')

def CompareHTML(html1, html2, ignoreWhitespace=0):
    """
    A cmp()-like function that compares two HTML strings by parsing
    with sgmllib.SGMLParser and comparing events until a mismatch is
    found. It has the side effect of reporting differences to stdout.

    ignoreWhitespace controls whether whitespace differences in text
    events are ignored.
    """
    # SGMLParser goes awry when <emptyelement/> tags are encountered.
    # Obviously this is a very kludgy 'solution'
    html1 = g_xmlEmptyTagPattern.sub(r'<\1>',html1)
    html2 = g_xmlEmptyTagPattern.sub(r'<\1>',html2)

    p1 = SGMLParserEventGenerator()
    p1.feed(html1)
    p1.close()

    p2 = SGMLParserEventGenerator()
    p2.feed(html2)
    p2.close()

    stack = []  #Not 100% accurate, but close enough
    for cur1, cur2 in zip(p1.events, p2.events):
        if cur1[0] != cur2[0]:
            #different events
            return __ReportEventError(cur1,cur2,stack,'different events')
        event = cur1[0]
        if event == SGMLParserEventGenerator.TEXT_EVENT:
            # Compare the text of each
            d1 = cur1[1]
            d2 = cur2[1]
            if ignoreWhitespace and XmlStrStrip(d1) != XmlStrStrip(d2):
                return __ReportEventError(cur1,cur2,stack,'data')
        elif event == SGMLParserEventGenerator.COMMENT_EVENT:
            d1 = cur1[1]
            d2 = cur2[1]
            if d1.strip() != d2.strip():
                return __ReportEventError(cur1,cur2,stack,'comment data')
        elif event == SGMLParserEventGenerator.START_TAG_EVENT:
            if cur1[1] != cur2[1]:
                return __ReportEventError(cur1,cur2,stack,'start tag name')
            stack.append(cur1[1])  #Save for a nice print out

            att1 = cur1[2]
            att2 = cur2[2]

            if len(att1) != len(att2):
                return __ReportEventError(cur1,cur2,stack,'number of attributes')

            for name, value in att1.items():
                if att2.get(name, -1) != value:
                    return __ReportEventError(cur1,cur2,stack,'attribute value %s' % name)
        elif event == SGMLParserEventGenerator.END_TAG_EVENT:
            if cur1[1] != cur2[1]:
                return __ReportEventError(cur1,cur2,stack,'end tag name')
            while stack and stack[-1] != cur1[1]:  #Remove it
                del stack[-1]
            del stack[-1]
        elif event == SGMLParserEventGenerator.ENTITYREF_EVENT:
            if cur1[1] != cur2[1]:
                return __ReportEventError(cur1,cur2,stack,'entity ref')
        elif event == SGMLParserEventGenerator.CHARREF_EVENT:
            if cur1[1] != cur2[1]:
                return __ReportEventError(cur1,cur2,stack,'char ref')
        else:
            raise cur1
    return 1

def __ReportEventError(event1, event2, stack, attribute):
    __PrintStack(stack)
    print '--- Expected ---'
    print attribute, repr(event1[1:])
    print '--- Compared ---'
    print attribute, repr(event2[1:])
    return 0

def __PrintStack(stack):
    indent = ''
    for name in stack:
        print "%s%s" % (indent,name)
        indent += '  '


##        if isHtml:
##            # HTML DOM should already capitalize all tagNames
##            if node1.tagName != node2.tagName:
##                return __ReportError(node1, node2, 'tagName')
##            # Elements where whitespace is significant
##            if node1.tagName in ['SCRIPT', 'STYLE', 'PRE', 'TEXTAREA']:
##                ignoreWhitespace = 0


##            if isHtml:
##                # HTML DOMs should force upper case already
##                # FIXME: PyXML 0.7 changed HTML attributes to be NS so they
##                # are no longer forced to uppercase!
##                if attr1.name.upper() != attr2.name.upper():
##                    return __ReportError(attr1, attr2, 'name')


##        if ignoreWhitespace or isHtml:
##            if not XmlStrStrip(text1):
##                text1 = None
##            if not XmlStrStrip(text2):
##                text2 = None
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.