"""This script duplicates adjacency_list.py, but optimizes the loading
of XML nodes to be based on a "flattened" datamodel. Any number of XML documents,
each of arbitrary complexity, can be loaded in their entirety via a single query
which joins on only three tables.
"""
################################# PART I - Imports/Configuration ###########################################
from sqlalchemy import (MetaData,Table,Column,Integer,String,ForeignKey
Unicode, and_)
from sqlalchemy.orm import mapper,relationship,create_session,lazyload
import sys, os, StringIO, re
from xml.etree import ElementTree
meta = MetaData()
meta.bind = 'sqlite://'
################################# PART II - Table Metadata ###########################################
# stores a top level record of an XML document.
documents = Table('documents', meta,
Column('document_id', Integer, primary_key=True),
Column('filename', String(30), unique=True),
)
# stores XML nodes in an adjacency list model. This corresponds to
# Element and SubElement objects.
elements = Table('elements', meta,
Column('element_id', Integer, primary_key=True),
Column('parent_id', Integer, ForeignKey('elements.element_id')),
Column('document_id', Integer, ForeignKey('documents.document_id')),
Column('tag', Unicode(30), nullable=False),
Column('text', Unicode),
Column('tail', Unicode)
)
# stores attributes. This corresponds to the dictionary of attributes
# stored by an Element or SubElement.
attributes = Table('attributes', meta,
Column('element_id', Integer, ForeignKey('elements.element_id'), primary_key=True),
Column('name', Unicode(100), nullable=False, primary_key=True),
Column('value', Unicode(255)))
meta.create_all()
#################################### PART III - Model #############################################
# our document class. contains a string name,
# and the ElementTree root element.
class Document(object):
def __init__(self, name, element):
self.filename = name
self.element = element
def __str__(self):
buf = StringIO.StringIO()
self.element.write(buf)
return buf.getvalue()
#################################### PART IV - Persistence Mapping ###################################
# Node class. a non-public class which will represent
# the DB-persisted Element/SubElement object. We cannot create mappers for
# ElementTree elements directly because they are at the very least not new-style
# classes, and also may be backed by native implementations.
# so here we construct an adapter.
class _Node(object):
pass
# Attribute class. also internal, this will represent the key/value attributes stored for
# a particular Node.
class _Attribute(object):
def __init__(self, name, value):
self.name = name
self.value = value
# setup mappers. Document will eagerly load a list of _Node objects.
# they will be ordered in primary key/insert order, so that we can reconstruct
# an ElementTree structure from the list.
mapper(Document, documents, properties={
'_nodes':relationship(_Node, lazy='joined', cascade="all, delete-orphan")
})
# the _Node objects change the way they load so that a list of _Nodes will organize
# themselves hierarchically using the ElementTreeMarshal. this depends on the ordering of
# nodes being hierarchical as well; relationship() always applies at least ROWID/primary key
# ordering to rows which will suffice.
mapper(_Node, elements, properties={
'children':relationship(_Node, lazy=None), # doesnt load; used only for the save relationship
'attributes':relationship(_Attribute, lazy='joined', cascade="all, delete-orphan"), # eagerly load attributes
})
mapper(_Attribute, attributes)
# define marshalling functions that convert from _Node/_Attribute to/from ElementTree objects.
# this will set the ElementTree element as "document._element", and append the root _Node
# object to the "_nodes" mapped collection.
class ElementTreeMarshal(object):
def __get__(self, document, owner):
if document is None:
return self
if hasattr(document, '_element'):
return document._element
nodes = {}
root = None
for node in document._nodes:
if node.parent_id is not None:
parent = nodes[node.parent_id]
elem = ElementTree.SubElement(parent, node.tag)
nodes[node.element_id] = elem
else:
parent = None
elem = root = ElementTree.Element(node.tag)
nodes[node.element_id] = root
for attr in node.attributes:
elem.attrib[attr.name] = attr.value
elem.text = node.text
elem.tail = node.tail
document._element = ElementTree.ElementTree(root)
return document._element
def __set__(self, document, element):
def traverse(node):
n = _Node()
n.tag = unicode(node.tag)
n.text = unicode(node.text)
n.tail = unicode(node.tail)
document._nodes.append(n)
n.children = [traverse(n2) for n2 in node]
n.attributes = [_Attribute(unicode(k), unicode(v)) for k, v in node.attrib.iteritems()]
return n
traverse(element.getroot())
document._element = element
def __delete__(self, document):
del document._element
document._nodes = []
# override Document's "element" attribute with the marshaller.
Document.element = ElementTreeMarshal()
########################################### PART V - Basic Persistence Example ############################
line = "\n--------------------------------------------------------"
# save to DB
session = create_session()
# get ElementTree documents
for file in ('test.xml', 'test2.xml', 'test3.xml'):
filename = os.path.join(os.path.dirname(__file__), file)
doc = ElementTree.parse(filename)
session.add(Document(file, doc))
print "\nSaving three documents...", line
session.flush()
print "Done."
# clear session (to illustrate a full load), restore
session.expunge_all()
print "\nFull text of document 'text.xml':", line
document = session.query(Document).filter_by(filename="test.xml").first()
print document
############################################ PART VI - Searching for Paths #######################################
# manually search for a document which contains "/somefile/header/field1:hi"
print "\nManual search for /somefile/header/field1=='hi':", line
d = session.query(Document).join('_nodes', aliased=True).filter(and_(_Node.parent_id==None, _Node.tag==u'somefile')).\
join('children', aliased=True, from_joinpoint=True).filter(_Node.tag==u'header').\
join('children', aliased=True, from_joinpoint=True).filter(and_(_Node.tag==u'field1', _Node.text==u'hi')).\
one()
print d
# generalize the above approach into an extremely impoverished xpath function:
def find_document(path, compareto):
j = documents
prev_elements = None
query = session.query(Document)
first = True
for i, match in enumerate(re.finditer(r'/([\w_]+)(?:\[@([\w_]+)(?:=(.*))?\])?', path)):
(token, attrname, attrvalue) = match.group(1, 2, 3)
if first:
query = query.join('_nodes', aliased=True).filter(_Node.parent_id==None)
first = False
else:
query = query.join('children', aliased=True, from_joinpoint=True)
query = query.filter(_Node.tag==token)
if attrname:
query = query.join('attributes', aliased=True, from_joinpoint=True)
if attrvalue:
query = query.filter(and_(_Attribute.name==attrname, _Attribute.value==attrvalue))
else:
query = query.filter(_Attribute.name==attrname)
return query.options(lazyload('_nodes')).filter(_Node.text==compareto).all()
for path, compareto in (
(u'/somefile/header/field1', u'hi'),
(u'/somefile/field1', u'hi'),
(u'/somefile/header/field2', u'there'),
(u'/somefile/header/field2[@attr=foo]', u'there')
):
print "\nDocuments containing '%s=%s':" % (path, compareto), line
print [d.filename for d in find_document(path, compareto)]
|