Source code for press.parsers.collection
from xml import sax
from press.models import CollectionMetadata, PressElement
from .common import make_cnx_xpath, make_elm_tree, parse_common_properties
[docs]class CollectionXmlHandler(sax.ContentHandler):
def __init__(self, root):
self.current_node = root
self.next_node = None
[docs] def startElementNS(self, name, qname, attrs):
uri, localname = name
# TODO: we could improve this by passing in just the URI and have the
# model map it to a namespace.
self.next_node = PressElement(localname,
self._attrs_no_uri(attrs))
self.current_node.add_child(self.next_node)
self.current_node = self.next_node
[docs] def characters(self, content):
# we have to create a new node so that the hash is re-calculated,
# see docs for __hash__ method.
new_node = self.current_node.insert_text(content)
new_node.parent = self.current_node.parent
new_node.children = self.current_node.children
# the parent needs to contain the new child element for the old one
self.current_node.parent.children.pop()
self.current_node.parent.children.append(new_node)
# and finally...
self.current_node = new_node
[docs] def endElementNS(self, name, qname):
self.current_node = self.current_node.parent
def _attrs_no_uri(self, attrs):
return {name: value for (uri, name), value in attrs.items()}
[docs]def parse_collxml(input_collxml):
"""
Given a collxml document, parses the document to a python object
where collections and sub-collections (both branching points) contain
subcollections and modules (leaf nodes).
"""
tree_root = PressElement('root')
parser = sax.make_parser()
parser.setFeature(sax.handler.feature_namespaces, 1)
parser.setContentHandler(CollectionXmlHandler(tree_root))
# adds the ability to parse an object that comes from the database
if isinstance(input_collxml, memoryview):
import io
input_collxml = io.BytesIO(input_collxml)
parser.parse(input_collxml) # parses a file-like object
return tree_root