"""Facility to use the Expat parser to load a minidom instance
This avoids all the overhead of SAX and pulldom to gain performance.
# This module is tightly bound to the implementation details of the
# minidom DOM and can't be used with other DOM implementations. This
# is due, in part, to a lack of appropriate methods in the DOM (there is
# no way to create Entity and Notation nodes via the DOM Level 2
# interface), and for performance. The later is the cause of some fairly
# - .character_data_handler() has an extra case in which continuing
# data is appended to an existing Text node; this can be a
# speedup since pyexpat can break up character data into multiple
# callbacks even though we set the buffer_text attribute on the
# parser. This also gives us the advantage that we don't need a
# separate normalization pass.
# - Determining that a node exists is done using an identity comparison
# with None rather than a truth test; this avoids searching for and
# calling any methods on the node object if it exists. (A rather
# nice speedup is achieved this way as well!)
from xml
.dom
import xmlbuilder
, minidom
, Node
from xml
.dom
import EMPTY_NAMESPACE
, EMPTY_PREFIX
, XMLNS_NAMESPACE
from xml
.parsers
import expat
from xml
.dom
.minidom
import _append_child
, _set_attribute_node
from xml
.dom
.NodeFilter
import NodeFilter
from xml
.dom
.minicompat
import *
TEXT_NODE
= Node
.TEXT_NODE
CDATA_SECTION_NODE
= Node
.CDATA_SECTION_NODE
DOCUMENT_NODE
= Node
.DOCUMENT_NODE
FILTER_ACCEPT
= xmlbuilder
.DOMBuilderFilter
.FILTER_ACCEPT
FILTER_REJECT
= xmlbuilder
.DOMBuilderFilter
.FILTER_REJECT
FILTER_SKIP
= xmlbuilder
.DOMBuilderFilter
.FILTER_SKIP
FILTER_INTERRUPT
= xmlbuilder
.DOMBuilderFilter
.FILTER_INTERRUPT
theDOMImplementation
= minidom
.getDOMImplementation()
# Expat typename -> TypeInfo
"CDATA": minidom
.TypeInfo(None, "cdata"),
"ENUM": minidom
.TypeInfo(None, "enumeration"),
"ENTITY": minidom
.TypeInfo(None, "entity"),
"ENTITIES": minidom
.TypeInfo(None, "entities"),
"ID": minidom
.TypeInfo(None, "id"),
"IDREF": minidom
.TypeInfo(None, "idref"),
"IDREFS": minidom
.TypeInfo(None, "idrefs"),
"NMTOKEN": minidom
.TypeInfo(None, "nmtoken"),
"NMTOKENS": minidom
.TypeInfo(None, "nmtokens"),
class ElementInfo(NewStyle
):
__slots__
= '_attr_info', '_model', 'tagName'
def __init__(self
, tagName
, model
=None):
return self
._attr
_info
, self
._model
, self
.tagName
def __setstate__(self
, state
):
self
._attr
_info
, self
._model
, self
.tagName
= state
def getAttributeType(self
, aname
):
for info
in self
._attr
_info
:
return _typeinfo_map
["ENUM"]
return _typeinfo_map
[info
[-2]]
def getAttributeTypeNS(self
, namespaceURI
, localName
):
def isElementContent(self
):
return type not in (expat
.model
.XML_CTYPE_ANY
,
expat
.model
.XML_CTYPE_MIXED
)
return self
._model
[0] == expat
.model
.XML_CTYPE_EMPTY
for info
in self
._attr
_info
:
def isIdNS(self
, euri
, ename
, auri
, aname
):
# not sure this is meaningful
return self
.isId((auri
, aname
))
return builder
._intern
_setdefault
(s
, s
)
def _parse_ns_name(builder
, name
):
intern = builder
._intern
_setdefault
uri
, localname
, prefix
= parts
prefix
= intern(prefix
, prefix
)
qname
= "%s:%s" % (prefix
, localname
)
qname
= intern(qname
, qname
)
localname
= intern(localname
, localname
)
qname
= localname
= intern(localname
, localname
)
return intern(uri
, uri
), localname
, prefix
, qname
"""Document builder that uses Expat to build a ParsedXML.DOM document
def __init__(self
, options
=None):
options
= xmlbuilder
.Options()
if self
._options
.filter is not None:
self
._filter
= FilterVisibilityController(self
._options
.filter)
# This *really* doesn't do anything in this case, so
# override it with something fast & minimal.
self
._finish
_start
_element
= id
"""Create a new parser object."""
return expat
.ParserCreate()
"""Return the parser object, creating a new one if needed."""
self
._parser
= self
.createParser()
self
._intern
_setdefault
= self
._parser
.intern.setdefault
self
._parser
.buffer_text
= True
self
._parser
.ordered_attributes
= True
self
._parser
.specified_attributes
= True
self
.install(self
._parser
)
"""Free all data structures used during DOM construction."""
self
.document
= theDOMImplementation
.createDocument(
EMPTY_NAMESPACE
, None, None)
self
.curNode
= self
.document
self
._elem
_info
= self
.document
._elem
_info
def install(self
, parser
):
"""Install the callbacks needed to build the DOM into the parser."""
# This creates circular references!
parser
.StartDoctypeDeclHandler
= self
.start_doctype_decl_handler
parser
.StartElementHandler
= self
.first_element_handler
parser
.EndElementHandler
= self
.end_element_handler
parser
.ProcessingInstructionHandler
= self
.pi_handler
if self
._options
.entities
:
parser
.EntityDeclHandler
= self
.entity_decl_handler
parser
.NotationDeclHandler
= self
.notation_decl_handler
if self
._options
.comments
:
parser
.CommentHandler
= self
.comment_handler
if self
._options
.cdata_sections
:
parser
.StartCdataSectionHandler
= self
.start_cdata_section_handler
parser
.EndCdataSectionHandler
= self
.end_cdata_section_handler
parser
.CharacterDataHandler
= self
.character_data_handler_cdata
parser
.CharacterDataHandler
= self
.character_data_handler
parser
.ExternalEntityRefHandler
= self
.external_entity_ref_handler
parser
.XmlDeclHandler
= self
.xml_decl_handler
parser
.ElementDeclHandler
= self
.element_decl_handler
parser
.AttlistDeclHandler
= self
.attlist_decl_handler
def parseFile(self
, file):
"""Parse a document from a file object, returning the document
parser
= self
.getParser()
buffer = file.read(16*1024)
if first_buffer
and self
.document
.documentElement
:
self
._setup
_subset
(buffer)
def parseString(self
, string
):
"""Parse a document from a string, returning the document node."""
parser
= self
.getParser()
parser
.Parse(string
, True)
self
._setup
_subset
(string
)
def _setup_subset(self
, buffer):
"""Load the internal subset if there might be one."""
if self
.document
.doctype
:
extractor
= InternalSubsetExtractor()
extractor
.parseString(buffer)
subset
= extractor
.getSubset()
self
.document
.doctype
.internalSubset
= subset
def start_doctype_decl_handler(self
, doctypeName
, systemId
, publicId
,
doctype
= self
.document
.implementation
.createDocumentType(
doctypeName
, publicId
, systemId
)
doctype
.ownerDocument
= self
.document
self
.document
.childNodes
.append(doctype
)
self
.document
.doctype
= doctype
if self
._filter
and self
._filter
.acceptNode(doctype
) == FILTER_REJECT
:
self
.document
.doctype
= None
del self
.document
.childNodes
[-1]
self
._parser
.EntityDeclHandler
= None
self
._parser
.NotationDeclHandler
= None
doctype
.entities
._seq
= []
doctype
.notations
._seq
= []
self
._parser
.CommentHandler
= None
self
._parser
.ProcessingInstructionHandler
= None
self
._parser
.EndDoctypeDeclHandler
= self
.end_doctype_decl_handler
def end_doctype_decl_handler(self
):
if self
._options
.comments
:
self
._parser
.CommentHandler
= self
.comment_handler
self
._parser
.ProcessingInstructionHandler
= self
.pi_handler
if not (self
._elem
_info
or self
._filter
):
self
._finish
_end
_element
= id
def pi_handler(self
, target
, data
):
node
= self
.document
.createProcessingInstruction(target
, data
)
_append_child(self
.curNode
, node
)
if self
._filter
and self
._filter
.acceptNode(node
) == FILTER_REJECT
:
self
.curNode
.removeChild(node
)
def character_data_handler_cdata(self
, data
):
childNodes
= self
.curNode
.childNodes
if ( self
._cdata
_continue
and childNodes
[-1].nodeType
== CDATA_SECTION_NODE
):
childNodes
[-1].appendData(data
)
node
= self
.document
.createCDATASection(data
)
self
._cdata
_continue
= True
elif childNodes
and childNodes
[-1].nodeType
== TEXT_NODE
:
d
['data'] = d
['nodeValue'] = value
d
['data'] = d
['nodeValue'] = data
d
['ownerDocument'] = self
.document
_append_child(self
.curNode
, node
)
def character_data_handler(self
, data
):
childNodes
= self
.curNode
.childNodes
if childNodes
and childNodes
[-1].nodeType
== TEXT_NODE
:
d
['data'] = d
['nodeValue'] = node
.data
+ data
d
['data'] = d
['nodeValue'] = node
.data
+ data
d
['ownerDocument'] = self
.document
_append_child(self
.curNode
, node
)
def entity_decl_handler(self
, entityName
, is_parameter_entity
, value
,
base
, systemId
, publicId
, notationName
):
# we don't care about parameter entities for the DOM
if not self
._options
.entities
:
node
= self
.document
._create
_entity
(entityName
, publicId
,
# node *should* be readonly, but we'll cheat
child
= self
.document
.createTextNode(value
)
node
.childNodes
.append(child
)
self
.document
.doctype
.entities
._seq
.append(node
)
if self
._filter
and self
._filter
.acceptNode(node
) == FILTER_REJECT
:
del self
.document
.doctype
.entities
._seq
[-1]
def notation_decl_handler(self
, notationName
, base
, systemId
, publicId
):
node
= self
.document
._create
_notation
(notationName
, publicId
, systemId
)
self
.document
.doctype
.notations
._seq
.append(node
)
if self
._filter
and self
._filter
.acceptNode(node
) == FILTER_ACCEPT
:
del self
.document
.doctype
.notations
._seq
[-1]
def comment_handler(self
, data
):
node
= self
.document
.createComment(data
)
_append_child(self
.curNode
, node
)
if self
._filter
and self
._filter
.acceptNode(node
) == FILTER_REJECT
:
self
.curNode
.removeChild(node
)
def start_cdata_section_handler(self
):
self
._cdata
_continue
= False
def end_cdata_section_handler(self
):
self
._cdata
_continue
= False
def external_entity_ref_handler(self
, context
, base
, systemId
, publicId
):
def first_element_handler(self
, name
, attributes
):
if self
._filter
is None and not self
._elem
_info
:
self
._finish
_end
_element
= id
self
.getParser().StartElementHandler
= self
.start_element_handler
self
.start_element_handler(name
, attributes
)
def start_element_handler(self
, name
, attributes
):
node
= self
.document
.createElement(name
)
_append_child(self
.curNode
, node
)
for i
in range(0, len(attributes
), 2):
a
= minidom
.Attr(attributes
[i
], EMPTY_NAMESPACE
,
d
= a
.childNodes
[0].__dict
__
d
['data'] = d
['nodeValue'] = value
d
['value'] = d
['nodeValue'] = value
d
['ownerDocument'] = self
.document
_set_attribute_node(node
, a
)
if node
is not self
.document
.documentElement
:
self
._finish
_start
_element
(node
)
def _finish_start_element(self
, node
):
# To be general, we'd have to call isSameNode(), but this
# is sufficient for minidom:
if node
is self
.document
.documentElement
:
filt
= self
._filter
.startContainer(node
)
if filt
== FILTER_REJECT
:
# ignore this node & all descendents
elif filt
== FILTER_SKIP
:
# ignore this node, but make it's children become
# children of the parent node
self
.curNode
= node
.parentNode
node
.parentNode
.removeChild(node
)
# If this ever changes, Namespaces.end_element_handler() needs to
def end_element_handler(self
, name
):
self
.curNode
= curNode
.parentNode
self
._finish
_end
_element
(curNode
)
def _finish_end_element(self
, curNode
):
info
= self
._elem
_info
.get(curNode
.tagName
)
self
._handle
_white
_text
_nodes
(curNode
, info
)
if curNode
is self
.document
.documentElement
:
if self
._filter
.acceptNode(curNode
) == FILTER_REJECT
:
self
.curNode
.removeChild(curNode
)
def _handle_white_text_nodes(self
, node
, info
):
if (self
._options
.whitespace_in_element_content
or not info
.isElementContent()):
# We have element type information and should remove ignorable
# whitespace; identify for text nodes which contain only
for child
in node
.childNodes
:
if child
.nodeType
== TEXT_NODE
and not child
.data
.strip():
# Remove ignorable whitespace from the tree.
def element_decl_handler(self
, name
, model
):
info
= self
._elem
_info
.get(name
)
self
._elem
_info
[name
] = ElementInfo(name
, model
)
assert info
._model
is None
def attlist_decl_handler(self
, elem
, name
, type, default
, required
):
info
= self
._elem
_info
.get(elem
)
self
._elem
_info
[elem
] = info
[None, name
, None, None, default
, 0, type, required
])
def xml_decl_handler(self
, version
, encoding
, standalone
):
self
.document
.version
= version
self
.document
.encoding
= encoding
# This is still a little ugly, thanks to the pyexpat API. ;-(
self
.document
.standalone
= True
self
.document
.standalone
= False
# Don't include FILTER_INTERRUPT, since that's checked separately
_ALLOWED_FILTER_RETURNS
= (FILTER_ACCEPT
, FILTER_REJECT
, FILTER_SKIP
)
class FilterVisibilityController(NewStyle
):
"""Wrapper around a DOMBuilderFilter which implements the checks
to make the whatToShow filter attribute work."""
def __init__(self
, filter):
def startContainer(self
, node
):
mask
= self
._nodetype
_mask
[node
.nodeType
]
if self
.filter.whatToShow
& mask
:
val
= self
.filter.startContainer(node
)
if val
== FILTER_INTERRUPT
:
if val
not in _ALLOWED_FILTER_RETURNS
:
"startContainer() returned illegal value: " + repr(val
)
def acceptNode(self
, node
):
mask
= self
._nodetype
_mask
[node
.nodeType
]
if self
.filter.whatToShow
& mask
:
val
= self
.filter.acceptNode(node
)
if val
== FILTER_INTERRUPT
:
# move all child nodes to the parent, and remove this node
for child
in node
.childNodes
[:]:
parent
.appendChild(child
)
# node is handled by the caller
if val
not in _ALLOWED_FILTER_RETURNS
:
"acceptNode() returned illegal value: " + repr(val
)
Node
.ELEMENT_NODE
: NodeFilter
.SHOW_ELEMENT
,
Node
.ATTRIBUTE_NODE
: NodeFilter
.SHOW_ATTRIBUTE
,
Node
.TEXT_NODE
: NodeFilter
.SHOW_TEXT
,
Node
.CDATA_SECTION_NODE
: NodeFilter
.SHOW_CDATA_SECTION
,
Node
.ENTITY_REFERENCE_NODE
: NodeFilter
.SHOW_ENTITY_REFERENCE
,
Node
.ENTITY_NODE
: NodeFilter
.SHOW_ENTITY
,
Node
.PROCESSING_INSTRUCTION_NODE
: NodeFilter
.SHOW_PROCESSING_INSTRUCTION
,
Node
.COMMENT_NODE
: NodeFilter
.SHOW_COMMENT
,
Node
.DOCUMENT_NODE
: NodeFilter
.SHOW_DOCUMENT
,
Node
.DOCUMENT_TYPE_NODE
: NodeFilter
.SHOW_DOCUMENT_TYPE
,
Node
.DOCUMENT_FRAGMENT_NODE
: NodeFilter
.SHOW_DOCUMENT_FRAGMENT
,
Node
.NOTATION_NODE
: NodeFilter
.SHOW_NOTATION
,
class FilterCrutch(NewStyle
):
__slots__
= '_builder', '_level', '_old_start', '_old_end'
def __init__(self
, builder
):
self
._old
_start
= parser
.StartElementHandler
self
._old
_end
= parser
.EndElementHandler
parser
.StartElementHandler
= self
.start_element_handler
parser
.EndElementHandler
= self
.end_element_handler
class Rejecter(FilterCrutch
):
def __init__(self
, builder
):
FilterCrutch
.__init
__(self
, builder
)
for name
in ("ProcessingInstructionHandler",
"StartCdataSectionHandler",
"EndCdataSectionHandler",
"ExternalEntityRefHandler",
setattr(parser
, name
, None)
def start_element_handler(self
, *args
):
self
._level
= self
._level
+ 1
def end_element_handler(self
, *args
):
# restore the old handlers
parser
= self
._builder
._parser
self
._builder
.install(parser
)
parser
.StartElementHandler
= self
._old
_start
parser
.EndElementHandler
= self
._old
_end
self
._level
= self
._level
- 1
class Skipper(FilterCrutch
):
def start_element_handler(self
, *args
):
node
= self
._builder
.curNode
if self
._builder
.curNode
is not node
:
self
._level
= self
._level
+ 1
def end_element_handler(self
, *args
):
# We're popping back out of the node we're skipping, so we
# shouldn't need to do anything but reset the handlers.
self
._builder
._parser
.StartElementHandler
= self
._old
_start
self
._builder
._parser
.EndElementHandler
= self
._old
_end
self
._level
= self
._level
- 1
# framework document used by the fragment builder.
# Takes a string for the doctype, subset string, and namespace attrs string.
_FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID
= \
"http://xml.python.org/entities/fragment-builder/internal"
_FRAGMENT_BUILDER_TEMPLATE
= (
<!ENTITY fragment-builder-internal
>&fragment-builder-internal;</wrapper>'''
% _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID
)
class FragmentBuilder(ExpatBuilder
):
"""Builder which constructs document fragments given XML source
The context node is expected to provide information about the
namespace declarations which are in scope at the start of the
def __init__(self
, context
, options
=None):
if context
.nodeType
== DOCUMENT_NODE
:
self
.originalDocument
= context
self
.originalDocument
= context
.ownerDocument
ExpatBuilder
.__init
__(self
, options
)
def parseFile(self
, file):
"""Parse a document fragment from a file object, returning the
return self
.parseString(file.read())
def parseString(self
, string
):
"""Parse a document fragment from a string, returning the
parser
= self
.getParser()
doctype
= self
.originalDocument
.doctype
subset
= doctype
.internalSubset
or self
._getDeclarations
()
ident
= ('PUBLIC "%s" "%s"'
% (doctype
.publicId
, doctype
.systemId
))
ident
= 'SYSTEM "%s"' % doctype
.systemId
nsattrs
= self
._getNSattrs
() # get ns decls from node's ancestors
document
= _FRAGMENT_BUILDER_TEMPLATE
% (ident
, subset
, nsattrs
)
parser
.Parse(document
, 1)
def _getDeclarations(self
):
"""Re-create the internal subset from the DocumentType node.
This is only needed if we don't already have the
internalSubset as a string.
doctype
= self
.context
.ownerDocument
.doctype
for i
in range(doctype
.notations
.length
):
notation
= doctype
.notations
.item(i
)
s
= "%s<!NOTATION %s" % (s
, notation
.nodeName
)
s
= '%s PUBLIC "%s"\n "%s">' \
% (s
, notation
.publicId
, notation
.systemId
)
s
= '%s SYSTEM "%s">' % (s
, notation
.systemId
)
for i
in range(doctype
.entities
.length
):
entity
= doctype
.entities
.item(i
)
s
= "%s<!ENTITY %s" % (s
, entity
.nodeName
)
s
= '%s PUBLIC "%s"\n "%s"' \
% (s
, entity
.publicId
, entity
.systemId
)
s
= '%s SYSTEM "%s"' % (s
, entity
.systemId
)
s
= '%s "%s"' % (s
, entity
.firstChild
.data
)
s
= "%s NOTATION %s" % (s
, entity
.notationName
)
def external_entity_ref_handler(self
, context
, base
, systemId
, publicId
):
if systemId
== _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID
:
# this entref is the one that we made to put the subtree
# in; all of our given input is parsed in here.
old_document
= self
.document
old_cur_node
= self
.curNode
parser
= self
._parser
.ExternalEntityParserCreate(context
)
# put the real document back, parse into the fragment to return
self
.document
= self
.originalDocument
self
.fragment
= self
.document
.createDocumentFragment()
self
.curNode
= self
.fragment
parser
.Parse(self
._source
, 1)
self
.curNode
= old_cur_node
self
.document
= old_document
return ExpatBuilder
.external_entity_ref_handler(
self
, context
, base
, systemId
, publicId
)
"""Mix-in class for builders; adds support for namespaces."""
def _initNamespaces(self
):
# list of (prefix, uri) ns declarations. Namespace attrs are
# constructed from this and added to the element's attrs.
self
._ns
_ordered
_prefixes
= []
"""Create a new namespace-handling parser."""
parser
= expat
.ParserCreate(namespace_separator
=" ")
parser
.namespace_prefixes
= True
def install(self
, parser
):
"""Insert the namespace-handlers onto the parser."""
ExpatBuilder
.install(self
, parser
)
if self
._options
.namespace_declarations
:
parser
.StartNamespaceDeclHandler
= (
self
.start_namespace_decl_handler
)
def start_namespace_decl_handler(self
, prefix
, uri
):
"""Push this namespace declaration on our storage."""
self
._ns
_ordered
_prefixes
.append((prefix
, uri
))
def start_element_handler(self
, name
, attributes
):
uri
, localname
, prefix
, qname
= _parse_ns_name(self
, name
)
node
= minidom
.Element(qname
, uri
, prefix
, localname
)
node
.ownerDocument
= self
.document
_append_child(self
.curNode
, node
)
if self
._ns
_ordered
_prefixes
:
for prefix
, uri
in self
._ns
_ordered
_prefixes
:
a
= minidom
.Attr(_intern(self
, 'xmlns:' + prefix
),
XMLNS_NAMESPACE
, prefix
, "xmlns")
a
= minidom
.Attr("xmlns", XMLNS_NAMESPACE
,
d
= a
.childNodes
[0].__dict
__
d
['data'] = d
['nodeValue'] = uri
d
['value'] = d
['nodeValue'] = uri
d
['ownerDocument'] = self
.document
_set_attribute_node(node
, a
)
del self
._ns
_ordered
_prefixes
[:]
for i
in range(0, len(attributes
), 2):
uri
, localname
, prefix
, qname
= _parse_ns_name(self
, aname
)
a
= minidom
.Attr(qname
, uri
, localname
, prefix
)
_attrsNS
[(uri
, localname
)] = a
a
= minidom
.Attr(aname
, EMPTY_NAMESPACE
,
_attrsNS
[(EMPTY_NAMESPACE
, aname
)] = a
d
= a
.childNodes
[0].__dict
__
d
['data'] = d
['nodeValue'] = value
d
['ownerDocument'] = self
.document
d
['value'] = d
['nodeValue'] = value
# This only adds some asserts to the original
# end_element_handler(), so we only define this when -O is not
# used. If changing one, be sure to check the other to see if
# it needs to be changed as well.
def end_element_handler(self
, name
):
uri
, localname
, prefix
, qname
= _parse_ns_name(self
, name
)
assert (curNode
.namespaceURI
== uri
and curNode
.localName
== localname
and curNode
.prefix
== prefix
), \
"element stack messed up! (namespace)"
assert curNode
.nodeName
== name
, \
"element stack messed up - bad nodeName"
assert curNode
.namespaceURI
== EMPTY_NAMESPACE
, \
"element stack messed up - bad namespaceURI"
self
.curNode
= curNode
.parentNode
self
._finish
_end
_element
(curNode
)
class ExpatBuilderNS(Namespaces
, ExpatBuilder
):
"""Document builder that supports namespaces."""
class FragmentBuilderNS(Namespaces
, FragmentBuilder
):
"""Fragment builder that supports namespaces."""
FragmentBuilder
.reset(self
)
"""Return string of namespace attributes from this element and
# XXX This needs to be re-written to walk the ancestors of the
# context to build up the namespace information from
# declarations, elements, and attributes found in context.
# Otherwise we have to store a bunch more data on the DOM
# (though that *might* be more reliable -- not clear).
if hasattr(context
, '_ns_prefix_uri'):
for prefix
, uri
in context
._ns
_prefix
_uri
.items():
# add every new NS decl from context to L and attrs string
declname
= "xmlns:" + prefix
attrs
= "%s\n %s='%s'" % (attrs
, declname
, uri
)
attrs
= " %s='%s'" % (declname
, uri
)
context
= context
.parentNode
class ParseEscape(Exception):
"""Exception raised to short-circuit parsing in InternalSubsetExtractor."""
class InternalSubsetExtractor(ExpatBuilder
):
"""XML processor which can rip out the internal document type subset."""
"""Return the internal subset as a string."""
def parseFile(self
, file):
ExpatBuilder
.parseFile(self
, file)
def parseString(self
, string
):
ExpatBuilder
.parseString(self
, string
)
def install(self
, parser
):
parser
.StartDoctypeDeclHandler
= self
.start_doctype_decl_handler
parser
.StartElementHandler
= self
.start_element_handler
def start_doctype_decl_handler(self
, name
, publicId
, systemId
,
parser
= self
.getParser()
parser
.DefaultHandler
= self
.subset
.append
parser
.EndDoctypeDeclHandler
= self
.end_doctype_decl_handler
def end_doctype_decl_handler(self
):
s
= ''.join(self
.subset
).replace('\r\n', '\n').replace('\r', '\n')
def start_element_handler(self
, name
, attrs
):
def parse(file, namespaces
=1):
"""Parse a document, returning the resulting Document node.
'file' may be either a file name or an open file object.
builder
= ExpatBuilderNS()
if isinstance(file, StringTypes
):
result
= builder
.parseFile(fp
)
result
= builder
.parseFile(file)
def parseString(string
, namespaces
=1):
"""Parse a document from a string, returning the resulting
builder
= ExpatBuilderNS()
return builder
.parseString(string
)
def parseFragment(file, context
, namespaces
=1):
"""Parse a fragment of a document, given the context from which it
was originally extracted. context should be the parent of the
node(s) which are in the fragment.
'file' may be either a file name or an open file object.
builder
= FragmentBuilderNS(context
)
builder
= FragmentBuilder(context
)
if isinstance(file, StringTypes
):
result
= builder
.parseFile(fp
)
result
= builder
.parseFile(file)
def parseFragmentString(string
, context
, namespaces
=1):
"""Parse a fragment of a document from a string, given the context
from which it was originally extracted. context should be the
parent of the node(s) which are in the fragment.
builder
= FragmentBuilderNS(context
)
builder
= FragmentBuilder(context
)
return builder
.parseString(string
)
def makeBuilder(options
):
"""Create a builder based on an Options object."""
return ExpatBuilderNS(options
)
return ExpatBuilder(options
)