"""A parser for XML, using the derived class as static DTD."""
# Author: Sjoerd Mullender.
warnings
.warn("The xmllib module is obsolete. Use xml.sax instead.", DeprecationWarning)
class Error(RuntimeError):
# Regular expressions used for parsing
_S
= '[ \t\r\n]+' # white space
_opS
= '[ \t\r\n]*' # optional white space
_Name
= '[a-zA-Z_:][-a-zA-Z0-9._:]*' # valid XML name
_QStr
= "(?:'[^']*'|\"[^\"]*\")" # quoted XML string
illegal
= re
.compile('[^\t\r\n -\176\240-\377]') # illegal chars in content
interesting
= re
.compile('[]&<]')
ref
= re
.compile('&(' + _Name
+ '|#[0-9]+|#x[0-9a-fA-F]+)[^-a-zA-Z0-9._:]')
entityref
= re
.compile('&(?P<name>' + _Name
+ ')[^-a-zA-Z0-9._:]')
charref
= re
.compile('&#(?P<char>[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])')
space
= re
.compile(_S
+ '$')
newline
= re
.compile('\n')
_S
+ '(?P<name>' + _Name
+ ')'
'(' + _opS
+ '=' + _opS
+
'(?P<value>'+_QStr
+'|[-a-zA-Z0-9.:+*%?!\(\)_#=~]+))?')
starttagopen
= re
.compile('<' + _Name
)
starttagend
= re
.compile(_opS
+ '(?P<slash>/?)>')
starttagmatch
= re
.compile('<(?P<tagname>'+_Name
+')'
'(?P<attrs>(?:'+attrfind
.pattern
+')*)'+
endtagopen
= re
.compile('</')
endbracket
= re
.compile(_opS
+ '>')
endbracketfind
= re
.compile('(?:[^>\'"]|'+_QStr
+')*>')
tagfind
= re
.compile(_Name
)
cdataopen
= re
.compile(r
'<!\[CDATA\[')
cdataclose
= re
.compile(r
'\]\]>')
# this matches one of the following:
# PUBLIC PubidLiteral SystemLiteral
_SystemLiteral
= '(?P<%s>'+_QStr
+')'
_PublicLiteral
= '(?P<%s>"[-\'\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*"|' \
"'[-\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*')"
_ExternalId
= '(?:SYSTEM|' \
'PUBLIC'+_S
+_PublicLiteral
%'pubid'+ \
')'+_S
+_SystemLiteral
%'syslit'
doctype
= re
.compile('<!DOCTYPE'+_S
+'(?P<name>'+_Name
+')'
'(?:'+_S
+_ExternalId
+')?'+_opS
)
xmldecl
= re
.compile('<\?xml'+_S
+
'version'+_opS
+'='+_opS
+'(?P<version>'+_QStr
+')'+
'(?:'+_S
+'encoding'+_opS
+'='+_opS
+
"(?P<encoding>'[A-Za-z][-A-Za-z0-9._]*'|"
'"[A-Za-z][-A-Za-z0-9._]*"))?'
'(?:'+_S
+'standalone'+_opS
+'='+_opS
+
'(?P<standalone>\'(?:yes|no)\'|"(?:yes|no)"))?'+
procopen
= re
.compile(r
'<\?(?P<proc>' + _Name
+ ')' + _opS
)
procclose
= re
.compile(_opS
+ r
'\?>')
commentopen
= re
.compile('<!--')
commentclose
= re
.compile('-->')
doubledash
= re
.compile('--')
attrtrans
= string
.maketrans(' \r\n\t', ' ')
# definitions for XML namespaces
_NCName
= '[a-zA-Z_][-a-zA-Z0-9._]*' # XML Name, minus the ":"
ncname
= re
.compile(_NCName
+ '$')
qname
= re
.compile('(?:(?P<prefix>' + _NCName
+ '):)?' # optional prefix
'(?P<local>' + _NCName
+ ')$')
xmlns
= re
.compile('xmlns(?::(?P<ncname>'+_NCName
+'))?$')
# XML parser base class -- find tags and call handler functions.
# Usage: p = XMLParser(); p.feed(data); ...; p.close().
# The dtd is defined by deriving a class which defines methods with
# special names to handle tags: start_foo and end_foo to handle <foo>
# and </foo>, respectively. The data between tags is passed to the
# parser by calling self.handle_data() with some data as argument (the
# data may be split up in arbitrary chunks).
attributes
= {} # default, to be overridden
elements
= {} # default, to be overridden
# parsing options, settable using keyword args in __init__
__accept_unquoted_attributes
= 0
__accept_missing_endtag_name
= 0
__translate_attribute_references
= 1
# Interface -- initialize and reset this instance
def __init__(self
, **kw
):
if 'accept_unquoted_attributes' in kw
:
self
.__accept
_unquoted
_attributes
= kw
['accept_unquoted_attributes']
if 'accept_missing_endtag_name' in kw
:
self
.__accept
_missing
_endtag
_name
= kw
['accept_missing_endtag_name']
self
.__map
_case
= kw
['map_case']
self
.__accept
_utf
8 = kw
['accept_utf8']
if 'translate_attribute_references' in kw
:
self
.__translate
_attribute
_references
= kw
['translate_attribute_references']
self
.__fixdict
(self
.__dict
__)
self
.__fixclass
(self
.__class
__)
def __fixclass(self
, kl
):
self
.__fixdict
(kl
.__dict
__)
def __fixdict(self
, dict):
start
, end
= self
.elements
.get(tag
, (None, None))
self
.elements
[tag
] = getattr(self
, key
), end
start
, end
= self
.elements
.get(tag
, (None, None))
self
.elements
[tag
] = start
, getattr(self
, key
)
# Interface -- reset this instance. Loses all unprocessed data
self
.__seen
_doctype
= None
self
.__use
_namespaces
= 0
self
.__namespaces
= {'xml':None} # xml is implicitly declared
# backward compatibility hack: if elements not overridden,
if self
.elements
is XMLParser
.elements
:
# For derived classes only -- enter literal mode (CDATA) till EOF
self
.nomoretags
= self
.literal
= 1
# For derived classes only -- enter literal mode (CDATA)
def setliteral(self
, *args
):
# Interface -- feed some data to the parser. Call this as
# often as you want, with as little or as much text as you
# want (may include '\n'). (This just saves the text, all the
# processing is done by goahead().)
self
.rawdata
= self
.rawdata
+ data
# Interface -- handle the remaining data
# remove self.elements so that we don't leak
# Interface -- translate references
def translate_references(self
, data
, all
= 1):
if not self
.__translate
_attribute
_references
:
res
= amp
.search(data
, i
)
self
.syntax_error("bogus `&'")
str = chr(int(str[2:], 16))
self
.syntax_error("`;' missing after char reference")
if str in self
.entitydefs
:
str = self
.entitydefs
[str]
self
.syntax_error("bogus `&'")
i
= s
+ 1 # just past the &
self
.syntax_error("reference to unknown entity `&%s;'" % str)
self
.syntax_error("bogus `&'")
i
= s
+ 1 # just past the &
# when we get here, str contains the translated text and i points
# to the end of the string that is to be replaced
data
= data
[:s
] + str + data
[i
:]
# Interface - return a dictionary of all namespaces currently valid
for t
, d
, nst
in self
.stack
:
# Internal -- handle data as far as reasonable. May leave state
# and data to be processed by a subsequent call. If 'end' is
# true, force handling all data as if followed by EOF marker.
self
.lineno
= self
.lineno
+ data
.count('\n')
res
= interesting
.search(rawdata
, i
)
if self
.__at
_start
and space
.match(data
) is None:
self
.syntax_error('illegal data at start of file')
if not self
.stack
and space
.match(data
) is None:
self
.syntax_error('data not in content')
if not self
.__accept
_utf
8 and illegal
.search(data
):
self
.syntax_error('illegal character in content')
self
.lineno
= self
.lineno
+ data
.count('\n')
if starttagopen
.match(rawdata
, i
):
self
.lineno
= self
.lineno
+ data
.count('\n')
k
= self
.parse_starttag(i
)
self
.lineno
= self
.lineno
+ rawdata
[i
:k
].count('\n')
if endtagopen
.match(rawdata
, i
):
self
.lineno
= self
.lineno
+ rawdata
[i
:k
].count('\n')
if commentopen
.match(rawdata
, i
):
self
.lineno
= self
.lineno
+ data
.count('\n')
k
= self
.parse_comment(i
)
self
.lineno
= self
.lineno
+ rawdata
[i
:k
].count('\n')
if cdataopen
.match(rawdata
, i
):
self
.lineno
= self
.lineno
+ rawdata
[i
:k
].count('\n')
res
= xmldecl
.match(rawdata
, i
)
self
.syntax_error("<?xml?> declaration not at start of document")
version
, encoding
, standalone
= res
.group('version',
if version
[1:-1] != '1.0':
raise Error('only XML version 1.0 supported')
if encoding
: encoding
= encoding
[1:-1]
if standalone
: standalone
= standalone
[1:-1]
self
.handle_xml(encoding
, standalone
)
res
= procopen
.match(rawdata
, i
)
self
.lineno
= self
.lineno
+ rawdata
[i
:k
].count('\n')
res
= doctype
.match(rawdata
, i
)
self
.lineno
= self
.lineno
+ data
.count('\n')
self
.syntax_error('multiple DOCTYPE elements')
self
.syntax_error('DOCTYPE not at beginning of document')
k
= self
.parse_doctype(res
)
self
.__seen
_doctype
= res
.group('name')
self
.__seen
_doctype
= self
.__seen
_doctype
.lower()
self
.lineno
= self
.lineno
+ rawdata
[i
:k
].count('\n')
res
= charref
.match(rawdata
, i
)
self
.syntax_error("`;' missing in charref")
self
.syntax_error('data not in content')
self
.handle_charref(res
.group('char')[:-1])
self
.lineno
= self
.lineno
+ res
.group(0).count('\n')
res
= entityref
.match(rawdata
, i
)
self
.syntax_error("`;' missing in entityref")
if name
in self
.entitydefs
:
self
.rawdata
= rawdata
= rawdata
[:res
.start(0)] + self
.entitydefs
[name
] + rawdata
[i
:]
self
.unknown_entityref(name
)
self
.lineno
= self
.lineno
+ res
.group(0).count('\n')
if cdataclose
.match(rawdata
, i
):
self
.syntax_error("bogus `]]>'")
self
.handle_data(rawdata
[i
])
raise Error('neither < nor & ??')
# We get here only if incomplete matches but
self
.syntax_error("bogus `%s'" % data
)
if not self
.__accept
_utf
8 and illegal
.search(data
):
self
.syntax_error('illegal character in content')
self
.lineno
= self
.lineno
+ data
.count('\n')
self
.rawdata
= rawdata
[i
+1:]
self
.rawdata
= rawdata
[i
:]
if not self
.__seen
_starttag
:
self
.syntax_error('no elements in file')
self
.syntax_error('missing end tags')
self
.finish_endtag(self
.stack
[-1][0])
# Internal -- parse comment, return length or -1 if not terminated
def parse_comment(self
, i
):
if rawdata
[i
:i
+4] != '<!--':
raise Error('unexpected call to handle_comment')
res
= commentclose
.search(rawdata
, i
+4)
if doubledash
.search(rawdata
, i
+4, res
.start(0)):
self
.syntax_error("`--' inside comment")
if rawdata
[res
.start(0)-1] == '-':
self
.syntax_error('comment cannot end in three dashes')
if not self
.__accept
_utf
8 and \
illegal
.search(rawdata
, i
+4, res
.start(0)):
self
.syntax_error('illegal character in comment')
self
.handle_comment(rawdata
[i
+4: res
.start(0)])
# Internal -- handle DOCTYPE tag, return length or -1 if not terminated
def parse_doctype(self
, res
):
pubid
, syslit
= res
.group('pubid', 'syslit')
pubid
= pubid
[1:-1] # remove quotes
pubid
= ' '.join(pubid
.split()) # normalize
if syslit
is not None: syslit
= syslit
[1:-1] # remove quotes
elif not dq
and c
== "'":
elif level
<= 0 and c
== ']':
res
= endbracket
.match(rawdata
, k
+1)
self
.handle_doctype(name
, pubid
, syslit
, rawdata
[j
+1:k
])
self
.syntax_error("bogus `>' in DOCTYPE")
res
= endbracketfind
.match(rawdata
, k
)
if endbracket
.match(rawdata
, k
) is None:
self
.syntax_error('garbage in DOCTYPE')
self
.handle_doctype(name
, pubid
, syslit
, None)
# Internal -- handle CDATA tag, return length or -1 if not terminated
def parse_cdata(self
, i
):
if rawdata
[i
:i
+9] != '<![CDATA[':
raise Error('unexpected call to parse_cdata')
res
= cdataclose
.search(rawdata
, i
+9)
if not self
.__accept
_utf
8 and \
illegal
.search(rawdata
, i
+9, res
.start(0)):
self
.syntax_error('illegal character in CDATA')
self
.syntax_error('CDATA not in content')
self
.handle_cdata(rawdata
[i
+9:res
.start(0)])
__xml_namespace_attributes
= {'ns':None, 'src':None, 'prefix':None}
# Internal -- handle a processing instruction tag
end
= procclose
.search(rawdata
, i
)
if not self
.__accept
_utf
8 and illegal
.search(rawdata
, i
+2, j
):
self
.syntax_error('illegal character in processing instruction')
res
= tagfind
.match(rawdata
, i
+2)
raise Error('unexpected call to parse_proc')
if name
== 'xml:namespace':
self
.syntax_error('old-fashioned namespace declaration')
self
.__use
_namespaces
= -1
# this must come after the <?xml?> declaration (if any)
# and before the <!DOCTYPE> (if any).
if self
.__seen
_doctype
or self
.__seen
_starttag
:
self
.syntax_error('xml:namespace declaration too late in document')
attrdict
, namespace
, k
= self
.parse_attributes(name
, k
, j
)
self
.syntax_error('namespace declaration inside namespace declaration')
for attrname
in attrdict
.keys():
if not attrname
in self
.__xml
_namespace
_attributes
:
self
.syntax_error("unknown attribute `%s' in xml:namespace tag" % attrname
)
if not 'ns' in attrdict
or not 'prefix' in attrdict
:
self
.syntax_error('xml:namespace without required attributes')
prefix
= attrdict
.get('prefix')
if ncname
.match(prefix
) is None:
self
.syntax_error('xml:namespace illegal prefix value')
if prefix
in self
.__namespaces
:
self
.syntax_error('xml:namespace prefix not unique')
self
.__namespaces
[prefix
] = attrdict
['ns']
if name
.lower() == 'xml':
self
.syntax_error('illegal processing instruction target name')
self
.handle_proc(name
, rawdata
[k
:j
])
# Internal -- parse attributes between i and j
def parse_attributes(self
, tag
, i
, j
):
res
= attrfind
.match(rawdata
, i
)
attrname
, attrvalue
= res
.group('name', 'value')
attrname
= attrname
.lower()
self
.syntax_error("no value specified for attribute `%s'" % attrname
)
elif attrvalue
[:1] == "'" == attrvalue
[-1:] or \
attrvalue
[:1] == '"' == attrvalue
[-1:]:
attrvalue
= attrvalue
[1:-1]
elif not self
.__accept
_unquoted
_attributes
:
self
.syntax_error("attribute `%s' value not quoted" % attrname
)
res
= xmlns
.match(attrname
)
ncname
= res
.group('ncname')
namespace
[ncname
or ''] = attrvalue
or None
if not self
.__use
_namespaces
:
self
.__use
_namespaces
= len(self
.stack
)+1
self
.syntax_error("`<' illegal in attribute value")
self
.syntax_error("attribute `%s' specified twice" % attrname
)
attrvalue
= attrvalue
.translate(attrtrans
)
attrdict
[attrname
] = self
.translate_references(attrvalue
)
return attrdict
, namespace
, i
# Internal -- handle starttag, return length or -1 if not terminated
def parse_starttag(self
, i
):
# i points to start of tag
end
= endbracketfind
.match(rawdata
, i
+1)
tag
= starttagmatch
.match(rawdata
, i
)
if tag
is None or tag
.end(0) != end
.end(0):
self
.syntax_error('garbage in starttag')
nstag
= tagname
= tag
.group('tagname')
nstag
= tagname
= nstag
.lower()
if not self
.__seen
_starttag
and self
.__seen
_doctype
and \
tagname
!= self
.__seen
_doctype
:
self
.syntax_error('starttag does not match DOCTYPE')
if self
.__seen
_starttag
and not self
.stack
:
self
.syntax_error('multiple elements on top level')
attrdict
, nsdict
, k
= self
.parse_attributes(tagname
, k
, j
)
self
.stack
.append((tagname
, nsdict
, nstag
))
if self
.__use
_namespaces
:
res
= qname
.match(tagname
)
prefix
, nstag
= res
.group('prefix', 'local')
for t
, d
, nst
in self
.stack
:
if ns
is None and prefix
!= '':
ns
= self
.__namespaces
.get(prefix
)
nstag
= prefix
+ ':' + nstag
# undo split
self
.stack
[-1] = tagname
, nsdict
, nstag
# translate namespace of attributes
attrnamemap
= {} # map from new name to old name (used for error reporting)
for key
in attrdict
.keys():
if self
.__use
_namespaces
:
for key
, val
in attrdict
.items():
aprefix
, key
= res
.group('prefix', 'local')
for t
, d
, nst
in self
.stack
:
ans
= self
.__namespaces
.get(aprefix
)
key
= aprefix
+ ':' + key
attributes
= self
.attributes
.get(nstag
)
if attributes
is not None:
for key
in attrdict
.keys():
if not key
in attributes
:
self
.syntax_error("unknown attribute `%s' in tag `%s'" % (attrnamemap
[key
], tagname
))
for key
, val
in attributes
.items():
if val
is not None and not key
in attrdict
:
method
= self
.elements
.get(nstag
, (None, None))[0]
self
.finish_starttag(nstag
, attrdict
, method
)
if tag
.group('slash') == '/':
self
.finish_endtag(tagname
)
# Internal -- parse endtag
def parse_endtag(self
, i
):
end
= endbracketfind
.match(rawdata
, i
+1)
res
= tagfind
.match(rawdata
, i
+2)
self
.handle_data(rawdata
[i
])
if not self
.__accept
_missing
_endtag
_name
:
self
.syntax_error('no name specified in end tag')
if not self
.stack
or tag
!= self
.stack
[-1][0]:
self
.handle_data(rawdata
[i
])
if endbracket
.match(rawdata
, k
) is None:
self
.syntax_error('garbage in end tag')
# Internal -- finish processing of start tag
def finish_starttag(self
, tagname
, attrdict
, method
):
self
.handle_starttag(tagname
, method
, attrdict
)
self
.unknown_starttag(tagname
, attrdict
)
# Internal -- finish processing of end tag
def finish_endtag(self
, tag
):
self
.syntax_error('name-less end tag')
found
= len(self
.stack
) - 1
for i
in range(len(self
.stack
)):
if tag
== self
.stack
[i
][0]:
self
.syntax_error('unopened end tag')
while len(self
.stack
) > found
:
if found
< len(self
.stack
) - 1:
self
.syntax_error('missing close tag for %s' % self
.stack
[-1][2])
nstag
= self
.stack
[-1][2]
method
= self
.elements
.get(nstag
, (None, None))[1]
self
.handle_endtag(nstag
, method
)
self
.unknown_endtag(nstag
)
if self
.__use
_namespaces
== len(self
.stack
):
self
.__use
_namespaces
= 0
# Overridable -- handle xml processing instruction
def handle_xml(self
, encoding
, standalone
):
# Overridable -- handle DOCTYPE
def handle_doctype(self
, tag
, pubid
, syslit
, data
):
# Overridable -- handle start tag
def handle_starttag(self
, tag
, method
, attrs
):
# Overridable -- handle end tag
def handle_endtag(self
, tag
, method
):
# Example -- handle character reference, no need to override
def handle_charref(self
, name
):
self
.unknown_charref(name
)
self
.unknown_charref(name
)
# Definition of entities -- derived classes may override
entitydefs
= {'lt': '<', # must use charref
'amp': '&', # must use charref
# Example -- handle data, should be overridden
def handle_data(self
, data
):
# Example -- handle cdata, could be overridden
def handle_cdata(self
, data
):
# Example -- handle comment, could be overridden
def handle_comment(self
, data
):
# Example -- handle processing instructions, could be overridden
def handle_proc(self
, name
, data
):
# Example -- handle relatively harmless syntax errors, could be overridden
def syntax_error(self
, message
):
raise Error('Syntax error at line %d: %s' % (self
.lineno
, message
))
# To be overridden -- handlers for unknown objects
def unknown_starttag(self
, tag
, attrs
): pass
def unknown_endtag(self
, tag
): pass
def unknown_charref(self
, ref
): pass
def unknown_entityref(self
, name
):
self
.syntax_error("reference to unknown entity `&%s;'" % name
)
class TestXMLParser(XMLParser
):
def __init__(self
, **kw
):
XMLParser
.__init
__(self
, **kw
)
def handle_xml(self
, encoding
, standalone
):
print 'xml: encoding =',encoding
,'standalone =',standalone
def handle_doctype(self
, tag
, pubid
, syslit
, data
):
print 'DOCTYPE:',tag
, repr(data
)
def handle_data(self
, data
):
self
.testdata
= self
.testdata
+ data
if len(repr(self
.testdata
)) >= 70:
print 'data:', repr(data
)
def handle_cdata(self
, data
):
print 'cdata:', repr(data
)
def handle_proc(self
, name
, data
):
print 'processing:',name
,repr(data
)
def handle_comment(self
, data
):
r
= r
[:32] + '...' + r
[-32:]
def syntax_error(self
, message
):
print 'error at line %d:' % self
.lineno
, message
def unknown_starttag(self
, tag
, attrs
):
print 'start tag: <' + tag
+ '>'
print 'start tag: <' + tag
,
for name
, value
in attrs
.items():
print name
+ '=' + '"' + value
+ '"',
def unknown_endtag(self
, tag
):
print 'end tag: </' + tag
+ '>'
def unknown_entityref(self
, ref
):
print '*** unknown entity ref: &' + ref
+ ';'
def unknown_charref(self
, ref
):
print '*** unknown char ref: &#' + ref
+ ';'
opts
, args
= getopt
.getopt(args
, 'st')
print 'total time: %g' % (t1
-t0
)
print 'total time: %g' % (t1
-t0
)
if __name__
== '__main__':