"""Shared support for scanning document type declarations in HTML and XHTML.
This module is used as a foundation for the HTMLParser and sgmllib
modules (indirectly, for htmllib as well). It has no documented
public API and should not be used directly.
_declname_match
= re
.compile(r
'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
_declstringlit_match
= re
.compile(r
'(\'[^
\']*\'|
"[^"]*")\s*').match
_commentclose = re.compile(r'--\s*>')
_markedsectionclose = re.compile(r']\s*]\s*>')
# An analysis of the MS-Word extensions is available at
# http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf
_msmarkedsectionclose = re.compile(r']\s*>')
"""Parser base class which provides some common support methods used
by the SGML/HTML and XHTML parsers."""
if self.__class__ is ParserBase:
"markupbase
.ParserBase must be subclassed
")
def error(self, message):
raise NotImplementedError(
"subclasses of ParserBase must override
error()")
"""Return current line number and offset."""
return self.lineno, self.offset
# Internal -- update line number and offset. This should be
# called for each piece of data exactly once, in order -- in other
# words the concatenation of all the input strings to this
# function should be exactly the entire input.
def updatepos(self, i, j):
nlines = rawdata.count("\n", i, j)
self.lineno = self.lineno + nlines
pos = rawdata.rindex("\n", i, j) # Should not fail
self.offset = self.offset + j-i
# Internal -- parse declaration (for use by subclasses).
def parse_declaration(self, i):
# This is some sort of declaration; in "HTML
as
# deployed," this should only be the document type
# declaration ("<!DOCTYPE html...>").
# ISO 8879:1986, however, has more complex
# declaration syntax for elements in <!...>, including:
# name in the following list: ENTITY, DOCTYPE, ELEMENT,
# ATTLIST, NOTATION, SHORTREF, USEMAP,
# LINKTYPE, LINK, IDLINK, USELINK, SYSTEM
assert rawdata
[i
:j
] == "<!", "unexpected call to parse_declaration"
if rawdata
[j
:j
+1] in ("-", ""):
# Start of comment followed by buffer boundary,
# or just a buffer boundary.
# A simple, practical version could look like: ((name|stringlit) S*) + '>'
if rawdata
[j
:j
+1] == '--': #comment
# Locate --.*-- as the body of the comment
return self
.parse_comment(i
)
elif rawdata
[j
] == '[': #marked section
# Locate [statusWord [...arbitrary SGML...]] as the body of the marked section
# Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA
# Note that this is extended by Microsoft Office "Save as Web" function
# to include [if...] and [endif].
return self
.parse_marked_section(i
)
else: #all other declaration elements
decltype
, j
= self
._scan
_name
(j
, i
)
if decltype
== "doctype":
self
._decl
_otherchars
= ''
# end of declaration syntax
if decltype
== "doctype":
m
= _declstringlit_match(rawdata
, j
)
elif c
in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
name
, j
= self
._scan
_name
(j
, i
)
elif c
in self
._decl
_otherchars
:
# this could be handled in a separate doctype parser
if decltype
== "doctype":
j
= self
._parse
_doctype
_subset
(j
+ 1, i
)
elif decltype
in ("attlist", "linktype", "link", "element"):
# must tolerate []'d groups in a content model in an element declaration
# also in data attribute specifications of attlist declaration
# also link type declaration subsets in linktype declarations
# also link attribute specification lists in link declarations
self
.error("unsupported '[' char in %s declaration" % decltype
)
self
.error("unexpected '[' char in declaration")
"unexpected %r char in declaration" % rawdata
[j
])
# Internal -- parse a marked section
# Override this to handle MS-word extension syntax <![if word]>content<![endif]>
def parse_marked_section( self
, i
, report
=1 ):
assert rawdata
[i
:i
+3] == '<![', "unexpected call to parse_marked_section()"
sectName
, j
= self
._scan
_name
( i
+3, i
)
if sectName
in ("temp", "cdata", "ignore", "include", "rcdata"):
# look for standard ]]> ending
match
= _markedsectionclose
.search(rawdata
, i
+3)
elif sectName
in ("if", "else", "endif"):
# look for MS Office ]> ending
match
= _msmarkedsectionclose
.search(rawdata
, i
+3)
self
.error('unknown status keyword %r in marked section' % rawdata
[i
+3:j
])
self
.unknown_decl(rawdata
[i
+3: j
])
# Internal -- parse comment, return length or -1 if not terminated
def parse_comment(self
, i
, report
=1):
if rawdata
[i
:i
+4] != '<!--':
self
.error('unexpected call to parse_comment()')
match
= _commentclose
.search(rawdata
, i
+4)
self
.handle_comment(rawdata
[i
+4: j
])
# Internal -- scan past the internal subset in a <!DOCTYPE declaration,
# returning the index just past any whitespace following the trailing ']'.
def _parse_doctype_subset(self
, i
, declstartpos
):
# end of buffer; incomplete
self
.updatepos(declstartpos
, j
+ 1)
self
.error("unexpected char in internal subset (in %r)" % s
)
# end of buffer; incomplete
# end of buffer; incomplete
if rawdata
[j
:j
+4] == "<!--":
j
= self
.parse_comment(j
, report
=0)
name
, j
= self
._scan
_name
(j
+ 2, declstartpos
)
if name
not in ("attlist", "element", "entity", "notation"):
self
.updatepos(declstartpos
, j
+ 2)
"unknown declaration %r in internal subset" % name
)
# handle the individual names
meth
= getattr(self
, "_parse_doctype_" + name
)
j
= meth(j
, declstartpos
)
# parameter entity reference
# end of buffer; incomplete
s
, j
= self
._scan
_name
(j
+ 1, declstartpos
)
while j
< n
and rawdata
[j
].isspace():
self
.updatepos(declstartpos
, j
)
self
.error("unexpected char after internal subset")
self
.updatepos(declstartpos
, j
)
self
.error("unexpected char %r in internal subset" % c
)
# Internal -- scan past <!ELEMENT declarations
def _parse_doctype_element(self
, i
, declstartpos
):
name
, j
= self
._scan
_name
(i
, declstartpos
)
# style content model; just skip until '>'
return rawdata
.find(">", j
) + 1
# Internal -- scan past <!ATTLIST declarations
def _parse_doctype_attlist(self
, i
, declstartpos
):
name
, j
= self
._scan
_name
(i
, declstartpos
)
# scan a series of attribute descriptions; simplified:
# name type [value] [#constraint]
name
, j
= self
._scan
_name
(j
, declstartpos
)
# an enumerated type; look for ')'
j
= rawdata
.find(")", j
) + 1
while rawdata
[j
:j
+1].isspace():
# end of buffer, incomplete
name
, j
= self
._scan
_name
(j
, declstartpos
)
m
= _declstringlit_match(rawdata
, j
)
name
, j
= self
._scan
_name
(j
+ 1, declstartpos
)
# Internal -- scan past <!NOTATION declarations
def _parse_doctype_notation(self
, i
, declstartpos
):
name
, j
= self
._scan
_name
(i
, declstartpos
)
# end of buffer; incomplete
m
= _declstringlit_match(rawdata
, j
)
name
, j
= self
._scan
_name
(j
, declstartpos
)
# Internal -- scan past <!ENTITY declarations
def _parse_doctype_entity(self
, i
, declstartpos
):
if rawdata
[i
:i
+1] == "%":
name
, j
= self
._scan
_name
(j
, declstartpos
)
m
= _declstringlit_match(rawdata
, j
)
name
, j
= self
._scan
_name
(j
, declstartpos
)
# Internal -- scan a name token and the new position and the token, or
# return -1 if we've reached the end of the buffer.
def _scan_name(self
, i
, declstartpos
):
m
= _declname_match(rawdata
, i
)
return None, -1 # end of buffer
return name
.lower(), m
.end()
self
.updatepos(declstartpos
, i
)
self
.error("expected name token at %r"
% rawdata
[declstartpos
:declstartpos
+20])
# To be overridden -- handlers for unknown objects
def unknown_decl(self
, data
):