git.subgeniuskitty.com - OpenSPARC-T2-SAM/.git/blame_incremental - sam-t2/devtools/amd64/lib/python2.4/markupbase.py

... / ...

Commit	Line	Data
	1	"""Shared support for scanning document type declarations in HTML and XHTML.
	2
	3	This module is used as a foundation for the HTMLParser and sgmllib
	4	modules (indirectly, for htmllib as well). It has no documented
	5	public API and should not be used directly.
	6
	7	"""
	8
	9	import re
	10
	11	_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]\s').match
	12	_declstringlit_match = re.compile(r'(\'[^\']\'\|"[^"]")\s*').match
	13	_commentclose = re.compile(r'--\s*>')
	14	_markedsectionclose = re.compile(r']\s]\s>')
	15
	16	# An analysis of the MS-Word extensions is available at
	17	# http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf
	18
	19	_msmarkedsectionclose = re.compile(r']\s*>')
	20
	21	del re
	22
	23
	24	class ParserBase:
	25	"""Parser base class which provides some common support methods used
	26	by the SGML/HTML and XHTML parsers."""
	27
	28	def __init__(self):
	29	if self.__class__ is ParserBase:
	30	raise RuntimeError(
	31	"markupbase.ParserBase must be subclassed")
	32
	33	def error(self, message):
	34	raise NotImplementedError(
	35	"subclasses of ParserBase must override error()")
	36
	37	def reset(self):
	38	self.lineno = 1
	39	self.offset = 0
	40
	41	def getpos(self):
	42	"""Return current line number and offset."""
	43	return self.lineno, self.offset
	44
	45	# Internal -- update line number and offset. This should be
	46	# called for each piece of data exactly once, in order -- in other
	47	# words the concatenation of all the input strings to this
	48	# function should be exactly the entire input.
	49	def updatepos(self, i, j):
	50	if i >= j:
	51	return j
	52	rawdata = self.rawdata
	53	nlines = rawdata.count("\n", i, j)
	54	if nlines:
	55	self.lineno = self.lineno + nlines
	56	pos = rawdata.rindex("\n", i, j) # Should not fail
	57	self.offset = j-(pos+1)
	58	else:
	59	self.offset = self.offset + j-i
	60	return j
	61
	62	_decl_otherchars = ''
	63
	64	# Internal -- parse declaration (for use by subclasses).
	65	def parse_declaration(self, i):
	66	# This is some sort of declaration; in "HTML as
	67	# deployed," this should only be the document type
	68	# declaration ("<!DOCTYPE html...>").
	69	# ISO 8879:1986, however, has more complex
	70	# declaration syntax for elements in <!...>, including:
	71	# --comment--
	72	# [marked section]
	73	# name in the following list: ENTITY, DOCTYPE, ELEMENT,
	74	# ATTLIST, NOTATION, SHORTREF, USEMAP,
	75	# LINKTYPE, LINK, IDLINK, USELINK, SYSTEM
	76	rawdata = self.rawdata
	77	j = i + 2
	78	assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
	79	if rawdata[j:j+1] in ("-", ""):
	80	# Start of comment followed by buffer boundary,
	81	# or just a buffer boundary.
	82	return -1
	83	# A simple, practical version could look like: ((name\|stringlit) S*) + '>'
	84	n = len(rawdata)
	85	if rawdata[j:j+1] == '--': #comment
	86	# Locate --.*-- as the body of the comment
	87	return self.parse_comment(i)
	88	elif rawdata[j] == '[': #marked section
	89	# Locate [statusWord [...arbitrary SGML...]] as the body of the marked section
	90	# Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA
	91	# Note that this is extended by Microsoft Office "Save as Web" function
	92	# to include [if...] and [endif].
	93	return self.parse_marked_section(i)
	94	else: #all other declaration elements
	95	decltype, j = self._scan_name(j, i)
	96	if j < 0:
	97	return j
	98	if decltype == "doctype":
	99	self._decl_otherchars = ''
	100	while j < n:
	101	c = rawdata[j]
	102	if c == ">":
	103	# end of declaration syntax
	104	data = rawdata[i+2:j]
	105	if decltype == "doctype":
	106	self.handle_decl(data)
	107	else:
	108	self.unknown_decl(data)
	109	return j + 1
	110	if c in "\"'":
	111	m = _declstringlit_match(rawdata, j)
	112	if not m:
	113	return -1 # incomplete
	114	j = m.end()
	115	elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
	116	name, j = self._scan_name(j, i)
	117	elif c in self._decl_otherchars:
	118	j = j + 1
	119	elif c == "[":
	120	# this could be handled in a separate doctype parser
	121	if decltype == "doctype":
	122	j = self._parse_doctype_subset(j + 1, i)
	123	elif decltype in ("attlist", "linktype", "link", "element"):
	124	# must tolerate []'d groups in a content model in an element declaration
	125	# also in data attribute specifications of attlist declaration
	126	# also link type declaration subsets in linktype declarations
	127	# also link attribute specification lists in link declarations
	128	self.error("unsupported '[' char in %s declaration" % decltype)
	129	else:
	130	self.error("unexpected '[' char in declaration")
	131	else:
	132	self.error(
	133	"unexpected %r char in declaration" % rawdata[j])
	134	if j < 0:
	135	return j
	136	return -1 # incomplete
	137
	138	# Internal -- parse a marked section
	139	# Override this to handle MS-word extension syntax <![if word]>content<![endif]>
	140	def parse_marked_section( self, i, report=1 ):
	141	rawdata= self.rawdata
	142	assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()"
	143	sectName, j = self._scan_name( i+3, i )
	144	if j < 0:
	145	return j
	146	if sectName in ("temp", "cdata", "ignore", "include", "rcdata"):
	147	# look for standard ]]> ending
	148	match= _markedsectionclose.search(rawdata, i+3)
	149	elif sectName in ("if", "else", "endif"):
	150	# look for MS Office ]> ending
	151	match= _msmarkedsectionclose.search(rawdata, i+3)
	152	else:
	153	self.error('unknown status keyword %r in marked section' % rawdata[i+3:j])
	154	if not match:
	155	return -1
	156	if report:
	157	j = match.start(0)
	158	self.unknown_decl(rawdata[i+3: j])
	159	return match.end(0)
	160
	161	# Internal -- parse comment, return length or -1 if not terminated
	162	def parse_comment(self, i, report=1):
	163	rawdata = self.rawdata
	164	if rawdata[i:i+4] != '<!--':
	165	self.error('unexpected call to parse_comment()')
	166	match = _commentclose.search(rawdata, i+4)
	167	if not match:
	168	return -1
	169	if report:
	170	j = match.start(0)
	171	self.handle_comment(rawdata[i+4: j])
	172	return match.end(0)
	173
	174	# Internal -- scan past the internal subset in a <!DOCTYPE declaration,
	175	# returning the index just past any whitespace following the trailing ']'.
	176	def _parse_doctype_subset(self, i, declstartpos):
	177	rawdata = self.rawdata
	178	n = len(rawdata)
	179	j = i
	180	while j < n:
	181	c = rawdata[j]
	182	if c == "<":
	183	s = rawdata[j:j+2]
	184	if s == "<":
	185	# end of buffer; incomplete
	186	return -1
	187	if s != "<!":
	188	self.updatepos(declstartpos, j + 1)
	189	self.error("unexpected char in internal subset (in %r)" % s)
	190	if (j + 2) == n:
	191	# end of buffer; incomplete
	192	return -1
	193	if (j + 4) > n:
	194	# end of buffer; incomplete
	195	return -1
	196	if rawdata[j:j+4] == "<!--":
	197	j = self.parse_comment(j, report=0)
	198	if j < 0:
	199	return j
	200	continue
	201	name, j = self._scan_name(j + 2, declstartpos)
	202	if j == -1:
	203	return -1
	204	if name not in ("attlist", "element", "entity", "notation"):
	205	self.updatepos(declstartpos, j + 2)
	206	self.error(
	207	"unknown declaration %r in internal subset" % name)
	208	# handle the individual names
	209	meth = getattr(self, "_parse_doctype_" + name)
	210	j = meth(j, declstartpos)
	211	if j < 0:
	212	return j
	213	elif c == "%":
	214	# parameter entity reference
	215	if (j + 1) == n:
	216	# end of buffer; incomplete
	217	return -1
	218	s, j = self._scan_name(j + 1, declstartpos)
	219	if j < 0:
	220	return j
	221	if rawdata[j] == ";":
	222	j = j + 1
	223	elif c == "]":
	224	j = j + 1
	225	while j < n and rawdata[j].isspace():
	226	j = j + 1
	227	if j < n:
	228	if rawdata[j] == ">":
	229	return j
	230	self.updatepos(declstartpos, j)
	231	self.error("unexpected char after internal subset")
	232	else:
	233	return -1
	234	elif c.isspace():
	235	j = j + 1
	236	else:
	237	self.updatepos(declstartpos, j)
	238	self.error("unexpected char %r in internal subset" % c)
	239	# end of buffer reached
	240	return -1
	241
	242	# Internal -- scan past <!ELEMENT declarations
	243	def _parse_doctype_element(self, i, declstartpos):
	244	name, j = self._scan_name(i, declstartpos)
	245	if j == -1:
	246	return -1
	247	# style content model; just skip until '>'
	248	rawdata = self.rawdata
	249	if '>' in rawdata[j:]:
	250	return rawdata.find(">", j) + 1
	251	return -1
	252
	253	# Internal -- scan past <!ATTLIST declarations
	254	def _parse_doctype_attlist(self, i, declstartpos):
	255	rawdata = self.rawdata
	256	name, j = self._scan_name(i, declstartpos)
	257	c = rawdata[j:j+1]
	258	if c == "":
	259	return -1
	260	if c == ">":
	261	return j + 1
	262	while 1:
	263	# scan a series of attribute descriptions; simplified:
	264	# name type [value] [#constraint]
	265	name, j = self._scan_name(j, declstartpos)
	266	if j < 0:
	267	return j
	268	c = rawdata[j:j+1]
	269	if c == "":
	270	return -1
	271	if c == "(":
	272	# an enumerated type; look for ')'
	273	if ")" in rawdata[j:]:
	274	j = rawdata.find(")", j) + 1
	275	else:
	276	return -1
	277	while rawdata[j:j+1].isspace():
	278	j = j + 1
	279	if not rawdata[j:]:
	280	# end of buffer, incomplete
	281	return -1
	282	else:
	283	name, j = self._scan_name(j, declstartpos)
	284	c = rawdata[j:j+1]
	285	if not c:
	286	return -1
	287	if c in "'\"":
	288	m = _declstringlit_match(rawdata, j)
	289	if m:
	290	j = m.end()
	291	else:
	292	return -1
	293	c = rawdata[j:j+1]
	294	if not c:
	295	return -1
	296	if c == "#":
	297	if rawdata[j:] == "#":
	298	# end of buffer
	299	return -1
	300	name, j = self._scan_name(j + 1, declstartpos)
	301	if j < 0:
	302	return j
	303	c = rawdata[j:j+1]
	304	if not c:
	305	return -1
	306	if c == '>':
	307	# all done
	308	return j + 1
	309
	310	# Internal -- scan past <!NOTATION declarations
	311	def _parse_doctype_notation(self, i, declstartpos):
	312	name, j = self._scan_name(i, declstartpos)
	313	if j < 0:
	314	return j
	315	rawdata = self.rawdata
	316	while 1:
	317	c = rawdata[j:j+1]
	318	if not c:
	319	# end of buffer; incomplete
	320	return -1
	321	if c == '>':
	322	return j + 1
	323	if c in "'\"":
	324	m = _declstringlit_match(rawdata, j)
	325	if not m:
	326	return -1
	327	j = m.end()
	328	else:
	329	name, j = self._scan_name(j, declstartpos)
	330	if j < 0:
	331	return j
	332
	333	# Internal -- scan past <!ENTITY declarations
	334	def _parse_doctype_entity(self, i, declstartpos):
	335	rawdata = self.rawdata
	336	if rawdata[i:i+1] == "%":
	337	j = i + 1
	338	while 1:
	339	c = rawdata[j:j+1]
	340	if not c:
	341	return -1
	342	if c.isspace():
	343	j = j + 1
	344	else:
	345	break
	346	else:
	347	j = i
	348	name, j = self._scan_name(j, declstartpos)
	349	if j < 0:
	350	return j
	351	while 1:
	352	c = self.rawdata[j:j+1]
	353	if not c:
	354	return -1
	355	if c in "'\"":
	356	m = _declstringlit_match(rawdata, j)
	357	if m:
	358	j = m.end()
	359	else:
	360	return -1 # incomplete
	361	elif c == ">":
	362	return j + 1
	363	else:
	364	name, j = self._scan_name(j, declstartpos)
	365	if j < 0:
	366	return j
	367
	368	# Internal -- scan a name token and the new position and the token, or
	369	# return -1 if we've reached the end of the buffer.
	370	def _scan_name(self, i, declstartpos):
	371	rawdata = self.rawdata
	372	n = len(rawdata)
	373	if i == n:
	374	return None, -1
	375	m = _declname_match(rawdata, i)
	376	if m:
	377	s = m.group()
	378	name = s.strip()
	379	if (i + len(s)) == n:
	380	return None, -1 # end of buffer
	381	return name.lower(), m.end()
	382	else:
	383	self.updatepos(declstartpos, i)
	384	self.error("expected name token at %r"
	385	% rawdata[declstartpos:declstartpos+20])
	386
	387	# To be overridden -- handlers for unknown objects
	388	def unknown_decl(self, data):
	389	pass