[OpenSPARC-T2-DV] / tools / src / nas,5.n2.os.2 / lib / python / lib / python2.4 / HTMLParser.py

"""A parser for HTML and XHTML."""

# This file is based on sgmllib.py, but the API is slightly different.

# XXX There should be a way to distinguish between PCDATA (parsed
# character data -- the normal case), RCDATA (replaceable character
# data -- only char and entity references and end tags are special)
# and CDATA (character data -- only end tags are special).


import markupbase
import re

# Regular expressions used for parsing

interesting_normal = re.compile('[&<]')
interesting_cdata = re.compile(r'<(/|\Z)')
incomplete = re.compile('&[a-zA-Z#]')

entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')

starttagopen = re.compile('<[a-zA-Z]')
piclose = re.compile('>')
commentclose = re.compile(r'--\s*>')
tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
attrfind = re.compile(
    r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
    r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?')

locatestarttagend = re.compile(r"""
  <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
  (?:\s+                             # whitespace before attribute name
    (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
      (?:\s*=\s*                     # value indicator
        (?:'[^']*'                   # LITA-enclosed value
          |\"[^\"]*\"                # LIT-enclosed value
          |[^'\">\s]+                # bare value
         )
       )?
     )
   )*
  \s*                                # trailing whitespace
""", re.VERBOSE)
endendtag = re.compile('>')
endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')


class HTMLParseError(Exception):
    """Exception raised for all parse errors."""

    def __init__(self, msg, position=(None, None)):
        assert msg
        self.msg = msg
        self.lineno = position[0]
        self.offset = position[1]

    def __str__(self):
        result = self.msg
        if self.lineno is not None:
            result = result + ", at line %d" % self.lineno
        if self.offset is not None:
            result = result + ", column %d" % (self.offset + 1)
        return result


class HTMLParser(markupbase.ParserBase):
    """Find tags and other markup and call handler functions.

    Usage:
        p = HTMLParser()
        p.feed(data)
        ...
        p.close()

    Start tags are handled by calling self.handle_starttag() or
    self.handle_startendtag(); end tags by self.handle_endtag().  The
    data between tags is passed from the parser to the derived class
    by calling self.handle_data() with the data as argument (the data
    may be split up in arbitrary chunks).  Entity references are
    passed by calling self.handle_entityref() with the entity
    reference as the argument.  Numeric character references are
    passed to self.handle_charref() with the string containing the
    reference as the argument.
    """

    CDATA_CONTENT_ELEMENTS = ("script", "style")


    def __init__(self):
        """Initialize and reset this instance."""
        self.reset()

    def reset(self):
        """Reset this instance.  Loses all unprocessed data."""
        self.rawdata = ''
        self.lasttag = '???'
        self.interesting = interesting_normal
        markupbase.ParserBase.reset(self)

    def feed(self, data):
        """Feed data to the parser.

        Call this as often as you want, with as little or as much text
        as you want (may include '\n').
        """
        self.rawdata = self.rawdata + data
        self.goahead(0)

    def close(self):
        """Handle any buffered data."""
        self.goahead(1)

    def error(self, message):
        raise HTMLParseError(message, self.getpos())

    __starttag_text = None

    def get_starttag_text(self):
        """Return full source of start tag: '<...>'."""
        return self.__starttag_text

    def set_cdata_mode(self):
        self.interesting = interesting_cdata

    def clear_cdata_mode(self):
        self.interesting = interesting_normal

    # Internal -- handle data as far as reasonable.  May leave state
    # and data to be processed by a subsequent call.  If 'end' is
    # true, force handling all data as if followed by EOF marker.
    def goahead(self, end):
        rawdata = self.rawdata
        i = 0
        n = len(rawdata)
        while i < n:
            match = self.interesting.search(rawdata, i) # < or &
            if match:
                j = match.start()
            else:
                j = n
            if i < j: self.handle_data(rawdata[i:j])
            i = self.updatepos(i, j)
            if i == n: break
            startswith = rawdata.startswith
            if startswith('<', i):
                if starttagopen.match(rawdata, i): # < + letter
                    k = self.parse_starttag(i)
                elif startswith("</", i):
                    k = self.parse_endtag(i)
                elif startswith("<!--", i):
                    k = self.parse_comment(i)
                elif startswith("<?", i):
                    k = self.parse_pi(i)
                elif startswith("<!", i):
                    k = self.parse_declaration(i)
                elif (i + 1) < n:
                    self.handle_data("<")
                    k = i + 1
                else:
                    break
                if k < 0:
                    if end:
                        self.error("EOF in middle of construct")
                    break
                i = self.updatepos(i, k)
            elif startswith("&#", i):
                match = charref.match(rawdata, i)
                if match:
                    name = match.group()[2:-1]
                    self.handle_charref(name)
                    k = match.end()
                    if not startswith(';', k-1):
                        k = k - 1
                    i = self.updatepos(i, k)
                    continue
                else:
                    break
            elif startswith('&', i):
                match = entityref.match(rawdata, i)
                if match:
                    name = match.group(1)
                    self.handle_entityref(name)
                    k = match.end()
                    if not startswith(';', k-1):
                        k = k - 1
                    i = self.updatepos(i, k)
                    continue
                match = incomplete.match(rawdata, i)
                if match:
                    # match.group() will contain at least 2 chars
                    if end and match.group() == rawdata[i:]:
                        self.error("EOF in middle of entity or char ref")
                    # incomplete
                    break
                elif (i + 1) < n:
                    # not the end of the buffer, and can't be confused
                    # with some other construct
                    self.handle_data("&")
                    i = self.updatepos(i, i + 1)
                else:
                    break
            else:
                assert 0, "interesting.search() lied"
        # end while
        if end and i < n:
            self.handle_data(rawdata[i:n])
            i = self.updatepos(i, n)
        self.rawdata = rawdata[i:]

    # Internal -- parse processing instr, return end or -1 if not terminated
    def parse_pi(self, i):
        rawdata = self.rawdata
        assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
        match = piclose.search(rawdata, i+2) # >
        if not match:
            return -1
        j = match.start()
        self.handle_pi(rawdata[i+2: j])
        j = match.end()
        return j

    # Internal -- handle starttag, return end or -1 if not terminated
    def parse_starttag(self, i):
        self.__starttag_text = None
        endpos = self.check_for_whole_start_tag(i)
        if endpos < 0:
            return endpos
        rawdata = self.rawdata
        self.__starttag_text = rawdata[i:endpos]

        # Now parse the data between i+1 and j into a tag and attrs
        attrs = []
        match = tagfind.match(rawdata, i+1)
        assert match, 'unexpected call to parse_starttag()'
        k = match.end()
        self.lasttag = tag = rawdata[i+1:k].lower()

        while k < endpos:
            m = attrfind.match(rawdata, k)
            if not m:
                break
            attrname, rest, attrvalue = m.group(1, 2, 3)
            if not rest:
                attrvalue = None
            elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
                 attrvalue[:1] == '"' == attrvalue[-1:]:
                attrvalue = attrvalue[1:-1]
                attrvalue = self.unescape(attrvalue)
            attrs.append((attrname.lower(), attrvalue))
            k = m.end()

        end = rawdata[k:endpos].strip()
        if end not in (">", "/>"):
            lineno, offset = self.getpos()
            if "\n" in self.__starttag_text:
                lineno = lineno + self.__starttag_text.count("\n")
                offset = len(self.__starttag_text) \
                         - self.__starttag_text.rfind("\n")
            else:
                offset = offset + len(self.__starttag_text)
            self.error("junk characters in start tag: %r"
                       % (rawdata[k:endpos][:20],))
        if end.endswith('/>'):
            # XHTML-style empty tag: <span attr="value" />
            self.handle_startendtag(tag, attrs)
        else:
            self.handle_starttag(tag, attrs)
            if tag in self.CDATA_CONTENT_ELEMENTS:
                self.set_cdata_mode()
        return endpos

    # Internal -- check to see if we have a complete starttag; return end
    # or -1 if incomplete.
    def check_for_whole_start_tag(self, i):
        rawdata = self.rawdata
        m = locatestarttagend.match(rawdata, i)
        if m:
            j = m.end()
            next = rawdata[j:j+1]
            if next == ">":
                return j + 1
            if next == "/":
                if rawdata.startswith("/>", j):
                    return j + 2
                if rawdata.startswith("/", j):
                    # buffer boundary
                    return -1
                # else bogus input
                self.updatepos(i, j + 1)
                self.error("malformed empty start tag")
            if next == "":
                # end of input
                return -1
            if next in ("abcdefghijklmnopqrstuvwxyz=/"
                        "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
                # end of input in or before attribute value, or we have the
                # '/' from a '/>' ending
                return -1
            self.updatepos(i, j)
            self.error("malformed start tag")
        raise AssertionError("we should not get here!")

    # Internal -- parse endtag, return end or -1 if incomplete
    def parse_endtag(self, i):
        rawdata = self.rawdata
        assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
        match = endendtag.search(rawdata, i+1) # >
        if not match:
            return -1
        j = match.end()
        match = endtagfind.match(rawdata, i) # </ + tag + >
        if not match:
            self.error("bad end tag: %r" % (rawdata[i:j],))
        tag = match.group(1)
        self.handle_endtag(tag.lower())
        self.clear_cdata_mode()
        return j

    # Overridable -- finish processing of start+end tag: <tag.../>
    def handle_startendtag(self, tag, attrs):
        self.handle_starttag(tag, attrs)
        self.handle_endtag(tag)

    # Overridable -- handle start tag
    def handle_starttag(self, tag, attrs):
        pass

    # Overridable -- handle end tag
    def handle_endtag(self, tag):
        pass

    # Overridable -- handle character reference
    def handle_charref(self, name):
        pass

    # Overridable -- handle entity reference
    def handle_entityref(self, name):
        pass

    # Overridable -- handle data
    def handle_data(self, data):
        pass

    # Overridable -- handle comment
    def handle_comment(self, data):
        pass

    # Overridable -- handle declaration
    def handle_decl(self, decl):
        pass

    # Overridable -- handle processing instruction
    def handle_pi(self, data):
        pass

    def unknown_decl(self, data):
        self.error("unknown declaration: %r" % (data,))

    # Internal -- helper to remove special character quoting
    def unescape(self, s):
        if '&' not in s:
            return s
        s = s.replace("&lt;", "<")
        s = s.replace("&gt;", ">")
        s = s.replace("&apos;", "'")
        s = s.replace("&quot;", '"')
        s = s.replace("&amp;", "&") # Must be last
        return s
Commit	Line	Data
86530b38 AT	1	"""A parser for HTML and XHTML."""
	2
	3	# This file is based on sgmllib.py, but the API is slightly different.
	4
	5	# XXX There should be a way to distinguish between PCDATA (parsed
	6	# character data -- the normal case), RCDATA (replaceable character
	7	# data -- only char and entity references and end tags are special)
	8	# and CDATA (character data -- only end tags are special).
	9
	10
	11	import markupbase
	12	import re
	13
	14	# Regular expressions used for parsing
	15
	16	interesting_normal = re.compile('[&<]')
	17	interesting_cdata = re.compile(r'<(/\|\Z)')
	18	incomplete = re.compile('&[a-zA-Z#]')
	19
	20	entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
	21	charref = re.compile('&#(?:[0-9]+\|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
	22
	23	starttagopen = re.compile('<[a-zA-Z]')
	24	piclose = re.compile('>')
	25	commentclose = re.compile(r'--\s*>')
	26	tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
	27	attrfind = re.compile(
	28	r'\s([a-zA-Z_][-.:a-zA-Z_0-9])(\s=\s'
	29	r'(\'[^\']\'\|"[^"]"\|[-a-zA-Z0-9./,:;+%?!&$\(\)_#=~@]))?')
	30
	31	locatestarttagend = re.compile(r"""
	32	<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
	33	(?:\s+ # whitespace before attribute name
	34	(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
	35	(?:\s=\s # value indicator
	36	(?:'[^']*' # LITA-enclosed value
	37	\|\"[^\"]*\" # LIT-enclosed value
	38	\|[^'\">\s]+ # bare value
	39	)
	40	)?
	41	)
	42	)*
	43	\s* # trailing whitespace
	44	""", re.VERBOSE)
	45	endendtag = re.compile('>')
	46	endtagfind = re.compile('</\s([a-zA-Z][-.a-zA-Z0-9:_])\s*>')
	47
	48
	49	class HTMLParseError(Exception):
	50	"""Exception raised for all parse errors."""
	51
	52	def __init__(self, msg, position=(None, None)):
	53	assert msg
	54	self.msg = msg
	55	self.lineno = position[0]
	56	self.offset = position[1]
	57
	58	def __str__(self):
	59	result = self.msg
	60	if self.lineno is not None:
	61	result = result + ", at line %d" % self.lineno
	62	if self.offset is not None:
	63	result = result + ", column %d" % (self.offset + 1)
	64	return result
65
66
67	class HTMLParser(markupbase.ParserBase):
68	"""Find tags and other markup and call handler functions.
69
70	Usage:
71	p = HTMLParser()
72	p.feed(data)
73	...
74	p.close()
75
76	Start tags are handled by calling self.handle_starttag() or
77	self.handle_startendtag(); end tags by self.handle_endtag(). The
78	data between tags is passed from the parser to the derived class
79	by calling self.handle_data() with the data as argument (the data
80	may be split up in arbitrary chunks). Entity references are
81	passed by calling self.handle_entityref() with the entity
82	reference as the argument. Numeric character references are
83	passed to self.handle_charref() with the string containing the
84	reference as the argument.
85	"""
86
87	CDATA_CONTENT_ELEMENTS = ("script", "style")
88
89
90	def __init__(self):
91	"""Initialize and reset this instance."""
92	self.reset()
93
94	def reset(self):
95	"""Reset this instance. Loses all unprocessed data."""
96	self.rawdata = ''
97	self.lasttag = '???'
98	self.interesting = interesting_normal
99	markupbase.ParserBase.reset(self)
100
101	def feed(self, data):
102	"""Feed data to the parser.
103
104	Call this as often as you want, with as little or as much text
105	as you want (may include '\n').
106	"""
107	self.rawdata = self.rawdata + data
108	self.goahead(0)
109
110	def close(self):
111	"""Handle any buffered data."""
112	self.goahead(1)
113
114	def error(self, message):
115	raise HTMLParseError(message, self.getpos())
116
117	__starttag_text = None
118
119	def get_starttag_text(self):
120	"""Return full source of start tag: '<...>'."""
121	return self.__starttag_text
122
123	def set_cdata_mode(self):
124	self.interesting = interesting_cdata
125
126	def clear_cdata_mode(self):
127	self.interesting = interesting_normal
128
129	# Internal -- handle data as far as reasonable. May leave state
130	# and data to be processed by a subsequent call. If 'end' is
131	# true, force handling all data as if followed by EOF marker.
132	def goahead(self, end):
133	rawdata = self.rawdata
134	i = 0
135	n = len(rawdata)
136	while i < n:
137	match = self.interesting.search(rawdata, i) # < or &
138	if match:
139	j = match.start()
140	else:
141	j = n
142	if i < j: self.handle_data(rawdata[i:j])
143	i = self.updatepos(i, j)
144	if i == n: break
145	startswith = rawdata.startswith
146	if startswith('<', i):
147	if starttagopen.match(rawdata, i): # < + letter
148	k = self.parse_starttag(i)
149	elif startswith("</", i):
150	k = self.parse_endtag(i)
151	elif startswith("<!--", i):
152	k = self.parse_comment(i)
153	elif startswith("<?", i):
154	k = self.parse_pi(i)
155	elif startswith("<!", i):
156	k = self.parse_declaration(i)
157	elif (i + 1) < n:
158	self.handle_data("<")
159	k = i + 1
160	else:
161	break
162	if k < 0:
163	if end:
164	self.error("EOF in middle of construct")
165	break
166	i = self.updatepos(i, k)
167	elif startswith("&#", i):
168	match = charref.match(rawdata, i)
169	if match:
170	name = match.group()[2:-1]
171	self.handle_charref(name)
172	k = match.end()
173	if not startswith(';', k-1):
174	k = k - 1
175	i = self.updatepos(i, k)
176	continue
177	else:
178	break
179	elif startswith('&', i):
180	match = entityref.match(rawdata, i)
181	if match:
182	name = match.group(1)
183	self.handle_entityref(name)
184	k = match.end()
185	if not startswith(';', k-1):
186	k = k - 1
187	i = self.updatepos(i, k)
188	continue
189	match = incomplete.match(rawdata, i)
190	if match:
191	# match.group() will contain at least 2 chars
192	if end and match.group() == rawdata[i:]:
193	self.error("EOF in middle of entity or char ref")
194	# incomplete
195	break
196	elif (i + 1) < n:
197	# not the end of the buffer, and can't be confused
198	# with some other construct
199	self.handle_data("&")
200	i = self.updatepos(i, i + 1)
201	else:
202	break
203	else:
204	assert 0, "interesting.search() lied"
205	# end while
206	if end and i < n:
207	self.handle_data(rawdata[i:n])
208	i = self.updatepos(i, n)
209	self.rawdata = rawdata[i:]
210
211	# Internal -- parse processing instr, return end or -1 if not terminated
212	def parse_pi(self, i):
213	rawdata = self.rawdata
214	assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
215	match = piclose.search(rawdata, i+2) # >
216	if not match:
217	return -1
218	j = match.start()
219	self.handle_pi(rawdata[i+2: j])
220	j = match.end()
221	return j
222
223	# Internal -- handle starttag, return end or -1 if not terminated
224	def parse_starttag(self, i):
225	self.__starttag_text = None
226	endpos = self.check_for_whole_start_tag(i)
227	if endpos < 0:
228	return endpos
229	rawdata = self.rawdata
230	self.__starttag_text = rawdata[i:endpos]
231
232	# Now parse the data between i+1 and j into a tag and attrs
233	attrs = []
234	match = tagfind.match(rawdata, i+1)
235	assert match, 'unexpected call to parse_starttag()'
236	k = match.end()
237	self.lasttag = tag = rawdata[i+1:k].lower()
238
239	while k < endpos:
240	m = attrfind.match(rawdata, k)
241	if not m:
242	break
243	attrname, rest, attrvalue = m.group(1, 2, 3)
244	if not rest:
245	attrvalue = None
246	elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
247	attrvalue[:1] == '"' == attrvalue[-1:]:
248	attrvalue = attrvalue[1:-1]
249	attrvalue = self.unescape(attrvalue)
250	attrs.append((attrname.lower(), attrvalue))
251	k = m.end()
252
253	end = rawdata[k:endpos].strip()
254	if end not in (">", "/>"):
255	lineno, offset = self.getpos()
256	if "\n" in self.__starttag_text:
257	lineno = lineno + self.__starttag_text.count("\n")
258	offset = len(self.__starttag_text) \
259	- self.__starttag_text.rfind("\n")
260	else:
261	offset = offset + len(self.__starttag_text)
262	self.error("junk characters in start tag: %r"
263	% (rawdata[k:endpos][:20],))
264	if end.endswith('/>'):
265	# XHTML-style empty tag: <span attr="value" />
266	self.handle_startendtag(tag, attrs)
267	else:
268	self.handle_starttag(tag, attrs)
269	if tag in self.CDATA_CONTENT_ELEMENTS:
270	self.set_cdata_mode()
271	return endpos
272
273	# Internal -- check to see if we have a complete starttag; return end
274	# or -1 if incomplete.
275	def check_for_whole_start_tag(self, i):
276	rawdata = self.rawdata
277	m = locatestarttagend.match(rawdata, i)
278	if m:
279	j = m.end()
280	next = rawdata[j:j+1]
281	if next == ">":
282	return j + 1
283	if next == "/":
284	if rawdata.startswith("/>", j):
285	return j + 2
286	if rawdata.startswith("/", j):
287	# buffer boundary
288	return -1
289	# else bogus input
290	self.updatepos(i, j + 1)
291	self.error("malformed empty start tag")
292	if next == "":
293	# end of input
294	return -1
295	if next in ("abcdefghijklmnopqrstuvwxyz=/"
296	"ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
297	# end of input in or before attribute value, or we have the
298	# '/' from a '/>' ending
299	return -1
300	self.updatepos(i, j)
301	self.error("malformed start tag")
302	raise AssertionError("we should not get here!")
303
304	# Internal -- parse endtag, return end or -1 if incomplete
305	def parse_endtag(self, i):
306	rawdata = self.rawdata
307	assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
308	match = endendtag.search(rawdata, i+1) # >
309	if not match:
310	return -1
311	j = match.end()
312	match = endtagfind.match(rawdata, i) # </ + tag + >
313	if not match:
314	self.error("bad end tag: %r" % (rawdata[i:j],))
315	tag = match.group(1)
316	self.handle_endtag(tag.lower())
317	self.clear_cdata_mode()
318	return j
319
320	# Overridable -- finish processing of start+end tag: <tag.../>
321	def handle_startendtag(self, tag, attrs):
322	self.handle_starttag(tag, attrs)
323	self.handle_endtag(tag)
324
325	# Overridable -- handle start tag
326	def handle_starttag(self, tag, attrs):
327	pass
328
329	# Overridable -- handle end tag
330	def handle_endtag(self, tag):
331	pass
332
333	# Overridable -- handle character reference
334	def handle_charref(self, name):
335	pass
336
337	# Overridable -- handle entity reference
338	def handle_entityref(self, name):
339	pass
340
341	# Overridable -- handle data
342	def handle_data(self, data):
343	pass
344
345	# Overridable -- handle comment
346	def handle_comment(self, data):
347	pass
348
349	# Overridable -- handle declaration
350	def handle_decl(self, decl):
351	pass
352
353	# Overridable -- handle processing instruction
354	def handle_pi(self, data):
355	pass
356
357	def unknown_decl(self, data):
358	self.error("unknown declaration: %r" % (data,))
359
360	# Internal -- helper to remove special character quoting
361	def unescape(self, s):
362	if '&' not in s:
363	return s
364	s = s.replace("<", "<")
365	s = s.replace(">", ">")
366	s = s.replace("'", "'")
367	s = s.replace(""", '"')
368	s = s.replace("&", "&") # Must be last
369	return s