[OpenSPARC-T2-SAM] / sam-t2 / devtools / amd64 / lib / python2.4 / sgmllib.py

"""A parser for SGML, using the derived class as a static DTD."""

# XXX This only supports those SGML features used by HTML.

# XXX There should be a way to distinguish between PCDATA (parsed
# character data -- the normal case), RCDATA (replaceable character
# data -- only char and entity references and end tags are special)
# and CDATA (character data -- only end tags are special).  RCDATA is
# not supported at all.


import markupbase
import re

__all__ = ["SGMLParser", "SGMLParseError"]

# Regular expressions used for parsing

interesting = re.compile('[&<]')
incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
                           '<([a-zA-Z][^<>]*|'
                              '/([a-zA-Z][^<>]*)?|'
                              '![^<>]*)?')

entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
charref = re.compile('&#([0-9]+)[^0-9]')

starttagopen = re.compile('<[>a-zA-Z]')
shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
piclose = re.compile('>')
endbracket = re.compile('[<>]')
tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
attrfind = re.compile(
    r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
    r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')


class SGMLParseError(RuntimeError):
    """Exception raised for all parse errors."""
    pass


# SGML parser base class -- find tags and call handler functions.
# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
# The dtd is defined by deriving a class which defines methods
# with special names to handle tags: start_foo and end_foo to handle
# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
# (Tags are converted to lower case for this purpose.)  The data
# between tags is passed to the parser by calling self.handle_data()
# with some data as argument (the data may be split up in arbitrary
# chunks).  Entity references are passed by calling
# self.handle_entityref() with the entity reference as argument.

class SGMLParser(markupbase.ParserBase):

    def __init__(self, verbose=0):
        """Initialize and reset this instance."""
        self.verbose = verbose
        self.reset()

    def reset(self):
        """Reset this instance. Loses all unprocessed data."""
        self.__starttag_text = None
        self.rawdata = ''
        self.stack = []
        self.lasttag = '???'
        self.nomoretags = 0
        self.literal = 0
        markupbase.ParserBase.reset(self)

    def setnomoretags(self):
        """Enter literal mode (CDATA) till EOF.

        Intended for derived classes only.
        """
        self.nomoretags = self.literal = 1

    def setliteral(self, *args):
        """Enter literal mode (CDATA).

        Intended for derived classes only.
        """
        self.literal = 1

    def feed(self, data):
        """Feed some data to the parser.

        Call this as often as you want, with as little or as much text
        as you want (may include '\n').  (This just saves the text,
        all the processing is done by goahead().)
        """

        self.rawdata = self.rawdata + data
        self.goahead(0)

    def close(self):
        """Handle the remaining data."""
        self.goahead(1)

    def error(self, message):
        raise SGMLParseError(message)

    # Internal -- handle data as far as reasonable.  May leave state
    # and data to be processed by a subsequent call.  If 'end' is
    # true, force handling all data as if followed by EOF marker.
    def goahead(self, end):
        rawdata = self.rawdata
        i = 0
        n = len(rawdata)
        while i < n:
            if self.nomoretags:
                self.handle_data(rawdata[i:n])
                i = n
                break
            match = interesting.search(rawdata, i)
            if match: j = match.start()
            else: j = n
            if i < j:
                self.handle_data(rawdata[i:j])
            i = j
            if i == n: break
            if rawdata[i] == '<':
                if starttagopen.match(rawdata, i):
                    if self.literal:
                        self.handle_data(rawdata[i])
                        i = i+1
                        continue
                    k = self.parse_starttag(i)
                    if k < 0: break
                    i = k
                    continue
                if rawdata.startswith("</", i):
                    k = self.parse_endtag(i)
                    if k < 0: break
                    i = k
                    self.literal = 0
                    continue
                if self.literal:
                    if n > (i + 1):
                        self.handle_data("<")
                        i = i+1
                    else:
                        # incomplete
                        break
                    continue
                if rawdata.startswith("<!--", i):
                        # Strictly speaking, a comment is --.*--
                        # within a declaration tag <!...>.
                        # This should be removed,
                        # and comments handled only in parse_declaration.
                    k = self.parse_comment(i)
                    if k < 0: break
                    i = k
                    continue
                if rawdata.startswith("<?", i):
                    k = self.parse_pi(i)
                    if k < 0: break
                    i = i+k
                    continue
                if rawdata.startswith("<!", i):
                    # This is some sort of declaration; in "HTML as
                    # deployed," this should only be the document type
                    # declaration ("<!DOCTYPE html...>").
                    k = self.parse_declaration(i)
                    if k < 0: break
                    i = k
                    continue
            elif rawdata[i] == '&':
                if self.literal:
                    self.handle_data(rawdata[i])
                    i = i+1
                    continue
                match = charref.match(rawdata, i)
                if match:
                    name = match.group(1)
                    self.handle_charref(name)
                    i = match.end(0)
                    if rawdata[i-1] != ';': i = i-1
                    continue
                match = entityref.match(rawdata, i)
                if match:
                    name = match.group(1)
                    self.handle_entityref(name)
                    i = match.end(0)
                    if rawdata[i-1] != ';': i = i-1
                    continue
            else:
                self.error('neither < nor & ??')
            # We get here only if incomplete matches but
            # nothing else
            match = incomplete.match(rawdata, i)
            if not match:
                self.handle_data(rawdata[i])
                i = i+1
                continue
            j = match.end(0)
            if j == n:
                break # Really incomplete
            self.handle_data(rawdata[i:j])
            i = j
        # end while
        if end and i < n:
            self.handle_data(rawdata[i:n])
            i = n
        self.rawdata = rawdata[i:]
        # XXX if end: check for empty stack

    # Extensions for the DOCTYPE scanner:
    _decl_otherchars = '='

    # Internal -- parse processing instr, return length or -1 if not terminated
    def parse_pi(self, i):
        rawdata = self.rawdata
        if rawdata[i:i+2] != '<?':
            self.error('unexpected call to parse_pi()')
        match = piclose.search(rawdata, i+2)
        if not match:
            return -1
        j = match.start(0)
        self.handle_pi(rawdata[i+2: j])
        j = match.end(0)
        return j-i

    def get_starttag_text(self):
        return self.__starttag_text

    # Internal -- handle starttag, return length or -1 if not terminated
    def parse_starttag(self, i):
        self.__starttag_text = None
        start_pos = i
        rawdata = self.rawdata
        if shorttagopen.match(rawdata, i):
            # SGML shorthand: <tag/data/ == <tag>data</tag>
            # XXX Can data contain &... (entity or char refs)?
            # XXX Can data contain < or > (tag characters)?
            # XXX Can there be whitespace before the first /?
            match = shorttag.match(rawdata, i)
            if not match:
                return -1
            tag, data = match.group(1, 2)
            self.__starttag_text = '<%s/' % tag
            tag = tag.lower()
            k = match.end(0)
            self.finish_shorttag(tag, data)
            self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
            return k
        # XXX The following should skip matching quotes (' or ")
        match = endbracket.search(rawdata, i+1)
        if not match:
            return -1
        j = match.start(0)
        # Now parse the data between i+1 and j into a tag and attrs
        attrs = []
        if rawdata[i:i+2] == '<>':
            # SGML shorthand: <> == <last open tag seen>
            k = j
            tag = self.lasttag
        else:
            match = tagfind.match(rawdata, i+1)
            if not match:
                self.error('unexpected call to parse_starttag')
            k = match.end(0)
            tag = rawdata[i+1:k].lower()
            self.lasttag = tag
        while k < j:
            match = attrfind.match(rawdata, k)
            if not match: break
            attrname, rest, attrvalue = match.group(1, 2, 3)
            if not rest:
                attrvalue = attrname
            elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
                 attrvalue[:1] == '"' == attrvalue[-1:]:
                attrvalue = attrvalue[1:-1]
            attrs.append((attrname.lower(), attrvalue))
            k = match.end(0)
        if rawdata[j] == '>':
            j = j+1
        self.__starttag_text = rawdata[start_pos:j]
        self.finish_starttag(tag, attrs)
        return j

    # Internal -- parse endtag
    def parse_endtag(self, i):
        rawdata = self.rawdata
        match = endbracket.search(rawdata, i+1)
        if not match:
            return -1
        j = match.start(0)
        tag = rawdata[i+2:j].strip().lower()
        if rawdata[j] == '>':
            j = j+1
        self.finish_endtag(tag)
        return j

    # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
    def finish_shorttag(self, tag, data):
        self.finish_starttag(tag, [])
        self.handle_data(data)
        self.finish_endtag(tag)

    # Internal -- finish processing of start tag
    # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
    def finish_starttag(self, tag, attrs):
        try:
            method = getattr(self, 'start_' + tag)
        except AttributeError:
            try:
                method = getattr(self, 'do_' + tag)
            except AttributeError:
                self.unknown_starttag(tag, attrs)
                return -1
            else:
                self.handle_starttag(tag, method, attrs)
                return 0
        else:
            self.stack.append(tag)
            self.handle_starttag(tag, method, attrs)
            return 1

    # Internal -- finish processing of end tag
    def finish_endtag(self, tag):
        if not tag:
            found = len(self.stack) - 1
            if found < 0:
                self.unknown_endtag(tag)
                return
        else:
            if tag not in self.stack:
                try:
                    method = getattr(self, 'end_' + tag)
                except AttributeError:
                    self.unknown_endtag(tag)
                else:
                    self.report_unbalanced(tag)
                return
            found = len(self.stack)
            for i in range(found):
                if self.stack[i] == tag: found = i
        while len(self.stack) > found:
            tag = self.stack[-1]
            try:
                method = getattr(self, 'end_' + tag)
            except AttributeError:
                method = None
            if method:
                self.handle_endtag(tag, method)
            else:
                self.unknown_endtag(tag)
            del self.stack[-1]

    # Overridable -- handle start tag
    def handle_starttag(self, tag, method, attrs):
        method(attrs)

    # Overridable -- handle end tag
    def handle_endtag(self, tag, method):
        method()

    # Example -- report an unbalanced </...> tag.
    def report_unbalanced(self, tag):
        if self.verbose:
            print '*** Unbalanced </' + tag + '>'
            print '*** Stack:', self.stack

    def handle_charref(self, name):
        """Handle character reference, no need to override."""
        try:
            n = int(name)
        except ValueError:
            self.unknown_charref(name)
            return
        if not 0 <= n <= 255:
            self.unknown_charref(name)
            return
        self.handle_data(chr(n))

    # Definition of entities -- derived classes may override
    entitydefs = \
            {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}

    def handle_entityref(self, name):
        """Handle entity references.

        There should be no need to override this method; it can be
        tailored by setting up the self.entitydefs mapping appropriately.
        """
        table = self.entitydefs
        if name in table:
            self.handle_data(table[name])
        else:
            self.unknown_entityref(name)
            return

    # Example -- handle data, should be overridden
    def handle_data(self, data):
        pass

    # Example -- handle comment, could be overridden
    def handle_comment(self, data):
        pass

    # Example -- handle declaration, could be overridden
    def handle_decl(self, decl):
        pass

    # Example -- handle processing instruction, could be overridden
    def handle_pi(self, data):
        pass

    # To be overridden -- handlers for unknown objects
    def unknown_starttag(self, tag, attrs): pass
    def unknown_endtag(self, tag): pass
    def unknown_charref(self, ref): pass
    def unknown_entityref(self, ref): pass


class TestSGMLParser(SGMLParser):

    def __init__(self, verbose=0):
        self.testdata = ""
        SGMLParser.__init__(self, verbose)

    def handle_data(self, data):
        self.testdata = self.testdata + data
        if len(repr(self.testdata)) >= 70:
            self.flush()

    def flush(self):
        data = self.testdata
        if data:
            self.testdata = ""
            print 'data:', repr(data)

    def handle_comment(self, data):
        self.flush()
        r = repr(data)
        if len(r) > 68:
            r = r[:32] + '...' + r[-32:]
        print 'comment:', r

    def unknown_starttag(self, tag, attrs):
        self.flush()
        if not attrs:
            print 'start tag: <' + tag + '>'
        else:
            print 'start tag: <' + tag,
            for name, value in attrs:
                print name + '=' + '"' + value + '"',
            print '>'

    def unknown_endtag(self, tag):
        self.flush()
        print 'end tag: </' + tag + '>'

    def unknown_entityref(self, ref):
        self.flush()
        print '*** unknown entity ref: &' + ref + ';'

    def unknown_charref(self, ref):
        self.flush()
        print '*** unknown char ref: &#' + ref + ';'

    def unknown_decl(self, data):
        self.flush()
        print '*** unknown decl: [' + data + ']'

    def close(self):
        SGMLParser.close(self)
        self.flush()


def test(args = None):
    import sys

    if args is None:
        args = sys.argv[1:]

    if args and args[0] == '-s':
        args = args[1:]
        klass = SGMLParser
    else:
        klass = TestSGMLParser

    if args:
        file = args[0]
    else:
        file = 'test.html'

    if file == '-':
        f = sys.stdin
    else:
        try:
            f = open(file, 'r')
        except IOError, msg:
            print file, ":", msg
            sys.exit(1)

    data = f.read()
    if f is not sys.stdin:
        f.close()

    x = klass()
    for c in data:
        x.feed(c)
    x.close()


if __name__ == '__main__':
    test()
Commit	Line	Data
920dae64 AT	1	"""A parser for SGML, using the derived class as a static DTD."""
	2
	3	# XXX This only supports those SGML features used by HTML.
	4
	5	# XXX There should be a way to distinguish between PCDATA (parsed
	6	# character data -- the normal case), RCDATA (replaceable character
	7	# data -- only char and entity references and end tags are special)
	8	# and CDATA (character data -- only end tags are special). RCDATA is
	9	# not supported at all.
	10
	11
	12	import markupbase
	13	import re
	14
	15	__all__ = ["SGMLParser", "SGMLParseError"]
	16
	17	# Regular expressions used for parsing
	18
	19	interesting = re.compile('[&<]')
	20	incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]\|#[0-9])?\|'
	21	'<([a-zA-Z][^<>]*\|'
	22	'/([a-zA-Z][^<>]*)?\|'
	23	'![^<>]*)?')
	24
	25	entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
	26	charref = re.compile('&#([0-9]+)[^0-9]')
	27
	28	starttagopen = re.compile('<[>a-zA-Z]')
	29	shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
	30	shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9])/([^/])/')
	31	piclose = re.compile('>')
	32	endbracket = re.compile('[<>]')
	33	tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
	34	attrfind = re.compile(
	35	r'\s([a-zA-Z_][-:.a-zA-Z_0-9])(\s=\s'
	36	r'(\'[^\']\'\|"[^"]"\|[-a-zA-Z0-9./,:;+%?!&$\(\)_#=~\'"@]))?')
	37
	38
	39	class SGMLParseError(RuntimeError):
	40	"""Exception raised for all parse errors."""
	41	pass
	42
	43
	44	# SGML parser base class -- find tags and call handler functions.
	45	# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
	46	# The dtd is defined by deriving a class which defines methods
	47	# with special names to handle tags: start_foo and end_foo to handle
	48	# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
	49	# (Tags are converted to lower case for this purpose.) The data
	50	# between tags is passed to the parser by calling self.handle_data()
	51	# with some data as argument (the data may be split up in arbitrary
	52	# chunks). Entity references are passed by calling
	53	# self.handle_entityref() with the entity reference as argument.
	54
	55	class SGMLParser(markupbase.ParserBase):
	56
	57	def __init__(self, verbose=0):
	58	"""Initialize and reset this instance."""
	59	self.verbose = verbose
	60	self.reset()
	61
	62	def reset(self):
	63	"""Reset this instance. Loses all unprocessed data."""
	64	self.__starttag_text = None
65	self.rawdata = ''
66	self.stack = []
67	self.lasttag = '???'
68	self.nomoretags = 0
69	self.literal = 0
70	markupbase.ParserBase.reset(self)
71
72	def setnomoretags(self):
73	"""Enter literal mode (CDATA) till EOF.
74
75	Intended for derived classes only.
76	"""
77	self.nomoretags = self.literal = 1
78
79	def setliteral(self, *args):
80	"""Enter literal mode (CDATA).
81
82	Intended for derived classes only.
83	"""
84	self.literal = 1
85
86	def feed(self, data):
87	"""Feed some data to the parser.
88
89	Call this as often as you want, with as little or as much text
90	as you want (may include '\n'). (This just saves the text,
91	all the processing is done by goahead().)
92	"""
93
94	self.rawdata = self.rawdata + data
95	self.goahead(0)
96
97	def close(self):
98	"""Handle the remaining data."""
99	self.goahead(1)
100
101	def error(self, message):
102	raise SGMLParseError(message)
103
104	# Internal -- handle data as far as reasonable. May leave state
105	# and data to be processed by a subsequent call. If 'end' is
106	# true, force handling all data as if followed by EOF marker.
107	def goahead(self, end):
108	rawdata = self.rawdata
109	i = 0
110	n = len(rawdata)
111	while i < n:
112	if self.nomoretags:
113	self.handle_data(rawdata[i:n])
114	i = n
115	break
116	match = interesting.search(rawdata, i)
117	if match: j = match.start()
118	else: j = n
119	if i < j:
120	self.handle_data(rawdata[i:j])
121	i = j
122	if i == n: break
123	if rawdata[i] == '<':
124	if starttagopen.match(rawdata, i):
125	if self.literal:
126	self.handle_data(rawdata[i])
127	i = i+1
128	continue
129	k = self.parse_starttag(i)
130	if k < 0: break
131	i = k
132	continue
133	if rawdata.startswith("</", i):
134	k = self.parse_endtag(i)
135	if k < 0: break
136	i = k
137	self.literal = 0
138	continue
139	if self.literal:
140	if n > (i + 1):
141	self.handle_data("<")
142	i = i+1
143	else:
144	# incomplete
145	break
146	continue
147	if rawdata.startswith("<!--", i):
148	# Strictly speaking, a comment is --.*--
149	# within a declaration tag <!...>.
150	# This should be removed,
151	# and comments handled only in parse_declaration.
152	k = self.parse_comment(i)
153	if k < 0: break
154	i = k
155	continue
156	if rawdata.startswith("<?", i):
157	k = self.parse_pi(i)
158	if k < 0: break
159	i = i+k
160	continue
161	if rawdata.startswith("<!", i):
162	# This is some sort of declaration; in "HTML as
163	# deployed," this should only be the document type
164	# declaration ("<!DOCTYPE html...>").
165	k = self.parse_declaration(i)
166	if k < 0: break
167	i = k
168	continue
169	elif rawdata[i] == '&':
170	if self.literal:
171	self.handle_data(rawdata[i])
172	i = i+1
173	continue
174	match = charref.match(rawdata, i)
175	if match:
176	name = match.group(1)
177	self.handle_charref(name)
178	i = match.end(0)
179	if rawdata[i-1] != ';': i = i-1
180	continue
181	match = entityref.match(rawdata, i)
182	if match:
183	name = match.group(1)
184	self.handle_entityref(name)
185	i = match.end(0)
186	if rawdata[i-1] != ';': i = i-1
187	continue
188	else:
189	self.error('neither < nor & ??')
190	# We get here only if incomplete matches but
191	# nothing else
192	match = incomplete.match(rawdata, i)
193	if not match:
194	self.handle_data(rawdata[i])
195	i = i+1
196	continue
197	j = match.end(0)
198	if j == n:
199	break # Really incomplete
200	self.handle_data(rawdata[i:j])
201	i = j
202	# end while
203	if end and i < n:
204	self.handle_data(rawdata[i:n])
205	i = n
206	self.rawdata = rawdata[i:]
207	# XXX if end: check for empty stack
208
209	# Extensions for the DOCTYPE scanner:
210	_decl_otherchars = '='
211
212	# Internal -- parse processing instr, return length or -1 if not terminated
213	def parse_pi(self, i):
214	rawdata = self.rawdata
215	if rawdata[i:i+2] != '<?':
216	self.error('unexpected call to parse_pi()')
217	match = piclose.search(rawdata, i+2)
218	if not match:
219	return -1
220	j = match.start(0)
221	self.handle_pi(rawdata[i+2: j])
222	j = match.end(0)
223	return j-i
224
225	def get_starttag_text(self):
226	return self.__starttag_text
227
228	# Internal -- handle starttag, return length or -1 if not terminated
229	def parse_starttag(self, i):
230	self.__starttag_text = None
231	start_pos = i
232	rawdata = self.rawdata
233	if shorttagopen.match(rawdata, i):
234	# SGML shorthand: <tag/data/ == <tag>data</tag>
235	# XXX Can data contain &... (entity or char refs)?
236	# XXX Can data contain < or > (tag characters)?
237	# XXX Can there be whitespace before the first /?
238	match = shorttag.match(rawdata, i)
239	if not match:
240	return -1
241	tag, data = match.group(1, 2)
242	self.__starttag_text = '<%s/' % tag
243	tag = tag.lower()
244	k = match.end(0)
245	self.finish_shorttag(tag, data)
246	self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
247	return k
248	# XXX The following should skip matching quotes (' or ")
249	match = endbracket.search(rawdata, i+1)
250	if not match:
251	return -1
252	j = match.start(0)
253	# Now parse the data between i+1 and j into a tag and attrs
254	attrs = []
255	if rawdata[i:i+2] == '<>':
256	# SGML shorthand: <> == <last open tag seen>
257	k = j
258	tag = self.lasttag
259	else:
260	match = tagfind.match(rawdata, i+1)
261	if not match:
262	self.error('unexpected call to parse_starttag')
263	k = match.end(0)
264	tag = rawdata[i+1:k].lower()
265	self.lasttag = tag
266	while k < j:
267	match = attrfind.match(rawdata, k)
268	if not match: break
269	attrname, rest, attrvalue = match.group(1, 2, 3)
270	if not rest:
271	attrvalue = attrname
272	elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
273	attrvalue[:1] == '"' == attrvalue[-1:]:
274	attrvalue = attrvalue[1:-1]
275	attrs.append((attrname.lower(), attrvalue))
276	k = match.end(0)
277	if rawdata[j] == '>':
278	j = j+1
279	self.__starttag_text = rawdata[start_pos:j]
280	self.finish_starttag(tag, attrs)
281	return j
282
283	# Internal -- parse endtag
284	def parse_endtag(self, i):
285	rawdata = self.rawdata
286	match = endbracket.search(rawdata, i+1)
287	if not match:
288	return -1
289	j = match.start(0)
290	tag = rawdata[i+2:j].strip().lower()
291	if rawdata[j] == '>':
292	j = j+1
293	self.finish_endtag(tag)
294	return j
295
296	# Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
297	def finish_shorttag(self, tag, data):
298	self.finish_starttag(tag, [])
299	self.handle_data(data)
300	self.finish_endtag(tag)
301
302	# Internal -- finish processing of start tag
303	# Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
304	def finish_starttag(self, tag, attrs):
305	try:
306	method = getattr(self, 'start_' + tag)
307	except AttributeError:
308	try:
309	method = getattr(self, 'do_' + tag)
310	except AttributeError:
311	self.unknown_starttag(tag, attrs)
312	return -1
313	else:
314	self.handle_starttag(tag, method, attrs)
315	return 0
316	else:
317	self.stack.append(tag)
318	self.handle_starttag(tag, method, attrs)
319	return 1
320
321	# Internal -- finish processing of end tag
322	def finish_endtag(self, tag):
323	if not tag:
324	found = len(self.stack) - 1
325	if found < 0:
326	self.unknown_endtag(tag)
327	return
328	else:
329	if tag not in self.stack:
330	try:
331	method = getattr(self, 'end_' + tag)
332	except AttributeError:
333	self.unknown_endtag(tag)
334	else:
335	self.report_unbalanced(tag)
336	return
337	found = len(self.stack)
338	for i in range(found):
339	if self.stack[i] == tag: found = i
340	while len(self.stack) > found:
341	tag = self.stack[-1]
342	try:
343	method = getattr(self, 'end_' + tag)
344	except AttributeError:
345	method = None
346	if method:
347	self.handle_endtag(tag, method)
348	else:
349	self.unknown_endtag(tag)
350	del self.stack[-1]
351
352	# Overridable -- handle start tag
353	def handle_starttag(self, tag, method, attrs):
354	method(attrs)
355
356	# Overridable -- handle end tag
357	def handle_endtag(self, tag, method):
358	method()
359
360	# Example -- report an unbalanced </...> tag.
361	def report_unbalanced(self, tag):
362	if self.verbose:
363	print '*** Unbalanced </' + tag + '>'
364	print '*** Stack:', self.stack
365
366	def handle_charref(self, name):
367	"""Handle character reference, no need to override."""
368	try:
369	n = int(name)
370	except ValueError:
371	self.unknown_charref(name)
372	return
373	if not 0 <= n <= 255:
374	self.unknown_charref(name)
375	return
376	self.handle_data(chr(n))
377
378	# Definition of entities -- derived classes may override
379	entitydefs = \
380	{'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
381
382	def handle_entityref(self, name):
383	"""Handle entity references.
384
385	There should be no need to override this method; it can be
386	tailored by setting up the self.entitydefs mapping appropriately.
387	"""
388	table = self.entitydefs
389	if name in table:
390	self.handle_data(table[name])
391	else:
392	self.unknown_entityref(name)
393	return
394
395	# Example -- handle data, should be overridden
396	def handle_data(self, data):
397	pass
398
399	# Example -- handle comment, could be overridden
400	def handle_comment(self, data):
401	pass
402
403	# Example -- handle declaration, could be overridden
404	def handle_decl(self, decl):
405	pass
406
407	# Example -- handle processing instruction, could be overridden
408	def handle_pi(self, data):
409	pass
410
411	# To be overridden -- handlers for unknown objects
412	def unknown_starttag(self, tag, attrs): pass
413	def unknown_endtag(self, tag): pass
414	def unknown_charref(self, ref): pass
415	def unknown_entityref(self, ref): pass
416
417
418	class TestSGMLParser(SGMLParser):
419
420	def __init__(self, verbose=0):
421	self.testdata = ""
422	SGMLParser.__init__(self, verbose)
423
424	def handle_data(self, data):
425	self.testdata = self.testdata + data
426	if len(repr(self.testdata)) >= 70:
427	self.flush()
428
429	def flush(self):
430	data = self.testdata
431	if data:
432	self.testdata = ""
433	print 'data:', repr(data)
434
435	def handle_comment(self, data):
436	self.flush()
437	r = repr(data)
438	if len(r) > 68:
439	r = r[:32] + '...' + r[-32:]
440	print 'comment:', r
441
442	def unknown_starttag(self, tag, attrs):
443	self.flush()
444	if not attrs:
445	print 'start tag: <' + tag + '>'
446	else:
447	print 'start tag: <' + tag,
448	for name, value in attrs:
449	print name + '=' + '"' + value + '"',
450	print '>'
451
452	def unknown_endtag(self, tag):
453	self.flush()
454	print 'end tag: </' + tag + '>'
455
456	def unknown_entityref(self, ref):
457	self.flush()
458	print '*** unknown entity ref: &' + ref + ';'
459
460	def unknown_charref(self, ref):
461	self.flush()
462	print '*** unknown char ref: &#' + ref + ';'
463
464	def unknown_decl(self, data):
465	self.flush()
466	print '*** unknown decl: [' + data + ']'
467
468	def close(self):
469	SGMLParser.close(self)
470	self.flush()
471
472
473	def test(args = None):
474	import sys
475
476	if args is None:
477	args = sys.argv[1:]
478
479	if args and args[0] == '-s':
480	args = args[1:]
481	klass = SGMLParser
482	else:
483	klass = TestSGMLParser
484
485	if args:
486	file = args[0]
487	else:
488	file = 'test.html'
489
490	if file == '-':
491	f = sys.stdin
492	else:
493	try:
494	f = open(file, 'r')
495	except IOError, msg:
496	print file, ":", msg
497	sys.exit(1)
498
499	data = f.read()
500	if f is not sys.stdin:
501	f.close()
502
503	x = klass()
504	for c in data:
505	x.feed(c)
506	x.close()
507
508
509	if __name__ == '__main__':
510	test()