Commit | Line | Data |
---|---|---|
86530b38 AT |
1 | import xml.sax |
2 | import xml.sax.handler | |
3 | import types | |
4 | ||
5 | try: | |
6 | _StringTypes = [types.StringType, types.UnicodeType] | |
7 | except AttributeError: | |
8 | _StringTypes = [types.StringType] | |
9 | ||
10 | START_ELEMENT = "START_ELEMENT" | |
11 | END_ELEMENT = "END_ELEMENT" | |
12 | COMMENT = "COMMENT" | |
13 | START_DOCUMENT = "START_DOCUMENT" | |
14 | END_DOCUMENT = "END_DOCUMENT" | |
15 | PROCESSING_INSTRUCTION = "PROCESSING_INSTRUCTION" | |
16 | IGNORABLE_WHITESPACE = "IGNORABLE_WHITESPACE" | |
17 | CHARACTERS = "CHARACTERS" | |
18 | ||
19 | class PullDOM(xml.sax.ContentHandler): | |
20 | _locator = None | |
21 | document = None | |
22 | ||
23 | def __init__(self, documentFactory=None): | |
24 | from xml.dom import XML_NAMESPACE | |
25 | self.documentFactory = documentFactory | |
26 | self.firstEvent = [None, None] | |
27 | self.lastEvent = self.firstEvent | |
28 | self.elementStack = [] | |
29 | self.push = self.elementStack.append | |
30 | try: | |
31 | self.pop = self.elementStack.pop | |
32 | except AttributeError: | |
33 | # use class' pop instead | |
34 | pass | |
35 | self._ns_contexts = [{XML_NAMESPACE:'xml'}] # contains uri -> prefix dicts | |
36 | self._current_context = self._ns_contexts[-1] | |
37 | self.pending_events = [] | |
38 | ||
39 | def pop(self): | |
40 | result = self.elementStack[-1] | |
41 | del self.elementStack[-1] | |
42 | return result | |
43 | ||
44 | def setDocumentLocator(self, locator): | |
45 | self._locator = locator | |
46 | ||
47 | def startPrefixMapping(self, prefix, uri): | |
48 | if not hasattr(self, '_xmlns_attrs'): | |
49 | self._xmlns_attrs = [] | |
50 | self._xmlns_attrs.append((prefix or 'xmlns', uri)) | |
51 | self._ns_contexts.append(self._current_context.copy()) | |
52 | self._current_context[uri] = prefix or None | |
53 | ||
54 | def endPrefixMapping(self, prefix): | |
55 | self._current_context = self._ns_contexts.pop() | |
56 | ||
57 | def startElementNS(self, name, tagName , attrs): | |
58 | # Retrieve xml namespace declaration attributes. | |
59 | xmlns_uri = 'http://www.w3.org/2000/xmlns/' | |
60 | xmlns_attrs = getattr(self, '_xmlns_attrs', None) | |
61 | if xmlns_attrs is not None: | |
62 | for aname, value in xmlns_attrs: | |
63 | attrs._attrs[(xmlns_uri, aname)] = value | |
64 | self._xmlns_attrs = [] | |
65 | uri, localname = name | |
66 | if uri: | |
67 | # When using namespaces, the reader may or may not | |
68 | # provide us with the original name. If not, create | |
69 | # *a* valid tagName from the current context. | |
70 | if tagName is None: | |
71 | prefix = self._current_context[uri] | |
72 | if prefix: | |
73 | tagName = prefix + ":" + localname | |
74 | else: | |
75 | tagName = localname | |
76 | if self.document: | |
77 | node = self.document.createElementNS(uri, tagName) | |
78 | else: | |
79 | node = self.buildDocument(uri, tagName) | |
80 | else: | |
81 | # When the tagname is not prefixed, it just appears as | |
82 | # localname | |
83 | if self.document: | |
84 | node = self.document.createElement(localname) | |
85 | else: | |
86 | node = self.buildDocument(None, localname) | |
87 | ||
88 | for aname,value in attrs.items(): | |
89 | a_uri, a_localname = aname | |
90 | if a_uri == xmlns_uri: | |
91 | if a_localname == 'xmlns': | |
92 | qname = a_localname | |
93 | else: | |
94 | qname = 'xmlns:' + a_localname | |
95 | attr = self.document.createAttributeNS(a_uri, qname) | |
96 | node.setAttributeNodeNS(attr) | |
97 | elif a_uri: | |
98 | prefix = self._current_context[a_uri] | |
99 | if prefix: | |
100 | qname = prefix + ":" + a_localname | |
101 | else: | |
102 | qname = a_localname | |
103 | attr = self.document.createAttributeNS(a_uri, qname) | |
104 | node.setAttributeNodeNS(attr) | |
105 | else: | |
106 | attr = self.document.createAttribute(a_localname) | |
107 | node.setAttributeNode(attr) | |
108 | attr.value = value | |
109 | ||
110 | self.lastEvent[1] = [(START_ELEMENT, node), None] | |
111 | self.lastEvent = self.lastEvent[1] | |
112 | self.push(node) | |
113 | ||
114 | def endElementNS(self, name, tagName): | |
115 | self.lastEvent[1] = [(END_ELEMENT, self.pop()), None] | |
116 | self.lastEvent = self.lastEvent[1] | |
117 | ||
118 | def startElement(self, name, attrs): | |
119 | if self.document: | |
120 | node = self.document.createElement(name) | |
121 | else: | |
122 | node = self.buildDocument(None, name) | |
123 | ||
124 | for aname,value in attrs.items(): | |
125 | attr = self.document.createAttribute(aname) | |
126 | attr.value = value | |
127 | node.setAttributeNode(attr) | |
128 | ||
129 | self.lastEvent[1] = [(START_ELEMENT, node), None] | |
130 | self.lastEvent = self.lastEvent[1] | |
131 | self.push(node) | |
132 | ||
133 | def endElement(self, name): | |
134 | self.lastEvent[1] = [(END_ELEMENT, self.pop()), None] | |
135 | self.lastEvent = self.lastEvent[1] | |
136 | ||
137 | def comment(self, s): | |
138 | if self.document: | |
139 | node = self.document.createComment(s) | |
140 | self.lastEvent[1] = [(COMMENT, node), None] | |
141 | self.lastEvent = self.lastEvent[1] | |
142 | else: | |
143 | event = [(COMMENT, s), None] | |
144 | self.pending_events.append(event) | |
145 | ||
146 | def processingInstruction(self, target, data): | |
147 | if self.document: | |
148 | node = self.document.createProcessingInstruction(target, data) | |
149 | self.lastEvent[1] = [(PROCESSING_INSTRUCTION, node), None] | |
150 | self.lastEvent = self.lastEvent[1] | |
151 | else: | |
152 | event = [(PROCESSING_INSTRUCTION, target, data), None] | |
153 | self.pending_events.append(event) | |
154 | ||
155 | def ignorableWhitespace(self, chars): | |
156 | node = self.document.createTextNode(chars) | |
157 | self.lastEvent[1] = [(IGNORABLE_WHITESPACE, node), None] | |
158 | self.lastEvent = self.lastEvent[1] | |
159 | ||
160 | def characters(self, chars): | |
161 | node = self.document.createTextNode(chars) | |
162 | self.lastEvent[1] = [(CHARACTERS, node), None] | |
163 | self.lastEvent = self.lastEvent[1] | |
164 | ||
165 | def startDocument(self): | |
166 | if self.documentFactory is None: | |
167 | import xml.dom.minidom | |
168 | self.documentFactory = xml.dom.minidom.Document.implementation | |
169 | ||
170 | def buildDocument(self, uri, tagname): | |
171 | # Can't do that in startDocument, since we need the tagname | |
172 | # XXX: obtain DocumentType | |
173 | node = self.documentFactory.createDocument(uri, tagname, None) | |
174 | self.document = node | |
175 | self.lastEvent[1] = [(START_DOCUMENT, node), None] | |
176 | self.lastEvent = self.lastEvent[1] | |
177 | self.push(node) | |
178 | # Put everything we have seen so far into the document | |
179 | for e in self.pending_events: | |
180 | if e[0][0] == PROCESSING_INSTRUCTION: | |
181 | _,target,data = e[0] | |
182 | n = self.document.createProcessingInstruction(target, data) | |
183 | e[0] = (PROCESSING_INSTRUCTION, n) | |
184 | elif e[0][0] == COMMENT: | |
185 | n = self.document.createComment(e[0][1]) | |
186 | e[0] = (COMMENT, n) | |
187 | else: | |
188 | raise AssertionError("Unknown pending event ",e[0][0]) | |
189 | self.lastEvent[1] = e | |
190 | self.lastEvent = e | |
191 | self.pending_events = None | |
192 | return node.firstChild | |
193 | ||
194 | def endDocument(self): | |
195 | self.lastEvent[1] = [(END_DOCUMENT, self.document), None] | |
196 | self.pop() | |
197 | ||
198 | def clear(self): | |
199 | "clear(): Explicitly release parsing structures" | |
200 | self.document = None | |
201 | ||
202 | class ErrorHandler: | |
203 | def warning(self, exception): | |
204 | print exception | |
205 | def error(self, exception): | |
206 | raise exception | |
207 | def fatalError(self, exception): | |
208 | raise exception | |
209 | ||
210 | class DOMEventStream: | |
211 | def __init__(self, stream, parser, bufsize): | |
212 | self.stream = stream | |
213 | self.parser = parser | |
214 | self.bufsize = bufsize | |
215 | if not hasattr(self.parser, 'feed'): | |
216 | self.getEvent = self._slurp | |
217 | self.reset() | |
218 | ||
219 | def reset(self): | |
220 | self.pulldom = PullDOM() | |
221 | # This content handler relies on namespace support | |
222 | self.parser.setFeature(xml.sax.handler.feature_namespaces, 1) | |
223 | self.parser.setContentHandler(self.pulldom) | |
224 | ||
225 | def __getitem__(self, pos): | |
226 | rc = self.getEvent() | |
227 | if rc: | |
228 | return rc | |
229 | raise IndexError | |
230 | ||
231 | def next(self): | |
232 | rc = self.getEvent() | |
233 | if rc: | |
234 | return rc | |
235 | raise StopIteration | |
236 | ||
237 | def __iter__(self): | |
238 | return self | |
239 | ||
240 | def expandNode(self, node): | |
241 | event = self.getEvent() | |
242 | parents = [node] | |
243 | while event: | |
244 | token, cur_node = event | |
245 | if cur_node is node: | |
246 | return | |
247 | if token != END_ELEMENT: | |
248 | parents[-1].appendChild(cur_node) | |
249 | if token == START_ELEMENT: | |
250 | parents.append(cur_node) | |
251 | elif token == END_ELEMENT: | |
252 | del parents[-1] | |
253 | event = self.getEvent() | |
254 | ||
255 | def getEvent(self): | |
256 | # use IncrementalParser interface, so we get the desired | |
257 | # pull effect | |
258 | if not self.pulldom.firstEvent[1]: | |
259 | self.pulldom.lastEvent = self.pulldom.firstEvent | |
260 | while not self.pulldom.firstEvent[1]: | |
261 | buf = self.stream.read(self.bufsize) | |
262 | if not buf: | |
263 | self.parser.close() | |
264 | return None | |
265 | self.parser.feed(buf) | |
266 | rc = self.pulldom.firstEvent[1][0] | |
267 | self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1] | |
268 | return rc | |
269 | ||
270 | def _slurp(self): | |
271 | """ Fallback replacement for getEvent() using the | |
272 | standard SAX2 interface, which means we slurp the | |
273 | SAX events into memory (no performance gain, but | |
274 | we are compatible to all SAX parsers). | |
275 | """ | |
276 | self.parser.parse(self.stream) | |
277 | self.getEvent = self._emit | |
278 | return self._emit() | |
279 | ||
280 | def _emit(self): | |
281 | """ Fallback replacement for getEvent() that emits | |
282 | the events that _slurp() read previously. | |
283 | """ | |
284 | rc = self.pulldom.firstEvent[1][0] | |
285 | self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1] | |
286 | return rc | |
287 | ||
288 | def clear(self): | |
289 | """clear(): Explicitly release parsing objects""" | |
290 | self.pulldom.clear() | |
291 | del self.pulldom | |
292 | self.parser = None | |
293 | self.stream = None | |
294 | ||
295 | class SAX2DOM(PullDOM): | |
296 | ||
297 | def startElementNS(self, name, tagName , attrs): | |
298 | PullDOM.startElementNS(self, name, tagName, attrs) | |
299 | curNode = self.elementStack[-1] | |
300 | parentNode = self.elementStack[-2] | |
301 | parentNode.appendChild(curNode) | |
302 | ||
303 | def startElement(self, name, attrs): | |
304 | PullDOM.startElement(self, name, attrs) | |
305 | curNode = self.elementStack[-1] | |
306 | parentNode = self.elementStack[-2] | |
307 | parentNode.appendChild(curNode) | |
308 | ||
309 | def processingInstruction(self, target, data): | |
310 | PullDOM.processingInstruction(self, target, data) | |
311 | node = self.lastEvent[0][1] | |
312 | parentNode = self.elementStack[-1] | |
313 | parentNode.appendChild(node) | |
314 | ||
315 | def ignorableWhitespace(self, chars): | |
316 | PullDOM.ignorableWhitespace(self, chars) | |
317 | node = self.lastEvent[0][1] | |
318 | parentNode = self.elementStack[-1] | |
319 | parentNode.appendChild(node) | |
320 | ||
321 | def characters(self, chars): | |
322 | PullDOM.characters(self, chars) | |
323 | node = self.lastEvent[0][1] | |
324 | parentNode = self.elementStack[-1] | |
325 | parentNode.appendChild(node) | |
326 | ||
327 | ||
328 | default_bufsize = (2 ** 14) - 20 | |
329 | ||
330 | def parse(stream_or_string, parser=None, bufsize=None): | |
331 | if bufsize is None: | |
332 | bufsize = default_bufsize | |
333 | if type(stream_or_string) in _StringTypes: | |
334 | stream = open(stream_or_string) | |
335 | else: | |
336 | stream = stream_or_string | |
337 | if not parser: | |
338 | parser = xml.sax.make_parser() | |
339 | return DOMEventStream(stream, parser, bufsize) | |
340 | ||
341 | def parseString(string, parser=None): | |
342 | try: | |
343 | from cStringIO import StringIO | |
344 | except ImportError: | |
345 | from StringIO import StringIO | |
346 | ||
347 | bufsize = len(string) | |
348 | buf = StringIO(string) | |
349 | if not parser: | |
350 | parser = xml.sax.make_parser() | |
351 | return DOMEventStream(buf, parser, bufsize) |