Commit | Line | Data |
---|---|---|
920dae64 AT |
1 | """ |
2 | SAX driver for the pyexpat C module. This driver works with | |
3 | pyexpat.__version__ == '2.22'. | |
4 | """ | |
5 | ||
6 | version = "0.20" | |
7 | ||
8 | from xml.sax._exceptions import * | |
9 | from xml.sax.handler import feature_validation, feature_namespaces | |
10 | from xml.sax.handler import feature_namespace_prefixes | |
11 | from xml.sax.handler import feature_external_ges, feature_external_pes | |
12 | from xml.sax.handler import feature_string_interning | |
13 | from xml.sax.handler import property_xml_string, property_interning_dict | |
14 | ||
15 | # xml.parsers.expat does not raise ImportError in Jython | |
16 | import sys | |
17 | if sys.platform[:4] == "java": | |
18 | raise SAXReaderNotAvailable("expat not available in Java", None) | |
19 | del sys | |
20 | ||
21 | try: | |
22 | from xml.parsers import expat | |
23 | except ImportError: | |
24 | raise SAXReaderNotAvailable("expat not supported", None) | |
25 | else: | |
26 | if not hasattr(expat, "ParserCreate"): | |
27 | raise SAXReaderNotAvailable("expat not supported", None) | |
28 | from xml.sax import xmlreader, saxutils, handler | |
29 | ||
30 | AttributesImpl = xmlreader.AttributesImpl | |
31 | AttributesNSImpl = xmlreader.AttributesNSImpl | |
32 | ||
33 | # If we're using a sufficiently recent version of Python, we can use | |
34 | # weak references to avoid cycles between the parser and content | |
35 | # handler, otherwise we'll just have to pretend. | |
36 | try: | |
37 | import _weakref | |
38 | except ImportError: | |
39 | def _mkproxy(o): | |
40 | return o | |
41 | else: | |
42 | import weakref | |
43 | _mkproxy = weakref.proxy | |
44 | del weakref, _weakref | |
45 | ||
46 | # --- ExpatLocator | |
47 | ||
48 | class ExpatLocator(xmlreader.Locator): | |
49 | """Locator for use with the ExpatParser class. | |
50 | ||
51 | This uses a weak reference to the parser object to avoid creating | |
52 | a circular reference between the parser and the content handler. | |
53 | """ | |
54 | def __init__(self, parser): | |
55 | self._ref = _mkproxy(parser) | |
56 | ||
57 | def getColumnNumber(self): | |
58 | parser = self._ref | |
59 | if parser._parser is None: | |
60 | return None | |
61 | return parser._parser.ErrorColumnNumber | |
62 | ||
63 | def getLineNumber(self): | |
64 | parser = self._ref | |
65 | if parser._parser is None: | |
66 | return 1 | |
67 | return parser._parser.ErrorLineNumber | |
68 | ||
69 | def getPublicId(self): | |
70 | parser = self._ref | |
71 | if parser is None: | |
72 | return None | |
73 | return parser._source.getPublicId() | |
74 | ||
75 | def getSystemId(self): | |
76 | parser = self._ref | |
77 | if parser is None: | |
78 | return None | |
79 | return parser._source.getSystemId() | |
80 | ||
81 | ||
82 | # --- ExpatParser | |
83 | ||
84 | class ExpatParser(xmlreader.IncrementalParser, xmlreader.Locator): | |
85 | """SAX driver for the pyexpat C module.""" | |
86 | ||
87 | def __init__(self, namespaceHandling=0, bufsize=2**16-20): | |
88 | xmlreader.IncrementalParser.__init__(self, bufsize) | |
89 | self._source = xmlreader.InputSource() | |
90 | self._parser = None | |
91 | self._namespaces = namespaceHandling | |
92 | self._lex_handler_prop = None | |
93 | self._parsing = 0 | |
94 | self._entity_stack = [] | |
95 | self._external_ges = 1 | |
96 | self._interning = None | |
97 | ||
98 | # XMLReader methods | |
99 | ||
100 | def parse(self, source): | |
101 | "Parse an XML document from a URL or an InputSource." | |
102 | source = saxutils.prepare_input_source(source) | |
103 | ||
104 | self._source = source | |
105 | self.reset() | |
106 | self._cont_handler.setDocumentLocator(ExpatLocator(self)) | |
107 | xmlreader.IncrementalParser.parse(self, source) | |
108 | ||
109 | def prepareParser(self, source): | |
110 | if source.getSystemId() != None: | |
111 | self._parser.SetBase(source.getSystemId()) | |
112 | ||
113 | # Redefined setContentHandler to allow changing handlers during parsing | |
114 | ||
115 | def setContentHandler(self, handler): | |
116 | xmlreader.IncrementalParser.setContentHandler(self, handler) | |
117 | if self._parsing: | |
118 | self._reset_cont_handler() | |
119 | ||
120 | def getFeature(self, name): | |
121 | if name == feature_namespaces: | |
122 | return self._namespaces | |
123 | elif name == feature_string_interning: | |
124 | return self._interning is not None | |
125 | elif name in (feature_validation, feature_external_pes, | |
126 | feature_namespace_prefixes): | |
127 | return 0 | |
128 | elif name == feature_external_ges: | |
129 | return self._external_ges | |
130 | raise SAXNotRecognizedException("Feature '%s' not recognized" % name) | |
131 | ||
132 | def setFeature(self, name, state): | |
133 | if self._parsing: | |
134 | raise SAXNotSupportedException("Cannot set features while parsing") | |
135 | ||
136 | if name == feature_namespaces: | |
137 | self._namespaces = state | |
138 | elif name == feature_external_ges: | |
139 | self._external_ges = state | |
140 | elif name == feature_string_interning: | |
141 | if state: | |
142 | if self._interning is None: | |
143 | self._interning = {} | |
144 | else: | |
145 | self._interning = None | |
146 | elif name == feature_validation: | |
147 | if state: | |
148 | raise SAXNotSupportedException( | |
149 | "expat does not support validation") | |
150 | elif name == feature_external_pes: | |
151 | if state: | |
152 | raise SAXNotSupportedException( | |
153 | "expat does not read external parameter entities") | |
154 | elif name == feature_namespace_prefixes: | |
155 | if state: | |
156 | raise SAXNotSupportedException( | |
157 | "expat does not report namespace prefixes") | |
158 | else: | |
159 | raise SAXNotRecognizedException( | |
160 | "Feature '%s' not recognized" % name) | |
161 | ||
162 | def getProperty(self, name): | |
163 | if name == handler.property_lexical_handler: | |
164 | return self._lex_handler_prop | |
165 | elif name == property_interning_dict: | |
166 | return self._interning | |
167 | elif name == property_xml_string: | |
168 | if self._parser: | |
169 | if hasattr(self._parser, "GetInputContext"): | |
170 | return self._parser.GetInputContext() | |
171 | else: | |
172 | raise SAXNotRecognizedException( | |
173 | "This version of expat does not support getting" | |
174 | " the XML string") | |
175 | else: | |
176 | raise SAXNotSupportedException( | |
177 | "XML string cannot be returned when not parsing") | |
178 | raise SAXNotRecognizedException("Property '%s' not recognized" % name) | |
179 | ||
180 | def setProperty(self, name, value): | |
181 | if name == handler.property_lexical_handler: | |
182 | self._lex_handler_prop = value | |
183 | if self._parsing: | |
184 | self._reset_lex_handler_prop() | |
185 | elif name == property_interning_dict: | |
186 | self._interning = value | |
187 | elif name == property_xml_string: | |
188 | raise SAXNotSupportedException("Property '%s' cannot be set" % | |
189 | name) | |
190 | else: | |
191 | raise SAXNotRecognizedException("Property '%s' not recognized" % | |
192 | name) | |
193 | ||
194 | # IncrementalParser methods | |
195 | ||
196 | def feed(self, data, isFinal = 0): | |
197 | if not self._parsing: | |
198 | self.reset() | |
199 | self._parsing = 1 | |
200 | self._cont_handler.startDocument() | |
201 | ||
202 | try: | |
203 | # The isFinal parameter is internal to the expat reader. | |
204 | # If it is set to true, expat will check validity of the entire | |
205 | # document. When feeding chunks, they are not normally final - | |
206 | # except when invoked from close. | |
207 | self._parser.Parse(data, isFinal) | |
208 | except expat.error, e: | |
209 | exc = SAXParseException(expat.ErrorString(e.code), e, self) | |
210 | # FIXME: when to invoke error()? | |
211 | self._err_handler.fatalError(exc) | |
212 | ||
213 | def close(self): | |
214 | if self._entity_stack: | |
215 | # If we are completing an external entity, do nothing here | |
216 | return | |
217 | self.feed("", isFinal = 1) | |
218 | self._cont_handler.endDocument() | |
219 | self._parsing = 0 | |
220 | # break cycle created by expat handlers pointing to our methods | |
221 | self._parser = None | |
222 | ||
223 | def _reset_cont_handler(self): | |
224 | self._parser.ProcessingInstructionHandler = \ | |
225 | self._cont_handler.processingInstruction | |
226 | self._parser.CharacterDataHandler = self._cont_handler.characters | |
227 | ||
228 | def _reset_lex_handler_prop(self): | |
229 | lex = self._lex_handler_prop | |
230 | parser = self._parser | |
231 | if lex is None: | |
232 | parser.CommentHandler = None | |
233 | parser.StartCdataSectionHandler = None | |
234 | parser.EndCdataSectionHandler = None | |
235 | parser.StartDoctypeDeclHandler = None | |
236 | parser.EndDoctypeDeclHandler = None | |
237 | else: | |
238 | parser.CommentHandler = lex.comment | |
239 | parser.StartCdataSectionHandler = lex.startCDATA | |
240 | parser.EndCdataSectionHandler = lex.endCDATA | |
241 | parser.StartDoctypeDeclHandler = self.start_doctype_decl | |
242 | parser.EndDoctypeDeclHandler = lex.endDTD | |
243 | ||
244 | def reset(self): | |
245 | if self._namespaces: | |
246 | self._parser = expat.ParserCreate(None, " ", | |
247 | intern=self._interning) | |
248 | self._parser.namespace_prefixes = 1 | |
249 | self._parser.StartElementHandler = self.start_element_ns | |
250 | self._parser.EndElementHandler = self.end_element_ns | |
251 | else: | |
252 | self._parser = expat.ParserCreate(intern = self._interning) | |
253 | self._parser.StartElementHandler = self.start_element | |
254 | self._parser.EndElementHandler = self.end_element | |
255 | ||
256 | self._reset_cont_handler() | |
257 | self._parser.UnparsedEntityDeclHandler = self.unparsed_entity_decl | |
258 | self._parser.NotationDeclHandler = self.notation_decl | |
259 | self._parser.StartNamespaceDeclHandler = self.start_namespace_decl | |
260 | self._parser.EndNamespaceDeclHandler = self.end_namespace_decl | |
261 | ||
262 | self._decl_handler_prop = None | |
263 | if self._lex_handler_prop: | |
264 | self._reset_lex_handler_prop() | |
265 | # self._parser.DefaultHandler = | |
266 | # self._parser.DefaultHandlerExpand = | |
267 | # self._parser.NotStandaloneHandler = | |
268 | self._parser.ExternalEntityRefHandler = self.external_entity_ref | |
269 | try: | |
270 | self._parser.SkippedEntityHandler = self.skipped_entity_handler | |
271 | except AttributeError: | |
272 | # This pyexpat does not support SkippedEntity | |
273 | pass | |
274 | self._parser.SetParamEntityParsing( | |
275 | expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE) | |
276 | ||
277 | self._parsing = 0 | |
278 | self._entity_stack = [] | |
279 | ||
280 | # Locator methods | |
281 | ||
282 | def getColumnNumber(self): | |
283 | if self._parser is None: | |
284 | return None | |
285 | return self._parser.ErrorColumnNumber | |
286 | ||
287 | def getLineNumber(self): | |
288 | if self._parser is None: | |
289 | return 1 | |
290 | return self._parser.ErrorLineNumber | |
291 | ||
292 | def getPublicId(self): | |
293 | return self._source.getPublicId() | |
294 | ||
295 | def getSystemId(self): | |
296 | return self._source.getSystemId() | |
297 | ||
298 | # event handlers | |
299 | def start_element(self, name, attrs): | |
300 | self._cont_handler.startElement(name, AttributesImpl(attrs)) | |
301 | ||
302 | def end_element(self, name): | |
303 | self._cont_handler.endElement(name) | |
304 | ||
305 | def start_element_ns(self, name, attrs): | |
306 | pair = name.split() | |
307 | if len(pair) == 1: | |
308 | # no namespace | |
309 | pair = (None, name) | |
310 | elif len(pair) == 3: | |
311 | pair = pair[0], pair[1] | |
312 | else: | |
313 | # default namespace | |
314 | pair = tuple(pair) | |
315 | ||
316 | newattrs = {} | |
317 | qnames = {} | |
318 | for (aname, value) in attrs.items(): | |
319 | parts = aname.split() | |
320 | length = len(parts) | |
321 | if length == 1: | |
322 | # no namespace | |
323 | qname = aname | |
324 | apair = (None, aname) | |
325 | elif length == 3: | |
326 | qname = "%s:%s" % (parts[2], parts[1]) | |
327 | apair = parts[0], parts[1] | |
328 | else: | |
329 | # default namespace | |
330 | qname = parts[1] | |
331 | apair = tuple(parts) | |
332 | ||
333 | newattrs[apair] = value | |
334 | qnames[apair] = qname | |
335 | ||
336 | self._cont_handler.startElementNS(pair, None, | |
337 | AttributesNSImpl(newattrs, qnames)) | |
338 | ||
339 | def end_element_ns(self, name): | |
340 | pair = name.split() | |
341 | if len(pair) == 1: | |
342 | pair = (None, name) | |
343 | elif len(pair) == 3: | |
344 | pair = pair[0], pair[1] | |
345 | else: | |
346 | pair = tuple(pair) | |
347 | ||
348 | self._cont_handler.endElementNS(pair, None) | |
349 | ||
350 | # this is not used (call directly to ContentHandler) | |
351 | def processing_instruction(self, target, data): | |
352 | self._cont_handler.processingInstruction(target, data) | |
353 | ||
354 | # this is not used (call directly to ContentHandler) | |
355 | def character_data(self, data): | |
356 | self._cont_handler.characters(data) | |
357 | ||
358 | def start_namespace_decl(self, prefix, uri): | |
359 | self._cont_handler.startPrefixMapping(prefix, uri) | |
360 | ||
361 | def end_namespace_decl(self, prefix): | |
362 | self._cont_handler.endPrefixMapping(prefix) | |
363 | ||
364 | def start_doctype_decl(self, name, sysid, pubid, has_internal_subset): | |
365 | self._lex_handler_prop.startDTD(name, pubid, sysid) | |
366 | ||
367 | def unparsed_entity_decl(self, name, base, sysid, pubid, notation_name): | |
368 | self._dtd_handler.unparsedEntityDecl(name, pubid, sysid, notation_name) | |
369 | ||
370 | def notation_decl(self, name, base, sysid, pubid): | |
371 | self._dtd_handler.notationDecl(name, pubid, sysid) | |
372 | ||
373 | def external_entity_ref(self, context, base, sysid, pubid): | |
374 | if not self._external_ges: | |
375 | return 1 | |
376 | ||
377 | source = self._ent_handler.resolveEntity(pubid, sysid) | |
378 | source = saxutils.prepare_input_source(source, | |
379 | self._source.getSystemId() or | |
380 | "") | |
381 | ||
382 | self._entity_stack.append((self._parser, self._source)) | |
383 | self._parser = self._parser.ExternalEntityParserCreate(context) | |
384 | self._source = source | |
385 | ||
386 | try: | |
387 | xmlreader.IncrementalParser.parse(self, source) | |
388 | except: | |
389 | return 0 # FIXME: save error info here? | |
390 | ||
391 | (self._parser, self._source) = self._entity_stack[-1] | |
392 | del self._entity_stack[-1] | |
393 | return 1 | |
394 | ||
395 | def skipped_entity_handler(self, name, is_pe): | |
396 | if is_pe: | |
397 | # The SAX spec requires to report skipped PEs with a '%' | |
398 | name = '%'+name | |
399 | self._cont_handler.skippedEntity(name) | |
400 | ||
401 | # --- | |
402 | ||
403 | def create_parser(*args, **kwargs): | |
404 | return ExpatParser(*args, **kwargs) | |
405 | ||
406 | # --- | |
407 | ||
408 | if __name__ == "__main__": | |
409 | import xml.sax | |
410 | p = create_parser() | |
411 | p.setContentHandler(xml.sax.XMLGenerator()) | |
412 | p.setErrorHandler(xml.sax.ErrorHandler()) | |
413 | p.parse("../../../hamlet.xml") |