Commit | Line | Data |
---|---|---|
920dae64 AT |
1 | """ codecs -- Python Codec Registry, API and helpers. |
2 | ||
3 | ||
4 | Written by Marc-Andre Lemburg (mal@lemburg.com). | |
5 | ||
6 | (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. | |
7 | ||
8 | """#" | |
9 | ||
10 | import __builtin__, sys | |
11 | ||
12 | ### Registry and builtin stateless codec functions | |
13 | ||
14 | try: | |
15 | from _codecs import * | |
16 | except ImportError, why: | |
17 | raise SystemError,\ | |
18 | 'Failed to load the builtin codecs: %s' % why | |
19 | ||
20 | __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE", | |
21 | "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", | |
22 | "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE", | |
23 | "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE", | |
24 | "strict_errors", "ignore_errors", "replace_errors", | |
25 | "xmlcharrefreplace_errors", | |
26 | "register_error", "lookup_error"] | |
27 | ||
28 | ### Constants | |
29 | ||
30 | # | |
31 | # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF) | |
32 | # and its possible byte string values | |
33 | # for UTF8/UTF16/UTF32 output and little/big endian machines | |
34 | # | |
35 | ||
36 | # UTF-8 | |
37 | BOM_UTF8 = '\xef\xbb\xbf' | |
38 | ||
39 | # UTF-16, little endian | |
40 | BOM_LE = BOM_UTF16_LE = '\xff\xfe' | |
41 | ||
42 | # UTF-16, big endian | |
43 | BOM_BE = BOM_UTF16_BE = '\xfe\xff' | |
44 | ||
45 | # UTF-32, little endian | |
46 | BOM_UTF32_LE = '\xff\xfe\x00\x00' | |
47 | ||
48 | # UTF-32, big endian | |
49 | BOM_UTF32_BE = '\x00\x00\xfe\xff' | |
50 | ||
51 | if sys.byteorder == 'little': | |
52 | ||
53 | # UTF-16, native endianness | |
54 | BOM = BOM_UTF16 = BOM_UTF16_LE | |
55 | ||
56 | # UTF-32, native endianness | |
57 | BOM_UTF32 = BOM_UTF32_LE | |
58 | ||
59 | else: | |
60 | ||
61 | # UTF-16, native endianness | |
62 | BOM = BOM_UTF16 = BOM_UTF16_BE | |
63 | ||
64 | # UTF-32, native endianness | |
65 | BOM_UTF32 = BOM_UTF32_BE | |
66 | ||
67 | # Old broken names (don't use in new code) | |
68 | BOM32_LE = BOM_UTF16_LE | |
69 | BOM32_BE = BOM_UTF16_BE | |
70 | BOM64_LE = BOM_UTF32_LE | |
71 | BOM64_BE = BOM_UTF32_BE | |
72 | ||
73 | ||
74 | ### Codec base classes (defining the API) | |
75 | ||
76 | class Codec: | |
77 | ||
78 | """ Defines the interface for stateless encoders/decoders. | |
79 | ||
80 | The .encode()/.decode() methods may use different error | |
81 | handling schemes by providing the errors argument. These | |
82 | string values are predefined: | |
83 | ||
84 | 'strict' - raise a ValueError error (or a subclass) | |
85 | 'ignore' - ignore the character and continue with the next | |
86 | 'replace' - replace with a suitable replacement character; | |
87 | Python will use the official U+FFFD REPLACEMENT | |
88 | CHARACTER for the builtin Unicode codecs on | |
89 | decoding and '?' on encoding. | |
90 | 'xmlcharrefreplace' - Replace with the appropriate XML | |
91 | character reference (only for encoding). | |
92 | 'backslashreplace' - Replace with backslashed escape sequences | |
93 | (only for encoding). | |
94 | ||
95 | The set of allowed values can be extended via register_error. | |
96 | ||
97 | """ | |
98 | def encode(self, input, errors='strict'): | |
99 | ||
100 | """ Encodes the object input and returns a tuple (output | |
101 | object, length consumed). | |
102 | ||
103 | errors defines the error handling to apply. It defaults to | |
104 | 'strict' handling. | |
105 | ||
106 | The method may not store state in the Codec instance. Use | |
107 | StreamCodec for codecs which have to keep state in order to | |
108 | make encoding/decoding efficient. | |
109 | ||
110 | The encoder must be able to handle zero length input and | |
111 | return an empty object of the output object type in this | |
112 | situation. | |
113 | ||
114 | """ | |
115 | raise NotImplementedError | |
116 | ||
117 | def decode(self, input, errors='strict'): | |
118 | ||
119 | """ Decodes the object input and returns a tuple (output | |
120 | object, length consumed). | |
121 | ||
122 | input must be an object which provides the bf_getreadbuf | |
123 | buffer slot. Python strings, buffer objects and memory | |
124 | mapped files are examples of objects providing this slot. | |
125 | ||
126 | errors defines the error handling to apply. It defaults to | |
127 | 'strict' handling. | |
128 | ||
129 | The method may not store state in the Codec instance. Use | |
130 | StreamCodec for codecs which have to keep state in order to | |
131 | make encoding/decoding efficient. | |
132 | ||
133 | The decoder must be able to handle zero length input and | |
134 | return an empty object of the output object type in this | |
135 | situation. | |
136 | ||
137 | """ | |
138 | raise NotImplementedError | |
139 | ||
140 | # | |
141 | # The StreamWriter and StreamReader class provide generic working | |
142 | # interfaces which can be used to implement new encoding submodules | |
143 | # very easily. See encodings/utf_8.py for an example on how this is | |
144 | # done. | |
145 | # | |
146 | ||
147 | class StreamWriter(Codec): | |
148 | ||
149 | def __init__(self, stream, errors='strict'): | |
150 | ||
151 | """ Creates a StreamWriter instance. | |
152 | ||
153 | stream must be a file-like object open for writing | |
154 | (binary) data. | |
155 | ||
156 | The StreamWriter may use different error handling | |
157 | schemes by providing the errors keyword argument. These | |
158 | parameters are predefined: | |
159 | ||
160 | 'strict' - raise a ValueError (or a subclass) | |
161 | 'ignore' - ignore the character and continue with the next | |
162 | 'replace'- replace with a suitable replacement character | |
163 | 'xmlcharrefreplace' - Replace with the appropriate XML | |
164 | character reference. | |
165 | 'backslashreplace' - Replace with backslashed escape | |
166 | sequences (only for encoding). | |
167 | ||
168 | The set of allowed parameter values can be extended via | |
169 | register_error. | |
170 | """ | |
171 | self.stream = stream | |
172 | self.errors = errors | |
173 | ||
174 | def write(self, object): | |
175 | ||
176 | """ Writes the object's contents encoded to self.stream. | |
177 | """ | |
178 | data, consumed = self.encode(object, self.errors) | |
179 | self.stream.write(data) | |
180 | ||
181 | def writelines(self, list): | |
182 | ||
183 | """ Writes the concatenated list of strings to the stream | |
184 | using .write(). | |
185 | """ | |
186 | self.write(''.join(list)) | |
187 | ||
188 | def reset(self): | |
189 | ||
190 | """ Flushes and resets the codec buffers used for keeping state. | |
191 | ||
192 | Calling this method should ensure that the data on the | |
193 | output is put into a clean state, that allows appending | |
194 | of new fresh data without having to rescan the whole | |
195 | stream to recover state. | |
196 | ||
197 | """ | |
198 | pass | |
199 | ||
200 | def __getattr__(self, name, | |
201 | getattr=getattr): | |
202 | ||
203 | """ Inherit all other methods from the underlying stream. | |
204 | """ | |
205 | return getattr(self.stream, name) | |
206 | ||
207 | ### | |
208 | ||
209 | class StreamReader(Codec): | |
210 | ||
211 | def __init__(self, stream, errors='strict'): | |
212 | ||
213 | """ Creates a StreamReader instance. | |
214 | ||
215 | stream must be a file-like object open for reading | |
216 | (binary) data. | |
217 | ||
218 | The StreamReader may use different error handling | |
219 | schemes by providing the errors keyword argument. These | |
220 | parameters are predefined: | |
221 | ||
222 | 'strict' - raise a ValueError (or a subclass) | |
223 | 'ignore' - ignore the character and continue with the next | |
224 | 'replace'- replace with a suitable replacement character; | |
225 | ||
226 | The set of allowed parameter values can be extended via | |
227 | register_error. | |
228 | """ | |
229 | self.stream = stream | |
230 | self.errors = errors | |
231 | self.bytebuffer = "" | |
232 | # For str->str decoding this will stay a str | |
233 | # For str->unicode decoding the first read will promote it to unicode | |
234 | self.charbuffer = "" | |
235 | self.linebuffer = None | |
236 | ||
237 | def decode(self, input, errors='strict'): | |
238 | raise NotImplementedError | |
239 | ||
240 | def read(self, size=-1, chars=-1, firstline=False): | |
241 | ||
242 | """ Decodes data from the stream self.stream and returns the | |
243 | resulting object. | |
244 | ||
245 | chars indicates the number of characters to read from the | |
246 | stream. read() will never return more than chars | |
247 | characters, but it might return less, if there are not enough | |
248 | characters available. | |
249 | ||
250 | size indicates the approximate maximum number of bytes to | |
251 | read from the stream for decoding purposes. The decoder | |
252 | can modify this setting as appropriate. The default value | |
253 | -1 indicates to read and decode as much as possible. size | |
254 | is intended to prevent having to decode huge files in one | |
255 | step. | |
256 | ||
257 | If firstline is true, and a UnicodeDecodeError happens | |
258 | after the first line terminator in the input only the first line | |
259 | will be returned, the rest of the input will be kept until the | |
260 | next call to read(). | |
261 | ||
262 | The method should use a greedy read strategy meaning that | |
263 | it should read as much data as is allowed within the | |
264 | definition of the encoding and the given size, e.g. if | |
265 | optional encoding endings or state markers are available | |
266 | on the stream, these should be read too. | |
267 | """ | |
268 | # If we have lines cached, first merge them back into characters | |
269 | if self.linebuffer: | |
270 | self.charbuffer = "".join(self.linebuffer) | |
271 | self.linebuffer = None | |
272 | ||
273 | # read until we get the required number of characters (if available) | |
274 | while True: | |
275 | # can the request can be satisfied from the character buffer? | |
276 | if chars < 0: | |
277 | if self.charbuffer: | |
278 | break | |
279 | else: | |
280 | if len(self.charbuffer) >= chars: | |
281 | break | |
282 | # we need more data | |
283 | if size < 0: | |
284 | newdata = self.stream.read() | |
285 | else: | |
286 | newdata = self.stream.read(size) | |
287 | # decode bytes (those remaining from the last call included) | |
288 | data = self.bytebuffer + newdata | |
289 | try: | |
290 | newchars, decodedbytes = self.decode(data, self.errors) | |
291 | except UnicodeDecodeError, exc: | |
292 | if firstline: | |
293 | newchars, decodedbytes = self.decode(data[:exc.start], self.errors) | |
294 | lines = newchars.splitlines(True) | |
295 | if len(lines)<=1: | |
296 | raise | |
297 | else: | |
298 | raise | |
299 | # keep undecoded bytes until the next call | |
300 | self.bytebuffer = data[decodedbytes:] | |
301 | # put new characters in the character buffer | |
302 | self.charbuffer += newchars | |
303 | # there was no data available | |
304 | if not newdata: | |
305 | break | |
306 | if chars < 0: | |
307 | # Return everything we've got | |
308 | result = self.charbuffer | |
309 | self.charbuffer = "" | |
310 | else: | |
311 | # Return the first chars characters | |
312 | result = self.charbuffer[:chars] | |
313 | self.charbuffer = self.charbuffer[chars:] | |
314 | return result | |
315 | ||
316 | def readline(self, size=None, keepends=True): | |
317 | ||
318 | """ Read one line from the input stream and return the | |
319 | decoded data. | |
320 | ||
321 | size, if given, is passed as size argument to the | |
322 | read() method. | |
323 | ||
324 | """ | |
325 | # If we have lines cached from an earlier read, return | |
326 | # them unconditionally | |
327 | if self.linebuffer: | |
328 | line = self.linebuffer[0] | |
329 | del self.linebuffer[0] | |
330 | if len(self.linebuffer) == 1: | |
331 | # revert to charbuffer mode; we might need more data | |
332 | # next time | |
333 | self.charbuffer = self.linebuffer[0] | |
334 | self.linebuffer = None | |
335 | if not keepends: | |
336 | line = line.splitlines(False)[0] | |
337 | return line | |
338 | ||
339 | readsize = size or 72 | |
340 | line = "" | |
341 | # If size is given, we call read() only once | |
342 | while True: | |
343 | data = self.read(readsize, firstline=True) | |
344 | if data: | |
345 | # If we're at a "\r" read one extra character (which might | |
346 | # be a "\n") to get a proper line ending. If the stream is | |
347 | # temporarily exhausted we return the wrong line ending. | |
348 | if data.endswith("\r"): | |
349 | data += self.read(size=1, chars=1) | |
350 | ||
351 | line += data | |
352 | lines = line.splitlines(True) | |
353 | if lines: | |
354 | if len(lines) > 1: | |
355 | # More than one line result; the first line is a full line | |
356 | # to return | |
357 | line = lines[0] | |
358 | del lines[0] | |
359 | if len(lines) > 1: | |
360 | # cache the remaining lines | |
361 | lines[-1] += self.charbuffer | |
362 | self.linebuffer = lines | |
363 | self.charbuffer = None | |
364 | else: | |
365 | # only one remaining line, put it back into charbuffer | |
366 | self.charbuffer = lines[0] + self.charbuffer | |
367 | if not keepends: | |
368 | line = line.splitlines(False)[0] | |
369 | break | |
370 | line0withend = lines[0] | |
371 | line0withoutend = lines[0].splitlines(False)[0] | |
372 | if line0withend != line0withoutend: # We really have a line end | |
373 | # Put the rest back together and keep it until the next call | |
374 | self.charbuffer = "".join(lines[1:]) + self.charbuffer | |
375 | if keepends: | |
376 | line = line0withend | |
377 | else: | |
378 | line = line0withoutend | |
379 | break | |
380 | # we didn't get anything or this was our only try | |
381 | if not data or size is not None: | |
382 | if line and not keepends: | |
383 | line = line.splitlines(False)[0] | |
384 | break | |
385 | if readsize<8000: | |
386 | readsize *= 2 | |
387 | return line | |
388 | ||
389 | def readlines(self, sizehint=None, keepends=True): | |
390 | ||
391 | """ Read all lines available on the input stream | |
392 | and return them as list of lines. | |
393 | ||
394 | Line breaks are implemented using the codec's decoder | |
395 | method and are included in the list entries. | |
396 | ||
397 | sizehint, if given, is ignored since there is no efficient | |
398 | way to finding the true end-of-line. | |
399 | ||
400 | """ | |
401 | data = self.read() | |
402 | return data.splitlines(keepends) | |
403 | ||
404 | def reset(self): | |
405 | ||
406 | """ Resets the codec buffers used for keeping state. | |
407 | ||
408 | Note that no stream repositioning should take place. | |
409 | This method is primarily intended to be able to recover | |
410 | from decoding errors. | |
411 | ||
412 | """ | |
413 | self.bytebuffer = "" | |
414 | self.charbuffer = u"" | |
415 | self.linebuffer = None | |
416 | ||
417 | def seek(self, offset, whence=0): | |
418 | """ Set the input stream's current position. | |
419 | ||
420 | Resets the codec buffers used for keeping state. | |
421 | """ | |
422 | self.reset() | |
423 | self.stream.seek(offset, whence) | |
424 | ||
425 | def next(self): | |
426 | ||
427 | """ Return the next decoded line from the input stream.""" | |
428 | line = self.readline() | |
429 | if line: | |
430 | return line | |
431 | raise StopIteration | |
432 | ||
433 | def __iter__(self): | |
434 | return self | |
435 | ||
436 | def __getattr__(self, name, | |
437 | getattr=getattr): | |
438 | ||
439 | """ Inherit all other methods from the underlying stream. | |
440 | """ | |
441 | return getattr(self.stream, name) | |
442 | ||
443 | ### | |
444 | ||
445 | class StreamReaderWriter: | |
446 | ||
447 | """ StreamReaderWriter instances allow wrapping streams which | |
448 | work in both read and write modes. | |
449 | ||
450 | The design is such that one can use the factory functions | |
451 | returned by the codec.lookup() function to construct the | |
452 | instance. | |
453 | ||
454 | """ | |
455 | # Optional attributes set by the file wrappers below | |
456 | encoding = 'unknown' | |
457 | ||
458 | def __init__(self, stream, Reader, Writer, errors='strict'): | |
459 | ||
460 | """ Creates a StreamReaderWriter instance. | |
461 | ||
462 | stream must be a Stream-like object. | |
463 | ||
464 | Reader, Writer must be factory functions or classes | |
465 | providing the StreamReader, StreamWriter interface resp. | |
466 | ||
467 | Error handling is done in the same way as defined for the | |
468 | StreamWriter/Readers. | |
469 | ||
470 | """ | |
471 | self.stream = stream | |
472 | self.reader = Reader(stream, errors) | |
473 | self.writer = Writer(stream, errors) | |
474 | self.errors = errors | |
475 | ||
476 | def read(self, size=-1): | |
477 | ||
478 | return self.reader.read(size) | |
479 | ||
480 | def readline(self, size=None): | |
481 | ||
482 | return self.reader.readline(size) | |
483 | ||
484 | def readlines(self, sizehint=None): | |
485 | ||
486 | return self.reader.readlines(sizehint) | |
487 | ||
488 | def next(self): | |
489 | ||
490 | """ Return the next decoded line from the input stream.""" | |
491 | return self.reader.next() | |
492 | ||
493 | def __iter__(self): | |
494 | return self | |
495 | ||
496 | def write(self, data): | |
497 | ||
498 | return self.writer.write(data) | |
499 | ||
500 | def writelines(self, list): | |
501 | ||
502 | return self.writer.writelines(list) | |
503 | ||
504 | def reset(self): | |
505 | ||
506 | self.reader.reset() | |
507 | self.writer.reset() | |
508 | ||
509 | def __getattr__(self, name, | |
510 | getattr=getattr): | |
511 | ||
512 | """ Inherit all other methods from the underlying stream. | |
513 | """ | |
514 | return getattr(self.stream, name) | |
515 | ||
516 | ### | |
517 | ||
518 | class StreamRecoder: | |
519 | ||
520 | """ StreamRecoder instances provide a frontend - backend | |
521 | view of encoding data. | |
522 | ||
523 | They use the complete set of APIs returned by the | |
524 | codecs.lookup() function to implement their task. | |
525 | ||
526 | Data written to the stream is first decoded into an | |
527 | intermediate format (which is dependent on the given codec | |
528 | combination) and then written to the stream using an instance | |
529 | of the provided Writer class. | |
530 | ||
531 | In the other direction, data is read from the stream using a | |
532 | Reader instance and then return encoded data to the caller. | |
533 | ||
534 | """ | |
535 | # Optional attributes set by the file wrappers below | |
536 | data_encoding = 'unknown' | |
537 | file_encoding = 'unknown' | |
538 | ||
539 | def __init__(self, stream, encode, decode, Reader, Writer, | |
540 | errors='strict'): | |
541 | ||
542 | """ Creates a StreamRecoder instance which implements a two-way | |
543 | conversion: encode and decode work on the frontend (the | |
544 | input to .read() and output of .write()) while | |
545 | Reader and Writer work on the backend (reading and | |
546 | writing to the stream). | |
547 | ||
548 | You can use these objects to do transparent direct | |
549 | recodings from e.g. latin-1 to utf-8 and back. | |
550 | ||
551 | stream must be a file-like object. | |
552 | ||
553 | encode, decode must adhere to the Codec interface, Reader, | |
554 | Writer must be factory functions or classes providing the | |
555 | StreamReader, StreamWriter interface resp. | |
556 | ||
557 | encode and decode are needed for the frontend translation, | |
558 | Reader and Writer for the backend translation. Unicode is | |
559 | used as intermediate encoding. | |
560 | ||
561 | Error handling is done in the same way as defined for the | |
562 | StreamWriter/Readers. | |
563 | ||
564 | """ | |
565 | self.stream = stream | |
566 | self.encode = encode | |
567 | self.decode = decode | |
568 | self.reader = Reader(stream, errors) | |
569 | self.writer = Writer(stream, errors) | |
570 | self.errors = errors | |
571 | ||
572 | def read(self, size=-1): | |
573 | ||
574 | data = self.reader.read(size) | |
575 | data, bytesencoded = self.encode(data, self.errors) | |
576 | return data | |
577 | ||
578 | def readline(self, size=None): | |
579 | ||
580 | if size is None: | |
581 | data = self.reader.readline() | |
582 | else: | |
583 | data = self.reader.readline(size) | |
584 | data, bytesencoded = self.encode(data, self.errors) | |
585 | return data | |
586 | ||
587 | def readlines(self, sizehint=None): | |
588 | ||
589 | data = self.reader.read() | |
590 | data, bytesencoded = self.encode(data, self.errors) | |
591 | return data.splitlines(1) | |
592 | ||
593 | def next(self): | |
594 | ||
595 | """ Return the next decoded line from the input stream.""" | |
596 | data = self.reader.next() | |
597 | data, bytesencoded = self.encode(data, self.errors) | |
598 | return data | |
599 | ||
600 | def __iter__(self): | |
601 | return self | |
602 | ||
603 | def write(self, data): | |
604 | ||
605 | data, bytesdecoded = self.decode(data, self.errors) | |
606 | return self.writer.write(data) | |
607 | ||
608 | def writelines(self, list): | |
609 | ||
610 | data = ''.join(list) | |
611 | data, bytesdecoded = self.decode(data, self.errors) | |
612 | return self.writer.write(data) | |
613 | ||
614 | def reset(self): | |
615 | ||
616 | self.reader.reset() | |
617 | self.writer.reset() | |
618 | ||
619 | def __getattr__(self, name, | |
620 | getattr=getattr): | |
621 | ||
622 | """ Inherit all other methods from the underlying stream. | |
623 | """ | |
624 | return getattr(self.stream, name) | |
625 | ||
626 | ### Shortcuts | |
627 | ||
628 | def open(filename, mode='rb', encoding=None, errors='strict', buffering=1): | |
629 | ||
630 | """ Open an encoded file using the given mode and return | |
631 | a wrapped version providing transparent encoding/decoding. | |
632 | ||
633 | Note: The wrapped version will only accept the object format | |
634 | defined by the codecs, i.e. Unicode objects for most builtin | |
635 | codecs. Output is also codec dependent and will usually by | |
636 | Unicode as well. | |
637 | ||
638 | Files are always opened in binary mode, even if no binary mode | |
639 | was specified. This is done to avoid data loss due to encodings | |
640 | using 8-bit values. The default file mode is 'rb' meaning to | |
641 | open the file in binary read mode. | |
642 | ||
643 | encoding specifies the encoding which is to be used for the | |
644 | file. | |
645 | ||
646 | errors may be given to define the error handling. It defaults | |
647 | to 'strict' which causes ValueErrors to be raised in case an | |
648 | encoding error occurs. | |
649 | ||
650 | buffering has the same meaning as for the builtin open() API. | |
651 | It defaults to line buffered. | |
652 | ||
653 | The returned wrapped file object provides an extra attribute | |
654 | .encoding which allows querying the used encoding. This | |
655 | attribute is only available if an encoding was specified as | |
656 | parameter. | |
657 | ||
658 | """ | |
659 | if encoding is not None and \ | |
660 | 'b' not in mode: | |
661 | # Force opening of the file in binary mode | |
662 | mode = mode + 'b' | |
663 | file = __builtin__.open(filename, mode, buffering) | |
664 | if encoding is None: | |
665 | return file | |
666 | (e, d, sr, sw) = lookup(encoding) | |
667 | srw = StreamReaderWriter(file, sr, sw, errors) | |
668 | # Add attributes to simplify introspection | |
669 | srw.encoding = encoding | |
670 | return srw | |
671 | ||
672 | def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'): | |
673 | ||
674 | """ Return a wrapped version of file which provides transparent | |
675 | encoding translation. | |
676 | ||
677 | Strings written to the wrapped file are interpreted according | |
678 | to the given data_encoding and then written to the original | |
679 | file as string using file_encoding. The intermediate encoding | |
680 | will usually be Unicode but depends on the specified codecs. | |
681 | ||
682 | Strings are read from the file using file_encoding and then | |
683 | passed back to the caller as string using data_encoding. | |
684 | ||
685 | If file_encoding is not given, it defaults to data_encoding. | |
686 | ||
687 | errors may be given to define the error handling. It defaults | |
688 | to 'strict' which causes ValueErrors to be raised in case an | |
689 | encoding error occurs. | |
690 | ||
691 | The returned wrapped file object provides two extra attributes | |
692 | .data_encoding and .file_encoding which reflect the given | |
693 | parameters of the same name. The attributes can be used for | |
694 | introspection by Python programs. | |
695 | ||
696 | """ | |
697 | if file_encoding is None: | |
698 | file_encoding = data_encoding | |
699 | encode, decode = lookup(data_encoding)[:2] | |
700 | Reader, Writer = lookup(file_encoding)[2:] | |
701 | sr = StreamRecoder(file, | |
702 | encode, decode, Reader, Writer, | |
703 | errors) | |
704 | # Add attributes to simplify introspection | |
705 | sr.data_encoding = data_encoding | |
706 | sr.file_encoding = file_encoding | |
707 | return sr | |
708 | ||
709 | ### Helpers for codec lookup | |
710 | ||
711 | def getencoder(encoding): | |
712 | ||
713 | """ Lookup up the codec for the given encoding and return | |
714 | its encoder function. | |
715 | ||
716 | Raises a LookupError in case the encoding cannot be found. | |
717 | ||
718 | """ | |
719 | return lookup(encoding)[0] | |
720 | ||
721 | def getdecoder(encoding): | |
722 | ||
723 | """ Lookup up the codec for the given encoding and return | |
724 | its decoder function. | |
725 | ||
726 | Raises a LookupError in case the encoding cannot be found. | |
727 | ||
728 | """ | |
729 | return lookup(encoding)[1] | |
730 | ||
731 | def getreader(encoding): | |
732 | ||
733 | """ Lookup up the codec for the given encoding and return | |
734 | its StreamReader class or factory function. | |
735 | ||
736 | Raises a LookupError in case the encoding cannot be found. | |
737 | ||
738 | """ | |
739 | return lookup(encoding)[2] | |
740 | ||
741 | def getwriter(encoding): | |
742 | ||
743 | """ Lookup up the codec for the given encoding and return | |
744 | its StreamWriter class or factory function. | |
745 | ||
746 | Raises a LookupError in case the encoding cannot be found. | |
747 | ||
748 | """ | |
749 | return lookup(encoding)[3] | |
750 | ||
751 | ### Helpers for charmap-based codecs | |
752 | ||
753 | def make_identity_dict(rng): | |
754 | ||
755 | """ make_identity_dict(rng) -> dict | |
756 | ||
757 | Return a dictionary where elements of the rng sequence are | |
758 | mapped to themselves. | |
759 | ||
760 | """ | |
761 | res = {} | |
762 | for i in rng: | |
763 | res[i]=i | |
764 | return res | |
765 | ||
766 | def make_encoding_map(decoding_map): | |
767 | ||
768 | """ Creates an encoding map from a decoding map. | |
769 | ||
770 | If a target mapping in the decoding map occurs multiple | |
771 | times, then that target is mapped to None (undefined mapping), | |
772 | causing an exception when encountered by the charmap codec | |
773 | during translation. | |
774 | ||
775 | One example where this happens is cp875.py which decodes | |
776 | multiple character to \u001a. | |
777 | ||
778 | """ | |
779 | m = {} | |
780 | for k,v in decoding_map.items(): | |
781 | if not v in m: | |
782 | m[v] = k | |
783 | else: | |
784 | m[v] = None | |
785 | return m | |
786 | ||
787 | ### error handlers | |
788 | ||
789 | try: | |
790 | strict_errors = lookup_error("strict") | |
791 | ignore_errors = lookup_error("ignore") | |
792 | replace_errors = lookup_error("replace") | |
793 | xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace") | |
794 | backslashreplace_errors = lookup_error("backslashreplace") | |
795 | except LookupError: | |
796 | # In --disable-unicode builds, these error handler are missing | |
797 | strict_errors = None | |
798 | ignore_errors = None | |
799 | replace_errors = None | |
800 | xmlcharrefreplace_errors = None | |
801 | backslashreplace_errors = None | |
802 | ||
803 | # Tell modulefinder that using codecs probably needs the encodings | |
804 | # package | |
805 | _false = 0 | |
806 | if _false: | |
807 | import encodings | |
808 | ||
809 | ### Tests | |
810 | ||
811 | if __name__ == '__main__': | |
812 | ||
813 | # Make stdout translate Latin-1 output into UTF-8 output | |
814 | sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8') | |
815 | ||
816 | # Have stdin translate Latin-1 input into UTF-8 input | |
817 | sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1') |