Initial commit of OpenSPARC T2 design and verification files.
[OpenSPARC-T2-DV] / tools / src / nas,5.n2.os.2 / lib / python / lib / python2.4 / codecs.py
CommitLineData
86530b38
AT
1""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
10import __builtin__, sys
11
12### Registry and builtin stateless codec functions
13
14try:
15 from _codecs import *
16except ImportError, why:
17 raise SystemError,\
18 'Failed to load the builtin codecs: %s' % why
19
20__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
21 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
22 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
23 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
24 "strict_errors", "ignore_errors", "replace_errors",
25 "xmlcharrefreplace_errors",
26 "register_error", "lookup_error"]
27
28### Constants
29
30#
31# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
32# and its possible byte string values
33# for UTF8/UTF16/UTF32 output and little/big endian machines
34#
35
36# UTF-8
37BOM_UTF8 = '\xef\xbb\xbf'
38
39# UTF-16, little endian
40BOM_LE = BOM_UTF16_LE = '\xff\xfe'
41
42# UTF-16, big endian
43BOM_BE = BOM_UTF16_BE = '\xfe\xff'
44
45# UTF-32, little endian
46BOM_UTF32_LE = '\xff\xfe\x00\x00'
47
48# UTF-32, big endian
49BOM_UTF32_BE = '\x00\x00\xfe\xff'
50
51if sys.byteorder == 'little':
52
53 # UTF-16, native endianness
54 BOM = BOM_UTF16 = BOM_UTF16_LE
55
56 # UTF-32, native endianness
57 BOM_UTF32 = BOM_UTF32_LE
58
59else:
60
61 # UTF-16, native endianness
62 BOM = BOM_UTF16 = BOM_UTF16_BE
63
64 # UTF-32, native endianness
65 BOM_UTF32 = BOM_UTF32_BE
66
67# Old broken names (don't use in new code)
68BOM32_LE = BOM_UTF16_LE
69BOM32_BE = BOM_UTF16_BE
70BOM64_LE = BOM_UTF32_LE
71BOM64_BE = BOM_UTF32_BE
72
73
74### Codec base classes (defining the API)
75
76class Codec:
77
78 """ Defines the interface for stateless encoders/decoders.
79
80 The .encode()/.decode() methods may use different error
81 handling schemes by providing the errors argument. These
82 string values are predefined:
83
84 'strict' - raise a ValueError error (or a subclass)
85 'ignore' - ignore the character and continue with the next
86 'replace' - replace with a suitable replacement character;
87 Python will use the official U+FFFD REPLACEMENT
88 CHARACTER for the builtin Unicode codecs on
89 decoding and '?' on encoding.
90 'xmlcharrefreplace' - Replace with the appropriate XML
91 character reference (only for encoding).
92 'backslashreplace' - Replace with backslashed escape sequences
93 (only for encoding).
94
95 The set of allowed values can be extended via register_error.
96
97 """
98 def encode(self, input, errors='strict'):
99
100 """ Encodes the object input and returns a tuple (output
101 object, length consumed).
102
103 errors defines the error handling to apply. It defaults to
104 'strict' handling.
105
106 The method may not store state in the Codec instance. Use
107 StreamCodec for codecs which have to keep state in order to
108 make encoding/decoding efficient.
109
110 The encoder must be able to handle zero length input and
111 return an empty object of the output object type in this
112 situation.
113
114 """
115 raise NotImplementedError
116
117 def decode(self, input, errors='strict'):
118
119 """ Decodes the object input and returns a tuple (output
120 object, length consumed).
121
122 input must be an object which provides the bf_getreadbuf
123 buffer slot. Python strings, buffer objects and memory
124 mapped files are examples of objects providing this slot.
125
126 errors defines the error handling to apply. It defaults to
127 'strict' handling.
128
129 The method may not store state in the Codec instance. Use
130 StreamCodec for codecs which have to keep state in order to
131 make encoding/decoding efficient.
132
133 The decoder must be able to handle zero length input and
134 return an empty object of the output object type in this
135 situation.
136
137 """
138 raise NotImplementedError
139
140#
141# The StreamWriter and StreamReader class provide generic working
142# interfaces which can be used to implement new encoding submodules
143# very easily. See encodings/utf_8.py for an example on how this is
144# done.
145#
146
147class StreamWriter(Codec):
148
149 def __init__(self, stream, errors='strict'):
150
151 """ Creates a StreamWriter instance.
152
153 stream must be a file-like object open for writing
154 (binary) data.
155
156 The StreamWriter may use different error handling
157 schemes by providing the errors keyword argument. These
158 parameters are predefined:
159
160 'strict' - raise a ValueError (or a subclass)
161 'ignore' - ignore the character and continue with the next
162 'replace'- replace with a suitable replacement character
163 'xmlcharrefreplace' - Replace with the appropriate XML
164 character reference.
165 'backslashreplace' - Replace with backslashed escape
166 sequences (only for encoding).
167
168 The set of allowed parameter values can be extended via
169 register_error.
170 """
171 self.stream = stream
172 self.errors = errors
173
174 def write(self, object):
175
176 """ Writes the object's contents encoded to self.stream.
177 """
178 data, consumed = self.encode(object, self.errors)
179 self.stream.write(data)
180
181 def writelines(self, list):
182
183 """ Writes the concatenated list of strings to the stream
184 using .write().
185 """
186 self.write(''.join(list))
187
188 def reset(self):
189
190 """ Flushes and resets the codec buffers used for keeping state.
191
192 Calling this method should ensure that the data on the
193 output is put into a clean state, that allows appending
194 of new fresh data without having to rescan the whole
195 stream to recover state.
196
197 """
198 pass
199
200 def __getattr__(self, name,
201 getattr=getattr):
202
203 """ Inherit all other methods from the underlying stream.
204 """
205 return getattr(self.stream, name)
206
207###
208
209class StreamReader(Codec):
210
211 def __init__(self, stream, errors='strict'):
212
213 """ Creates a StreamReader instance.
214
215 stream must be a file-like object open for reading
216 (binary) data.
217
218 The StreamReader may use different error handling
219 schemes by providing the errors keyword argument. These
220 parameters are predefined:
221
222 'strict' - raise a ValueError (or a subclass)
223 'ignore' - ignore the character and continue with the next
224 'replace'- replace with a suitable replacement character;
225
226 The set of allowed parameter values can be extended via
227 register_error.
228 """
229 self.stream = stream
230 self.errors = errors
231 self.bytebuffer = ""
232 # For str->str decoding this will stay a str
233 # For str->unicode decoding the first read will promote it to unicode
234 self.charbuffer = ""
235 self.linebuffer = None
236
237 def decode(self, input, errors='strict'):
238 raise NotImplementedError
239
240 def read(self, size=-1, chars=-1, firstline=False):
241
242 """ Decodes data from the stream self.stream and returns the
243 resulting object.
244
245 chars indicates the number of characters to read from the
246 stream. read() will never return more than chars
247 characters, but it might return less, if there are not enough
248 characters available.
249
250 size indicates the approximate maximum number of bytes to
251 read from the stream for decoding purposes. The decoder
252 can modify this setting as appropriate. The default value
253 -1 indicates to read and decode as much as possible. size
254 is intended to prevent having to decode huge files in one
255 step.
256
257 If firstline is true, and a UnicodeDecodeError happens
258 after the first line terminator in the input only the first line
259 will be returned, the rest of the input will be kept until the
260 next call to read().
261
262 The method should use a greedy read strategy meaning that
263 it should read as much data as is allowed within the
264 definition of the encoding and the given size, e.g. if
265 optional encoding endings or state markers are available
266 on the stream, these should be read too.
267 """
268 # If we have lines cached, first merge them back into characters
269 if self.linebuffer:
270 self.charbuffer = "".join(self.linebuffer)
271 self.linebuffer = None
272
273 # read until we get the required number of characters (if available)
274 while True:
275 # can the request can be satisfied from the character buffer?
276 if chars < 0:
277 if self.charbuffer:
278 break
279 else:
280 if len(self.charbuffer) >= chars:
281 break
282 # we need more data
283 if size < 0:
284 newdata = self.stream.read()
285 else:
286 newdata = self.stream.read(size)
287 # decode bytes (those remaining from the last call included)
288 data = self.bytebuffer + newdata
289 try:
290 newchars, decodedbytes = self.decode(data, self.errors)
291 except UnicodeDecodeError, exc:
292 if firstline:
293 newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
294 lines = newchars.splitlines(True)
295 if len(lines)<=1:
296 raise
297 else:
298 raise
299 # keep undecoded bytes until the next call
300 self.bytebuffer = data[decodedbytes:]
301 # put new characters in the character buffer
302 self.charbuffer += newchars
303 # there was no data available
304 if not newdata:
305 break
306 if chars < 0:
307 # Return everything we've got
308 result = self.charbuffer
309 self.charbuffer = ""
310 else:
311 # Return the first chars characters
312 result = self.charbuffer[:chars]
313 self.charbuffer = self.charbuffer[chars:]
314 return result
315
316 def readline(self, size=None, keepends=True):
317
318 """ Read one line from the input stream and return the
319 decoded data.
320
321 size, if given, is passed as size argument to the
322 read() method.
323
324 """
325 # If we have lines cached from an earlier read, return
326 # them unconditionally
327 if self.linebuffer:
328 line = self.linebuffer[0]
329 del self.linebuffer[0]
330 if len(self.linebuffer) == 1:
331 # revert to charbuffer mode; we might need more data
332 # next time
333 self.charbuffer = self.linebuffer[0]
334 self.linebuffer = None
335 if not keepends:
336 line = line.splitlines(False)[0]
337 return line
338
339 readsize = size or 72
340 line = ""
341 # If size is given, we call read() only once
342 while True:
343 data = self.read(readsize, firstline=True)
344 if data:
345 # If we're at a "\r" read one extra character (which might
346 # be a "\n") to get a proper line ending. If the stream is
347 # temporarily exhausted we return the wrong line ending.
348 if data.endswith("\r"):
349 data += self.read(size=1, chars=1)
350
351 line += data
352 lines = line.splitlines(True)
353 if lines:
354 if len(lines) > 1:
355 # More than one line result; the first line is a full line
356 # to return
357 line = lines[0]
358 del lines[0]
359 if len(lines) > 1:
360 # cache the remaining lines
361 lines[-1] += self.charbuffer
362 self.linebuffer = lines
363 self.charbuffer = None
364 else:
365 # only one remaining line, put it back into charbuffer
366 self.charbuffer = lines[0] + self.charbuffer
367 if not keepends:
368 line = line.splitlines(False)[0]
369 break
370 line0withend = lines[0]
371 line0withoutend = lines[0].splitlines(False)[0]
372 if line0withend != line0withoutend: # We really have a line end
373 # Put the rest back together and keep it until the next call
374 self.charbuffer = "".join(lines[1:]) + self.charbuffer
375 if keepends:
376 line = line0withend
377 else:
378 line = line0withoutend
379 break
380 # we didn't get anything or this was our only try
381 if not data or size is not None:
382 if line and not keepends:
383 line = line.splitlines(False)[0]
384 break
385 if readsize<8000:
386 readsize *= 2
387 return line
388
389 def readlines(self, sizehint=None, keepends=True):
390
391 """ Read all lines available on the input stream
392 and return them as list of lines.
393
394 Line breaks are implemented using the codec's decoder
395 method and are included in the list entries.
396
397 sizehint, if given, is ignored since there is no efficient
398 way to finding the true end-of-line.
399
400 """
401 data = self.read()
402 return data.splitlines(keepends)
403
404 def reset(self):
405
406 """ Resets the codec buffers used for keeping state.
407
408 Note that no stream repositioning should take place.
409 This method is primarily intended to be able to recover
410 from decoding errors.
411
412 """
413 self.bytebuffer = ""
414 self.charbuffer = u""
415 self.linebuffer = None
416
417 def seek(self, offset, whence=0):
418 """ Set the input stream's current position.
419
420 Resets the codec buffers used for keeping state.
421 """
422 self.reset()
423 self.stream.seek(offset, whence)
424
425 def next(self):
426
427 """ Return the next decoded line from the input stream."""
428 line = self.readline()
429 if line:
430 return line
431 raise StopIteration
432
433 def __iter__(self):
434 return self
435
436 def __getattr__(self, name,
437 getattr=getattr):
438
439 """ Inherit all other methods from the underlying stream.
440 """
441 return getattr(self.stream, name)
442
443###
444
445class StreamReaderWriter:
446
447 """ StreamReaderWriter instances allow wrapping streams which
448 work in both read and write modes.
449
450 The design is such that one can use the factory functions
451 returned by the codec.lookup() function to construct the
452 instance.
453
454 """
455 # Optional attributes set by the file wrappers below
456 encoding = 'unknown'
457
458 def __init__(self, stream, Reader, Writer, errors='strict'):
459
460 """ Creates a StreamReaderWriter instance.
461
462 stream must be a Stream-like object.
463
464 Reader, Writer must be factory functions or classes
465 providing the StreamReader, StreamWriter interface resp.
466
467 Error handling is done in the same way as defined for the
468 StreamWriter/Readers.
469
470 """
471 self.stream = stream
472 self.reader = Reader(stream, errors)
473 self.writer = Writer(stream, errors)
474 self.errors = errors
475
476 def read(self, size=-1):
477
478 return self.reader.read(size)
479
480 def readline(self, size=None):
481
482 return self.reader.readline(size)
483
484 def readlines(self, sizehint=None):
485
486 return self.reader.readlines(sizehint)
487
488 def next(self):
489
490 """ Return the next decoded line from the input stream."""
491 return self.reader.next()
492
493 def __iter__(self):
494 return self
495
496 def write(self, data):
497
498 return self.writer.write(data)
499
500 def writelines(self, list):
501
502 return self.writer.writelines(list)
503
504 def reset(self):
505
506 self.reader.reset()
507 self.writer.reset()
508
509 def __getattr__(self, name,
510 getattr=getattr):
511
512 """ Inherit all other methods from the underlying stream.
513 """
514 return getattr(self.stream, name)
515
516###
517
518class StreamRecoder:
519
520 """ StreamRecoder instances provide a frontend - backend
521 view of encoding data.
522
523 They use the complete set of APIs returned by the
524 codecs.lookup() function to implement their task.
525
526 Data written to the stream is first decoded into an
527 intermediate format (which is dependent on the given codec
528 combination) and then written to the stream using an instance
529 of the provided Writer class.
530
531 In the other direction, data is read from the stream using a
532 Reader instance and then return encoded data to the caller.
533
534 """
535 # Optional attributes set by the file wrappers below
536 data_encoding = 'unknown'
537 file_encoding = 'unknown'
538
539 def __init__(self, stream, encode, decode, Reader, Writer,
540 errors='strict'):
541
542 """ Creates a StreamRecoder instance which implements a two-way
543 conversion: encode and decode work on the frontend (the
544 input to .read() and output of .write()) while
545 Reader and Writer work on the backend (reading and
546 writing to the stream).
547
548 You can use these objects to do transparent direct
549 recodings from e.g. latin-1 to utf-8 and back.
550
551 stream must be a file-like object.
552
553 encode, decode must adhere to the Codec interface, Reader,
554 Writer must be factory functions or classes providing the
555 StreamReader, StreamWriter interface resp.
556
557 encode and decode are needed for the frontend translation,
558 Reader and Writer for the backend translation. Unicode is
559 used as intermediate encoding.
560
561 Error handling is done in the same way as defined for the
562 StreamWriter/Readers.
563
564 """
565 self.stream = stream
566 self.encode = encode
567 self.decode = decode
568 self.reader = Reader(stream, errors)
569 self.writer = Writer(stream, errors)
570 self.errors = errors
571
572 def read(self, size=-1):
573
574 data = self.reader.read(size)
575 data, bytesencoded = self.encode(data, self.errors)
576 return data
577
578 def readline(self, size=None):
579
580 if size is None:
581 data = self.reader.readline()
582 else:
583 data = self.reader.readline(size)
584 data, bytesencoded = self.encode(data, self.errors)
585 return data
586
587 def readlines(self, sizehint=None):
588
589 data = self.reader.read()
590 data, bytesencoded = self.encode(data, self.errors)
591 return data.splitlines(1)
592
593 def next(self):
594
595 """ Return the next decoded line from the input stream."""
596 data = self.reader.next()
597 data, bytesencoded = self.encode(data, self.errors)
598 return data
599
600 def __iter__(self):
601 return self
602
603 def write(self, data):
604
605 data, bytesdecoded = self.decode(data, self.errors)
606 return self.writer.write(data)
607
608 def writelines(self, list):
609
610 data = ''.join(list)
611 data, bytesdecoded = self.decode(data, self.errors)
612 return self.writer.write(data)
613
614 def reset(self):
615
616 self.reader.reset()
617 self.writer.reset()
618
619 def __getattr__(self, name,
620 getattr=getattr):
621
622 """ Inherit all other methods from the underlying stream.
623 """
624 return getattr(self.stream, name)
625
626### Shortcuts
627
628def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
629
630 """ Open an encoded file using the given mode and return
631 a wrapped version providing transparent encoding/decoding.
632
633 Note: The wrapped version will only accept the object format
634 defined by the codecs, i.e. Unicode objects for most builtin
635 codecs. Output is also codec dependent and will usually by
636 Unicode as well.
637
638 Files are always opened in binary mode, even if no binary mode
639 was specified. This is done to avoid data loss due to encodings
640 using 8-bit values. The default file mode is 'rb' meaning to
641 open the file in binary read mode.
642
643 encoding specifies the encoding which is to be used for the
644 file.
645
646 errors may be given to define the error handling. It defaults
647 to 'strict' which causes ValueErrors to be raised in case an
648 encoding error occurs.
649
650 buffering has the same meaning as for the builtin open() API.
651 It defaults to line buffered.
652
653 The returned wrapped file object provides an extra attribute
654 .encoding which allows querying the used encoding. This
655 attribute is only available if an encoding was specified as
656 parameter.
657
658 """
659 if encoding is not None and \
660 'b' not in mode:
661 # Force opening of the file in binary mode
662 mode = mode + 'b'
663 file = __builtin__.open(filename, mode, buffering)
664 if encoding is None:
665 return file
666 (e, d, sr, sw) = lookup(encoding)
667 srw = StreamReaderWriter(file, sr, sw, errors)
668 # Add attributes to simplify introspection
669 srw.encoding = encoding
670 return srw
671
672def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
673
674 """ Return a wrapped version of file which provides transparent
675 encoding translation.
676
677 Strings written to the wrapped file are interpreted according
678 to the given data_encoding and then written to the original
679 file as string using file_encoding. The intermediate encoding
680 will usually be Unicode but depends on the specified codecs.
681
682 Strings are read from the file using file_encoding and then
683 passed back to the caller as string using data_encoding.
684
685 If file_encoding is not given, it defaults to data_encoding.
686
687 errors may be given to define the error handling. It defaults
688 to 'strict' which causes ValueErrors to be raised in case an
689 encoding error occurs.
690
691 The returned wrapped file object provides two extra attributes
692 .data_encoding and .file_encoding which reflect the given
693 parameters of the same name. The attributes can be used for
694 introspection by Python programs.
695
696 """
697 if file_encoding is None:
698 file_encoding = data_encoding
699 encode, decode = lookup(data_encoding)[:2]
700 Reader, Writer = lookup(file_encoding)[2:]
701 sr = StreamRecoder(file,
702 encode, decode, Reader, Writer,
703 errors)
704 # Add attributes to simplify introspection
705 sr.data_encoding = data_encoding
706 sr.file_encoding = file_encoding
707 return sr
708
709### Helpers for codec lookup
710
711def getencoder(encoding):
712
713 """ Lookup up the codec for the given encoding and return
714 its encoder function.
715
716 Raises a LookupError in case the encoding cannot be found.
717
718 """
719 return lookup(encoding)[0]
720
721def getdecoder(encoding):
722
723 """ Lookup up the codec for the given encoding and return
724 its decoder function.
725
726 Raises a LookupError in case the encoding cannot be found.
727
728 """
729 return lookup(encoding)[1]
730
731def getreader(encoding):
732
733 """ Lookup up the codec for the given encoding and return
734 its StreamReader class or factory function.
735
736 Raises a LookupError in case the encoding cannot be found.
737
738 """
739 return lookup(encoding)[2]
740
741def getwriter(encoding):
742
743 """ Lookup up the codec for the given encoding and return
744 its StreamWriter class or factory function.
745
746 Raises a LookupError in case the encoding cannot be found.
747
748 """
749 return lookup(encoding)[3]
750
751### Helpers for charmap-based codecs
752
753def make_identity_dict(rng):
754
755 """ make_identity_dict(rng) -> dict
756
757 Return a dictionary where elements of the rng sequence are
758 mapped to themselves.
759
760 """
761 res = {}
762 for i in rng:
763 res[i]=i
764 return res
765
766def make_encoding_map(decoding_map):
767
768 """ Creates an encoding map from a decoding map.
769
770 If a target mapping in the decoding map occurs multiple
771 times, then that target is mapped to None (undefined mapping),
772 causing an exception when encountered by the charmap codec
773 during translation.
774
775 One example where this happens is cp875.py which decodes
776 multiple character to \u001a.
777
778 """
779 m = {}
780 for k,v in decoding_map.items():
781 if not v in m:
782 m[v] = k
783 else:
784 m[v] = None
785 return m
786
787### error handlers
788
789try:
790 strict_errors = lookup_error("strict")
791 ignore_errors = lookup_error("ignore")
792 replace_errors = lookup_error("replace")
793 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
794 backslashreplace_errors = lookup_error("backslashreplace")
795except LookupError:
796 # In --disable-unicode builds, these error handler are missing
797 strict_errors = None
798 ignore_errors = None
799 replace_errors = None
800 xmlcharrefreplace_errors = None
801 backslashreplace_errors = None
802
803# Tell modulefinder that using codecs probably needs the encodings
804# package
805_false = 0
806if _false:
807 import encodings
808
809### Tests
810
811if __name__ == '__main__':
812
813 # Make stdout translate Latin-1 output into UTF-8 output
814 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
815
816 # Have stdin translate Latin-1 input into UTF-8 input
817 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')