| 1 | """ Standard "encodings" Package |
| 2 | |
| 3 | Standard Python encoding modules are stored in this package |
| 4 | directory. |
| 5 | |
| 6 | Codec modules must have names corresponding to normalized encoding |
| 7 | names as defined in the normalize_encoding() function below, e.g. |
| 8 | 'utf-8' must be implemented by the module 'utf_8.py'. |
| 9 | |
| 10 | Each codec module must export the following interface: |
| 11 | |
| 12 | * getregentry() -> (encoder, decoder, stream_reader, stream_writer) |
| 13 | The getregentry() API must return callable objects which adhere to |
| 14 | the Python Codec Interface Standard. |
| 15 | |
| 16 | In addition, a module may optionally also define the following |
| 17 | APIs which are then used by the package's codec search function: |
| 18 | |
| 19 | * getaliases() -> sequence of encoding name strings to use as aliases |
| 20 | |
| 21 | Alias names returned by getaliases() must be normalized encoding |
| 22 | names as defined by normalize_encoding(). |
| 23 | |
| 24 | Written by Marc-Andre Lemburg (mal@lemburg.com). |
| 25 | |
| 26 | (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. |
| 27 | |
| 28 | """#" |
| 29 | |
| 30 | import codecs, exceptions, types, aliases |
| 31 | |
| 32 | _cache = {} |
| 33 | _unknown = '--unknown--' |
| 34 | _import_tail = ['*'] |
| 35 | _norm_encoding_map = (' . ' |
| 36 | '0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ ' |
| 37 | ' abcdefghijklmnopqrstuvwxyz ' |
| 38 | ' ' |
| 39 | ' ' |
| 40 | ' ') |
| 41 | _aliases = aliases.aliases |
| 42 | |
| 43 | class CodecRegistryError(exceptions.LookupError, |
| 44 | exceptions.SystemError): |
| 45 | pass |
| 46 | |
| 47 | def normalize_encoding(encoding): |
| 48 | |
| 49 | """ Normalize an encoding name. |
| 50 | |
| 51 | Normalization works as follows: all non-alphanumeric |
| 52 | characters except the dot used for Python package names are |
| 53 | collapsed and replaced with a single underscore, e.g. ' -;#' |
| 54 | becomes '_'. Leading and trailing underscores are removed. |
| 55 | |
| 56 | Note that encoding names should be ASCII only; if they do use |
| 57 | non-ASCII characters, these must be Latin-1 compatible. |
| 58 | |
| 59 | """ |
| 60 | # Make sure we have an 8-bit string, because .translate() works |
| 61 | # differently for Unicode strings. |
| 62 | if type(encoding) is types.UnicodeType: |
| 63 | # Note that .encode('latin-1') does *not* use the codec |
| 64 | # registry, so this call doesn't recurse. (See unicodeobject.c |
| 65 | # PyUnicode_AsEncodedString() for details) |
| 66 | encoding = encoding.encode('latin-1') |
| 67 | return '_'.join(encoding.translate(_norm_encoding_map).split()) |
| 68 | |
| 69 | def search_function(encoding): |
| 70 | |
| 71 | # Cache lookup |
| 72 | entry = _cache.get(encoding, _unknown) |
| 73 | if entry is not _unknown: |
| 74 | return entry |
| 75 | |
| 76 | # Import the module: |
| 77 | # |
| 78 | # First try to find an alias for the normalized encoding |
| 79 | # name and lookup the module using the aliased name, then try to |
| 80 | # lookup the module using the standard import scheme, i.e. first |
| 81 | # try in the encodings package, then at top-level. |
| 82 | # |
| 83 | norm_encoding = normalize_encoding(encoding) |
| 84 | aliased_encoding = _aliases.get(norm_encoding) or \ |
| 85 | _aliases.get(norm_encoding.replace('.', '_')) |
| 86 | if aliased_encoding is not None: |
| 87 | modnames = [aliased_encoding, |
| 88 | norm_encoding] |
| 89 | else: |
| 90 | modnames = [norm_encoding] |
| 91 | for modname in modnames: |
| 92 | if not modname: |
| 93 | continue |
| 94 | try: |
| 95 | mod = __import__(modname, |
| 96 | globals(), locals(), _import_tail) |
| 97 | except ImportError: |
| 98 | pass |
| 99 | else: |
| 100 | break |
| 101 | else: |
| 102 | mod = None |
| 103 | |
| 104 | try: |
| 105 | getregentry = mod.getregentry |
| 106 | except AttributeError: |
| 107 | # Not a codec module |
| 108 | mod = None |
| 109 | |
| 110 | if mod is None: |
| 111 | # Cache misses |
| 112 | _cache[encoding] = None |
| 113 | return None |
| 114 | |
| 115 | # Now ask the module for the registry entry |
| 116 | entry = tuple(getregentry()) |
| 117 | if len(entry) != 4: |
| 118 | raise CodecRegistryError,\ |
| 119 | 'module "%s" (%s) failed to register' % \ |
| 120 | (mod.__name__, mod.__file__) |
| 121 | for obj in entry: |
| 122 | if not callable(obj): |
| 123 | raise CodecRegistryError,\ |
| 124 | 'incompatible codecs in module "%s" (%s)' % \ |
| 125 | (mod.__name__, mod.__file__) |
| 126 | |
| 127 | # Cache the codec registry entry |
| 128 | _cache[encoding] = entry |
| 129 | |
| 130 | # Register its aliases (without overwriting previously registered |
| 131 | # aliases) |
| 132 | try: |
| 133 | codecaliases = mod.getaliases() |
| 134 | except AttributeError: |
| 135 | pass |
| 136 | else: |
| 137 | for alias in codecaliases: |
| 138 | if not _aliases.has_key(alias): |
| 139 | _aliases[alias] = modname |
| 140 | |
| 141 | # Return the registry entry |
| 142 | return entry |
| 143 | |
| 144 | # Register the search_function in the Python codec registry |
| 145 | codecs.register(search_function) |