Commit | Line | Data |
---|---|---|
920dae64 AT |
1 | """ Standard "encodings" Package |
2 | ||
3 | Standard Python encoding modules are stored in this package | |
4 | directory. | |
5 | ||
6 | Codec modules must have names corresponding to normalized encoding | |
7 | names as defined in the normalize_encoding() function below, e.g. | |
8 | 'utf-8' must be implemented by the module 'utf_8.py'. | |
9 | ||
10 | Each codec module must export the following interface: | |
11 | ||
12 | * getregentry() -> (encoder, decoder, stream_reader, stream_writer) | |
13 | The getregentry() API must return callable objects which adhere to | |
14 | the Python Codec Interface Standard. | |
15 | ||
16 | In addition, a module may optionally also define the following | |
17 | APIs which are then used by the package's codec search function: | |
18 | ||
19 | * getaliases() -> sequence of encoding name strings to use as aliases | |
20 | ||
21 | Alias names returned by getaliases() must be normalized encoding | |
22 | names as defined by normalize_encoding(). | |
23 | ||
24 | Written by Marc-Andre Lemburg (mal@lemburg.com). | |
25 | ||
26 | (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. | |
27 | ||
28 | """#" | |
29 | ||
30 | import codecs, exceptions, types, aliases | |
31 | ||
32 | _cache = {} | |
33 | _unknown = '--unknown--' | |
34 | _import_tail = ['*'] | |
35 | _norm_encoding_map = (' . ' | |
36 | '0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ ' | |
37 | ' abcdefghijklmnopqrstuvwxyz ' | |
38 | ' ' | |
39 | ' ' | |
40 | ' ') | |
41 | _aliases = aliases.aliases | |
42 | ||
43 | class CodecRegistryError(exceptions.LookupError, | |
44 | exceptions.SystemError): | |
45 | pass | |
46 | ||
47 | def normalize_encoding(encoding): | |
48 | ||
49 | """ Normalize an encoding name. | |
50 | ||
51 | Normalization works as follows: all non-alphanumeric | |
52 | characters except the dot used for Python package names are | |
53 | collapsed and replaced with a single underscore, e.g. ' -;#' | |
54 | becomes '_'. Leading and trailing underscores are removed. | |
55 | ||
56 | Note that encoding names should be ASCII only; if they do use | |
57 | non-ASCII characters, these must be Latin-1 compatible. | |
58 | ||
59 | """ | |
60 | # Make sure we have an 8-bit string, because .translate() works | |
61 | # differently for Unicode strings. | |
62 | if type(encoding) is types.UnicodeType: | |
63 | # Note that .encode('latin-1') does *not* use the codec | |
64 | # registry, so this call doesn't recurse. (See unicodeobject.c | |
65 | # PyUnicode_AsEncodedString() for details) | |
66 | encoding = encoding.encode('latin-1') | |
67 | return '_'.join(encoding.translate(_norm_encoding_map).split()) | |
68 | ||
69 | def search_function(encoding): | |
70 | ||
71 | # Cache lookup | |
72 | entry = _cache.get(encoding, _unknown) | |
73 | if entry is not _unknown: | |
74 | return entry | |
75 | ||
76 | # Import the module: | |
77 | # | |
78 | # First try to find an alias for the normalized encoding | |
79 | # name and lookup the module using the aliased name, then try to | |
80 | # lookup the module using the standard import scheme, i.e. first | |
81 | # try in the encodings package, then at top-level. | |
82 | # | |
83 | norm_encoding = normalize_encoding(encoding) | |
84 | aliased_encoding = _aliases.get(norm_encoding) or \ | |
85 | _aliases.get(norm_encoding.replace('.', '_')) | |
86 | if aliased_encoding is not None: | |
87 | modnames = [aliased_encoding, | |
88 | norm_encoding] | |
89 | else: | |
90 | modnames = [norm_encoding] | |
91 | for modname in modnames: | |
92 | if not modname: | |
93 | continue | |
94 | try: | |
95 | mod = __import__(modname, | |
96 | globals(), locals(), _import_tail) | |
97 | except ImportError: | |
98 | pass | |
99 | else: | |
100 | break | |
101 | else: | |
102 | mod = None | |
103 | ||
104 | try: | |
105 | getregentry = mod.getregentry | |
106 | except AttributeError: | |
107 | # Not a codec module | |
108 | mod = None | |
109 | ||
110 | if mod is None: | |
111 | # Cache misses | |
112 | _cache[encoding] = None | |
113 | return None | |
114 | ||
115 | # Now ask the module for the registry entry | |
116 | entry = tuple(getregentry()) | |
117 | if len(entry) != 4: | |
118 | raise CodecRegistryError,\ | |
119 | 'module "%s" (%s) failed to register' % \ | |
120 | (mod.__name__, mod.__file__) | |
121 | for obj in entry: | |
122 | if not callable(obj): | |
123 | raise CodecRegistryError,\ | |
124 | 'incompatible codecs in module "%s" (%s)' % \ | |
125 | (mod.__name__, mod.__file__) | |
126 | ||
127 | # Cache the codec registry entry | |
128 | _cache[encoding] = entry | |
129 | ||
130 | # Register its aliases (without overwriting previously registered | |
131 | # aliases) | |
132 | try: | |
133 | codecaliases = mod.getaliases() | |
134 | except AttributeError: | |
135 | pass | |
136 | else: | |
137 | for alias in codecaliases: | |
138 | if not _aliases.has_key(alias): | |
139 | _aliases[alias] = modname | |
140 | ||
141 | # Return the registry entry | |
142 | return entry | |
143 | ||
144 | # Register the search_function in the Python codec registry | |
145 | codecs.register(search_function) |