Commit | Line | Data |
---|---|---|
920dae64 AT |
1 | #ifndef Py_UNICODEOBJECT_H |
2 | #define Py_UNICODEOBJECT_H | |
3 | ||
4 | /* | |
5 | ||
6 | Unicode implementation based on original code by Fredrik Lundh, | |
7 | modified by Marc-Andre Lemburg (mal@lemburg.com) according to the | |
8 | Unicode Integration Proposal (see file Misc/unicode.txt). | |
9 | ||
10 | Copyright (c) Corporation for National Research Initiatives. | |
11 | ||
12 | ||
13 | Original header: | |
14 | -------------------------------------------------------------------- | |
15 | ||
16 | * Yet another Unicode string type for Python. This type supports the | |
17 | * 16-bit Basic Multilingual Plane (BMP) only. | |
18 | * | |
19 | * Written by Fredrik Lundh, January 1999. | |
20 | * | |
21 | * Copyright (c) 1999 by Secret Labs AB. | |
22 | * Copyright (c) 1999 by Fredrik Lundh. | |
23 | * | |
24 | * fredrik@pythonware.com | |
25 | * http://www.pythonware.com | |
26 | * | |
27 | * -------------------------------------------------------------------- | |
28 | * This Unicode String Type is | |
29 | * | |
30 | * Copyright (c) 1999 by Secret Labs AB | |
31 | * Copyright (c) 1999 by Fredrik Lundh | |
32 | * | |
33 | * By obtaining, using, and/or copying this software and/or its | |
34 | * associated documentation, you agree that you have read, understood, | |
35 | * and will comply with the following terms and conditions: | |
36 | * | |
37 | * Permission to use, copy, modify, and distribute this software and its | |
38 | * associated documentation for any purpose and without fee is hereby | |
39 | * granted, provided that the above copyright notice appears in all | |
40 | * copies, and that both that copyright notice and this permission notice | |
41 | * appear in supporting documentation, and that the name of Secret Labs | |
42 | * AB or the author not be used in advertising or publicity pertaining to | |
43 | * distribution of the software without specific, written prior | |
44 | * permission. | |
45 | * | |
46 | * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO | |
47 | * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND | |
48 | * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR | |
49 | * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | |
50 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | |
51 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT | |
52 | * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | |
53 | * -------------------------------------------------------------------- */ | |
54 | ||
55 | #include <ctype.h> | |
56 | ||
57 | /* === Internal API ======================================================= */ | |
58 | ||
59 | /* --- Internal Unicode Format -------------------------------------------- */ | |
60 | ||
61 | #ifndef Py_USING_UNICODE | |
62 | ||
63 | #define PyUnicode_Check(op) 0 | |
64 | #define PyUnicode_CheckExact(op) 0 | |
65 | ||
66 | #else | |
67 | ||
68 | /* FIXME: MvL's new implementation assumes that Py_UNICODE_SIZE is | |
69 | properly set, but the default rules below doesn't set it. I'll | |
70 | sort this out some other day -- fredrik@pythonware.com */ | |
71 | ||
72 | #ifndef Py_UNICODE_SIZE | |
73 | #error Must define Py_UNICODE_SIZE | |
74 | #endif | |
75 | ||
76 | /* Setting Py_UNICODE_WIDE enables UCS-4 storage. Otherwise, Unicode | |
77 | strings are stored as UCS-2 (with limited support for UTF-16) */ | |
78 | ||
79 | #if Py_UNICODE_SIZE >= 4 | |
80 | #define Py_UNICODE_WIDE | |
81 | #endif | |
82 | ||
83 | /* Set these flags if the platform has "wchar.h", "wctype.h" and the | |
84 | wchar_t type is a 16-bit unsigned type */ | |
85 | /* #define HAVE_WCHAR_H */ | |
86 | /* #define HAVE_USABLE_WCHAR_T */ | |
87 | ||
88 | /* Defaults for various platforms */ | |
89 | #ifndef PY_UNICODE_TYPE | |
90 | ||
91 | /* Windows has a usable wchar_t type (unless we're using UCS-4) */ | |
92 | # if defined(MS_WIN32) && Py_UNICODE_SIZE == 2 | |
93 | # define HAVE_USABLE_WCHAR_T | |
94 | # define PY_UNICODE_TYPE wchar_t | |
95 | # endif | |
96 | ||
97 | # if defined(Py_UNICODE_WIDE) | |
98 | # define PY_UNICODE_TYPE Py_UCS4 | |
99 | # endif | |
100 | ||
101 | #endif | |
102 | ||
103 | /* If the compiler provides a wchar_t type we try to support it | |
104 | through the interface functions PyUnicode_FromWideChar() and | |
105 | PyUnicode_AsWideChar(). */ | |
106 | ||
107 | #ifdef HAVE_USABLE_WCHAR_T | |
108 | # ifndef HAVE_WCHAR_H | |
109 | # define HAVE_WCHAR_H | |
110 | # endif | |
111 | #endif | |
112 | ||
113 | #ifdef HAVE_WCHAR_H | |
114 | /* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */ | |
115 | # ifdef _HAVE_BSDI | |
116 | # include <time.h> | |
117 | # endif | |
118 | # include <wchar.h> | |
119 | #endif | |
120 | ||
121 | /* | |
122 | * Use this typedef when you need to represent a UTF-16 surrogate pair | |
123 | * as single unsigned integer. | |
124 | */ | |
125 | #if SIZEOF_INT >= 4 | |
126 | typedef unsigned int Py_UCS4; | |
127 | #elif SIZEOF_LONG >= 4 | |
128 | typedef unsigned long Py_UCS4; | |
129 | #endif | |
130 | ||
131 | typedef PY_UNICODE_TYPE Py_UNICODE; | |
132 | ||
133 | /* --- UCS-2/UCS-4 Name Mangling ------------------------------------------ */ | |
134 | ||
135 | /* Unicode API names are mangled to assure that UCS-2 and UCS-4 builds | |
136 | produce different external names and thus cause import errors in | |
137 | case Python interpreters and extensions with mixed compiled in | |
138 | Unicode width assumptions are combined. */ | |
139 | ||
140 | #ifndef Py_UNICODE_WIDE | |
141 | ||
142 | # define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString | |
143 | # define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString | |
144 | # define PyUnicode_AsEncodedObject PyUnicodeUCS2_AsEncodedObject | |
145 | # define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString | |
146 | # define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String | |
147 | # define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString | |
148 | # define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String | |
149 | # define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String | |
150 | # define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode | |
151 | # define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS2_AsUnicodeEscapeString | |
152 | # define PyUnicode_AsWideChar PyUnicodeUCS2_AsWideChar | |
153 | # define PyUnicode_Compare PyUnicodeUCS2_Compare | |
154 | # define PyUnicode_Concat PyUnicodeUCS2_Concat | |
155 | # define PyUnicode_Contains PyUnicodeUCS2_Contains | |
156 | # define PyUnicode_Count PyUnicodeUCS2_Count | |
157 | # define PyUnicode_Decode PyUnicodeUCS2_Decode | |
158 | # define PyUnicode_DecodeASCII PyUnicodeUCS2_DecodeASCII | |
159 | # define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap | |
160 | # define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1 | |
161 | # define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape | |
162 | # define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16 | |
163 | # define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful | |
164 | # define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8 | |
165 | # define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS2_DecodeUTF8Stateful | |
166 | # define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS2_DecodeUnicodeEscape | |
167 | # define PyUnicode_Encode PyUnicodeUCS2_Encode | |
168 | # define PyUnicode_EncodeASCII PyUnicodeUCS2_EncodeASCII | |
169 | # define PyUnicode_EncodeCharmap PyUnicodeUCS2_EncodeCharmap | |
170 | # define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal | |
171 | # define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1 | |
172 | # define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape | |
173 | # define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16 | |
174 | # define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8 | |
175 | # define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape | |
176 | # define PyUnicode_Find PyUnicodeUCS2_Find | |
177 | # define PyUnicode_Format PyUnicodeUCS2_Format | |
178 | # define PyUnicode_FromEncodedObject PyUnicodeUCS2_FromEncodedObject | |
179 | # define PyUnicode_FromObject PyUnicodeUCS2_FromObject | |
180 | # define PyUnicode_FromOrdinal PyUnicodeUCS2_FromOrdinal | |
181 | # define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode | |
182 | # define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar | |
183 | # define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding | |
184 | # define PyUnicode_GetMax PyUnicodeUCS2_GetMax | |
185 | # define PyUnicode_GetSize PyUnicodeUCS2_GetSize | |
186 | # define PyUnicode_Join PyUnicodeUCS2_Join | |
187 | # define PyUnicode_Replace PyUnicodeUCS2_Replace | |
188 | # define PyUnicode_Resize PyUnicodeUCS2_Resize | |
189 | # define PyUnicode_SetDefaultEncoding PyUnicodeUCS2_SetDefaultEncoding | |
190 | # define PyUnicode_Split PyUnicodeUCS2_Split | |
191 | # define PyUnicode_RSplit PyUnicodeUCS2_RSplit | |
192 | # define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines | |
193 | # define PyUnicode_Tailmatch PyUnicodeUCS2_Tailmatch | |
194 | # define PyUnicode_Translate PyUnicodeUCS2_Translate | |
195 | # define PyUnicode_TranslateCharmap PyUnicodeUCS2_TranslateCharmap | |
196 | # define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS2_AsDefaultEncodedString | |
197 | # define _PyUnicode_Fini _PyUnicodeUCS2_Fini | |
198 | # define _PyUnicode_Init _PyUnicodeUCS2_Init | |
199 | # define _PyUnicode_IsAlpha _PyUnicodeUCS2_IsAlpha | |
200 | # define _PyUnicode_IsDecimalDigit _PyUnicodeUCS2_IsDecimalDigit | |
201 | # define _PyUnicode_IsDigit _PyUnicodeUCS2_IsDigit | |
202 | # define _PyUnicode_IsLinebreak _PyUnicodeUCS2_IsLinebreak | |
203 | # define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase | |
204 | # define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric | |
205 | # define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase | |
206 | # define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase | |
207 | # define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace | |
208 | # define _PyUnicode_ToDecimalDigit _PyUnicodeUCS2_ToDecimalDigit | |
209 | # define _PyUnicode_ToDigit _PyUnicodeUCS2_ToDigit | |
210 | # define _PyUnicode_ToLowercase _PyUnicodeUCS2_ToLowercase | |
211 | # define _PyUnicode_ToNumeric _PyUnicodeUCS2_ToNumeric | |
212 | # define _PyUnicode_ToTitlecase _PyUnicodeUCS2_ToTitlecase | |
213 | # define _PyUnicode_ToUppercase _PyUnicodeUCS2_ToUppercase | |
214 | ||
215 | #else | |
216 | ||
217 | # define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString | |
218 | # define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString | |
219 | # define PyUnicode_AsEncodedObject PyUnicodeUCS4_AsEncodedObject | |
220 | # define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString | |
221 | # define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String | |
222 | # define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString | |
223 | # define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String | |
224 | # define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String | |
225 | # define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode | |
226 | # define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS4_AsUnicodeEscapeString | |
227 | # define PyUnicode_AsWideChar PyUnicodeUCS4_AsWideChar | |
228 | # define PyUnicode_Compare PyUnicodeUCS4_Compare | |
229 | # define PyUnicode_Concat PyUnicodeUCS4_Concat | |
230 | # define PyUnicode_Contains PyUnicodeUCS4_Contains | |
231 | # define PyUnicode_Count PyUnicodeUCS4_Count | |
232 | # define PyUnicode_Decode PyUnicodeUCS4_Decode | |
233 | # define PyUnicode_DecodeASCII PyUnicodeUCS4_DecodeASCII | |
234 | # define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap | |
235 | # define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1 | |
236 | # define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape | |
237 | # define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16 | |
238 | # define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful | |
239 | # define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8 | |
240 | # define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS4_DecodeUTF8Stateful | |
241 | # define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS4_DecodeUnicodeEscape | |
242 | # define PyUnicode_Encode PyUnicodeUCS4_Encode | |
243 | # define PyUnicode_EncodeASCII PyUnicodeUCS4_EncodeASCII | |
244 | # define PyUnicode_EncodeCharmap PyUnicodeUCS4_EncodeCharmap | |
245 | # define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal | |
246 | # define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1 | |
247 | # define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape | |
248 | # define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16 | |
249 | # define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8 | |
250 | # define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape | |
251 | # define PyUnicode_Find PyUnicodeUCS4_Find | |
252 | # define PyUnicode_Format PyUnicodeUCS4_Format | |
253 | # define PyUnicode_FromEncodedObject PyUnicodeUCS4_FromEncodedObject | |
254 | # define PyUnicode_FromObject PyUnicodeUCS4_FromObject | |
255 | # define PyUnicode_FromOrdinal PyUnicodeUCS4_FromOrdinal | |
256 | # define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode | |
257 | # define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar | |
258 | # define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding | |
259 | # define PyUnicode_GetMax PyUnicodeUCS4_GetMax | |
260 | # define PyUnicode_GetSize PyUnicodeUCS4_GetSize | |
261 | # define PyUnicode_Join PyUnicodeUCS4_Join | |
262 | # define PyUnicode_Replace PyUnicodeUCS4_Replace | |
263 | # define PyUnicode_Resize PyUnicodeUCS4_Resize | |
264 | # define PyUnicode_SetDefaultEncoding PyUnicodeUCS4_SetDefaultEncoding | |
265 | # define PyUnicode_Split PyUnicodeUCS4_Split | |
266 | # define PyUnicode_Splitlines PyUnicodeUCS4_Splitlines | |
267 | # define PyUnicode_Tailmatch PyUnicodeUCS4_Tailmatch | |
268 | # define PyUnicode_Translate PyUnicodeUCS4_Translate | |
269 | # define PyUnicode_TranslateCharmap PyUnicodeUCS4_TranslateCharmap | |
270 | # define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS4_AsDefaultEncodedString | |
271 | # define _PyUnicode_Fini _PyUnicodeUCS4_Fini | |
272 | # define _PyUnicode_Init _PyUnicodeUCS4_Init | |
273 | # define _PyUnicode_IsAlpha _PyUnicodeUCS4_IsAlpha | |
274 | # define _PyUnicode_IsDecimalDigit _PyUnicodeUCS4_IsDecimalDigit | |
275 | # define _PyUnicode_IsDigit _PyUnicodeUCS4_IsDigit | |
276 | # define _PyUnicode_IsLinebreak _PyUnicodeUCS4_IsLinebreak | |
277 | # define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase | |
278 | # define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric | |
279 | # define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase | |
280 | # define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase | |
281 | # define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace | |
282 | # define _PyUnicode_ToDecimalDigit _PyUnicodeUCS4_ToDecimalDigit | |
283 | # define _PyUnicode_ToDigit _PyUnicodeUCS4_ToDigit | |
284 | # define _PyUnicode_ToLowercase _PyUnicodeUCS4_ToLowercase | |
285 | # define _PyUnicode_ToNumeric _PyUnicodeUCS4_ToNumeric | |
286 | # define _PyUnicode_ToTitlecase _PyUnicodeUCS4_ToTitlecase | |
287 | # define _PyUnicode_ToUppercase _PyUnicodeUCS4_ToUppercase | |
288 | ||
289 | ||
290 | #endif | |
291 | ||
292 | /* --- Internal Unicode Operations ---------------------------------------- */ | |
293 | ||
294 | /* If you want Python to use the compiler's wctype.h functions instead | |
295 | of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or | |
296 | configure Python using --with-wctype-functions. This reduces the | |
297 | interpreter's code size. */ | |
298 | ||
299 | #if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS) | |
300 | ||
301 | #include <wctype.h> | |
302 | ||
303 | #define Py_UNICODE_ISSPACE(ch) iswspace(ch) | |
304 | ||
305 | #define Py_UNICODE_ISLOWER(ch) iswlower(ch) | |
306 | #define Py_UNICODE_ISUPPER(ch) iswupper(ch) | |
307 | #define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) | |
308 | #define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) | |
309 | ||
310 | #define Py_UNICODE_TOLOWER(ch) towlower(ch) | |
311 | #define Py_UNICODE_TOUPPER(ch) towupper(ch) | |
312 | #define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) | |
313 | ||
314 | #define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) | |
315 | #define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) | |
316 | #define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) | |
317 | ||
318 | #define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) | |
319 | #define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) | |
320 | #define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) | |
321 | ||
322 | #define Py_UNICODE_ISALPHA(ch) iswalpha(ch) | |
323 | ||
324 | #else | |
325 | ||
326 | #define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch) | |
327 | ||
328 | #define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch) | |
329 | #define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch) | |
330 | #define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) | |
331 | #define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) | |
332 | ||
333 | #define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch) | |
334 | #define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch) | |
335 | #define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) | |
336 | ||
337 | #define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) | |
338 | #define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) | |
339 | #define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) | |
340 | ||
341 | #define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) | |
342 | #define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) | |
343 | #define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) | |
344 | ||
345 | #define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch) | |
346 | ||
347 | #endif | |
348 | ||
349 | #define Py_UNICODE_ISALNUM(ch) \ | |
350 | (Py_UNICODE_ISALPHA(ch) || \ | |
351 | Py_UNICODE_ISDECIMAL(ch) || \ | |
352 | Py_UNICODE_ISDIGIT(ch) || \ | |
353 | Py_UNICODE_ISNUMERIC(ch)) | |
354 | ||
355 | #define Py_UNICODE_COPY(target, source, length)\ | |
356 | (memcpy((target), (source), (length)*sizeof(Py_UNICODE))) | |
357 | ||
358 | #define Py_UNICODE_FILL(target, value, length) do\ | |
359 | {int i; for (i = 0; i < (length); i++) (target)[i] = (value);}\ | |
360 | while (0) | |
361 | ||
362 | #define Py_UNICODE_MATCH(string, offset, substring)\ | |
363 | ((*((string)->str + (offset)) == *((substring)->str)) &&\ | |
364 | !memcmp((string)->str + (offset), (substring)->str,\ | |
365 | (substring)->length*sizeof(Py_UNICODE))) | |
366 | ||
367 | #ifdef __cplusplus | |
368 | extern "C" { | |
369 | #endif | |
370 | ||
371 | /* --- Unicode Type ------------------------------------------------------- */ | |
372 | ||
373 | typedef struct { | |
374 | PyObject_HEAD | |
375 | int length; /* Length of raw Unicode data in buffer */ | |
376 | Py_UNICODE *str; /* Raw Unicode buffer */ | |
377 | long hash; /* Hash value; -1 if not set */ | |
378 | PyObject *defenc; /* (Default) Encoded version as Python | |
379 | string, or NULL; this is used for | |
380 | implementing the buffer protocol */ | |
381 | } PyUnicodeObject; | |
382 | ||
383 | PyAPI_DATA(PyTypeObject) PyUnicode_Type; | |
384 | ||
385 | #define PyUnicode_Check(op) PyObject_TypeCheck(op, &PyUnicode_Type) | |
386 | #define PyUnicode_CheckExact(op) ((op)->ob_type == &PyUnicode_Type) | |
387 | ||
388 | /* Fast access macros */ | |
389 | #define PyUnicode_GET_SIZE(op) \ | |
390 | (((PyUnicodeObject *)(op))->length) | |
391 | #define PyUnicode_GET_DATA_SIZE(op) \ | |
392 | (((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE)) | |
393 | #define PyUnicode_AS_UNICODE(op) \ | |
394 | (((PyUnicodeObject *)(op))->str) | |
395 | #define PyUnicode_AS_DATA(op) \ | |
396 | ((const char *)((PyUnicodeObject *)(op))->str) | |
397 | ||
398 | /* --- Constants ---------------------------------------------------------- */ | |
399 | ||
400 | /* This Unicode character will be used as replacement character during | |
401 | decoding if the errors argument is set to "replace". Note: the | |
402 | Unicode character U+FFFD is the official REPLACEMENT CHARACTER in | |
403 | Unicode 3.0. */ | |
404 | ||
405 | #define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD) | |
406 | ||
407 | /* === Public API ========================================================= */ | |
408 | ||
409 | /* --- Plain Py_UNICODE --------------------------------------------------- */ | |
410 | ||
411 | /* Create a Unicode Object from the Py_UNICODE buffer u of the given | |
412 | size. | |
413 | ||
414 | u may be NULL which causes the contents to be undefined. It is the | |
415 | user's responsibility to fill in the needed data afterwards. Note | |
416 | that modifying the Unicode object contents after construction is | |
417 | only allowed if u was set to NULL. | |
418 | ||
419 | The buffer is copied into the new object. */ | |
420 | ||
421 | PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode( | |
422 | const Py_UNICODE *u, /* Unicode buffer */ | |
423 | int size /* size of buffer */ | |
424 | ); | |
425 | ||
426 | /* Return a read-only pointer to the Unicode object's internal | |
427 | Py_UNICODE buffer. */ | |
428 | ||
429 | PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode( | |
430 | PyObject *unicode /* Unicode object */ | |
431 | ); | |
432 | ||
433 | /* Get the length of the Unicode object. */ | |
434 | ||
435 | PyAPI_FUNC(int) PyUnicode_GetSize( | |
436 | PyObject *unicode /* Unicode object */ | |
437 | ); | |
438 | ||
439 | /* Get the maximum ordinal for a Unicode character. */ | |
440 | PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void); | |
441 | ||
442 | /* Resize an already allocated Unicode object to the new size length. | |
443 | ||
444 | *unicode is modified to point to the new (resized) object and 0 | |
445 | returned on success. | |
446 | ||
447 | This API may only be called by the function which also called the | |
448 | Unicode constructor. The refcount on the object must be 1. Otherwise, | |
449 | an error is returned. | |
450 | ||
451 | Error handling is implemented as follows: an exception is set, -1 | |
452 | is returned and *unicode left untouched. | |
453 | ||
454 | */ | |
455 | ||
456 | PyAPI_FUNC(int) PyUnicode_Resize( | |
457 | PyObject **unicode, /* Pointer to the Unicode object */ | |
458 | int length /* New length */ | |
459 | ); | |
460 | ||
461 | /* Coerce obj to an Unicode object and return a reference with | |
462 | *incremented* refcount. | |
463 | ||
464 | Coercion is done in the following way: | |
465 | ||
466 | 1. String and other char buffer compatible objects are decoded | |
467 | under the assumptions that they contain data using the current | |
468 | default encoding. Decoding is done in "strict" mode. | |
469 | ||
470 | 2. All other objects (including Unicode objects) raise an | |
471 | exception. | |
472 | ||
473 | The API returns NULL in case of an error. The caller is responsible | |
474 | for decref'ing the returned objects. | |
475 | ||
476 | */ | |
477 | ||
478 | PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject( | |
479 | register PyObject *obj, /* Object */ | |
480 | const char *encoding, /* encoding */ | |
481 | const char *errors /* error handling */ | |
482 | ); | |
483 | ||
484 | /* Coerce obj to an Unicode object and return a reference with | |
485 | *incremented* refcount. | |
486 | ||
487 | Unicode objects are passed back as-is (subclasses are converted to | |
488 | true Unicode objects), all other objects are delegated to | |
489 | PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in | |
490 | using the default encoding as basis for decoding the object. | |
491 | ||
492 | The API returns NULL in case of an error. The caller is responsible | |
493 | for decref'ing the returned objects. | |
494 | ||
495 | */ | |
496 | ||
497 | PyAPI_FUNC(PyObject*) PyUnicode_FromObject( | |
498 | register PyObject *obj /* Object */ | |
499 | ); | |
500 | ||
501 | /* --- wchar_t support for platforms which support it --------------------- */ | |
502 | ||
503 | #ifdef HAVE_WCHAR_H | |
504 | ||
505 | /* Create a Unicode Object from the whcar_t buffer w of the given | |
506 | size. | |
507 | ||
508 | The buffer is copied into the new object. */ | |
509 | ||
510 | PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar( | |
511 | register const wchar_t *w, /* wchar_t buffer */ | |
512 | int size /* size of buffer */ | |
513 | ); | |
514 | ||
515 | /* Copies the Unicode Object contents into the wchar_t buffer w. At | |
516 | most size wchar_t characters are copied. | |
517 | ||
518 | Note that the resulting wchar_t string may or may not be | |
519 | 0-terminated. It is the responsibility of the caller to make sure | |
520 | that the wchar_t string is 0-terminated in case this is required by | |
521 | the application. | |
522 | ||
523 | Returns the number of wchar_t characters copied (excluding a | |
524 | possibly trailing 0-termination character) or -1 in case of an | |
525 | error. */ | |
526 | ||
527 | PyAPI_FUNC(int) PyUnicode_AsWideChar( | |
528 | PyUnicodeObject *unicode, /* Unicode object */ | |
529 | register wchar_t *w, /* wchar_t buffer */ | |
530 | int size /* size of buffer */ | |
531 | ); | |
532 | ||
533 | #endif | |
534 | ||
535 | /* --- Unicode ordinals --------------------------------------------------- */ | |
536 | ||
537 | /* Create a Unicode Object from the given Unicode code point ordinal. | |
538 | ||
539 | The ordinal must be in range(0x10000) on narrow Python builds | |
540 | (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is | |
541 | raised in case it is not. | |
542 | ||
543 | */ | |
544 | ||
545 | PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal); | |
546 | ||
547 | /* === Builtin Codecs ===================================================== | |
548 | ||
549 | Many of these APIs take two arguments encoding and errors. These | |
550 | parameters encoding and errors have the same semantics as the ones | |
551 | of the builtin unicode() API. | |
552 | ||
553 | Setting encoding to NULL causes the default encoding to be used. | |
554 | ||
555 | Error handling is set by errors which may also be set to NULL | |
556 | meaning to use the default handling defined for the codec. Default | |
557 | error handling for all builtin codecs is "strict" (ValueErrors are | |
558 | raised). | |
559 | ||
560 | The codecs all use a similar interface. Only deviation from the | |
561 | generic ones are documented. | |
562 | ||
563 | */ | |
564 | ||
565 | /* --- Manage the default encoding ---------------------------------------- */ | |
566 | ||
567 | /* Return a Python string holding the default encoded value of the | |
568 | Unicode object. | |
569 | ||
570 | The resulting string is cached in the Unicode object for subsequent | |
571 | usage by this function. The cached version is needed to implement | |
572 | the character buffer interface and will live (at least) as long as | |
573 | the Unicode object itself. | |
574 | ||
575 | The refcount of the string is *not* incremented. | |
576 | ||
577 | *** Exported for internal use by the interpreter only !!! *** | |
578 | ||
579 | */ | |
580 | ||
581 | PyAPI_FUNC(PyObject *) _PyUnicode_AsDefaultEncodedString( | |
582 | PyObject *, const char *); | |
583 | ||
584 | /* Returns the currently active default encoding. | |
585 | ||
586 | The default encoding is currently implemented as run-time settable | |
587 | process global. This may change in future versions of the | |
588 | interpreter to become a parameter which is managed on a per-thread | |
589 | basis. | |
590 | ||
591 | */ | |
592 | ||
593 | PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void); | |
594 | ||
595 | /* Sets the currently active default encoding. | |
596 | ||
597 | Returns 0 on success, -1 in case of an error. | |
598 | ||
599 | */ | |
600 | ||
601 | PyAPI_FUNC(int) PyUnicode_SetDefaultEncoding( | |
602 | const char *encoding /* Encoding name in standard form */ | |
603 | ); | |
604 | ||
605 | /* --- Generic Codecs ----------------------------------------------------- */ | |
606 | ||
607 | /* Create a Unicode object by decoding the encoded string s of the | |
608 | given size. */ | |
609 | ||
610 | PyAPI_FUNC(PyObject*) PyUnicode_Decode( | |
611 | const char *s, /* encoded string */ | |
612 | int size, /* size of buffer */ | |
613 | const char *encoding, /* encoding */ | |
614 | const char *errors /* error handling */ | |
615 | ); | |
616 | ||
617 | /* Encodes a Py_UNICODE buffer of the given size and returns a | |
618 | Python string object. */ | |
619 | ||
620 | PyAPI_FUNC(PyObject*) PyUnicode_Encode( | |
621 | const Py_UNICODE *s, /* Unicode char buffer */ | |
622 | int size, /* number of Py_UNICODE chars to encode */ | |
623 | const char *encoding, /* encoding */ | |
624 | const char *errors /* error handling */ | |
625 | ); | |
626 | ||
627 | /* Encodes a Unicode object and returns the result as Python | |
628 | object. */ | |
629 | ||
630 | PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject( | |
631 | PyObject *unicode, /* Unicode object */ | |
632 | const char *encoding, /* encoding */ | |
633 | const char *errors /* error handling */ | |
634 | ); | |
635 | ||
636 | /* Encodes a Unicode object and returns the result as Python string | |
637 | object. */ | |
638 | ||
639 | PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString( | |
640 | PyObject *unicode, /* Unicode object */ | |
641 | const char *encoding, /* encoding */ | |
642 | const char *errors /* error handling */ | |
643 | ); | |
644 | ||
645 | /* --- UTF-7 Codecs ------------------------------------------------------- */ | |
646 | ||
647 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7( | |
648 | const char *string, /* UTF-7 encoded string */ | |
649 | int length, /* size of string */ | |
650 | const char *errors /* error handling */ | |
651 | ); | |
652 | ||
653 | PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7( | |
654 | const Py_UNICODE *data, /* Unicode char buffer */ | |
655 | int length, /* number of Py_UNICODE chars to encode */ | |
656 | int encodeSetO, /* force the encoder to encode characters in | |
657 | Set O, as described in RFC2152 */ | |
658 | int encodeWhiteSpace, /* force the encoder to encode space, tab, | |
659 | carriage return and linefeed characters */ | |
660 | const char *errors /* error handling */ | |
661 | ); | |
662 | ||
663 | /* --- UTF-8 Codecs ------------------------------------------------------- */ | |
664 | ||
665 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8( | |
666 | const char *string, /* UTF-8 encoded string */ | |
667 | int length, /* size of string */ | |
668 | const char *errors /* error handling */ | |
669 | ); | |
670 | ||
671 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful( | |
672 | const char *string, /* UTF-8 encoded string */ | |
673 | int length, /* size of string */ | |
674 | const char *errors, /* error handling */ | |
675 | int *consumed /* bytes consumed */ | |
676 | ); | |
677 | ||
678 | PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String( | |
679 | PyObject *unicode /* Unicode object */ | |
680 | ); | |
681 | ||
682 | PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8( | |
683 | const Py_UNICODE *data, /* Unicode char buffer */ | |
684 | int length, /* number of Py_UNICODE chars to encode */ | |
685 | const char *errors /* error handling */ | |
686 | ); | |
687 | ||
688 | /* --- UTF-16 Codecs ------------------------------------------------------ */ | |
689 | ||
690 | /* Decodes length bytes from a UTF-16 encoded buffer string and returns | |
691 | the corresponding Unicode object. | |
692 | ||
693 | errors (if non-NULL) defines the error handling. It defaults | |
694 | to "strict". | |
695 | ||
696 | If byteorder is non-NULL, the decoder starts decoding using the | |
697 | given byte order: | |
698 | ||
699 | *byteorder == -1: little endian | |
700 | *byteorder == 0: native order | |
701 | *byteorder == 1: big endian | |
702 | ||
703 | In native mode, the first two bytes of the stream are checked for a | |
704 | BOM mark. If found, the BOM mark is analysed, the byte order | |
705 | adjusted and the BOM skipped. In the other modes, no BOM mark | |
706 | interpretation is done. After completion, *byteorder is set to the | |
707 | current byte order at the end of input data. | |
708 | ||
709 | If byteorder is NULL, the codec starts in native order mode. | |
710 | ||
711 | */ | |
712 | ||
713 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16( | |
714 | const char *string, /* UTF-16 encoded string */ | |
715 | int length, /* size of string */ | |
716 | const char *errors, /* error handling */ | |
717 | int *byteorder /* pointer to byteorder to use | |
718 | 0=native;-1=LE,1=BE; updated on | |
719 | exit */ | |
720 | ); | |
721 | ||
722 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful( | |
723 | const char *string, /* UTF-16 encoded string */ | |
724 | int length, /* size of string */ | |
725 | const char *errors, /* error handling */ | |
726 | int *byteorder, /* pointer to byteorder to use | |
727 | 0=native;-1=LE,1=BE; updated on | |
728 | exit */ | |
729 | int *consumed /* bytes consumed */ | |
730 | ); | |
731 | ||
732 | /* Returns a Python string using the UTF-16 encoding in native byte | |
733 | order. The string always starts with a BOM mark. */ | |
734 | ||
735 | PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String( | |
736 | PyObject *unicode /* Unicode object */ | |
737 | ); | |
738 | ||
739 | /* Returns a Python string object holding the UTF-16 encoded value of | |
740 | the Unicode data. | |
741 | ||
742 | If byteorder is not 0, output is written according to the following | |
743 | byte order: | |
744 | ||
745 | byteorder == -1: little endian | |
746 | byteorder == 0: native byte order (writes a BOM mark) | |
747 | byteorder == 1: big endian | |
748 | ||
749 | If byteorder is 0, the output string will always start with the | |
750 | Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is | |
751 | prepended. | |
752 | ||
753 | Note that Py_UNICODE data is being interpreted as UTF-16 reduced to | |
754 | UCS-2. This trick makes it possible to add full UTF-16 capabilities | |
755 | at a later point without compromising the APIs. | |
756 | ||
757 | */ | |
758 | ||
759 | PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16( | |
760 | const Py_UNICODE *data, /* Unicode char buffer */ | |
761 | int length, /* number of Py_UNICODE chars to encode */ | |
762 | const char *errors, /* error handling */ | |
763 | int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ | |
764 | ); | |
765 | ||
766 | /* --- Unicode-Escape Codecs ---------------------------------------------- */ | |
767 | ||
768 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape( | |
769 | const char *string, /* Unicode-Escape encoded string */ | |
770 | int length, /* size of string */ | |
771 | const char *errors /* error handling */ | |
772 | ); | |
773 | ||
774 | PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString( | |
775 | PyObject *unicode /* Unicode object */ | |
776 | ); | |
777 | ||
778 | PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape( | |
779 | const Py_UNICODE *data, /* Unicode char buffer */ | |
780 | int length /* Number of Py_UNICODE chars to encode */ | |
781 | ); | |
782 | ||
783 | /* --- Raw-Unicode-Escape Codecs ------------------------------------------ */ | |
784 | ||
785 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape( | |
786 | const char *string, /* Raw-Unicode-Escape encoded string */ | |
787 | int length, /* size of string */ | |
788 | const char *errors /* error handling */ | |
789 | ); | |
790 | ||
791 | PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString( | |
792 | PyObject *unicode /* Unicode object */ | |
793 | ); | |
794 | ||
795 | PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape( | |
796 | const Py_UNICODE *data, /* Unicode char buffer */ | |
797 | int length /* Number of Py_UNICODE chars to encode */ | |
798 | ); | |
799 | ||
800 | /* --- Unicode Internal Codec --------------------------------------------- | |
801 | ||
802 | Only for internal use in _codecsmodule.c */ | |
803 | ||
804 | PyObject *_PyUnicode_DecodeUnicodeInternal( | |
805 | const char *string, | |
806 | int length, | |
807 | const char *errors | |
808 | ); | |
809 | ||
810 | /* --- Latin-1 Codecs ----------------------------------------------------- | |
811 | ||
812 | Note: Latin-1 corresponds to the first 256 Unicode ordinals. | |
813 | ||
814 | */ | |
815 | ||
816 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1( | |
817 | const char *string, /* Latin-1 encoded string */ | |
818 | int length, /* size of string */ | |
819 | const char *errors /* error handling */ | |
820 | ); | |
821 | ||
822 | PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String( | |
823 | PyObject *unicode /* Unicode object */ | |
824 | ); | |
825 | ||
826 | PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1( | |
827 | const Py_UNICODE *data, /* Unicode char buffer */ | |
828 | int length, /* Number of Py_UNICODE chars to encode */ | |
829 | const char *errors /* error handling */ | |
830 | ); | |
831 | ||
832 | /* --- ASCII Codecs ------------------------------------------------------- | |
833 | ||
834 | Only 7-bit ASCII data is excepted. All other codes generate errors. | |
835 | ||
836 | */ | |
837 | ||
838 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII( | |
839 | const char *string, /* ASCII encoded string */ | |
840 | int length, /* size of string */ | |
841 | const char *errors /* error handling */ | |
842 | ); | |
843 | ||
844 | PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString( | |
845 | PyObject *unicode /* Unicode object */ | |
846 | ); | |
847 | ||
848 | PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII( | |
849 | const Py_UNICODE *data, /* Unicode char buffer */ | |
850 | int length, /* Number of Py_UNICODE chars to encode */ | |
851 | const char *errors /* error handling */ | |
852 | ); | |
853 | ||
854 | /* --- Character Map Codecs ----------------------------------------------- | |
855 | ||
856 | This codec uses mappings to encode and decode characters. | |
857 | ||
858 | Decoding mappings must map single string characters to single | |
859 | Unicode characters, integers (which are then interpreted as Unicode | |
860 | ordinals) or None (meaning "undefined mapping" and causing an | |
861 | error). | |
862 | ||
863 | Encoding mappings must map single Unicode characters to single | |
864 | string characters, integers (which are then interpreted as Latin-1 | |
865 | ordinals) or None (meaning "undefined mapping" and causing an | |
866 | error). | |
867 | ||
868 | If a character lookup fails with a LookupError, the character is | |
869 | copied as-is meaning that its ordinal value will be interpreted as | |
870 | Unicode or Latin-1 ordinal resp. Because of this mappings only need | |
871 | to contain those mappings which map characters to different code | |
872 | points. | |
873 | ||
874 | */ | |
875 | ||
876 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap( | |
877 | const char *string, /* Encoded string */ | |
878 | int length, /* size of string */ | |
879 | PyObject *mapping, /* character mapping | |
880 | (char ordinal -> unicode ordinal) */ | |
881 | const char *errors /* error handling */ | |
882 | ); | |
883 | ||
884 | PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString( | |
885 | PyObject *unicode, /* Unicode object */ | |
886 | PyObject *mapping /* character mapping | |
887 | (unicode ordinal -> char ordinal) */ | |
888 | ); | |
889 | ||
890 | PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap( | |
891 | const Py_UNICODE *data, /* Unicode char buffer */ | |
892 | int length, /* Number of Py_UNICODE chars to encode */ | |
893 | PyObject *mapping, /* character mapping | |
894 | (unicode ordinal -> char ordinal) */ | |
895 | const char *errors /* error handling */ | |
896 | ); | |
897 | ||
898 | /* Translate a Py_UNICODE buffer of the given length by applying a | |
899 | character mapping table to it and return the resulting Unicode | |
900 | object. | |
901 | ||
902 | The mapping table must map Unicode ordinal integers to Unicode | |
903 | ordinal integers or None (causing deletion of the character). | |
904 | ||
905 | Mapping tables may be dictionaries or sequences. Unmapped character | |
906 | ordinals (ones which cause a LookupError) are left untouched and | |
907 | are copied as-is. | |
908 | ||
909 | */ | |
910 | ||
911 | PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap( | |
912 | const Py_UNICODE *data, /* Unicode char buffer */ | |
913 | int length, /* Number of Py_UNICODE chars to encode */ | |
914 | PyObject *table, /* Translate table */ | |
915 | const char *errors /* error handling */ | |
916 | ); | |
917 | ||
918 | #ifdef MS_WIN32 | |
919 | ||
920 | /* --- MBCS codecs for Windows -------------------------------------------- */ | |
921 | ||
922 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS( | |
923 | const char *string, /* MBCS encoded string */ | |
924 | int length, /* size of string */ | |
925 | const char *errors /* error handling */ | |
926 | ); | |
927 | ||
928 | PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString( | |
929 | PyObject *unicode /* Unicode object */ | |
930 | ); | |
931 | ||
932 | PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS( | |
933 | const Py_UNICODE *data, /* Unicode char buffer */ | |
934 | int length, /* Number of Py_UNICODE chars to encode */ | |
935 | const char *errors /* error handling */ | |
936 | ); | |
937 | ||
938 | #endif /* MS_WIN32 */ | |
939 | ||
940 | /* --- Decimal Encoder ---------------------------------------------------- */ | |
941 | ||
942 | /* Takes a Unicode string holding a decimal value and writes it into | |
943 | an output buffer using standard ASCII digit codes. | |
944 | ||
945 | The output buffer has to provide at least length+1 bytes of storage | |
946 | area. The output string is 0-terminated. | |
947 | ||
948 | The encoder converts whitespace to ' ', decimal characters to their | |
949 | corresponding ASCII digit and all other Latin-1 characters except | |
950 | \0 as-is. Characters outside this range (Unicode ordinals 1-256) | |
951 | are treated as errors. This includes embedded NULL bytes. | |
952 | ||
953 | Error handling is defined by the errors argument: | |
954 | ||
955 | NULL or "strict": raise a ValueError | |
956 | "ignore": ignore the wrong characters (these are not copied to the | |
957 | output buffer) | |
958 | "replace": replaces illegal characters with '?' | |
959 | ||
960 | Returns 0 on success, -1 on failure. | |
961 | ||
962 | */ | |
963 | ||
964 | PyAPI_FUNC(int) PyUnicode_EncodeDecimal( | |
965 | Py_UNICODE *s, /* Unicode buffer */ | |
966 | int length, /* Number of Py_UNICODE chars to encode */ | |
967 | char *output, /* Output buffer; must have size >= length */ | |
968 | const char *errors /* error handling */ | |
969 | ); | |
970 | ||
971 | /* --- Methods & Slots ---------------------------------------------------- | |
972 | ||
973 | These are capable of handling Unicode objects and strings on input | |
974 | (we refer to them as strings in the descriptions) and return | |
975 | Unicode objects or integers as apporpriate. */ | |
976 | ||
977 | /* Concat two strings giving a new Unicode string. */ | |
978 | ||
979 | PyAPI_FUNC(PyObject*) PyUnicode_Concat( | |
980 | PyObject *left, /* Left string */ | |
981 | PyObject *right /* Right string */ | |
982 | ); | |
983 | ||
984 | /* Split a string giving a list of Unicode strings. | |
985 | ||
986 | If sep is NULL, splitting will be done at all whitespace | |
987 | substrings. Otherwise, splits occur at the given separator. | |
988 | ||
989 | At most maxsplit splits will be done. If negative, no limit is set. | |
990 | ||
991 | Separators are not included in the resulting list. | |
992 | ||
993 | */ | |
994 | ||
995 | PyAPI_FUNC(PyObject*) PyUnicode_Split( | |
996 | PyObject *s, /* String to split */ | |
997 | PyObject *sep, /* String separator */ | |
998 | int maxsplit /* Maxsplit count */ | |
999 | ); | |
1000 | ||
1001 | /* Dito, but split at line breaks. | |
1002 | ||
1003 | CRLF is considered to be one line break. Line breaks are not | |
1004 | included in the resulting list. */ | |
1005 | ||
1006 | PyAPI_FUNC(PyObject*) PyUnicode_Splitlines( | |
1007 | PyObject *s, /* String to split */ | |
1008 | int keepends /* If true, line end markers are included */ | |
1009 | ); | |
1010 | ||
1011 | /* Split a string giving a list of Unicode strings. | |
1012 | ||
1013 | If sep is NULL, splitting will be done at all whitespace | |
1014 | substrings. Otherwise, splits occur at the given separator. | |
1015 | ||
1016 | At most maxsplit splits will be done. But unlike PyUnicode_Split | |
1017 | PyUnicode_RSplit splits from the end of the string. If negative, | |
1018 | no limit is set. | |
1019 | ||
1020 | Separators are not included in the resulting list. | |
1021 | ||
1022 | */ | |
1023 | ||
1024 | PyAPI_FUNC(PyObject*) PyUnicode_RSplit( | |
1025 | PyObject *s, /* String to split */ | |
1026 | PyObject *sep, /* String separator */ | |
1027 | int maxsplit /* Maxsplit count */ | |
1028 | ); | |
1029 | ||
1030 | /* Translate a string by applying a character mapping table to it and | |
1031 | return the resulting Unicode object. | |
1032 | ||
1033 | The mapping table must map Unicode ordinal integers to Unicode | |
1034 | ordinal integers or None (causing deletion of the character). | |
1035 | ||
1036 | Mapping tables may be dictionaries or sequences. Unmapped character | |
1037 | ordinals (ones which cause a LookupError) are left untouched and | |
1038 | are copied as-is. | |
1039 | ||
1040 | */ | |
1041 | ||
1042 | PyAPI_FUNC(PyObject *) PyUnicode_Translate( | |
1043 | PyObject *str, /* String */ | |
1044 | PyObject *table, /* Translate table */ | |
1045 | const char *errors /* error handling */ | |
1046 | ); | |
1047 | ||
1048 | /* Join a sequence of strings using the given separator and return | |
1049 | the resulting Unicode string. */ | |
1050 | ||
1051 | PyAPI_FUNC(PyObject*) PyUnicode_Join( | |
1052 | PyObject *separator, /* Separator string */ | |
1053 | PyObject *seq /* Sequence object */ | |
1054 | ); | |
1055 | ||
1056 | /* Return 1 if substr matches str[start:end] at the given tail end, 0 | |
1057 | otherwise. */ | |
1058 | ||
1059 | PyAPI_FUNC(int) PyUnicode_Tailmatch( | |
1060 | PyObject *str, /* String */ | |
1061 | PyObject *substr, /* Prefix or Suffix string */ | |
1062 | int start, /* Start index */ | |
1063 | int end, /* Stop index */ | |
1064 | int direction /* Tail end: -1 prefix, +1 suffix */ | |
1065 | ); | |
1066 | ||
1067 | /* Return the first position of substr in str[start:end] using the | |
1068 | given search direction or -1 if not found. -2 is returned in case | |
1069 | an error occurred and an exception is set. */ | |
1070 | ||
1071 | PyAPI_FUNC(int) PyUnicode_Find( | |
1072 | PyObject *str, /* String */ | |
1073 | PyObject *substr, /* Substring to find */ | |
1074 | int start, /* Start index */ | |
1075 | int end, /* Stop index */ | |
1076 | int direction /* Find direction: +1 forward, -1 backward */ | |
1077 | ); | |
1078 | ||
1079 | /* Count the number of occurrences of substr in str[start:end]. */ | |
1080 | ||
1081 | PyAPI_FUNC(int) PyUnicode_Count( | |
1082 | PyObject *str, /* String */ | |
1083 | PyObject *substr, /* Substring to count */ | |
1084 | int start, /* Start index */ | |
1085 | int end /* Stop index */ | |
1086 | ); | |
1087 | ||
1088 | /* Replace at most maxcount occurrences of substr in str with replstr | |
1089 | and return the resulting Unicode object. */ | |
1090 | ||
1091 | PyAPI_FUNC(PyObject *) PyUnicode_Replace( | |
1092 | PyObject *str, /* String */ | |
1093 | PyObject *substr, /* Substring to find */ | |
1094 | PyObject *replstr, /* Substring to replace */ | |
1095 | int maxcount /* Max. number of replacements to apply; | |
1096 | -1 = all */ | |
1097 | ); | |
1098 | ||
1099 | /* Compare two strings and return -1, 0, 1 for less than, equal, | |
1100 | greater than resp. */ | |
1101 | ||
1102 | PyAPI_FUNC(int) PyUnicode_Compare( | |
1103 | PyObject *left, /* Left string */ | |
1104 | PyObject *right /* Right string */ | |
1105 | ); | |
1106 | ||
1107 | /* Apply a argument tuple or dictionary to a format string and return | |
1108 | the resulting Unicode string. */ | |
1109 | ||
1110 | PyAPI_FUNC(PyObject *) PyUnicode_Format( | |
1111 | PyObject *format, /* Format string */ | |
1112 | PyObject *args /* Argument tuple or dictionary */ | |
1113 | ); | |
1114 | ||
1115 | /* Checks whether element is contained in container and return 1/0 | |
1116 | accordingly. | |
1117 | ||
1118 | element has to coerce to an one element Unicode string. -1 is | |
1119 | returned in case of an error. */ | |
1120 | ||
1121 | PyAPI_FUNC(int) PyUnicode_Contains( | |
1122 | PyObject *container, /* Container string */ | |
1123 | PyObject *element /* Element string */ | |
1124 | ); | |
1125 | ||
1126 | /* Externally visible for str.strip(unicode) */ | |
1127 | PyAPI_FUNC(PyObject *) _PyUnicode_XStrip( | |
1128 | PyUnicodeObject *self, | |
1129 | int striptype, | |
1130 | PyObject *sepobj | |
1131 | ); | |
1132 | ||
1133 | /* === Characters Type APIs =============================================== */ | |
1134 | ||
1135 | /* These should not be used directly. Use the Py_UNICODE_IS* and | |
1136 | Py_UNICODE_TO* macros instead. | |
1137 | ||
1138 | These APIs are implemented in Objects/unicodectype.c. | |
1139 | ||
1140 | */ | |
1141 | ||
1142 | PyAPI_FUNC(int) _PyUnicode_IsLowercase( | |
1143 | Py_UNICODE ch /* Unicode character */ | |
1144 | ); | |
1145 | ||
1146 | PyAPI_FUNC(int) _PyUnicode_IsUppercase( | |
1147 | Py_UNICODE ch /* Unicode character */ | |
1148 | ); | |
1149 | ||
1150 | PyAPI_FUNC(int) _PyUnicode_IsTitlecase( | |
1151 | Py_UNICODE ch /* Unicode character */ | |
1152 | ); | |
1153 | ||
1154 | PyAPI_FUNC(int) _PyUnicode_IsWhitespace( | |
1155 | Py_UNICODE ch /* Unicode character */ | |
1156 | ); | |
1157 | ||
1158 | PyAPI_FUNC(int) _PyUnicode_IsLinebreak( | |
1159 | Py_UNICODE ch /* Unicode character */ | |
1160 | ); | |
1161 | ||
1162 | PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToLowercase( | |
1163 | Py_UNICODE ch /* Unicode character */ | |
1164 | ); | |
1165 | ||
1166 | PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToUppercase( | |
1167 | Py_UNICODE ch /* Unicode character */ | |
1168 | ); | |
1169 | ||
1170 | PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToTitlecase( | |
1171 | Py_UNICODE ch /* Unicode character */ | |
1172 | ); | |
1173 | ||
1174 | PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit( | |
1175 | Py_UNICODE ch /* Unicode character */ | |
1176 | ); | |
1177 | ||
1178 | PyAPI_FUNC(int) _PyUnicode_ToDigit( | |
1179 | Py_UNICODE ch /* Unicode character */ | |
1180 | ); | |
1181 | ||
1182 | PyAPI_FUNC(double) _PyUnicode_ToNumeric( | |
1183 | Py_UNICODE ch /* Unicode character */ | |
1184 | ); | |
1185 | ||
1186 | PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit( | |
1187 | Py_UNICODE ch /* Unicode character */ | |
1188 | ); | |
1189 | ||
1190 | PyAPI_FUNC(int) _PyUnicode_IsDigit( | |
1191 | Py_UNICODE ch /* Unicode character */ | |
1192 | ); | |
1193 | ||
1194 | PyAPI_FUNC(int) _PyUnicode_IsNumeric( | |
1195 | Py_UNICODE ch /* Unicode character */ | |
1196 | ); | |
1197 | ||
1198 | PyAPI_FUNC(int) _PyUnicode_IsAlpha( | |
1199 | Py_UNICODE ch /* Unicode character */ | |
1200 | ); | |
1201 | ||
1202 | #ifdef __cplusplus | |
1203 | } | |
1204 | #endif | |
1205 | #endif /* Py_USING_UNICODE */ | |
1206 | #endif /* !Py_UNICODEOBJECT_H */ |