|
1 #ifndef Py_UNICODEOBJECT_H |
|
2 #define Py_UNICODEOBJECT_H |
|
3 |
|
4 /* |
|
5 |
|
6 Unicode implementation based on original code by Fredrik Lundh, |
|
7 modified by Marc-Andre Lemburg (mal@lemburg.com) according to the |
|
8 Unicode Integration Proposal (see file Misc/unicode.txt). |
|
9 |
|
10 Copyright (c) Corporation for National Research Initiatives. |
|
11 |
|
12 |
|
13 Original header: |
|
14 -------------------------------------------------------------------- |
|
15 |
|
16 * Yet another Unicode string type for Python. This type supports the |
|
17 * 16-bit Basic Multilingual Plane (BMP) only. |
|
18 * |
|
19 * Written by Fredrik Lundh, January 1999. |
|
20 * |
|
21 * Copyright (c) 1999 by Secret Labs AB. |
|
22 * Copyright (c) 1999 by Fredrik Lundh. |
|
23 * |
|
24 * fredrik@pythonware.com |
|
25 * http://www.pythonware.com |
|
26 * |
|
27 * -------------------------------------------------------------------- |
|
28 * This Unicode String Type is |
|
29 * |
|
30 * Copyright (c) 1999 by Secret Labs AB |
|
31 * Copyright (c) 1999 by Fredrik Lundh |
|
32 * |
|
33 * By obtaining, using, and/or copying this software and/or its |
|
34 * associated documentation, you agree that you have read, understood, |
|
35 * and will comply with the following terms and conditions: |
|
36 * |
|
37 * Permission to use, copy, modify, and distribute this software and its |
|
38 * associated documentation for any purpose and without fee is hereby |
|
39 * granted, provided that the above copyright notice appears in all |
|
40 * copies, and that both that copyright notice and this permission notice |
|
41 * appear in supporting documentation, and that the name of Secret Labs |
|
42 * AB or the author not be used in advertising or publicity pertaining to |
|
43 * distribution of the software without specific, written prior |
|
44 * permission. |
|
45 * |
|
46 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO |
|
47 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND |
|
48 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR |
|
49 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
|
50 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
|
51 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT |
|
52 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
|
53 * -------------------------------------------------------------------- */ |
|
54 |
|
55 #include <ctype.h> |
|
56 |
|
57 /* === Internal API ======================================================= */ |
|
58 |
|
59 /* --- Internal Unicode Format -------------------------------------------- */ |
|
60 |
|
61 #ifndef Py_USING_UNICODE |
|
62 |
|
63 #define PyUnicode_Check(op) 0 |
|
64 #define PyUnicode_CheckExact(op) 0 |
|
65 |
|
66 #else |
|
67 |
|
68 /* FIXME: MvL's new implementation assumes that Py_UNICODE_SIZE is |
|
69 properly set, but the default rules below doesn't set it. I'll |
|
70 sort this out some other day -- fredrik@pythonware.com */ |
|
71 |
|
72 #ifndef Py_UNICODE_SIZE |
|
73 #error Must define Py_UNICODE_SIZE |
|
74 #endif |
|
75 |
|
76 /* Setting Py_UNICODE_WIDE enables UCS-4 storage. Otherwise, Unicode |
|
77 strings are stored as UCS-2 (with limited support for UTF-16) */ |
|
78 |
|
79 #if Py_UNICODE_SIZE >= 4 |
|
80 #define Py_UNICODE_WIDE |
|
81 #endif |
|
82 |
|
83 /* Set these flags if the platform has "wchar.h", "wctype.h" and the |
|
84 wchar_t type is a 16-bit unsigned type */ |
|
85 /* #define HAVE_WCHAR_H */ |
|
86 /* #define HAVE_USABLE_WCHAR_T */ |
|
87 |
|
88 /* Defaults for various platforms */ |
|
89 #ifndef PY_UNICODE_TYPE |
|
90 |
|
91 /* Windows has a usable wchar_t type (unless we're using UCS-4) */ |
|
92 # if defined(MS_WIN32) && Py_UNICODE_SIZE == 2 |
|
93 # define HAVE_USABLE_WCHAR_T |
|
94 # define PY_UNICODE_TYPE wchar_t |
|
95 # endif |
|
96 |
|
97 # if defined(Py_UNICODE_WIDE) |
|
98 # define PY_UNICODE_TYPE Py_UCS4 |
|
99 # endif |
|
100 |
|
101 #endif |
|
102 |
|
103 /* If the compiler provides a wchar_t type we try to support it |
|
104 through the interface functions PyUnicode_FromWideChar() and |
|
105 PyUnicode_AsWideChar(). */ |
|
106 |
|
107 #ifdef HAVE_USABLE_WCHAR_T |
|
108 # ifndef HAVE_WCHAR_H |
|
109 # define HAVE_WCHAR_H |
|
110 # endif |
|
111 #endif |
|
112 |
|
113 #ifdef HAVE_WCHAR_H |
|
114 /* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */ |
|
115 # ifdef _HAVE_BSDI |
|
116 # include <time.h> |
|
117 # endif |
|
118 # include <wchar.h> |
|
119 #endif |
|
120 |
|
121 /* |
|
122 * Use this typedef when you need to represent a UTF-16 surrogate pair |
|
123 * as single unsigned integer. |
|
124 */ |
|
125 #if SIZEOF_INT >= 4 |
|
126 typedef unsigned int Py_UCS4; |
|
127 #elif SIZEOF_LONG >= 4 |
|
128 typedef unsigned long Py_UCS4; |
|
129 #endif |
|
130 |
|
131 typedef PY_UNICODE_TYPE Py_UNICODE; |
|
132 |
|
133 /* --- UCS-2/UCS-4 Name Mangling ------------------------------------------ */ |
|
134 |
|
135 /* Unicode API names are mangled to assure that UCS-2 and UCS-4 builds |
|
136 produce different external names and thus cause import errors in |
|
137 case Python interpreters and extensions with mixed compiled in |
|
138 Unicode width assumptions are combined. */ |
|
139 |
|
140 #ifndef Py_UNICODE_WIDE |
|
141 |
|
142 # define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString |
|
143 # define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString |
|
144 # define PyUnicode_AsEncodedObject PyUnicodeUCS2_AsEncodedObject |
|
145 # define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString |
|
146 # define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String |
|
147 # define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString |
|
148 # define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String |
|
149 # define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String |
|
150 # define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode |
|
151 # define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS2_AsUnicodeEscapeString |
|
152 # define PyUnicode_AsWideChar PyUnicodeUCS2_AsWideChar |
|
153 # define PyUnicode_Compare PyUnicodeUCS2_Compare |
|
154 # define PyUnicode_Concat PyUnicodeUCS2_Concat |
|
155 # define PyUnicode_Contains PyUnicodeUCS2_Contains |
|
156 # define PyUnicode_Count PyUnicodeUCS2_Count |
|
157 # define PyUnicode_Decode PyUnicodeUCS2_Decode |
|
158 # define PyUnicode_DecodeASCII PyUnicodeUCS2_DecodeASCII |
|
159 # define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap |
|
160 # define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1 |
|
161 # define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape |
|
162 # define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16 |
|
163 # define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful |
|
164 # define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8 |
|
165 # define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS2_DecodeUTF8Stateful |
|
166 # define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS2_DecodeUnicodeEscape |
|
167 # define PyUnicode_Encode PyUnicodeUCS2_Encode |
|
168 # define PyUnicode_EncodeASCII PyUnicodeUCS2_EncodeASCII |
|
169 # define PyUnicode_EncodeCharmap PyUnicodeUCS2_EncodeCharmap |
|
170 # define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal |
|
171 # define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1 |
|
172 # define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape |
|
173 # define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16 |
|
174 # define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8 |
|
175 # define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape |
|
176 # define PyUnicode_Find PyUnicodeUCS2_Find |
|
177 # define PyUnicode_Format PyUnicodeUCS2_Format |
|
178 # define PyUnicode_FromEncodedObject PyUnicodeUCS2_FromEncodedObject |
|
179 # define PyUnicode_FromObject PyUnicodeUCS2_FromObject |
|
180 # define PyUnicode_FromOrdinal PyUnicodeUCS2_FromOrdinal |
|
181 # define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode |
|
182 # define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar |
|
183 # define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding |
|
184 # define PyUnicode_GetMax PyUnicodeUCS2_GetMax |
|
185 # define PyUnicode_GetSize PyUnicodeUCS2_GetSize |
|
186 # define PyUnicode_Join PyUnicodeUCS2_Join |
|
187 # define PyUnicode_Partition PyUnicodeUCS2_Partition |
|
188 # define PyUnicode_RPartition PyUnicodeUCS2_RPartition |
|
189 # define PyUnicode_RSplit PyUnicodeUCS2_RSplit |
|
190 # define PyUnicode_Replace PyUnicodeUCS2_Replace |
|
191 # define PyUnicode_Resize PyUnicodeUCS2_Resize |
|
192 # define PyUnicode_RichCompare PyUnicodeUCS2_RichCompare |
|
193 # define PyUnicode_SetDefaultEncoding PyUnicodeUCS2_SetDefaultEncoding |
|
194 # define PyUnicode_Split PyUnicodeUCS2_Split |
|
195 # define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines |
|
196 # define PyUnicode_Tailmatch PyUnicodeUCS2_Tailmatch |
|
197 # define PyUnicode_Translate PyUnicodeUCS2_Translate |
|
198 # define PyUnicode_TranslateCharmap PyUnicodeUCS2_TranslateCharmap |
|
199 # define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS2_AsDefaultEncodedString |
|
200 # define _PyUnicode_Fini _PyUnicodeUCS2_Fini |
|
201 # define _PyUnicode_Init _PyUnicodeUCS2_Init |
|
202 # define _PyUnicode_IsAlpha _PyUnicodeUCS2_IsAlpha |
|
203 # define _PyUnicode_IsDecimalDigit _PyUnicodeUCS2_IsDecimalDigit |
|
204 # define _PyUnicode_IsDigit _PyUnicodeUCS2_IsDigit |
|
205 # define _PyUnicode_IsLinebreak _PyUnicodeUCS2_IsLinebreak |
|
206 # define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase |
|
207 # define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric |
|
208 # define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase |
|
209 # define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase |
|
210 # define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace |
|
211 # define _PyUnicode_ToDecimalDigit _PyUnicodeUCS2_ToDecimalDigit |
|
212 # define _PyUnicode_ToDigit _PyUnicodeUCS2_ToDigit |
|
213 # define _PyUnicode_ToLowercase _PyUnicodeUCS2_ToLowercase |
|
214 # define _PyUnicode_ToNumeric _PyUnicodeUCS2_ToNumeric |
|
215 # define _PyUnicode_ToTitlecase _PyUnicodeUCS2_ToTitlecase |
|
216 # define _PyUnicode_ToUppercase _PyUnicodeUCS2_ToUppercase |
|
217 |
|
218 #else |
|
219 |
|
220 # define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString |
|
221 # define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString |
|
222 # define PyUnicode_AsEncodedObject PyUnicodeUCS4_AsEncodedObject |
|
223 # define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString |
|
224 # define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String |
|
225 # define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString |
|
226 # define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String |
|
227 # define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String |
|
228 # define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode |
|
229 # define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS4_AsUnicodeEscapeString |
|
230 # define PyUnicode_AsWideChar PyUnicodeUCS4_AsWideChar |
|
231 # define PyUnicode_Compare PyUnicodeUCS4_Compare |
|
232 # define PyUnicode_Concat PyUnicodeUCS4_Concat |
|
233 # define PyUnicode_Contains PyUnicodeUCS4_Contains |
|
234 # define PyUnicode_Count PyUnicodeUCS4_Count |
|
235 # define PyUnicode_Decode PyUnicodeUCS4_Decode |
|
236 # define PyUnicode_DecodeASCII PyUnicodeUCS4_DecodeASCII |
|
237 # define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap |
|
238 # define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1 |
|
239 # define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape |
|
240 # define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16 |
|
241 # define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful |
|
242 # define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8 |
|
243 # define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS4_DecodeUTF8Stateful |
|
244 # define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS4_DecodeUnicodeEscape |
|
245 # define PyUnicode_Encode PyUnicodeUCS4_Encode |
|
246 # define PyUnicode_EncodeASCII PyUnicodeUCS4_EncodeASCII |
|
247 # define PyUnicode_EncodeCharmap PyUnicodeUCS4_EncodeCharmap |
|
248 # define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal |
|
249 # define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1 |
|
250 # define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape |
|
251 # define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16 |
|
252 # define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8 |
|
253 # define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape |
|
254 # define PyUnicode_Find PyUnicodeUCS4_Find |
|
255 # define PyUnicode_Format PyUnicodeUCS4_Format |
|
256 # define PyUnicode_FromEncodedObject PyUnicodeUCS4_FromEncodedObject |
|
257 # define PyUnicode_FromObject PyUnicodeUCS4_FromObject |
|
258 # define PyUnicode_FromOrdinal PyUnicodeUCS4_FromOrdinal |
|
259 # define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode |
|
260 # define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar |
|
261 # define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding |
|
262 # define PyUnicode_GetMax PyUnicodeUCS4_GetMax |
|
263 # define PyUnicode_GetSize PyUnicodeUCS4_GetSize |
|
264 # define PyUnicode_Join PyUnicodeUCS4_Join |
|
265 # define PyUnicode_Partition PyUnicodeUCS4_Partition |
|
266 # define PyUnicode_RPartition PyUnicodeUCS4_RPartition |
|
267 # define PyUnicode_RSplit PyUnicodeUCS4_RSplit |
|
268 # define PyUnicode_Replace PyUnicodeUCS4_Replace |
|
269 # define PyUnicode_Resize PyUnicodeUCS4_Resize |
|
270 # define PyUnicode_RichCompare PyUnicodeUCS4_RichCompare |
|
271 # define PyUnicode_SetDefaultEncoding PyUnicodeUCS4_SetDefaultEncoding |
|
272 # define PyUnicode_Split PyUnicodeUCS4_Split |
|
273 # define PyUnicode_Splitlines PyUnicodeUCS4_Splitlines |
|
274 # define PyUnicode_Tailmatch PyUnicodeUCS4_Tailmatch |
|
275 # define PyUnicode_Translate PyUnicodeUCS4_Translate |
|
276 # define PyUnicode_TranslateCharmap PyUnicodeUCS4_TranslateCharmap |
|
277 # define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS4_AsDefaultEncodedString |
|
278 # define _PyUnicode_Fini _PyUnicodeUCS4_Fini |
|
279 # define _PyUnicode_Init _PyUnicodeUCS4_Init |
|
280 # define _PyUnicode_IsAlpha _PyUnicodeUCS4_IsAlpha |
|
281 # define _PyUnicode_IsDecimalDigit _PyUnicodeUCS4_IsDecimalDigit |
|
282 # define _PyUnicode_IsDigit _PyUnicodeUCS4_IsDigit |
|
283 # define _PyUnicode_IsLinebreak _PyUnicodeUCS4_IsLinebreak |
|
284 # define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase |
|
285 # define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric |
|
286 # define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase |
|
287 # define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase |
|
288 # define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace |
|
289 # define _PyUnicode_ToDecimalDigit _PyUnicodeUCS4_ToDecimalDigit |
|
290 # define _PyUnicode_ToDigit _PyUnicodeUCS4_ToDigit |
|
291 # define _PyUnicode_ToLowercase _PyUnicodeUCS4_ToLowercase |
|
292 # define _PyUnicode_ToNumeric _PyUnicodeUCS4_ToNumeric |
|
293 # define _PyUnicode_ToTitlecase _PyUnicodeUCS4_ToTitlecase |
|
294 # define _PyUnicode_ToUppercase _PyUnicodeUCS4_ToUppercase |
|
295 |
|
296 |
|
297 #endif |
|
298 |
|
299 /* --- Internal Unicode Operations ---------------------------------------- */ |
|
300 |
|
301 /* If you want Python to use the compiler's wctype.h functions instead |
|
302 of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or |
|
303 configure Python using --with-wctype-functions. This reduces the |
|
304 interpreter's code size. */ |
|
305 |
|
306 #if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS) |
|
307 |
|
308 #include <wctype.h> |
|
309 |
|
310 #define Py_UNICODE_ISSPACE(ch) iswspace(ch) |
|
311 |
|
312 #define Py_UNICODE_ISLOWER(ch) iswlower(ch) |
|
313 #define Py_UNICODE_ISUPPER(ch) iswupper(ch) |
|
314 #define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) |
|
315 #define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) |
|
316 |
|
317 #define Py_UNICODE_TOLOWER(ch) towlower(ch) |
|
318 #define Py_UNICODE_TOUPPER(ch) towupper(ch) |
|
319 #define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) |
|
320 |
|
321 #define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) |
|
322 #define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) |
|
323 #define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) |
|
324 |
|
325 #define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) |
|
326 #define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) |
|
327 #define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) |
|
328 |
|
329 #define Py_UNICODE_ISALPHA(ch) iswalpha(ch) |
|
330 |
|
331 #else |
|
332 |
|
333 #define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch) |
|
334 |
|
335 #define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch) |
|
336 #define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch) |
|
337 #define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) |
|
338 #define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) |
|
339 |
|
340 #define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch) |
|
341 #define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch) |
|
342 #define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) |
|
343 |
|
344 #define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) |
|
345 #define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) |
|
346 #define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) |
|
347 |
|
348 #define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) |
|
349 #define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) |
|
350 #define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) |
|
351 |
|
352 #define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch) |
|
353 |
|
354 #endif |
|
355 |
|
356 #define Py_UNICODE_ISALNUM(ch) \ |
|
357 (Py_UNICODE_ISALPHA(ch) || \ |
|
358 Py_UNICODE_ISDECIMAL(ch) || \ |
|
359 Py_UNICODE_ISDIGIT(ch) || \ |
|
360 Py_UNICODE_ISNUMERIC(ch)) |
|
361 |
|
362 #define Py_UNICODE_COPY(target, source, length) \ |
|
363 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE)) |
|
364 |
|
365 #define Py_UNICODE_FILL(target, value, length) do\ |
|
366 {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\ |
|
367 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\ |
|
368 } while (0) |
|
369 |
|
370 /* check if substring matches at given offset. the offset must be |
|
371 valid, and the substring must not be empty */ |
|
372 #define Py_UNICODE_MATCH(string, offset, substring) \ |
|
373 ((*((string)->str + (offset)) == *((substring)->str)) && \ |
|
374 ((*((string)->str + (offset) + (substring)->length-1) == *((substring)->str + (substring)->length-1))) && \ |
|
375 !memcmp((string)->str + (offset), (substring)->str, (substring)->length*sizeof(Py_UNICODE))) |
|
376 |
|
377 #ifdef __cplusplus |
|
378 extern "C" { |
|
379 #endif |
|
380 |
|
381 /* --- Unicode Type ------------------------------------------------------- */ |
|
382 |
|
383 typedef struct { |
|
384 PyObject_HEAD |
|
385 Py_ssize_t length; /* Length of raw Unicode data in buffer */ |
|
386 Py_UNICODE *str; /* Raw Unicode buffer */ |
|
387 long hash; /* Hash value; -1 if not set */ |
|
388 PyObject *defenc; /* (Default) Encoded version as Python |
|
389 string, or NULL; this is used for |
|
390 implementing the buffer protocol */ |
|
391 } PyUnicodeObject; |
|
392 |
|
393 PyAPI_DATA(PyTypeObject) PyUnicode_Type; |
|
394 |
|
395 #define PyUnicode_Check(op) PyObject_TypeCheck(op, &PyUnicode_Type) |
|
396 #define PyUnicode_CheckExact(op) ((op)->ob_type == &PyUnicode_Type) |
|
397 |
|
398 /* Fast access macros */ |
|
399 #define PyUnicode_GET_SIZE(op) \ |
|
400 (((PyUnicodeObject *)(op))->length) |
|
401 #define PyUnicode_GET_DATA_SIZE(op) \ |
|
402 (((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE)) |
|
403 #define PyUnicode_AS_UNICODE(op) \ |
|
404 (((PyUnicodeObject *)(op))->str) |
|
405 #define PyUnicode_AS_DATA(op) \ |
|
406 ((const char *)((PyUnicodeObject *)(op))->str) |
|
407 |
|
408 /* --- Constants ---------------------------------------------------------- */ |
|
409 |
|
410 /* This Unicode character will be used as replacement character during |
|
411 decoding if the errors argument is set to "replace". Note: the |
|
412 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in |
|
413 Unicode 3.0. */ |
|
414 |
|
415 #define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD) |
|
416 |
|
417 /* === Public API ========================================================= */ |
|
418 |
|
419 /* --- Plain Py_UNICODE --------------------------------------------------- */ |
|
420 |
|
421 /* Create a Unicode Object from the Py_UNICODE buffer u of the given |
|
422 size. |
|
423 |
|
424 u may be NULL which causes the contents to be undefined. It is the |
|
425 user's responsibility to fill in the needed data afterwards. Note |
|
426 that modifying the Unicode object contents after construction is |
|
427 only allowed if u was set to NULL. |
|
428 |
|
429 The buffer is copied into the new object. */ |
|
430 |
|
431 PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode( |
|
432 const Py_UNICODE *u, /* Unicode buffer */ |
|
433 Py_ssize_t size /* size of buffer */ |
|
434 ); |
|
435 |
|
436 /* Return a read-only pointer to the Unicode object's internal |
|
437 Py_UNICODE buffer. */ |
|
438 |
|
439 PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode( |
|
440 PyObject *unicode /* Unicode object */ |
|
441 ); |
|
442 |
|
443 /* Get the length of the Unicode object. */ |
|
444 |
|
445 PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize( |
|
446 PyObject *unicode /* Unicode object */ |
|
447 ); |
|
448 |
|
449 /* Get the maximum ordinal for a Unicode character. */ |
|
450 PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void); |
|
451 |
|
452 /* Resize an already allocated Unicode object to the new size length. |
|
453 |
|
454 *unicode is modified to point to the new (resized) object and 0 |
|
455 returned on success. |
|
456 |
|
457 This API may only be called by the function which also called the |
|
458 Unicode constructor. The refcount on the object must be 1. Otherwise, |
|
459 an error is returned. |
|
460 |
|
461 Error handling is implemented as follows: an exception is set, -1 |
|
462 is returned and *unicode left untouched. |
|
463 |
|
464 */ |
|
465 |
|
466 PyAPI_FUNC(int) PyUnicode_Resize( |
|
467 PyObject **unicode, /* Pointer to the Unicode object */ |
|
468 Py_ssize_t length /* New length */ |
|
469 ); |
|
470 |
|
471 /* Coerce obj to an Unicode object and return a reference with |
|
472 *incremented* refcount. |
|
473 |
|
474 Coercion is done in the following way: |
|
475 |
|
476 1. String and other char buffer compatible objects are decoded |
|
477 under the assumptions that they contain data using the current |
|
478 default encoding. Decoding is done in "strict" mode. |
|
479 |
|
480 2. All other objects (including Unicode objects) raise an |
|
481 exception. |
|
482 |
|
483 The API returns NULL in case of an error. The caller is responsible |
|
484 for decref'ing the returned objects. |
|
485 |
|
486 */ |
|
487 |
|
488 PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject( |
|
489 register PyObject *obj, /* Object */ |
|
490 const char *encoding, /* encoding */ |
|
491 const char *errors /* error handling */ |
|
492 ); |
|
493 |
|
494 /* Coerce obj to an Unicode object and return a reference with |
|
495 *incremented* refcount. |
|
496 |
|
497 Unicode objects are passed back as-is (subclasses are converted to |
|
498 true Unicode objects), all other objects are delegated to |
|
499 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in |
|
500 using the default encoding as basis for decoding the object. |
|
501 |
|
502 The API returns NULL in case of an error. The caller is responsible |
|
503 for decref'ing the returned objects. |
|
504 |
|
505 */ |
|
506 |
|
507 PyAPI_FUNC(PyObject*) PyUnicode_FromObject( |
|
508 register PyObject *obj /* Object */ |
|
509 ); |
|
510 |
|
511 /* --- wchar_t support for platforms which support it --------------------- */ |
|
512 |
|
513 #ifdef HAVE_WCHAR_H |
|
514 |
|
515 /* Create a Unicode Object from the whcar_t buffer w of the given |
|
516 size. |
|
517 |
|
518 The buffer is copied into the new object. */ |
|
519 |
|
520 PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar( |
|
521 register const wchar_t *w, /* wchar_t buffer */ |
|
522 Py_ssize_t size /* size of buffer */ |
|
523 ); |
|
524 |
|
525 /* Copies the Unicode Object contents into the wchar_t buffer w. At |
|
526 most size wchar_t characters are copied. |
|
527 |
|
528 Note that the resulting wchar_t string may or may not be |
|
529 0-terminated. It is the responsibility of the caller to make sure |
|
530 that the wchar_t string is 0-terminated in case this is required by |
|
531 the application. |
|
532 |
|
533 Returns the number of wchar_t characters copied (excluding a |
|
534 possibly trailing 0-termination character) or -1 in case of an |
|
535 error. */ |
|
536 |
|
537 PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar( |
|
538 PyUnicodeObject *unicode, /* Unicode object */ |
|
539 register wchar_t *w, /* wchar_t buffer */ |
|
540 Py_ssize_t size /* size of buffer */ |
|
541 ); |
|
542 |
|
543 #endif |
|
544 |
|
545 /* --- Unicode ordinals --------------------------------------------------- */ |
|
546 |
|
547 /* Create a Unicode Object from the given Unicode code point ordinal. |
|
548 |
|
549 The ordinal must be in range(0x10000) on narrow Python builds |
|
550 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is |
|
551 raised in case it is not. |
|
552 |
|
553 */ |
|
554 |
|
555 PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal); |
|
556 |
|
557 /* === Builtin Codecs ===================================================== |
|
558 |
|
559 Many of these APIs take two arguments encoding and errors. These |
|
560 parameters encoding and errors have the same semantics as the ones |
|
561 of the builtin unicode() API. |
|
562 |
|
563 Setting encoding to NULL causes the default encoding to be used. |
|
564 |
|
565 Error handling is set by errors which may also be set to NULL |
|
566 meaning to use the default handling defined for the codec. Default |
|
567 error handling for all builtin codecs is "strict" (ValueErrors are |
|
568 raised). |
|
569 |
|
570 The codecs all use a similar interface. Only deviation from the |
|
571 generic ones are documented. |
|
572 |
|
573 */ |
|
574 |
|
575 /* --- Manage the default encoding ---------------------------------------- */ |
|
576 |
|
577 /* Return a Python string holding the default encoded value of the |
|
578 Unicode object. |
|
579 |
|
580 The resulting string is cached in the Unicode object for subsequent |
|
581 usage by this function. The cached version is needed to implement |
|
582 the character buffer interface and will live (at least) as long as |
|
583 the Unicode object itself. |
|
584 |
|
585 The refcount of the string is *not* incremented. |
|
586 |
|
587 *** Exported for internal use by the interpreter only !!! *** |
|
588 |
|
589 */ |
|
590 |
|
591 PyAPI_FUNC(PyObject *) _PyUnicode_AsDefaultEncodedString( |
|
592 PyObject *, const char *); |
|
593 |
|
594 /* Returns the currently active default encoding. |
|
595 |
|
596 The default encoding is currently implemented as run-time settable |
|
597 process global. This may change in future versions of the |
|
598 interpreter to become a parameter which is managed on a per-thread |
|
599 basis. |
|
600 |
|
601 */ |
|
602 |
|
603 PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void); |
|
604 |
|
605 /* Sets the currently active default encoding. |
|
606 |
|
607 Returns 0 on success, -1 in case of an error. |
|
608 |
|
609 */ |
|
610 |
|
611 PyAPI_FUNC(int) PyUnicode_SetDefaultEncoding( |
|
612 const char *encoding /* Encoding name in standard form */ |
|
613 ); |
|
614 |
|
615 /* --- Generic Codecs ----------------------------------------------------- */ |
|
616 |
|
617 /* Create a Unicode object by decoding the encoded string s of the |
|
618 given size. */ |
|
619 |
|
620 PyAPI_FUNC(PyObject*) PyUnicode_Decode( |
|
621 const char *s, /* encoded string */ |
|
622 Py_ssize_t size, /* size of buffer */ |
|
623 const char *encoding, /* encoding */ |
|
624 const char *errors /* error handling */ |
|
625 ); |
|
626 |
|
627 /* Encodes a Py_UNICODE buffer of the given size and returns a |
|
628 Python string object. */ |
|
629 |
|
630 PyAPI_FUNC(PyObject*) PyUnicode_Encode( |
|
631 const Py_UNICODE *s, /* Unicode char buffer */ |
|
632 Py_ssize_t size, /* number of Py_UNICODE chars to encode */ |
|
633 const char *encoding, /* encoding */ |
|
634 const char *errors /* error handling */ |
|
635 ); |
|
636 |
|
637 /* Encodes a Unicode object and returns the result as Python |
|
638 object. */ |
|
639 |
|
640 PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject( |
|
641 PyObject *unicode, /* Unicode object */ |
|
642 const char *encoding, /* encoding */ |
|
643 const char *errors /* error handling */ |
|
644 ); |
|
645 |
|
646 /* Encodes a Unicode object and returns the result as Python string |
|
647 object. */ |
|
648 |
|
649 PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString( |
|
650 PyObject *unicode, /* Unicode object */ |
|
651 const char *encoding, /* encoding */ |
|
652 const char *errors /* error handling */ |
|
653 ); |
|
654 |
|
655 PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap( |
|
656 PyObject* string /* 256 character map */ |
|
657 ); |
|
658 |
|
659 |
|
660 /* --- UTF-7 Codecs ------------------------------------------------------- */ |
|
661 |
|
662 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7( |
|
663 const char *string, /* UTF-7 encoded string */ |
|
664 Py_ssize_t length, /* size of string */ |
|
665 const char *errors /* error handling */ |
|
666 ); |
|
667 |
|
668 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7( |
|
669 const Py_UNICODE *data, /* Unicode char buffer */ |
|
670 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ |
|
671 int encodeSetO, /* force the encoder to encode characters in |
|
672 Set O, as described in RFC2152 */ |
|
673 int encodeWhiteSpace, /* force the encoder to encode space, tab, |
|
674 carriage return and linefeed characters */ |
|
675 const char *errors /* error handling */ |
|
676 ); |
|
677 |
|
678 /* --- UTF-8 Codecs ------------------------------------------------------- */ |
|
679 |
|
680 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8( |
|
681 const char *string, /* UTF-8 encoded string */ |
|
682 Py_ssize_t length, /* size of string */ |
|
683 const char *errors /* error handling */ |
|
684 ); |
|
685 |
|
686 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful( |
|
687 const char *string, /* UTF-8 encoded string */ |
|
688 Py_ssize_t length, /* size of string */ |
|
689 const char *errors, /* error handling */ |
|
690 Py_ssize_t *consumed /* bytes consumed */ |
|
691 ); |
|
692 |
|
693 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String( |
|
694 PyObject *unicode /* Unicode object */ |
|
695 ); |
|
696 |
|
697 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8( |
|
698 const Py_UNICODE *data, /* Unicode char buffer */ |
|
699 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ |
|
700 const char *errors /* error handling */ |
|
701 ); |
|
702 |
|
703 /* --- UTF-16 Codecs ------------------------------------------------------ */ |
|
704 |
|
705 /* Decodes length bytes from a UTF-16 encoded buffer string and returns |
|
706 the corresponding Unicode object. |
|
707 |
|
708 errors (if non-NULL) defines the error handling. It defaults |
|
709 to "strict". |
|
710 |
|
711 If byteorder is non-NULL, the decoder starts decoding using the |
|
712 given byte order: |
|
713 |
|
714 *byteorder == -1: little endian |
|
715 *byteorder == 0: native order |
|
716 *byteorder == 1: big endian |
|
717 |
|
718 In native mode, the first two bytes of the stream are checked for a |
|
719 BOM mark. If found, the BOM mark is analysed, the byte order |
|
720 adjusted and the BOM skipped. In the other modes, no BOM mark |
|
721 interpretation is done. After completion, *byteorder is set to the |
|
722 current byte order at the end of input data. |
|
723 |
|
724 If byteorder is NULL, the codec starts in native order mode. |
|
725 |
|
726 */ |
|
727 |
|
728 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16( |
|
729 const char *string, /* UTF-16 encoded string */ |
|
730 Py_ssize_t length, /* size of string */ |
|
731 const char *errors, /* error handling */ |
|
732 int *byteorder /* pointer to byteorder to use |
|
733 0=native;-1=LE,1=BE; updated on |
|
734 exit */ |
|
735 ); |
|
736 |
|
737 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful( |
|
738 const char *string, /* UTF-16 encoded string */ |
|
739 Py_ssize_t length, /* size of string */ |
|
740 const char *errors, /* error handling */ |
|
741 int *byteorder, /* pointer to byteorder to use |
|
742 0=native;-1=LE,1=BE; updated on |
|
743 exit */ |
|
744 Py_ssize_t *consumed /* bytes consumed */ |
|
745 ); |
|
746 |
|
747 /* Returns a Python string using the UTF-16 encoding in native byte |
|
748 order. The string always starts with a BOM mark. */ |
|
749 |
|
750 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String( |
|
751 PyObject *unicode /* Unicode object */ |
|
752 ); |
|
753 |
|
754 /* Returns a Python string object holding the UTF-16 encoded value of |
|
755 the Unicode data. |
|
756 |
|
757 If byteorder is not 0, output is written according to the following |
|
758 byte order: |
|
759 |
|
760 byteorder == -1: little endian |
|
761 byteorder == 0: native byte order (writes a BOM mark) |
|
762 byteorder == 1: big endian |
|
763 |
|
764 If byteorder is 0, the output string will always start with the |
|
765 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is |
|
766 prepended. |
|
767 |
|
768 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to |
|
769 UCS-2. This trick makes it possible to add full UTF-16 capabilities |
|
770 at a later point without compromising the APIs. |
|
771 |
|
772 */ |
|
773 |
|
774 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16( |
|
775 const Py_UNICODE *data, /* Unicode char buffer */ |
|
776 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ |
|
777 const char *errors, /* error handling */ |
|
778 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ |
|
779 ); |
|
780 |
|
781 /* --- Unicode-Escape Codecs ---------------------------------------------- */ |
|
782 |
|
783 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape( |
|
784 const char *string, /* Unicode-Escape encoded string */ |
|
785 Py_ssize_t length, /* size of string */ |
|
786 const char *errors /* error handling */ |
|
787 ); |
|
788 |
|
789 PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString( |
|
790 PyObject *unicode /* Unicode object */ |
|
791 ); |
|
792 |
|
793 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape( |
|
794 const Py_UNICODE *data, /* Unicode char buffer */ |
|
795 Py_ssize_t length /* Number of Py_UNICODE chars to encode */ |
|
796 ); |
|
797 |
|
798 /* --- Raw-Unicode-Escape Codecs ------------------------------------------ */ |
|
799 |
|
800 PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape( |
|
801 const char *string, /* Raw-Unicode-Escape encoded string */ |
|
802 Py_ssize_t length, /* size of string */ |
|
803 const char *errors /* error handling */ |
|
804 ); |
|
805 |
|
806 PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString( |
|
807 PyObject *unicode /* Unicode object */ |
|
808 ); |
|
809 |
|
810 PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape( |
|
811 const Py_UNICODE *data, /* Unicode char buffer */ |
|
812 Py_ssize_t length /* Number of Py_UNICODE chars to encode */ |
|
813 ); |
|
814 |
|
815 /* --- Unicode Internal Codec --------------------------------------------- |
|
816 |
|
817 Only for internal use in _codecsmodule.c */ |
|
818 |
|
819 PyObject *_PyUnicode_DecodeUnicodeInternal( |
|
820 const char *string, |
|
821 Py_ssize_t length, |
|
822 const char *errors |
|
823 ); |
|
824 |
|
825 /* --- Latin-1 Codecs ----------------------------------------------------- |
|
826 |
|
827 Note: Latin-1 corresponds to the first 256 Unicode ordinals. |
|
828 |
|
829 */ |
|
830 |
|
831 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1( |
|
832 const char *string, /* Latin-1 encoded string */ |
|
833 Py_ssize_t length, /* size of string */ |
|
834 const char *errors /* error handling */ |
|
835 ); |
|
836 |
|
837 PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String( |
|
838 PyObject *unicode /* Unicode object */ |
|
839 ); |
|
840 |
|
841 PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1( |
|
842 const Py_UNICODE *data, /* Unicode char buffer */ |
|
843 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ |
|
844 const char *errors /* error handling */ |
|
845 ); |
|
846 |
|
847 /* --- ASCII Codecs ------------------------------------------------------- |
|
848 |
|
849 Only 7-bit ASCII data is excepted. All other codes generate errors. |
|
850 |
|
851 */ |
|
852 |
|
853 PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII( |
|
854 const char *string, /* ASCII encoded string */ |
|
855 Py_ssize_t length, /* size of string */ |
|
856 const char *errors /* error handling */ |
|
857 ); |
|
858 |
|
859 PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString( |
|
860 PyObject *unicode /* Unicode object */ |
|
861 ); |
|
862 |
|
863 PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII( |
|
864 const Py_UNICODE *data, /* Unicode char buffer */ |
|
865 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ |
|
866 const char *errors /* error handling */ |
|
867 ); |
|
868 |
|
869 /* --- Character Map Codecs ----------------------------------------------- |
|
870 |
|
871 This codec uses mappings to encode and decode characters. |
|
872 |
|
873 Decoding mappings must map single string characters to single |
|
874 Unicode characters, integers (which are then interpreted as Unicode |
|
875 ordinals) or None (meaning "undefined mapping" and causing an |
|
876 error). |
|
877 |
|
878 Encoding mappings must map single Unicode characters to single |
|
879 string characters, integers (which are then interpreted as Latin-1 |
|
880 ordinals) or None (meaning "undefined mapping" and causing an |
|
881 error). |
|
882 |
|
883 If a character lookup fails with a LookupError, the character is |
|
884 copied as-is meaning that its ordinal value will be interpreted as |
|
885 Unicode or Latin-1 ordinal resp. Because of this mappings only need |
|
886 to contain those mappings which map characters to different code |
|
887 points. |
|
888 |
|
889 */ |
|
890 |
|
891 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap( |
|
892 const char *string, /* Encoded string */ |
|
893 Py_ssize_t length, /* size of string */ |
|
894 PyObject *mapping, /* character mapping |
|
895 (char ordinal -> unicode ordinal) */ |
|
896 const char *errors /* error handling */ |
|
897 ); |
|
898 |
|
899 PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString( |
|
900 PyObject *unicode, /* Unicode object */ |
|
901 PyObject *mapping /* character mapping |
|
902 (unicode ordinal -> char ordinal) */ |
|
903 ); |
|
904 |
|
905 PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap( |
|
906 const Py_UNICODE *data, /* Unicode char buffer */ |
|
907 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ |
|
908 PyObject *mapping, /* character mapping |
|
909 (unicode ordinal -> char ordinal) */ |
|
910 const char *errors /* error handling */ |
|
911 ); |
|
912 |
|
913 /* Translate a Py_UNICODE buffer of the given length by applying a |
|
914 character mapping table to it and return the resulting Unicode |
|
915 object. |
|
916 |
|
917 The mapping table must map Unicode ordinal integers to Unicode |
|
918 ordinal integers or None (causing deletion of the character). |
|
919 |
|
920 Mapping tables may be dictionaries or sequences. Unmapped character |
|
921 ordinals (ones which cause a LookupError) are left untouched and |
|
922 are copied as-is. |
|
923 |
|
924 */ |
|
925 |
|
926 PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap( |
|
927 const Py_UNICODE *data, /* Unicode char buffer */ |
|
928 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ |
|
929 PyObject *table, /* Translate table */ |
|
930 const char *errors /* error handling */ |
|
931 ); |
|
932 |
|
933 #ifdef MS_WIN32 |
|
934 |
|
935 /* --- MBCS codecs for Windows -------------------------------------------- */ |
|
936 |
|
937 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS( |
|
938 const char *string, /* MBCS encoded string */ |
|
939 Py_ssize_t length, /* size of string */ |
|
940 const char *errors /* error handling */ |
|
941 ); |
|
942 |
|
943 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful( |
|
944 const char *string, /* MBCS encoded string */ |
|
945 Py_ssize_t length, /* size of string */ |
|
946 const char *errors, /* error handling */ |
|
947 Py_ssize_t *consumed /* bytes consumed */ |
|
948 ); |
|
949 |
|
950 PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString( |
|
951 PyObject *unicode /* Unicode object */ |
|
952 ); |
|
953 |
|
954 PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS( |
|
955 const Py_UNICODE *data, /* Unicode char buffer */ |
|
956 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ |
|
957 const char *errors /* error handling */ |
|
958 ); |
|
959 |
|
960 #endif /* MS_WIN32 */ |
|
961 |
|
962 /* --- Decimal Encoder ---------------------------------------------------- */ |
|
963 |
|
964 /* Takes a Unicode string holding a decimal value and writes it into |
|
965 an output buffer using standard ASCII digit codes. |
|
966 |
|
967 The output buffer has to provide at least length+1 bytes of storage |
|
968 area. The output string is 0-terminated. |
|
969 |
|
970 The encoder converts whitespace to ' ', decimal characters to their |
|
971 corresponding ASCII digit and all other Latin-1 characters except |
|
972 \0 as-is. Characters outside this range (Unicode ordinals 1-256) |
|
973 are treated as errors. This includes embedded NULL bytes. |
|
974 |
|
975 Error handling is defined by the errors argument: |
|
976 |
|
977 NULL or "strict": raise a ValueError |
|
978 "ignore": ignore the wrong characters (these are not copied to the |
|
979 output buffer) |
|
980 "replace": replaces illegal characters with '?' |
|
981 |
|
982 Returns 0 on success, -1 on failure. |
|
983 |
|
984 */ |
|
985 |
|
986 PyAPI_FUNC(int) PyUnicode_EncodeDecimal( |
|
987 Py_UNICODE *s, /* Unicode buffer */ |
|
988 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ |
|
989 char *output, /* Output buffer; must have size >= length */ |
|
990 const char *errors /* error handling */ |
|
991 ); |
|
992 |
|
993 /* --- Methods & Slots ---------------------------------------------------- |
|
994 |
|
995 These are capable of handling Unicode objects and strings on input |
|
996 (we refer to them as strings in the descriptions) and return |
|
997 Unicode objects or integers as apporpriate. */ |
|
998 |
|
999 /* Concat two strings giving a new Unicode string. */ |
|
1000 |
|
1001 PyAPI_FUNC(PyObject*) PyUnicode_Concat( |
|
1002 PyObject *left, /* Left string */ |
|
1003 PyObject *right /* Right string */ |
|
1004 ); |
|
1005 |
|
1006 /* Split a string giving a list of Unicode strings. |
|
1007 |
|
1008 If sep is NULL, splitting will be done at all whitespace |
|
1009 substrings. Otherwise, splits occur at the given separator. |
|
1010 |
|
1011 At most maxsplit splits will be done. If negative, no limit is set. |
|
1012 |
|
1013 Separators are not included in the resulting list. |
|
1014 |
|
1015 */ |
|
1016 |
|
1017 PyAPI_FUNC(PyObject*) PyUnicode_Split( |
|
1018 PyObject *s, /* String to split */ |
|
1019 PyObject *sep, /* String separator */ |
|
1020 Py_ssize_t maxsplit /* Maxsplit count */ |
|
1021 ); |
|
1022 |
|
1023 /* Dito, but split at line breaks. |
|
1024 |
|
1025 CRLF is considered to be one line break. Line breaks are not |
|
1026 included in the resulting list. */ |
|
1027 |
|
1028 PyAPI_FUNC(PyObject*) PyUnicode_Splitlines( |
|
1029 PyObject *s, /* String to split */ |
|
1030 int keepends /* If true, line end markers are included */ |
|
1031 ); |
|
1032 |
|
1033 /* Partition a string using a given separator. */ |
|
1034 |
|
1035 PyAPI_FUNC(PyObject*) PyUnicode_Partition( |
|
1036 PyObject *s, /* String to partition */ |
|
1037 PyObject *sep /* String separator */ |
|
1038 ); |
|
1039 |
|
1040 /* Partition a string using a given separator, searching from the end of the |
|
1041 string. */ |
|
1042 |
|
1043 PyAPI_FUNC(PyObject*) PyUnicode_RPartition( |
|
1044 PyObject *s, /* String to partition */ |
|
1045 PyObject *sep /* String separator */ |
|
1046 ); |
|
1047 |
|
1048 /* Split a string giving a list of Unicode strings. |
|
1049 |
|
1050 If sep is NULL, splitting will be done at all whitespace |
|
1051 substrings. Otherwise, splits occur at the given separator. |
|
1052 |
|
1053 At most maxsplit splits will be done. But unlike PyUnicode_Split |
|
1054 PyUnicode_RSplit splits from the end of the string. If negative, |
|
1055 no limit is set. |
|
1056 |
|
1057 Separators are not included in the resulting list. |
|
1058 |
|
1059 */ |
|
1060 |
|
1061 PyAPI_FUNC(PyObject*) PyUnicode_RSplit( |
|
1062 PyObject *s, /* String to split */ |
|
1063 PyObject *sep, /* String separator */ |
|
1064 Py_ssize_t maxsplit /* Maxsplit count */ |
|
1065 ); |
|
1066 |
|
1067 /* Translate a string by applying a character mapping table to it and |
|
1068 return the resulting Unicode object. |
|
1069 |
|
1070 The mapping table must map Unicode ordinal integers to Unicode |
|
1071 ordinal integers or None (causing deletion of the character). |
|
1072 |
|
1073 Mapping tables may be dictionaries or sequences. Unmapped character |
|
1074 ordinals (ones which cause a LookupError) are left untouched and |
|
1075 are copied as-is. |
|
1076 |
|
1077 */ |
|
1078 |
|
1079 PyAPI_FUNC(PyObject *) PyUnicode_Translate( |
|
1080 PyObject *str, /* String */ |
|
1081 PyObject *table, /* Translate table */ |
|
1082 const char *errors /* error handling */ |
|
1083 ); |
|
1084 |
|
1085 /* Join a sequence of strings using the given separator and return |
|
1086 the resulting Unicode string. */ |
|
1087 |
|
1088 PyAPI_FUNC(PyObject*) PyUnicode_Join( |
|
1089 PyObject *separator, /* Separator string */ |
|
1090 PyObject *seq /* Sequence object */ |
|
1091 ); |
|
1092 |
|
1093 /* Return 1 if substr matches str[start:end] at the given tail end, 0 |
|
1094 otherwise. */ |
|
1095 |
|
1096 PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch( |
|
1097 PyObject *str, /* String */ |
|
1098 PyObject *substr, /* Prefix or Suffix string */ |
|
1099 Py_ssize_t start, /* Start index */ |
|
1100 Py_ssize_t end, /* Stop index */ |
|
1101 int direction /* Tail end: -1 prefix, +1 suffix */ |
|
1102 ); |
|
1103 |
|
1104 /* Return the first position of substr in str[start:end] using the |
|
1105 given search direction or -1 if not found. -2 is returned in case |
|
1106 an error occurred and an exception is set. */ |
|
1107 |
|
1108 PyAPI_FUNC(Py_ssize_t) PyUnicode_Find( |
|
1109 PyObject *str, /* String */ |
|
1110 PyObject *substr, /* Substring to find */ |
|
1111 Py_ssize_t start, /* Start index */ |
|
1112 Py_ssize_t end, /* Stop index */ |
|
1113 int direction /* Find direction: +1 forward, -1 backward */ |
|
1114 ); |
|
1115 |
|
1116 /* Count the number of occurrences of substr in str[start:end]. */ |
|
1117 |
|
1118 PyAPI_FUNC(Py_ssize_t) PyUnicode_Count( |
|
1119 PyObject *str, /* String */ |
|
1120 PyObject *substr, /* Substring to count */ |
|
1121 Py_ssize_t start, /* Start index */ |
|
1122 Py_ssize_t end /* Stop index */ |
|
1123 ); |
|
1124 |
|
1125 /* Replace at most maxcount occurrences of substr in str with replstr |
|
1126 and return the resulting Unicode object. */ |
|
1127 |
|
1128 PyAPI_FUNC(PyObject *) PyUnicode_Replace( |
|
1129 PyObject *str, /* String */ |
|
1130 PyObject *substr, /* Substring to find */ |
|
1131 PyObject *replstr, /* Substring to replace */ |
|
1132 Py_ssize_t maxcount /* Max. number of replacements to apply; |
|
1133 -1 = all */ |
|
1134 ); |
|
1135 |
|
1136 /* Compare two strings and return -1, 0, 1 for less than, equal, |
|
1137 greater than resp. */ |
|
1138 |
|
1139 PyAPI_FUNC(int) PyUnicode_Compare( |
|
1140 PyObject *left, /* Left string */ |
|
1141 PyObject *right /* Right string */ |
|
1142 ); |
|
1143 |
|
1144 /* Rich compare two strings and return one of the following: |
|
1145 |
|
1146 - NULL in case an exception was raised |
|
1147 - Py_True or Py_False for successfuly comparisons |
|
1148 - Py_NotImplemented in case the type combination is unknown |
|
1149 |
|
1150 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in |
|
1151 case the conversion of the arguments to Unicode fails with a |
|
1152 UnicodeDecodeError. |
|
1153 |
|
1154 Possible values for op: |
|
1155 |
|
1156 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE |
|
1157 |
|
1158 */ |
|
1159 |
|
1160 PyAPI_FUNC(PyObject *) PyUnicode_RichCompare( |
|
1161 PyObject *left, /* Left string */ |
|
1162 PyObject *right, /* Right string */ |
|
1163 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */ |
|
1164 ); |
|
1165 |
|
1166 /* Apply a argument tuple or dictionary to a format string and return |
|
1167 the resulting Unicode string. */ |
|
1168 |
|
1169 PyAPI_FUNC(PyObject *) PyUnicode_Format( |
|
1170 PyObject *format, /* Format string */ |
|
1171 PyObject *args /* Argument tuple or dictionary */ |
|
1172 ); |
|
1173 |
|
1174 /* Checks whether element is contained in container and return 1/0 |
|
1175 accordingly. |
|
1176 |
|
1177 element has to coerce to an one element Unicode string. -1 is |
|
1178 returned in case of an error. */ |
|
1179 |
|
1180 PyAPI_FUNC(int) PyUnicode_Contains( |
|
1181 PyObject *container, /* Container string */ |
|
1182 PyObject *element /* Element string */ |
|
1183 ); |
|
1184 |
|
1185 /* Externally visible for str.strip(unicode) */ |
|
1186 PyAPI_FUNC(PyObject *) _PyUnicode_XStrip( |
|
1187 PyUnicodeObject *self, |
|
1188 int striptype, |
|
1189 PyObject *sepobj |
|
1190 ); |
|
1191 |
|
1192 /* === Characters Type APIs =============================================== */ |
|
1193 |
|
1194 /* These should not be used directly. Use the Py_UNICODE_IS* and |
|
1195 Py_UNICODE_TO* macros instead. |
|
1196 |
|
1197 These APIs are implemented in Objects/unicodectype.c. |
|
1198 |
|
1199 */ |
|
1200 |
|
1201 PyAPI_FUNC(int) _PyUnicode_IsLowercase( |
|
1202 Py_UNICODE ch /* Unicode character */ |
|
1203 ); |
|
1204 |
|
1205 PyAPI_FUNC(int) _PyUnicode_IsUppercase( |
|
1206 Py_UNICODE ch /* Unicode character */ |
|
1207 ); |
|
1208 |
|
1209 PyAPI_FUNC(int) _PyUnicode_IsTitlecase( |
|
1210 Py_UNICODE ch /* Unicode character */ |
|
1211 ); |
|
1212 |
|
1213 PyAPI_FUNC(int) _PyUnicode_IsWhitespace( |
|
1214 const Py_UNICODE ch /* Unicode character */ |
|
1215 ); |
|
1216 |
|
1217 PyAPI_FUNC(int) _PyUnicode_IsLinebreak( |
|
1218 const Py_UNICODE ch /* Unicode character */ |
|
1219 ); |
|
1220 |
|
1221 PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToLowercase( |
|
1222 Py_UNICODE ch /* Unicode character */ |
|
1223 ); |
|
1224 |
|
1225 PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToUppercase( |
|
1226 Py_UNICODE ch /* Unicode character */ |
|
1227 ); |
|
1228 |
|
1229 PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToTitlecase( |
|
1230 Py_UNICODE ch /* Unicode character */ |
|
1231 ); |
|
1232 |
|
1233 PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit( |
|
1234 Py_UNICODE ch /* Unicode character */ |
|
1235 ); |
|
1236 |
|
1237 PyAPI_FUNC(int) _PyUnicode_ToDigit( |
|
1238 Py_UNICODE ch /* Unicode character */ |
|
1239 ); |
|
1240 |
|
1241 PyAPI_FUNC(double) _PyUnicode_ToNumeric( |
|
1242 Py_UNICODE ch /* Unicode character */ |
|
1243 ); |
|
1244 |
|
1245 PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit( |
|
1246 Py_UNICODE ch /* Unicode character */ |
|
1247 ); |
|
1248 |
|
1249 PyAPI_FUNC(int) _PyUnicode_IsDigit( |
|
1250 Py_UNICODE ch /* Unicode character */ |
|
1251 ); |
|
1252 |
|
1253 PyAPI_FUNC(int) _PyUnicode_IsNumeric( |
|
1254 Py_UNICODE ch /* Unicode character */ |
|
1255 ); |
|
1256 |
|
1257 PyAPI_FUNC(int) _PyUnicode_IsAlpha( |
|
1258 Py_UNICODE ch /* Unicode character */ |
|
1259 ); |
|
1260 |
|
1261 #ifdef __cplusplus |
|
1262 } |
|
1263 #endif |
|
1264 #endif /* Py_USING_UNICODE */ |
|
1265 #endif /* !Py_UNICODEOBJECT_H */ |