Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code | Sign in
(56)

Delta Between Two Patch Sets: Objects/unicodeobject.c

Issue 11828: startswith and endswith don't accept None as slice index
Left Patch Set: Created 2 years, 1 month ago
Right Patch Set: Created 2 years, 1 month ago
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
« no previous file with change/comment | « Objects/stringlib/find.h ('k') | no next file » | no next file with change/comment »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
LEFTRIGHT
1 /* 1 /*
2 2
3 Unicode implementation based on original code by Fredrik Lundh, 3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the 4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt). 5 Unicode Integration Proposal (see file Misc/unicode.txt).
6 6
7 Major speed upgrades to the method implementations at the Reykjavik 7 Major speed upgrades to the method implementations at the Reykjavik
8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9 9
10 Copyright (c) Corporation for National Research Initiatives. 10 Copyright (c) Corporation for National Research Initiatives.
(...skipping 23 matching lines...) Expand all
34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38 -------------------------------------------------------------------- 38 --------------------------------------------------------------------
39 39
40 */ 40 */
41 41
42 #define PY_SSIZE_T_CLEAN 42 #define PY_SSIZE_T_CLEAN
43 #include "Python.h" 43 #include "Python.h"
44 #include "bytes_methods.h"
45
46 #include "unicodeobject.h"
44 #include "ucnhash.h" 47 #include "ucnhash.h"
45 48
46 #ifdef MS_WINDOWS 49 #ifdef MS_WINDOWS
47 #include <windows.h> 50 #include <windows.h>
48 #endif 51 #endif
49 52
50 /* Limit for the Unicode object free list */ 53 /* Limit for the Unicode object free list */
51 54
52 #define PyUnicode_MAXFREELIST 1024 55 #define PyUnicode_MAXFREELIST 1024
53 56
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after
104 static PyUnicodeObject *free_list; 107 static PyUnicodeObject *free_list;
105 static int numfree; 108 static int numfree;
106 109
107 /* The empty Unicode object is shared to improve performance. */ 110 /* The empty Unicode object is shared to improve performance. */
108 static PyUnicodeObject *unicode_empty; 111 static PyUnicodeObject *unicode_empty;
109 112
110 /* Single character Unicode strings in the Latin-1 range are being 113 /* Single character Unicode strings in the Latin-1 range are being
111 shared as well. */ 114 shared as well. */
112 static PyUnicodeObject *unicode_latin1[256]; 115 static PyUnicodeObject *unicode_latin1[256];
113 116
117 /* Default encoding to use and assume when NULL is passed as encoding
118 parameter; it is fixed to "utf-8". Always use the
119 PyUnicode_GetDefaultEncoding() API to access this global.
120
121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
122 hard coded default!
123 */
124 static const char unicode_default_encoding[] = "utf-8";
125
114 /* Fast detection of the most frequent whitespace characters */ 126 /* Fast detection of the most frequent whitespace characters */
115 const unsigned char _Py_ascii_whitespace[] = { 127 const unsigned char _Py_ascii_whitespace[] = {
116 0, 0, 0, 0, 0, 0, 0, 0, 128 0, 0, 0, 0, 0, 0, 0, 0,
117 /* case 0x0009: * CHARACTER TABULATION */ 129 /* case 0x0009: * HORIZONTAL TABULATION */
118 /* case 0x000A: * LINE FEED */ 130 /* case 0x000A: * LINE FEED */
119 /* case 0x000B: * LINE TABULATION */ 131 /* case 0x000B: * VERTICAL TABULATION */
120 /* case 0x000C: * FORM FEED */ 132 /* case 0x000C: * FORM FEED */
121 /* case 0x000D: * CARRIAGE RETURN */ 133 /* case 0x000D: * CARRIAGE RETURN */
122 0, 1, 1, 1, 1, 1, 0, 0, 134 0, 1, 1, 1, 1, 1, 0, 0,
123 0, 0, 0, 0, 0, 0, 0, 0, 135 0, 0, 0, 0, 0, 0, 0, 0,
124 /* case 0x001C: * FILE SEPARATOR */ 136 /* case 0x001C: * FILE SEPARATOR */
125 /* case 0x001D: * GROUP SEPARATOR */ 137 /* case 0x001D: * GROUP SEPARATOR */
126 /* case 0x001E: * RECORD SEPARATOR */ 138 /* case 0x001E: * RECORD SEPARATOR */
127 /* case 0x001F: * UNIT SEPARATOR */ 139 /* case 0x001F: * UNIT SEPARATOR */
128 0, 0, 0, 0, 1, 1, 1, 1, 140 0, 0, 0, 0, 1, 1, 1, 1,
129 /* case 0x0020: * SPACE */ 141 /* case 0x0020: * SPACE */
130 1, 0, 0, 0, 0, 0, 0, 0, 142 1, 0, 0, 0, 0, 0, 0, 0,
131 0, 0, 0, 0, 0, 0, 0, 0, 143 0, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0, 144 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0, 145 0, 0, 0, 0, 0, 0, 0, 0,
134 146
135 0, 0, 0, 0, 0, 0, 0, 0, 147 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0, 148 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0, 149 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0, 150 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0, 151 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0, 152 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0, 153 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0 154 0, 0, 0, 0, 0, 0, 0, 0
143 }; 155 };
144 156
145 static PyObject * 157 static PyObject *unicode_encode_call_errorhandler(const char *errors,
146 unicode_encode_call_errorhandler(const char *errors,
147 PyObject **errorHandler,const char *encoding, const char *reason, 158 PyObject **errorHandler,const char *encoding, const char *reason,
148 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 159 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
149 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); 160 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
150 161
151 static void 162 static void raise_encode_exception(PyObject **exceptionObject,
152 raise_encode_exception(PyObject **exceptionObject, 163 const char *encoding,
153 » » const char *encoding, 164 const Py_UNICODE *unicode, Py_ssize_t size,
154 » » const Py_UNICODE *unicode, Py_ssize_t size, 165 Py_ssize_t startpos, Py_ssize_t endpos,
155 » » Py_ssize_t startpos, Py_ssize_t endpos, 166 const char *reason);
156 » » const char *reason);
157 167
158 /* Same for linebreaks */ 168 /* Same for linebreaks */
159 static unsigned char ascii_linebreak[] = { 169 static unsigned char ascii_linebreak[] = {
160 0, 0, 0, 0, 0, 0, 0, 0, 170 0, 0, 0, 0, 0, 0, 0, 0,
161 /* 0x000A, * LINE FEED */ 171 /* 0x000A, * LINE FEED */
162 /* 0x000B, * LINE TABULATION */
163 /* 0x000C, * FORM FEED */
164 /* 0x000D, * CARRIAGE RETURN */ 172 /* 0x000D, * CARRIAGE RETURN */
165 0, 0, 1, 1, 1, 1, 0, 0, 173 0, 0, 1, 0, 0, 1, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0, 174 0, 0, 0, 0, 0, 0, 0, 0,
167 /* 0x001C, * FILE SEPARATOR */ 175 /* 0x001C, * FILE SEPARATOR */
168 /* 0x001D, * GROUP SEPARATOR */ 176 /* 0x001D, * GROUP SEPARATOR */
169 /* 0x001E, * RECORD SEPARATOR */ 177 /* 0x001E, * RECORD SEPARATOR */
170 0, 0, 0, 0, 1, 1, 1, 0, 178 0, 0, 0, 0, 1, 1, 1, 0,
171 0, 0, 0, 0, 0, 0, 0, 0, 179 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0, 180 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0, 181 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0, 182 0, 0, 0, 0, 0, 0, 0, 0,
175 183
(...skipping 21 matching lines...) Expand all
197 } 205 }
198 206
199 /* --- Bloom Filters ----------------------------------------------------- */ 207 /* --- Bloom Filters ----------------------------------------------------- */
200 208
201 /* stuff to implement simple "bloom filters" for Unicode characters. 209 /* stuff to implement simple "bloom filters" for Unicode characters.
202 to keep things simple, we use a single bitmask, using the least 5 210 to keep things simple, we use a single bitmask, using the least 5
203 bits from each unicode characters as the bit index. */ 211 bits from each unicode characters as the bit index. */
204 212
205 /* the linebreak mask is set up by Unicode_Init below */ 213 /* the linebreak mask is set up by Unicode_Init below */
206 214
207 #if LONG_BIT >= 128
208 #define BLOOM_WIDTH 128
209 #elif LONG_BIT >= 64
210 #define BLOOM_WIDTH 64
211 #elif LONG_BIT >= 32
212 #define BLOOM_WIDTH 32
213 #else
214 #error "LONG_BIT is smaller than 32"
215 #endif
216
217 #define BLOOM_MASK unsigned long 215 #define BLOOM_MASK unsigned long
218 216
219 static BLOOM_MASK bloom_linebreak; 217 static BLOOM_MASK bloom_linebreak;
220 218
221 #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 219 #define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
222 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
223 220
224 #define BLOOM_LINEBREAK(ch) \ 221 #define BLOOM_LINEBREAK(ch) \
225 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 222 ((ch) < 128U ? ascii_linebreak[(ch)] : \
226 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 223 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
227 224
228 Py_LOCAL_INLINE(BLOOM_MASK) 225 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
229 make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
230 { 226 {
231 /* calculate simple bloom-style bitmask for a given unicode string */ 227 /* calculate simple bloom-style bitmask for a given unicode string */
232 228
233 BLOOM_MASK mask; 229 long mask;
234 Py_ssize_t i; 230 Py_ssize_t i;
235 231
236 mask = 0; 232 mask = 0;
237 for (i = 0; i < len; i++) 233 for (i = 0; i < len; i++)
238 BLOOM_ADD(mask, ptr[i]); 234 mask |= (1 << (ptr[i] & 0x1F));
239 235
240 return mask; 236 return mask;
241 } 237 }
242 238
243 Py_LOCAL_INLINE(int) 239 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
244 unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
245 { 240 {
246 Py_ssize_t i; 241 Py_ssize_t i;
247 242
248 for (i = 0; i < setlen; i++) 243 for (i = 0; i < setlen; i++)
249 if (set[i] == chr) 244 if (set[i] == chr)
250 return 1; 245 return 1;
251 246
252 return 0; 247 return 0;
253 } 248 }
254 249
255 #define BLOOM_MEMBER(mask, chr, set, setlen) \ 250 #define BLOOM_MEMBER(mask, chr, set, setlen) \
256 BLOOM(mask, chr) && unicode_member(chr, set, setlen) 251 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
257 252
258 /* --- Unicode Object ----------------------------------------------------- */ 253 /* --- Unicode Object ----------------------------------------------------- */
259 254
260 static int 255 static
261 unicode_resize(register PyUnicodeObject *unicode, 256 int unicode_resize(register PyUnicodeObject *unicode,
262 » Py_ssize_t length) 257 Py_ssize_t length)
263 { 258 {
264 void *oldstr; 259 void *oldstr;
265 260
266 /* Shortcut if there's nothing much to do. */ 261 /* Shortcut if there's nothing much to do. */
267 if (unicode->length == length) 262 if (unicode->length == length)
268 goto reset; 263 goto reset;
269 264
270 /* Resizing shared object (unicode_empty or single character 265 /* Resizing shared object (unicode_empty or single character
271 objects) in-place is not allowed. Use PyUnicode_Resize() 266 objects) in-place is not allowed. Use PyUnicode_Resize()
272 instead ! */ 267 instead ! */
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after
308 303
309 /* We allocate one more byte to make sure the string is 304 /* We allocate one more byte to make sure the string is
310 Ux0000 terminated; some code (e.g. new_identifier) 305 Ux0000 terminated; some code (e.g. new_identifier)
311 relies on that. 306 relies on that.
312 307
313 XXX This allocator could further be enhanced by assuring that the 308 XXX This allocator could further be enhanced by assuring that the
314 free list never reduces its size below 1. 309 free list never reduces its size below 1.
315 310
316 */ 311 */
317 312
318 static PyUnicodeObject * 313 static
319 _PyUnicode_New(Py_ssize_t length) 314 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
320 { 315 {
321 register PyUnicodeObject *unicode; 316 register PyUnicodeObject *unicode;
322 317
323 /* Optimization for empty strings */ 318 /* Optimization for empty strings */
324 if (length == 0 && unicode_empty != NULL) { 319 if (length == 0 && unicode_empty != NULL) {
325 Py_INCREF(unicode_empty); 320 Py_INCREF(unicode_empty);
326 return unicode_empty; 321 return unicode_empty;
327 } 322 }
328 323
329 /* Ensure we won't overflow the size. */ 324 /* Ensure we won't overflow the size. */
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after
380 return unicode; 375 return unicode;
381 376
382 onError: 377 onError:
383 /* XXX UNREF/NEWREF interface should be more symmetrical */ 378 /* XXX UNREF/NEWREF interface should be more symmetrical */
384 _Py_DEC_REFTOTAL; 379 _Py_DEC_REFTOTAL;
385 _Py_ForgetReference((PyObject *)unicode); 380 _Py_ForgetReference((PyObject *)unicode);
386 PyObject_Del(unicode); 381 PyObject_Del(unicode);
387 return NULL; 382 return NULL;
388 } 383 }
389 384
390 static void 385 static
391 unicode_dealloc(register PyUnicodeObject *unicode) 386 void unicode_dealloc(register PyUnicodeObject *unicode)
392 { 387 {
393 switch (PyUnicode_CHECK_INTERNED(unicode)) { 388 switch (PyUnicode_CHECK_INTERNED(unicode)) {
394 case SSTATE_NOT_INTERNED: 389 case SSTATE_NOT_INTERNED:
395 break; 390 break;
396 391
397 case SSTATE_INTERNED_MORTAL: 392 case SSTATE_INTERNED_MORTAL:
398 /* revive dead object temporarily for DelItem */ 393 /* revive dead object temporarily for DelItem */
399 Py_REFCNT(unicode) = 3; 394 Py_REFCNT(unicode) = 3;
400 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0) 395 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
401 Py_FatalError( 396 Py_FatalError(
(...skipping 23 matching lines...) Expand all
425 free_list = unicode; 420 free_list = unicode;
426 numfree++; 421 numfree++;
427 } 422 }
428 else { 423 else {
429 PyObject_DEL(unicode->str); 424 PyObject_DEL(unicode->str);
430 Py_XDECREF(unicode->defenc); 425 Py_XDECREF(unicode->defenc);
431 Py_TYPE(unicode)->tp_free((PyObject *)unicode); 426 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
432 } 427 }
433 } 428 }
434 429
435 static int 430 static
436 _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length) 431 int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
437 { 432 {
438 register PyUnicodeObject *v; 433 register PyUnicodeObject *v;
439 434
440 /* Argument checks */ 435 /* Argument checks */
441 if (unicode == NULL) { 436 if (unicode == NULL) {
442 PyErr_BadInternalCall(); 437 PyErr_BadInternalCall();
443 return -1; 438 return -1;
444 } 439 }
445 v = *unicode; 440 v = *unicode;
446 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) { 441 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
(...skipping 14 matching lines...) Expand all
461 Py_DECREF(*unicode); 456 Py_DECREF(*unicode);
462 *unicode = w; 457 *unicode = w;
463 return 0; 458 return 0;
464 } 459 }
465 460
466 /* Note that we don't have to modify *unicode for unshared Unicode 461 /* Note that we don't have to modify *unicode for unshared Unicode
467 objects, since we can modify them in-place. */ 462 objects, since we can modify them in-place. */
468 return unicode_resize(v, length); 463 return unicode_resize(v, length);
469 } 464 }
470 465
471 int 466 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
472 PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
473 { 467 {
474 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length); 468 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
475 } 469 }
476 470
477 PyObject * 471 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
478 PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) 472 Py_ssize_t size)
479 { 473 {
480 PyUnicodeObject *unicode; 474 PyUnicodeObject *unicode;
481 475
482 /* If the Unicode data is known at construction time, we can apply 476 /* If the Unicode data is known at construction time, we can apply
483 some optimizations which share commonly used objects. */ 477 some optimizations which share commonly used objects. */
484 if (u != NULL) { 478 if (u != NULL) {
485 479
486 /* Optimization for empty strings */ 480 /* Optimization for empty strings */
487 if (size == 0 && unicode_empty != NULL) { 481 if (size == 0 && unicode_empty != NULL) {
488 Py_INCREF(unicode_empty); 482 Py_INCREF(unicode_empty);
(...skipping 20 matching lines...) Expand all
509 if (!unicode) 503 if (!unicode)
510 return NULL; 504 return NULL;
511 505
512 /* Copy the Unicode data into the new object */ 506 /* Copy the Unicode data into the new object */
513 if (u != NULL) 507 if (u != NULL)
514 Py_UNICODE_COPY(unicode->str, u, size); 508 Py_UNICODE_COPY(unicode->str, u, size);
515 509
516 return (PyObject *)unicode; 510 return (PyObject *)unicode;
517 } 511 }
518 512
519 PyObject * 513 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
520 PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
521 { 514 {
522 PyUnicodeObject *unicode; 515 PyUnicodeObject *unicode;
523 516
524 if (size < 0) { 517 if (size < 0) {
525 PyErr_SetString(PyExc_SystemError, 518 PyErr_SetString(PyExc_SystemError,
526 "Negative size passed to PyUnicode_FromStringAndSize"); 519 "Negative size passed to PyUnicode_FromStringAndSize");
527 return NULL; 520 return NULL;
528 } 521 }
529 522
530 /* If the Unicode data is known at construction time, we can apply 523 /* If the Unicode data is known at construction time, we can apply
(...skipping 26 matching lines...) Expand all
557 return PyUnicode_DecodeUTF8(u, size, NULL); 550 return PyUnicode_DecodeUTF8(u, size, NULL);
558 } 551 }
559 552
560 unicode = _PyUnicode_New(size); 553 unicode = _PyUnicode_New(size);
561 if (!unicode) 554 if (!unicode)
562 return NULL; 555 return NULL;
563 556
564 return (PyObject *)unicode; 557 return (PyObject *)unicode;
565 } 558 }
566 559
567 PyObject * 560 PyObject *PyUnicode_FromString(const char *u)
568 PyUnicode_FromString(const char *u)
569 { 561 {
570 size_t size = strlen(u); 562 size_t size = strlen(u);
571 if (size > PY_SSIZE_T_MAX) { 563 if (size > PY_SSIZE_T_MAX) {
572 PyErr_SetString(PyExc_OverflowError, "input too long"); 564 PyErr_SetString(PyExc_OverflowError, "input too long");
573 return NULL; 565 return NULL;
574 } 566 }
575 567
576 return PyUnicode_FromStringAndSize(u, size); 568 return PyUnicode_FromStringAndSize(u, size);
577 } 569 }
578 570
579 #ifdef HAVE_WCHAR_H 571 #ifdef HAVE_WCHAR_H
580 572
581 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4) 573 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
582 # define CONVERT_WCHAR_TO_SURROGATES 574 # define CONVERT_WCHAR_TO_SURROGATES
583 #endif 575 #endif
584 576
585 #ifdef CONVERT_WCHAR_TO_SURROGATES 577 #ifdef CONVERT_WCHAR_TO_SURROGATES
586 578
587 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need 579 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
588 to convert from UTF32 to UTF16. */ 580 to convert from UTF32 to UTF16. */
589 581
590 PyObject * 582 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
591 PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size) 583 Py_ssize_t size)
592 { 584 {
593 PyUnicodeObject *unicode; 585 PyUnicodeObject *unicode;
594 register Py_ssize_t i; 586 register Py_ssize_t i;
595 Py_ssize_t alloc; 587 Py_ssize_t alloc;
596 const wchar_t *orig_w; 588 const wchar_t *orig_w;
597 589
598 if (w == NULL) { 590 if (w == NULL) {
599 if (size == 0) 591 if (size == 0)
600 return PyUnicode_FromStringAndSize(NULL, 0); 592 return PyUnicode_FromStringAndSize(NULL, 0);
601 PyErr_BadInternalCall(); 593 PyErr_BadInternalCall();
(...skipping 29 matching lines...) Expand all
631 } 623 }
632 else 624 else
633 *u++ = *w++; 625 *u++ = *w++;
634 } 626 }
635 } 627 }
636 return (PyObject *)unicode; 628 return (PyObject *)unicode;
637 } 629 }
638 630
639 #else 631 #else
640 632
641 PyObject * 633 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
642 PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size) 634 Py_ssize_t size)
643 { 635 {
644 PyUnicodeObject *unicode; 636 PyUnicodeObject *unicode;
645 637
646 if (w == NULL) { 638 if (w == NULL) {
647 if (size == 0) 639 if (size == 0)
648 return PyUnicode_FromStringAndSize(NULL, 0); 640 return PyUnicode_FromStringAndSize(NULL, 0);
649 PyErr_BadInternalCall(); 641 PyErr_BadInternalCall();
650 return NULL; 642 return NULL;
651 } 643 }
652 644
653 if (size == -1) { 645 if (size == -1) {
654 size = wcslen(w); 646 size = wcslen(w);
655 } 647 }
656 648
657 unicode = _PyUnicode_New(size); 649 unicode = _PyUnicode_New(size);
658 if (!unicode) 650 if (!unicode)
659 return NULL; 651 return NULL;
660 652
661 /* Copy the wchar_t data into the new object */ 653 /* Copy the wchar_t data into the new object */
662 #if Py_UNICODE_SIZE == SIZEOF_WCHAR_T 654 #ifdef HAVE_USABLE_WCHAR_T
663 memcpy(unicode->str, w, size * sizeof(wchar_t)); 655 memcpy(unicode->str, w, size * sizeof(wchar_t));
664 #else 656 #else
665 { 657 {
666 register Py_UNICODE *u; 658 register Py_UNICODE *u;
667 register Py_ssize_t i; 659 register Py_ssize_t i;
668 u = PyUnicode_AS_UNICODE(unicode); 660 u = PyUnicode_AS_UNICODE(unicode);
669 for (i = size; i > 0; i--) 661 for (i = size; i > 0; i--)
670 *u++ = *w++; 662 *u++ = *w++;
671 } 663 }
672 #endif 664 #endif
673 665
674 return (PyObject *)unicode; 666 return (PyObject *)unicode;
675 } 667 }
676 668
677 #endif /* CONVERT_WCHAR_TO_SURROGATES */ 669 #endif /* CONVERT_WCHAR_TO_SURROGATES */
678 670
679 #undef CONVERT_WCHAR_TO_SURROGATES 671 #undef CONVERT_WCHAR_TO_SURROGATES
680 672
681 static void 673 static void
682 makefmt(char *fmt, int longflag, int longlongflag, int size_tflag, 674 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int pre cision, char c)
683 int zeropad, int width, int precision, char c)
684 { 675 {
685 *fmt++ = '%'; 676 *fmt++ = '%';
686 if (width) { 677 if (width) {
687 if (zeropad) 678 if (zeropad)
688 *fmt++ = '0'; 679 *fmt++ = '0';
689 fmt += sprintf(fmt, "%d", width); 680 fmt += sprintf(fmt, "%d", width);
690 } 681 }
691 if (precision) 682 if (precision)
692 fmt += sprintf(fmt, ".%d", precision); 683 fmt += sprintf(fmt, ".%d", precision);
693 if (longflag) 684 if (longflag)
694 *fmt++ = 'l'; 685 *fmt++ = 'l';
695 else if (longlongflag) {
696 /* longlongflag should only ever be nonzero on machines with
697 HAVE_LONG_LONG defined */
698 #ifdef HAVE_LONG_LONG
699 char *f = PY_FORMAT_LONG_LONG;
700 while (*f)
701 *fmt++ = *f++;
702 #else
703 /* we shouldn't ever get here */
704 assert(0);
705 *fmt++ = 'l';
706 #endif
707 }
708 else if (size_tflag) { 686 else if (size_tflag) {
709 char *f = PY_FORMAT_SIZE_T; 687 char *f = PY_FORMAT_SIZE_T;
710 while (*f) 688 while (*f)
711 *fmt++ = *f++; 689 *fmt++ = *f++;
712 } 690 }
713 *fmt++ = c; 691 *fmt++ = c;
714 *fmt = '\0'; 692 *fmt = '\0';
715 } 693 }
716 694
717 /* helper for PyUnicode_FromFormatV() */
718
719 static const char*
720 parse_format_flags(const char *f,
721 int *p_width, int *p_precision,
722 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
723 {
724 int width, precision, longflag, longlongflag, size_tflag;
725
726 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
727 f++;
728 width = 0;
729 while (Py_ISDIGIT((unsigned)*f))
730 width = (width*10) + *f++ - '0';
731 precision = 0;
732 if (*f == '.') {
733 f++;
734 while (Py_ISDIGIT((unsigned)*f))
735 precision = (precision*10) + *f++ - '0';
736 if (*f == '%') {
737 /* "%.3%s" => f points to "3" */
738 f--;
739 }
740 }
741 if (*f == '\0') {
742 /* bogus format "%.1" => go backward, f points to "1" */
743 f--;
744 }
745 if (p_width != NULL)
746 *p_width = width;
747 if (p_precision != NULL)
748 *p_precision = precision;
749
750 /* Handle %ld, %lu, %lld and %llu. */
751 longflag = 0;
752 longlongflag = 0;
753 size_tflag = 0;
754
755 if (*f == 'l') {
756 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
757 longflag = 1;
758 ++f;
759 }
760 #ifdef HAVE_LONG_LONG
761 else if (f[1] == 'l' &&
762 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
763 longlongflag = 1;
764 f += 2;
765 }
766 #endif
767 }
768 /* handle the size_t flag. */
769 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
770 size_tflag = 1;
771 ++f;
772 }
773 if (p_longflag != NULL)
774 *p_longflag = longflag;
775 if (p_longlongflag != NULL)
776 *p_longlongflag = longlongflag;
777 if (p_size_tflag != NULL)
778 *p_size_tflag = size_tflag;
779 return f;
780 }
781
782 #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;} 695 #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
783
784 /* size of fixed-size buffer for formatting single arguments */
785 #define ITEM_BUFFER_LEN 21
786 /* maximum number of characters required for output of %ld. 21 characters
787 allows for 64-bit integers (in decimal) and an optional sign. */
788 #define MAX_LONG_CHARS 21
789 /* maximum number of characters required for output of %lld.
790 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
791 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
792 #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
793 696
794 PyObject * 697 PyObject *
795 PyUnicode_FromFormatV(const char *format, va_list vargs) 698 PyUnicode_FromFormatV(const char *format, va_list vargs)
796 { 699 {
797 va_list count; 700 va_list count;
798 Py_ssize_t callcount = 0; 701 Py_ssize_t callcount = 0;
799 PyObject **callresults = NULL; 702 PyObject **callresults = NULL;
800 PyObject **callresult = NULL; 703 PyObject **callresult = NULL;
801 Py_ssize_t n = 0; 704 Py_ssize_t n = 0;
802 int width = 0; 705 int width = 0;
803 int precision = 0; 706 int precision = 0;
804 int zeropad; 707 int zeropad;
805 const char* f; 708 const char* f;
806 Py_UNICODE *s; 709 Py_UNICODE *s;
807 PyObject *string; 710 PyObject *string;
808 /* used by sprintf */ 711 /* used by sprintf */
809 char buffer[ITEM_BUFFER_LEN+1]; 712 char buffer[21];
810 /* use abuffer instead of buffer, if we need more space 713 /* use abuffer instead of buffer, if we need more space
811 * (which can happen if there's a format specifier with width). */ 714 * (which can happen if there's a format specifier with width). */
812 char *abuffer = NULL; 715 char *abuffer = NULL;
813 char *realbuffer; 716 char *realbuffer;
814 Py_ssize_t abuffersize = 0; 717 Py_ssize_t abuffersize = 0;
815 char fmt[61]; /* should be enough for %0width.precisionlld */ 718 char fmt[60]; /* should be enough for %0width.precisionld */
816 const char *copy; 719 const char *copy;
817 720
818 Py_VA_COPY(count, vargs); 721 #ifdef VA_LIST_IS_ARRAY
722 Py_MEMCPY(count, vargs, sizeof(va_list));
723 #else
724 #ifdef __va_copy
725 __va_copy(count, vargs);
726 #else
727 count = vargs;
728 #endif
729 #endif
819 /* step 1: count the number of %S/%R/%A/%s format specifications 730 /* step 1: count the number of %S/%R/%A/%s format specifications
820 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/ 731 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
821 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the 732 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
822 * result in an array) */ 733 * result in an array) */
823 for (f = format; *f; f++) { 734 for (f = format; *f; f++) {
824 if (*f == '%') { 735 if (*f == '%') {
825 /* skip width or width.precision (eg. "1.2" of "%1.2f") */ 736 if (*(f+1)=='%')
826 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL); 737 continue;
827 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V') 738 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
828 ++callcount; 739 ++callcount;
829 } 740 while (ISDIGIT((unsigned)*f))
830 else if (128 <= (unsigned char)*f) { 741 width = (width*10) + *f++ - '0';
831 PyErr_Format(PyExc_ValueError, 742 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
832 "PyUnicode_FromFormatV() expects an ASCII-encoded format " 743 ;
833 "string, got a non-ASCII byte: 0x%02x", 744 if (*f == 's')
834 (unsigned char)*f); 745 ++callcount;
835 return NULL;
836 } 746 }
837 } 747 }
838 /* step 2: allocate memory for the results of 748 /* step 2: allocate memory for the results of
839 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */ 749 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
840 if (callcount) { 750 if (callcount) {
841 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount); 751 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
842 if (!callresults) { 752 if (!callresults) {
843 PyErr_NoMemory(); 753 PyErr_NoMemory();
844 return NULL; 754 return NULL;
845 } 755 }
846 callresult = callresults; 756 callresult = callresults;
847 } 757 }
848 /* step 3: figure out how large a buffer we need */ 758 /* step 3: figure out how large a buffer we need */
849 for (f = format; *f; f++) { 759 for (f = format; *f; f++) {
850 if (*f == '%') { 760 if (*f == '%') {
851 #ifdef HAVE_LONG_LONG 761 const char* p = f;
852 int longlongflag; 762 width = 0;
853 #endif 763 while (ISDIGIT((unsigned)*f))
854 const char* p; 764 width = (width*10) + *f++ - '0';
855 765 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
856 p = f; 766 ;
857 f = parse_format_flags(f, &width, NULL, 767
858 NULL, &longlongflag, NULL); 768 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
769 * they don't affect the amount of space we reserve.
770 */
771 if ((*f == 'l' || *f == 'z') &&
772 (f[1] == 'd' || f[1] == 'u'))
773 ++f;
859 774
860 switch (*f) { 775 switch (*f) {
861 case 'c': 776 case 'c':
862 {
863 #ifndef Py_UNICODE_WIDE
864 int ordinal = va_arg(count, int);
865 if (ordinal > 0xffff)
866 n += 2;
867 else
868 n++;
869 #else
870 (void)va_arg(count, int); 777 (void)va_arg(count, int);
871 n++; 778 /* fall through... */
872 #endif
873 break;
874 }
875 case '%': 779 case '%':
876 n++; 780 n++;
877 break; 781 break;
878 case 'd': case 'u': case 'i': case 'x': 782 case 'd': case 'u': case 'i': case 'x':
879 (void) va_arg(count, int); 783 (void) va_arg(count, int);
880 #ifdef HAVE_LONG_LONG 784 /* 20 bytes is enough to hold a 64-bit
881 if (longlongflag) { 785 integer. Decimal takes the most space.
882 if (width < MAX_LONG_LONG_CHARS) 786 This isn't enough for octal.
883 width = MAX_LONG_LONG_CHARS; 787 If a width is specified we need more
884 } 788 (which we allocate later). */
885 else 789 if (width < 20)
886 #endif 790 width = 20;
887 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
888 including sign. Decimal takes the most space. This
889 isn't enough for octal. If a width is specified we
890 need more (which we allocate later). */
891 if (width < MAX_LONG_CHARS)
892 width = MAX_LONG_CHARS;
893 n += width; 791 n += width;
894 /* XXX should allow for large precision here too. */
895 if (abuffersize < width) 792 if (abuffersize < width)
896 abuffersize = width; 793 abuffersize = width;
897 break; 794 break;
898 case 's': 795 case 's':
899 { 796 {
900 /* UTF-8 */ 797 /* UTF-8 */
901 const char *s = va_arg(count, const char*); 798 const char *s = va_arg(count, const char*);
902 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace"); 799 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
903 if (!str) 800 if (!str)
904 goto fail; 801 goto fail;
905 n += PyUnicode_GET_SIZE(str); 802 n += PyUnicode_GET_SIZE(str);
906 /* Remember the str and switch to the next slot */ 803 /* Remember the str and switch to the next slot */
907 *callresult++ = str; 804 *callresult++ = str;
908 break; 805 break;
909 } 806 }
910 case 'U': 807 case 'U':
911 { 808 {
912 PyObject *obj = va_arg(count, PyObject *); 809 PyObject *obj = va_arg(count, PyObject *);
913 assert(obj && PyUnicode_Check(obj)); 810 assert(obj && PyUnicode_Check(obj));
914 n += PyUnicode_GET_SIZE(obj); 811 n += PyUnicode_GET_SIZE(obj);
915 break; 812 break;
916 } 813 }
917 case 'V': 814 case 'V':
918 { 815 {
919 PyObject *obj = va_arg(count, PyObject *); 816 PyObject *obj = va_arg(count, PyObject *);
920 const char *str = va_arg(count, const char *); 817 const char *str = va_arg(count, const char *);
921 PyObject *str_obj;
922 assert(obj || str); 818 assert(obj || str);
923 assert(!obj || PyUnicode_Check(obj)); 819 assert(!obj || PyUnicode_Check(obj));
924 if (obj) { 820 if (obj)
925 n += PyUnicode_GET_SIZE(obj); 821 n += PyUnicode_GET_SIZE(obj);
926 *callresult++ = NULL; 822 else
927 } 823 n += strlen(str);
928 else {
929 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
930 if (!str_obj)
931 goto fail;
932 n += PyUnicode_GET_SIZE(str_obj);
933 *callresult++ = str_obj;
934 }
935 break; 824 break;
936 } 825 }
937 case 'S': 826 case 'S':
938 { 827 {
939 PyObject *obj = va_arg(count, PyObject *); 828 PyObject *obj = va_arg(count, PyObject *);
940 PyObject *str; 829 PyObject *str;
941 assert(obj); 830 assert(obj);
942 str = PyObject_Str(obj); 831 str = PyObject_Str(obj);
943 if (!str) 832 if (!str)
944 goto fail; 833 goto fail;
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after
989 string. (we cannot just skip the 878 string. (we cannot just skip the
990 code, since there's no way to know 879 code, since there's no way to know
991 what's in the argument list) */ 880 what's in the argument list) */
992 n += strlen(p); 881 n += strlen(p);
993 goto expand; 882 goto expand;
994 } 883 }
995 } else 884 } else
996 n++; 885 n++;
997 } 886 }
998 expand: 887 expand:
999 if (abuffersize > ITEM_BUFFER_LEN) { 888 if (abuffersize > 20) {
1000 /* add 1 for sprintf's trailing null byte */ 889 abuffer = PyObject_Malloc(abuffersize);
1001 abuffer = PyObject_Malloc(abuffersize + 1);
1002 if (!abuffer) { 890 if (!abuffer) {
1003 PyErr_NoMemory(); 891 PyErr_NoMemory();
1004 goto fail; 892 goto fail;
1005 } 893 }
1006 realbuffer = abuffer; 894 realbuffer = abuffer;
1007 } 895 }
1008 else 896 else
1009 realbuffer = buffer; 897 realbuffer = buffer;
1010 /* step 4: fill the buffer */ 898 /* step 4: fill the buffer */
1011 /* Since we've analyzed how much space we need for the worst case, 899 /* Since we've analyzed how much space we need for the worst case,
1012 we don't have to resize the string. 900 we don't have to resize the string.
1013 There can be no errors beyond this point. */ 901 There can be no errors beyond this point. */
1014 string = PyUnicode_FromUnicode(NULL, n); 902 string = PyUnicode_FromUnicode(NULL, n);
1015 if (!string) 903 if (!string)
1016 goto fail; 904 goto fail;
1017 905
1018 s = PyUnicode_AS_UNICODE(string); 906 s = PyUnicode_AS_UNICODE(string);
1019 callresult = callresults; 907 callresult = callresults;
1020 908
1021 for (f = format; *f; f++) { 909 for (f = format; *f; f++) {
1022 if (*f == '%') { 910 if (*f == '%') {
1023 const char* p; 911 const char* p = f++;
1024 int longflag; 912 int longflag = 0;
1025 int longlongflag; 913 int size_tflag = 0;
1026 int size_tflag; 914 zeropad = (*f == '0');
1027 915 /* parse the width.precision part */
1028 p = f; 916 width = 0;
1029 zeropad = (f[1] == '0'); 917 while (ISDIGIT((unsigned)*f))
1030 f = parse_format_flags(f, &width, &precision, 918 width = (width*10) + *f++ - '0';
1031 &longflag, &longlongflag, &size_tflag); 919 precision = 0;
920 if (*f == '.') {
921 f++;
922 while (ISDIGIT((unsigned)*f))
923 precision = (precision*10) + *f++ - '0';
924 }
925 /* handle the long flag, but only for %ld and %lu.
926 others can be added when necessary. */
927 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
928 longflag = 1;
929 ++f;
930 }
931 /* handle the size_t flag. */
932 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
933 size_tflag = 1;
934 ++f;
935 }
1032 936
1033 switch (*f) { 937 switch (*f) {
1034 case 'c': 938 case 'c':
1035 { 939 *s++ = va_arg(vargs, int);
1036 int ordinal = va_arg(vargs, int);
1037 #ifndef Py_UNICODE_WIDE
1038 if (ordinal > 0xffff) {
1039 ordinal -= 0x10000;
1040 *s++ = 0xD800 | (ordinal >> 10);
1041 *s++ = 0xDC00 | (ordinal & 0x3FF);
1042 } else
1043 #endif
1044 *s++ = ordinal;
1045 break; 940 break;
1046 }
1047 case 'i':
1048 case 'd': 941 case 'd':
1049 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 942 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd ');
1050 width, precision, *f);
1051 if (longflag) 943 if (longflag)
1052 sprintf(realbuffer, fmt, va_arg(vargs, long)); 944 sprintf(realbuffer, fmt, va_arg(vargs, long));
1053 #ifdef HAVE_LONG_LONG
1054 else if (longlongflag)
1055 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1056 #endif
1057 else if (size_tflag) 945 else if (size_tflag)
1058 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t)); 946 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1059 else 947 else
1060 sprintf(realbuffer, fmt, va_arg(vargs, int)); 948 sprintf(realbuffer, fmt, va_arg(vargs, int));
1061 appendstring(realbuffer); 949 appendstring(realbuffer);
1062 break; 950 break;
1063 case 'u': 951 case 'u':
1064 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 952 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u ');
1065 width, precision, 'u');
1066 if (longflag) 953 if (longflag)
1067 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long)); 954 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
1068 #ifdef HAVE_LONG_LONG
1069 else if (longlongflag)
1070 sprintf(realbuffer, fmt, va_arg(vargs,
1071 unsigned PY_LONG_LONG));
1072 #endif
1073 else if (size_tflag) 955 else if (size_tflag)
1074 sprintf(realbuffer, fmt, va_arg(vargs, size_t)); 956 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1075 else 957 else
1076 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int)); 958 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1077 appendstring(realbuffer); 959 appendstring(realbuffer);
1078 break; 960 break;
961 case 'i':
962 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
963 sprintf(realbuffer, fmt, va_arg(vargs, int));
964 appendstring(realbuffer);
965 break;
1079 case 'x': 966 case 'x':
1080 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x'); 967 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
1081 sprintf(realbuffer, fmt, va_arg(vargs, int)); 968 sprintf(realbuffer, fmt, va_arg(vargs, int));
1082 appendstring(realbuffer); 969 appendstring(realbuffer);
1083 break; 970 break;
1084 case 's': 971 case 's':
1085 { 972 {
1086 /* unused, since we already have the result */ 973 /* unused, since we already have the result */
1087 (void) va_arg(vargs, char *); 974 (void) va_arg(vargs, char *);
1088 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult), 975 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1089 PyUnicode_GET_SIZE(*callresult)); 976 PyUnicode_GET_SIZE(*callresult));
1090 s += PyUnicode_GET_SIZE(*callresult); 977 s += PyUnicode_GET_SIZE(*callresult);
1091 /* We're done with the unicode()/repr() => forget it */ 978 /* We're done with the unicode()/repr() => forget it */
1092 Py_DECREF(*callresult); 979 Py_DECREF(*callresult);
1093 /* switch to next unicode()/repr() result */ 980 /* switch to next unicode()/repr() result */
1094 ++callresult; 981 ++callresult;
1095 break; 982 break;
1096 } 983 }
1097 case 'U': 984 case 'U':
1098 { 985 {
1099 PyObject *obj = va_arg(vargs, PyObject *); 986 PyObject *obj = va_arg(vargs, PyObject *);
1100 Py_ssize_t size = PyUnicode_GET_SIZE(obj); 987 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1101 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); 988 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1102 s += size; 989 s += size;
1103 break; 990 break;
1104 } 991 }
1105 case 'V': 992 case 'V':
1106 { 993 {
1107 PyObject *obj = va_arg(vargs, PyObject *); 994 PyObject *obj = va_arg(vargs, PyObject *);
1108 va_arg(vargs, const char *); 995 const char *str = va_arg(vargs, const char *);
1109 if (obj) { 996 if (obj) {
1110 Py_ssize_t size = PyUnicode_GET_SIZE(obj); 997 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1111 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); 998 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1112 s += size; 999 s += size;
1113 } else { 1000 } else {
1114 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult), 1001 appendstring(str);
1115 PyUnicode_GET_SIZE(*callresult));
1116 s += PyUnicode_GET_SIZE(*callresult);
1117 Py_DECREF(*callresult);
1118 } 1002 }
1119 ++callresult;
1120 break; 1003 break;
1121 } 1004 }
1122 case 'S': 1005 case 'S':
1123 case 'R': 1006 case 'R':
1124 case 'A':
1125 { 1007 {
1126 Py_UNICODE *ucopy; 1008 Py_UNICODE *ucopy;
1127 Py_ssize_t usize; 1009 Py_ssize_t usize;
1128 Py_ssize_t upos; 1010 Py_ssize_t upos;
1129 /* unused, since we already have the result */ 1011 /* unused, since we already have the result */
1130 (void) va_arg(vargs, PyObject *); 1012 (void) va_arg(vargs, PyObject *);
1131 ucopy = PyUnicode_AS_UNICODE(*callresult); 1013 ucopy = PyUnicode_AS_UNICODE(*callresult);
1132 usize = PyUnicode_GET_SIZE(*callresult); 1014 usize = PyUnicode_GET_SIZE(*callresult);
1133 for (upos = 0; upos<usize;) 1015 for (upos = 0; upos<usize;)
1134 *s++ = ucopy[upos++]; 1016 *s++ = ucopy[upos++];
(...skipping 15 matching lines...) Expand all
1150 } 1032 }
1151 appendstring(buffer); 1033 appendstring(buffer);
1152 break; 1034 break;
1153 case '%': 1035 case '%':
1154 *s++ = '%'; 1036 *s++ = '%';
1155 break; 1037 break;
1156 default: 1038 default:
1157 appendstring(p); 1039 appendstring(p);
1158 goto end; 1040 goto end;
1159 } 1041 }
1160 } 1042 } else
1161 else
1162 *s++ = *f; 1043 *s++ = *f;
1163 } 1044 }
1164 1045
1165 end: 1046 end:
1166 if (callresults) 1047 if (callresults)
1167 PyObject_Free(callresults); 1048 PyObject_Free(callresults);
1168 if (abuffer) 1049 if (abuffer)
1169 PyObject_Free(abuffer); 1050 PyObject_Free(abuffer);
1170 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string)); 1051 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1171 return string; 1052 return string;
1172 fail: 1053 fail:
1173 if (callresults) { 1054 if (callresults) {
1174 PyObject **callresult2 = callresults; 1055 PyObject **callresult2 = callresults;
1175 while (callresult2 < callresult) { 1056 while (callresult2 < callresult) {
1176 Py_XDECREF(*callresult2); 1057 Py_DECREF(*callresult2);
1177 ++callresult2; 1058 ++callresult2;
1178 } 1059 }
1179 PyObject_Free(callresults); 1060 PyObject_Free(callresults);
1180 } 1061 }
1181 if (abuffer) 1062 if (abuffer)
1182 PyObject_Free(abuffer); 1063 PyObject_Free(abuffer);
1183 return NULL; 1064 return NULL;
1184 } 1065 }
1185 1066
1186 #undef appendstring 1067 #undef appendstring
1187 1068
1188 PyObject * 1069 PyObject *
1189 PyUnicode_FromFormat(const char *format, ...) 1070 PyUnicode_FromFormat(const char *format, ...)
1190 { 1071 {
1191 PyObject* ret; 1072 PyObject* ret;
1192 va_list vargs; 1073 va_list vargs;
1193 1074
1194 #ifdef HAVE_STDARG_PROTOTYPES 1075 #ifdef HAVE_STDARG_PROTOTYPES
1195 va_start(vargs, format); 1076 va_start(vargs, format);
1196 #else 1077 #else
1197 va_start(vargs); 1078 va_start(vargs);
1198 #endif 1079 #endif
1199 ret = PyUnicode_FromFormatV(format, vargs); 1080 ret = PyUnicode_FromFormatV(format, vargs);
1200 va_end(vargs); 1081 va_end(vargs);
1201 return ret; 1082 return ret;
1202 } 1083 }
1203 1084
1204 /* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(): 1085 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1205 convert a Unicode object to a wide character string. 1086 wchar_t *w,
1206 1087 Py_ssize_t size)
1207 - If w is NULL: return the number of wide characters (including the nul
1208 character) required to convert the unicode object. Ignore size argument.
1209
1210 - Otherwise: return the number of wide characters (excluding the nul
1211 character) written into w. Write at most size wide characters (including
1212 the nul character). */
1213 static Py_ssize_t
1214 unicode_aswidechar(PyUnicodeObject *unicode,
1215 wchar_t *w,
1216 Py_ssize_t size)
1217 {
1218 #if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
1219 Py_ssize_t res;
1220 if (w != NULL) {
1221 res = PyUnicode_GET_SIZE(unicode);
1222 if (size > res)
1223 size = res + 1;
1224 else
1225 res = size;
1226 memcpy(w, unicode->str, size * sizeof(wchar_t));
1227 return res;
1228 }
1229 else
1230 return PyUnicode_GET_SIZE(unicode) + 1;
1231 #elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1232 register const Py_UNICODE *u;
1233 const Py_UNICODE *uend;
1234 const wchar_t *worig, *wend;
1235 Py_ssize_t nchar;
1236
1237 u = PyUnicode_AS_UNICODE(unicode);
1238 uend = u + PyUnicode_GET_SIZE(unicode);
1239 if (w != NULL) {
1240 worig = w;
1241 wend = w + size;
1242 while (u != uend && w != wend) {
1243 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1244 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1245 {
1246 *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
1247 u += 2;
1248 }
1249 else {
1250 *w = *u;
1251 u++;
1252 }
1253 w++;
1254 }
1255 if (w != wend)
1256 *w = L'\0';
1257 return w - worig;
1258 }
1259 else {
1260 nchar = 1; /* nul character at the end */
1261 while (u != uend) {
1262 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1263 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1264 u += 2;
1265 else
1266 u++;
1267 nchar++;
1268 }
1269 }
1270 return nchar;
1271 #elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1272 register Py_UNICODE *u, *uend, ordinal;
1273 register Py_ssize_t i;
1274 wchar_t *worig, *wend;
1275 Py_ssize_t nchar;
1276
1277 u = PyUnicode_AS_UNICODE(unicode);
1278 uend = u + PyUnicode_GET_SIZE(u);
1279 if (w != NULL) {
1280 worig = w;
1281 wend = w + size;
1282 while (u != uend && w != wend) {
1283 ordinal = *u;
1284 if (ordinal > 0xffff) {
1285 ordinal -= 0x10000;
1286 *w++ = 0xD800 | (ordinal >> 10);
1287 *w++ = 0xDC00 | (ordinal & 0x3FF);
1288 }
1289 else
1290 *w++ = ordinal;
1291 u++;
1292 }
1293 if (w != wend)
1294 *w = 0;
1295 return w - worig;
1296 }
1297 else {
1298 nchar = 1; /* nul character */
1299 while (u != uend) {
1300 if (*u > 0xffff)
1301 nchar += 2;
1302 else
1303 nchar++;
1304 u++;
1305 }
1306 return nchar;
1307 }
1308 #else
1309 # error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
1310 #endif
1311 }
1312
1313 Py_ssize_t
1314 PyUnicode_AsWideChar(PyObject *unicode,
1315 wchar_t *w,
1316 Py_ssize_t size)
1317 { 1088 {
1318 if (unicode == NULL) { 1089 if (unicode == NULL) {
1319 PyErr_BadInternalCall(); 1090 PyErr_BadInternalCall();
1320 return -1; 1091 return -1;
1321 } 1092 }
1322 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size); 1093
1323 } 1094 /* If possible, try to copy the 0-termination as well */
1324 1095 if (size > PyUnicode_GET_SIZE(unicode))
1325 wchar_t* 1096 size = PyUnicode_GET_SIZE(unicode) + 1;
1326 PyUnicode_AsWideCharString(PyObject *unicode, 1097
1327 Py_ssize_t *size) 1098 #ifdef HAVE_USABLE_WCHAR_T
1328 { 1099 memcpy(w, unicode->str, size * sizeof(wchar_t));
1329 wchar_t* buffer; 1100 #else
1330 Py_ssize_t buflen; 1101 {
1331 1102 register Py_UNICODE *u;
1332 if (unicode == NULL) { 1103 register Py_ssize_t i;
1333 PyErr_BadInternalCall(); 1104 u = PyUnicode_AS_UNICODE(unicode);
1334 return NULL; 1105 for (i = size; i > 0; i--)
1335 } 1106 *w++ = *u++;
1336 1107 }
1337 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
1338 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
1339 PyErr_NoMemory();
1340 return NULL;
1341 }
1342
1343 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1344 if (buffer == NULL) {
1345 PyErr_NoMemory();
1346 return NULL;
1347 }
1348 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
1349 if (size != NULL)
1350 *size = buflen;
1351 return buffer;
1352 }
1353
1354 #endif 1108 #endif
1355 1109
1356 PyObject * 1110 if (size > PyUnicode_GET_SIZE(unicode))
1357 PyUnicode_FromOrdinal(int ordinal) 1111 return PyUnicode_GET_SIZE(unicode);
1112 else
1113 return size;
1114 }
1115
1116 #endif
1117
1118 PyObject *PyUnicode_FromOrdinal(int ordinal)
1358 { 1119 {
1359 Py_UNICODE s[2]; 1120 Py_UNICODE s[2];
1360 1121
1361 if (ordinal < 0 || ordinal > 0x10ffff) { 1122 if (ordinal < 0 || ordinal > 0x10ffff) {
1362 PyErr_SetString(PyExc_ValueError, 1123 PyErr_SetString(PyExc_ValueError,
1363 "chr() arg not in range(0x110000)"); 1124 "chr() arg not in range(0x110000)");
1364 return NULL; 1125 return NULL;
1365 } 1126 }
1366 1127
1367 #ifndef Py_UNICODE_WIDE 1128 #ifndef Py_UNICODE_WIDE
1368 if (ordinal > 0xffff) { 1129 if (ordinal > 0xffff) {
1369 ordinal -= 0x10000; 1130 ordinal -= 0x10000;
1370 s[0] = 0xD800 | (ordinal >> 10); 1131 s[0] = 0xD800 | (ordinal >> 10);
1371 s[1] = 0xDC00 | (ordinal & 0x3FF); 1132 s[1] = 0xDC00 | (ordinal & 0x3FF);
1372 return PyUnicode_FromUnicode(s, 2); 1133 return PyUnicode_FromUnicode(s, 2);
1373 } 1134 }
1374 #endif 1135 #endif
1375 1136
1376 s[0] = (Py_UNICODE)ordinal; 1137 s[0] = (Py_UNICODE)ordinal;
1377 return PyUnicode_FromUnicode(s, 1); 1138 return PyUnicode_FromUnicode(s, 1);
1378 } 1139 }
1379 1140
1380 PyObject * 1141 PyObject *PyUnicode_FromObject(register PyObject *obj)
1381 PyUnicode_FromObject(register PyObject *obj)
1382 { 1142 {
1383 /* XXX Perhaps we should make this API an alias of 1143 /* XXX Perhaps we should make this API an alias of
1384 PyObject_Str() instead ?! */ 1144 PyObject_Str() instead ?! */
1385 if (PyUnicode_CheckExact(obj)) { 1145 if (PyUnicode_CheckExact(obj)) {
1386 Py_INCREF(obj); 1146 Py_INCREF(obj);
1387 return obj; 1147 return obj;
1388 } 1148 }
1389 if (PyUnicode_Check(obj)) { 1149 if (PyUnicode_Check(obj)) {
1390 /* For a Unicode subtype that's not a Unicode object, 1150 /* For a Unicode subtype that's not a Unicode object,
1391 return a true Unicode object with the same data. */ 1151 return a true Unicode object with the same data. */
1392 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj), 1152 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1393 PyUnicode_GET_SIZE(obj)); 1153 PyUnicode_GET_SIZE(obj));
1394 } 1154 }
1395 PyErr_Format(PyExc_TypeError, 1155 PyErr_Format(PyExc_TypeError,
1396 "Can't convert '%.100s' object to str implicitly", 1156 "Can't convert '%.100s' object to str implicitly",
1397 Py_TYPE(obj)->tp_name); 1157 Py_TYPE(obj)->tp_name);
1398 return NULL; 1158 return NULL;
1399 } 1159 }
1400 1160
1401 PyObject * 1161 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1402 PyUnicode_FromEncodedObject(register PyObject *obj, 1162 const char *encoding,
1403 » » » const char *encoding, 1163 const char *errors)
1404 » » » const char *errors)
1405 { 1164 {
1406 Py_buffer buffer; 1165 Py_buffer buffer;
1407 PyObject *v; 1166 PyObject *v;
1408 1167
1409 if (obj == NULL) { 1168 if (obj == NULL) {
1410 PyErr_BadInternalCall(); 1169 PyErr_BadInternalCall();
1411 return NULL; 1170 return NULL;
1412 } 1171 }
1413 1172
1414 /* Decoding bytes objects is the most common case and should be fast */ 1173 /* Decoding bytes objects is the most common case and should be fast */
(...skipping 29 matching lines...) Expand all
1444 Py_INCREF(unicode_empty); 1203 Py_INCREF(unicode_empty);
1445 v = (PyObject *) unicode_empty; 1204 v = (PyObject *) unicode_empty;
1446 } 1205 }
1447 else 1206 else
1448 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); 1207 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
1449 1208
1450 PyBuffer_Release(&buffer); 1209 PyBuffer_Release(&buffer);
1451 return v; 1210 return v;
1452 } 1211 }
1453 1212
1454 /* Convert encoding to lower case and replace '_' with '-' in order to 1213 PyObject *PyUnicode_Decode(const char *s,
1455 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1), 1214 Py_ssize_t size,
1456 1 on success. */ 1215 const char *encoding,
1457 static int 1216 const char *errors)
1458 normalize_encoding(const char *encoding, 1217 {
1459 char *lower, 1218 PyObject *buffer = NULL, *unicode;
1460 size_t lower_len) 1219 Py_buffer info;
1461 { 1220 char lower[20]; /* Enough for any encoding name we recognize */
1221 char *l;
1462 const char *e; 1222 const char *e;
1463 char *l; 1223
1464 char *l_end; 1224 if (encoding == NULL)
1465 1225 encoding = PyUnicode_GetDefaultEncoding();
1226
1227 /* Convert encoding to lower case and replace '_' with '-' in order to
1228 catch e.g. UTF_8 */
1466 e = encoding; 1229 e = encoding;
1467 l = lower; 1230 l = lower;
1468 l_end = &lower[lower_len - 1]; 1231 while (*e && l < &lower[(sizeof lower) - 2]) {
1469 while (*e) { 1232 if (ISUPPER(*e)) {
1470 if (l == l_end) 1233 *l++ = TOLOWER(*e++);
1471 return 0;
1472 if (Py_ISUPPER(*e)) {
1473 *l++ = Py_TOLOWER(*e++);
1474 } 1234 }
1475 else if (*e == '_') { 1235 else if (*e == '_') {
1476 *l++ = '-'; 1236 *l++ = '-';
1477 e++; 1237 e++;
1478 } 1238 }
1479 else { 1239 else {
1480 *l++ = *e++; 1240 *l++ = *e++;
1481 } 1241 }
1482 } 1242 }
1483 *l = '\0'; 1243 *l = '\0';
1484 return 1; 1244
1485 } 1245 /* Shortcuts for common default encodings */
1486 1246 if (strcmp(lower, "utf-8") == 0)
1487 PyObject *
1488 PyUnicode_Decode(const char *s,
1489 » » Py_ssize_t size,
1490 » » const char *encoding,
1491 » » const char *errors)
1492 {
1493 PyObject *buffer = NULL, *unicode;
1494 Py_buffer info;
1495 char lower[11]; /* Enough for any encoding shortcut */
1496
1497 if (encoding == NULL)
1498 return PyUnicode_DecodeUTF8(s, size, errors); 1247 return PyUnicode_DecodeUTF8(s, size, errors);
1499 1248 else if ((strcmp(lower, "latin-1") == 0) ||
1500 /* Shortcuts for common default encodings */ 1249 (strcmp(lower, "iso-8859-1") == 0))
1501 if (normalize_encoding(encoding, lower, sizeof(lower))) { 1250 return PyUnicode_DecodeLatin1(s, size, errors);
1502 if ((strcmp(lower, "utf-8") == 0) ||
1503 (strcmp(lower, "utf8") == 0))
1504 return PyUnicode_DecodeUTF8(s, size, errors);
1505 else if ((strcmp(lower, "latin-1") == 0) ||
1506 (strcmp(lower, "latin1") == 0) ||
1507 (strcmp(lower, "iso-8859-1") == 0))
1508 return PyUnicode_DecodeLatin1(s, size, errors);
1509 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1251 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1510 else if (strcmp(lower, "mbcs") == 0) 1252 else if (strcmp(lower, "mbcs") == 0)
1511 return PyUnicode_DecodeMBCS(s, size, errors); 1253 return PyUnicode_DecodeMBCS(s, size, errors);
1512 #endif 1254 #endif
1513 else if (strcmp(lower, "ascii") == 0) 1255 else if (strcmp(lower, "ascii") == 0)
1514 return PyUnicode_DecodeASCII(s, size, errors); 1256 return PyUnicode_DecodeASCII(s, size, errors);
1515 else if (strcmp(lower, "utf-16") == 0) 1257 else if (strcmp(lower, "utf-16") == 0)
1516 return PyUnicode_DecodeUTF16(s, size, errors, 0); 1258 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1517 else if (strcmp(lower, "utf-32") == 0) 1259 else if (strcmp(lower, "utf-32") == 0)
1518 return PyUnicode_DecodeUTF32(s, size, errors, 0); 1260 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1519 }
1520 1261
1521 /* Decode via the codec registry */ 1262 /* Decode via the codec registry */
1522 buffer = NULL; 1263 buffer = NULL;
1523 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) 1264 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
1524 goto onError; 1265 goto onError;
1525 buffer = PyMemoryView_FromBuffer(&info); 1266 buffer = PyMemoryView_FromBuffer(&info);
1526 if (buffer == NULL) 1267 if (buffer == NULL)
1527 goto onError; 1268 goto onError;
1528 unicode = PyCodec_Decode(buffer, encoding, errors); 1269 unicode = PyCodec_Decode(buffer, encoding, errors);
1529 if (unicode == NULL) 1270 if (unicode == NULL)
1530 goto onError; 1271 goto onError;
1531 if (!PyUnicode_Check(unicode)) { 1272 if (!PyUnicode_Check(unicode)) {
1532 PyErr_Format(PyExc_TypeError, 1273 PyErr_Format(PyExc_TypeError,
1533 "decoder did not return a str object (type=%.400s)", 1274 "decoder did not return a str object (type=%.400s)",
1534 Py_TYPE(unicode)->tp_name); 1275 Py_TYPE(unicode)->tp_name);
1535 Py_DECREF(unicode); 1276 Py_DECREF(unicode);
1536 goto onError; 1277 goto onError;
1537 } 1278 }
1538 Py_DECREF(buffer); 1279 Py_DECREF(buffer);
1539 return unicode; 1280 return unicode;
1540 1281
1541 onError: 1282 onError:
1542 Py_XDECREF(buffer); 1283 Py_XDECREF(buffer);
1543 return NULL; 1284 return NULL;
1544 } 1285 }
1545 1286
1546 PyObject * 1287 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1547 PyUnicode_AsDecodedObject(PyObject *unicode, 1288 const char *encoding,
1548 » » » const char *encoding, 1289 const char *errors)
1549 » » » const char *errors)
1550 { 1290 {
1551 PyObject *v; 1291 PyObject *v;
1552 1292
1553 if (!PyUnicode_Check(unicode)) { 1293 if (!PyUnicode_Check(unicode)) {
1554 PyErr_BadArgument(); 1294 PyErr_BadArgument();
1555 goto onError; 1295 goto onError;
1556 } 1296 }
1557 1297
1558 if (encoding == NULL) 1298 if (encoding == NULL)
1559 encoding = PyUnicode_GetDefaultEncoding(); 1299 encoding = PyUnicode_GetDefaultEncoding();
1560 1300
1561 /* Decode via the codec registry */ 1301 /* Decode via the codec registry */
1562 v = PyCodec_Decode(unicode, encoding, errors); 1302 v = PyCodec_Decode(unicode, encoding, errors);
1563 if (v == NULL) 1303 if (v == NULL)
1564 goto onError; 1304 goto onError;
1565 return v; 1305 return v;
1566 1306
1567 onError: 1307 onError:
1568 return NULL; 1308 return NULL;
1569 } 1309 }
1570 1310
1571 PyObject * 1311 PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1572 PyUnicode_AsDecodedUnicode(PyObject *unicode, 1312 const char *encoding,
1573 » » » const char *encoding, 1313 const char *errors)
1574 » » » const char *errors)
1575 { 1314 {
1576 PyObject *v; 1315 PyObject *v;
1577 1316
1578 if (!PyUnicode_Check(unicode)) { 1317 if (!PyUnicode_Check(unicode)) {
1579 PyErr_BadArgument(); 1318 PyErr_BadArgument();
1580 goto onError; 1319 goto onError;
1581 } 1320 }
1582 1321
1583 if (encoding == NULL) 1322 if (encoding == NULL)
1584 encoding = PyUnicode_GetDefaultEncoding(); 1323 encoding = PyUnicode_GetDefaultEncoding();
1585 1324
1586 /* Decode via the codec registry */ 1325 /* Decode via the codec registry */
1587 v = PyCodec_Decode(unicode, encoding, errors); 1326 v = PyCodec_Decode(unicode, encoding, errors);
1588 if (v == NULL) 1327 if (v == NULL)
1589 goto onError; 1328 goto onError;
1590 if (!PyUnicode_Check(v)) { 1329 if (!PyUnicode_Check(v)) {
1591 PyErr_Format(PyExc_TypeError, 1330 PyErr_Format(PyExc_TypeError,
1592 "decoder did not return a str object (type=%.400s)", 1331 "decoder did not return a str object (type=%.400s)",
1593 Py_TYPE(v)->tp_name); 1332 Py_TYPE(v)->tp_name);
1594 Py_DECREF(v); 1333 Py_DECREF(v);
1595 goto onError; 1334 goto onError;
1596 } 1335 }
1597 return v; 1336 return v;
1598 1337
1599 onError: 1338 onError:
1600 return NULL; 1339 return NULL;
1601 } 1340 }
1602 1341
1603 PyObject * 1342 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1604 PyUnicode_Encode(const Py_UNICODE *s, 1343 Py_ssize_t size,
1605 » » Py_ssize_t size, 1344 const char *encoding,
1606 » » const char *encoding, 1345 const char *errors)
1607 » » const char *errors)
1608 { 1346 {
1609 PyObject *v, *unicode; 1347 PyObject *v, *unicode;
1610 1348
1611 unicode = PyUnicode_FromUnicode(s, size); 1349 unicode = PyUnicode_FromUnicode(s, size);
1612 if (unicode == NULL) 1350 if (unicode == NULL)
1613 return NULL; 1351 return NULL;
1614 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 1352 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1615 Py_DECREF(unicode); 1353 Py_DECREF(unicode);
1616 return v; 1354 return v;
1617 } 1355 }
1618 1356
1619 PyObject * 1357 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1620 PyUnicode_AsEncodedObject(PyObject *unicode, 1358 const char *encoding,
1621 » » » const char *encoding, 1359 const char *errors)
1622 » » » const char *errors)
1623 { 1360 {
1624 PyObject *v; 1361 PyObject *v;
1625 1362
1626 if (!PyUnicode_Check(unicode)) { 1363 if (!PyUnicode_Check(unicode)) {
1627 PyErr_BadArgument(); 1364 PyErr_BadArgument();
1628 goto onError; 1365 goto onError;
1629 } 1366 }
1630 1367
1631 if (encoding == NULL) 1368 if (encoding == NULL)
1632 encoding = PyUnicode_GetDefaultEncoding(); 1369 encoding = PyUnicode_GetDefaultEncoding();
1633 1370
1634 /* Encode via the codec registry */ 1371 /* Encode via the codec registry */
1635 v = PyCodec_Encode(unicode, encoding, errors); 1372 v = PyCodec_Encode(unicode, encoding, errors);
1636 if (v == NULL) 1373 if (v == NULL)
1637 goto onError; 1374 goto onError;
1638 return v; 1375 return v;
1639 1376
1640 onError: 1377 onError:
1641 return NULL; 1378 return NULL;
1642 } 1379 }
1643 1380
1644 PyObject * 1381 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1645 PyUnicode_EncodeFSDefault(PyObject *unicode) 1382 const char *encoding,
1646 { 1383 const char *errors)
1647 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1648 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1649 PyUnicode_GET_SIZE(unicode),
1650 NULL);
1651 #elif defined(__APPLE__)
1652 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1653 PyUnicode_GET_SIZE(unicode),
1654 "surrogateescape");
1655 #else
1656 if (Py_FileSystemDefaultEncoding) {
1657 return PyUnicode_AsEncodedString(unicode,
1658 Py_FileSystemDefaultEncoding,
1659 "surrogateescape");
1660 }
1661 else {
1662 /* locale encoding with surrogateescape */
1663 wchar_t *wchar;
1664 char *bytes;
1665 PyObject *bytes_obj;
1666 size_t error_pos;
1667
1668 wchar = PyUnicode_AsWideCharString(unicode, NULL);
1669 if (wchar == NULL)
1670 return NULL;
1671 bytes = _Py_wchar2char(wchar, &error_pos);
1672 if (bytes == NULL) {
1673 if (error_pos != (size_t)-1) {
1674 char *errmsg = strerror(errno);
1675 PyObject *exc = NULL;
1676 if (errmsg == NULL)
1677 errmsg = "Py_wchar2char() failed";
1678 raise_encode_exception(&exc,
1679 "filesystemencoding",
1680 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
1681 error_pos, error_pos+1,
1682 errmsg);
1683 Py_XDECREF(exc);
1684 }
1685 else
1686 PyErr_NoMemory();
1687 PyMem_Free(wchar);
1688 return NULL;
1689 }
1690 PyMem_Free(wchar);
1691
1692 bytes_obj = PyBytes_FromString(bytes);
1693 PyMem_Free(bytes);
1694 return bytes_obj;
1695 }
1696 #endif
1697 }
1698
1699 PyObject *
1700 PyUnicode_AsEncodedString(PyObject *unicode,
1701 » » » const char *encoding,
1702 » » » const char *errors)
1703 { 1384 {
1704 PyObject *v; 1385 PyObject *v;
1705 char lower[11]; /* Enough for any encoding shortcut */
1706 1386
1707 if (!PyUnicode_Check(unicode)) { 1387 if (!PyUnicode_Check(unicode)) {
1708 PyErr_BadArgument(); 1388 PyErr_BadArgument();
1709 return NULL; 1389 return NULL;
1710 } 1390 }
1711 1391
1712 if (encoding == NULL) { 1392 if (encoding == NULL)
1713 if (errors == NULL || strcmp(errors, "strict") == 0) 1393 encoding = PyUnicode_GetDefaultEncoding();
1394
1395 /* Shortcuts for common default encodings */
1396 if (errors == NULL) {
1397 if (strcmp(encoding, "utf-8") == 0)
1714 return PyUnicode_AsUTF8String(unicode); 1398 return PyUnicode_AsUTF8String(unicode);
1715 else 1399 else if (strcmp(encoding, "latin-1") == 0)
1716 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1400 return PyUnicode_AsLatin1String(unicode);
1717 PyUnicode_GET_SIZE(unicode),
1718 errors);
1719 }
1720
1721 /* Shortcuts for common default encodings */
1722 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1723 if ((strcmp(lower, "utf-8") == 0) ||
1724 (strcmp(lower, "utf8") == 0))
1725 {
1726 if (errors == NULL || strcmp(errors, "strict") == 0)
1727 return PyUnicode_AsUTF8String(unicode);
1728 else
1729 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1730 PyUnicode_GET_SIZE(unicode),
1731 errors);
1732 }
1733 else if ((strcmp(lower, "latin-1") == 0) ||
1734 (strcmp(lower, "latin1") == 0) ||
1735 (strcmp(lower, "iso-8859-1") == 0))
1736 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1737 PyUnicode_GET_SIZE(unicode),
1738 errors);
1739 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1401 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1740 else if (strcmp(lower, "mbcs") == 0) 1402 else if (strcmp(encoding, "mbcs") == 0)
1741 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 1403 return PyUnicode_AsMBCSString(unicode);
1742 PyUnicode_GET_SIZE(unicode),
1743 errors);
1744 #endif 1404 #endif
1745 else if (strcmp(lower, "ascii") == 0) 1405 else if (strcmp(encoding, "ascii") == 0)
1746 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 1406 return PyUnicode_AsASCIIString(unicode);
1747 PyUnicode_GET_SIZE(unicode), 1407 /* During bootstrap, we may need to find the encodings
1748 errors); 1408 package, to load the file system encoding, and require the
1409 file system encoding in order to load the encodings
1410 package.
1411
1412 Break out of this dependency by assuming that the path to
1413 the encodings module is ASCII-only. XXX could try wcstombs
1414 instead, if the file system encoding is the locale's
1415 encoding. */
1416 else if (Py_FileSystemDefaultEncoding &&
1417 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1418 !PyThreadState_GET()->interp->codecs_initialized)
1419 return PyUnicode_AsASCIIString(unicode);
1749 } 1420 }
1750 1421
1751 /* Encode via the codec registry */ 1422 /* Encode via the codec registry */
1752 v = PyCodec_Encode(unicode, encoding, errors); 1423 v = PyCodec_Encode(unicode, encoding, errors);
1753 if (v == NULL) 1424 if (v == NULL)
1754 return NULL; 1425 return NULL;
1755 1426
1756 /* The normal path */ 1427 /* The normal path */
1757 if (PyBytes_Check(v)) 1428 if (PyBytes_Check(v))
1758 return v; 1429 return v;
1759 1430
1760 /* If the codec returns a buffer, raise a warning and convert to bytes */ 1431 /* If the codec returns a buffer, raise a warning and convert to bytes */
1761 if (PyByteArray_Check(v)) { 1432 if (PyByteArray_Check(v)) {
1762 int error; 1433 char msg[100];
1763 PyObject *b; 1434 PyObject *b;
1764 1435 PyOS_snprintf(msg, sizeof(msg),
1765 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, 1436 "encoder %s returned buffer instead of bytes",
1766 "encoder %s returned bytearray instead of bytes", 1437 encoding);
1767 encoding); 1438 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
1768 if (error) {
1769 Py_DECREF(v); 1439 Py_DECREF(v);
1770 return NULL; 1440 return NULL;
1771 } 1441 }
1772 1442
1773 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 1443 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1774 Py_DECREF(v); 1444 Py_DECREF(v);
1775 return b; 1445 return b;
1776 } 1446 }
1777 1447
1778 PyErr_Format(PyExc_TypeError, 1448 PyErr_Format(PyExc_TypeError,
1779 "encoder did not return a bytes object (type=%.400s)", 1449 "encoder did not return a bytes object (type=%.400s)",
1780 Py_TYPE(v)->tp_name); 1450 Py_TYPE(v)->tp_name);
1781 Py_DECREF(v); 1451 Py_DECREF(v);
1782 return NULL; 1452 return NULL;
1783 } 1453 }
1784 1454
1785 PyObject * 1455 PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1786 PyUnicode_AsEncodedUnicode(PyObject *unicode, 1456 const char *encoding,
1787 » » » const char *encoding, 1457 const char *errors)
1788 » » » const char *errors)
1789 { 1458 {
1790 PyObject *v; 1459 PyObject *v;
1791 1460
1792 if (!PyUnicode_Check(unicode)) { 1461 if (!PyUnicode_Check(unicode)) {
1793 PyErr_BadArgument(); 1462 PyErr_BadArgument();
1794 goto onError; 1463 goto onError;
1795 } 1464 }
1796 1465
1797 if (encoding == NULL) 1466 if (encoding == NULL)
1798 encoding = PyUnicode_GetDefaultEncoding(); 1467 encoding = PyUnicode_GetDefaultEncoding();
1799 1468
1800 /* Encode via the codec registry */ 1469 /* Encode via the codec registry */
1801 v = PyCodec_Encode(unicode, encoding, errors); 1470 v = PyCodec_Encode(unicode, encoding, errors);
1802 if (v == NULL) 1471 if (v == NULL)
1803 goto onError; 1472 goto onError;
1804 if (!PyUnicode_Check(v)) { 1473 if (!PyUnicode_Check(v)) {
1805 PyErr_Format(PyExc_TypeError, 1474 PyErr_Format(PyExc_TypeError,
1806 "encoder did not return an str object (type=%.400s)", 1475 "encoder did not return an str object (type=%.400s)",
1807 Py_TYPE(v)->tp_name); 1476 Py_TYPE(v)->tp_name);
1808 Py_DECREF(v); 1477 Py_DECREF(v);
1809 goto onError; 1478 goto onError;
1810 } 1479 }
1811 return v; 1480 return v;
1812 1481
1813 onError: 1482 onError:
1814 return NULL; 1483 return NULL;
1815 } 1484 }
1816 1485
1817 PyObject * 1486 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1818 _PyUnicode_AsDefaultEncodedString(PyObject *unicode) 1487 const char *errors)
1819 { 1488 {
1820 PyObject *v = ((PyUnicodeObject *)unicode)->defenc; 1489 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1821 if (v) 1490 if (v)
1822 return v; 1491 return v;
1492 if (errors != NULL)
1493 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
1823 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1494 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1824 PyUnicode_GET_SIZE(unicode), 1495 PyUnicode_GET_SIZE(unicode),
1825 NULL); 1496 NULL);
1826 if (!v) 1497 if (!v)
1827 return NULL; 1498 return NULL;
1828 ((PyUnicodeObject *)unicode)->defenc = v; 1499 ((PyUnicodeObject *)unicode)->defenc = v;
1829 return v; 1500 return v;
1830 } 1501 }
1831 1502
1832 PyObject* 1503 PyObject*
1833 PyUnicode_DecodeFSDefault(const char *s) { 1504 PyUnicode_DecodeFSDefault(const char *s) {
1834 Py_ssize_t size = (Py_ssize_t)strlen(s); 1505 Py_ssize_t size = (Py_ssize_t)strlen(s);
1835 return PyUnicode_DecodeFSDefaultAndSize(s, size); 1506 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1836 } 1507 }
1837 1508
1838 PyObject* 1509 PyObject*
1839 PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 1510 PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1840 { 1511 {
1841 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1842 return PyUnicode_DecodeMBCS(s, size, NULL);
1843 #elif defined(__APPLE__)
1844 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
1845 #else
1846 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding 1512 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1847 can be undefined. If it is case, decode using UTF-8. The following assume s 1513 can be undefined. If it is case, decode using UTF-8. The following assume s
1848 that Py_FileSystemDefaultEncoding is set to a built-in encoding during th e 1514 that Py_FileSystemDefaultEncoding is set to a built-in encoding during th e
1849 bootstrapping process where the codecs aren't ready yet. 1515 bootstrapping process where the codecs aren't ready yet.
1850 */ 1516 */
1851 if (Py_FileSystemDefaultEncoding) { 1517 if (Py_FileSystemDefaultEncoding) {
1518 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1519 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
1520 return PyUnicode_DecodeMBCS(s, size, "replace");
1521 }
1522 #elif defined(__APPLE__)
1523 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
1524 return PyUnicode_DecodeUTF8(s, size, "replace");
1525 }
1526 #endif
1852 return PyUnicode_Decode(s, size, 1527 return PyUnicode_Decode(s, size,
1853 Py_FileSystemDefaultEncoding, 1528 Py_FileSystemDefaultEncoding,
1854 "surrogateescape"); 1529 "replace");
1855 } 1530 }
1856 else { 1531 else {
1857 /* locale encoding with surrogateescape */ 1532 return PyUnicode_DecodeUTF8(s, size, "replace");
1858 wchar_t *wchar; 1533 }
1859 PyObject *unicode; 1534 }
1860 size_t len; 1535
1861 1536 /* Convert the argument to a bytes object, according to the file
1862 if (s[size] != '\0' || size != strlen(s)) { 1537 system encoding */
1863 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1864 return NULL;
1865 }
1866
1867 wchar = _Py_char2wchar(s, &len);
1868 if (wchar == NULL)
1869 return PyErr_NoMemory();
1870
1871 unicode = PyUnicode_FromWideChar(wchar, len);
1872 PyMem_Free(wchar);
1873 return unicode;
1874 }
1875 #endif
1876 }
1877
1878 1538
1879 int 1539 int
1880 PyUnicode_FSConverter(PyObject* arg, void* addr) 1540 PyUnicode_FSConverter(PyObject* arg, void* addr)
1881 { 1541 {
1882 PyObject *output = NULL; 1542 PyObject *output = NULL;
1883 Py_ssize_t size; 1543 Py_ssize_t size;
1884 void *data; 1544 void *data;
1885 if (arg == NULL) { 1545 if (arg == NULL) {
1886 Py_DECREF(*(PyObject**)addr); 1546 Py_DECREF(*(PyObject**)addr);
1887 return 1; 1547 return 1;
1888 } 1548 }
1889 if (PyBytes_Check(arg)) { 1549 if (PyBytes_Check(arg) || PyByteArray_Check(arg)) {
1890 output = arg; 1550 output = arg;
1891 Py_INCREF(output); 1551 Py_INCREF(output);
1892 } 1552 }
1893 else { 1553 else {
1894 arg = PyUnicode_FromObject(arg); 1554 arg = PyUnicode_FromObject(arg);
1895 if (!arg) 1555 if (!arg)
1896 return 0; 1556 return 0;
1897 output = PyUnicode_EncodeFSDefault(arg); 1557 output = PyUnicode_AsEncodedObject(arg,
1558 Py_FileSystemDefaultEncoding,
1559 "surrogateescape");
1898 Py_DECREF(arg); 1560 Py_DECREF(arg);
1899 if (!output) 1561 if (!output)
1900 return 0; 1562 return 0;
1901 if (!PyBytes_Check(output)) { 1563 if (!PyBytes_Check(output)) {
1902 Py_DECREF(output); 1564 Py_DECREF(output);
1903 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes"); 1565 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1904 return 0; 1566 return 0;
1905 } 1567 }
1906 } 1568 }
1907 size = PyBytes_GET_SIZE(output); 1569 if (PyBytes_Check(output)) {
1908 data = PyBytes_AS_STRING(output); 1570 size = PyBytes_GET_SIZE(output);
1571 data = PyBytes_AS_STRING(output);
1572 }
1573 else {
1574 size = PyByteArray_GET_SIZE(output);
1575 data = PyByteArray_AS_STRING(output);
1576 }
1909 if (size != strlen(data)) { 1577 if (size != strlen(data)) {
1910 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 1578 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1911 Py_DECREF(output); 1579 Py_DECREF(output);
1912 return 0; 1580 return 0;
1913 } 1581 }
1914 *(PyObject**)addr = output; 1582 *(PyObject**)addr = output;
1915 return Py_CLEANUP_SUPPORTED; 1583 return Py_CLEANUP_SUPPORTED;
1916 } 1584 }
1917 1585
1918 1586
1919 int
1920 PyUnicode_FSDecoder(PyObject* arg, void* addr)
1921 {
1922 PyObject *output = NULL;
1923 Py_ssize_t size;
1924 void *data;
1925 if (arg == NULL) {
1926 Py_DECREF(*(PyObject**)addr);
1927 return 1;
1928 }
1929 if (PyUnicode_Check(arg)) {
1930 output = arg;
1931 Py_INCREF(output);
1932 }
1933 else {
1934 arg = PyBytes_FromObject(arg);
1935 if (!arg)
1936 return 0;
1937 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1938 PyBytes_GET_SIZE(arg));
1939 Py_DECREF(arg);
1940 if (!output)
1941 return 0;
1942 if (!PyUnicode_Check(output)) {
1943 Py_DECREF(output);
1944 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode") ;
1945 return 0;
1946 }
1947 }
1948 size = PyUnicode_GET_SIZE(output);
1949 data = PyUnicode_AS_UNICODE(output);
1950 if (size != Py_UNICODE_strlen(data)) {
1951 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1952 Py_DECREF(output);
1953 return 0;
1954 }
1955 *(PyObject**)addr = output;
1956 return Py_CLEANUP_SUPPORTED;
1957 }
1958
1959
1960 char* 1587 char*
1961 _PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize) 1588 _PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
1962 { 1589 {
1963 PyObject *bytes; 1590 PyObject *bytes;
1964 if (!PyUnicode_Check(unicode)) { 1591 if (!PyUnicode_Check(unicode)) {
1965 PyErr_BadArgument(); 1592 PyErr_BadArgument();
1966 return NULL; 1593 return NULL;
1967 } 1594 }
1968 bytes = _PyUnicode_AsDefaultEncodedString(unicode); 1595 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1969 if (bytes == NULL) 1596 if (bytes == NULL)
1970 return NULL; 1597 return NULL;
1971 if (psize != NULL) 1598 if (psize != NULL)
1972 *psize = PyBytes_GET_SIZE(bytes); 1599 *psize = PyBytes_GET_SIZE(bytes);
1973 return PyBytes_AS_STRING(bytes); 1600 return PyBytes_AS_STRING(bytes);
1974 } 1601 }
1975 1602
1976 char* 1603 char*
1977 _PyUnicode_AsString(PyObject *unicode) 1604 _PyUnicode_AsString(PyObject *unicode)
1978 { 1605 {
1979 return _PyUnicode_AsStringAndSize(unicode, NULL); 1606 return _PyUnicode_AsStringAndSize(unicode, NULL);
1980 } 1607 }
1981 1608
1982 Py_UNICODE * 1609 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1983 PyUnicode_AsUnicode(PyObject *unicode)
1984 { 1610 {
1985 if (!PyUnicode_Check(unicode)) { 1611 if (!PyUnicode_Check(unicode)) {
1986 PyErr_BadArgument(); 1612 PyErr_BadArgument();
1987 goto onError; 1613 goto onError;
1988 } 1614 }
1989 return PyUnicode_AS_UNICODE(unicode); 1615 return PyUnicode_AS_UNICODE(unicode);
1990 1616
1991 onError: 1617 onError:
1992 return NULL; 1618 return NULL;
1993 } 1619 }
1994 1620
1995 Py_ssize_t 1621 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1996 PyUnicode_GetSize(PyObject *unicode)
1997 { 1622 {
1998 if (!PyUnicode_Check(unicode)) { 1623 if (!PyUnicode_Check(unicode)) {
1999 PyErr_BadArgument(); 1624 PyErr_BadArgument();
2000 goto onError; 1625 goto onError;
2001 } 1626 }
2002 return PyUnicode_GET_SIZE(unicode); 1627 return PyUnicode_GET_SIZE(unicode);
2003 1628
2004 onError: 1629 onError:
2005 return -1; 1630 return -1;
2006 } 1631 }
2007 1632
2008 const char * 1633 const char *PyUnicode_GetDefaultEncoding(void)
2009 PyUnicode_GetDefaultEncoding(void) 1634 {
2010 { 1635 return unicode_default_encoding;
2011 return "utf-8"; 1636 }
2012 } 1637
2013 1638 int PyUnicode_SetDefaultEncoding(const char *encoding)
2014 /* create or adjust a UnicodeDecodeError */ 1639 {
2015 static void 1640 if (strcmp(encoding, unicode_default_encoding) != 0) {
2016 make_decode_exception(PyObject **exceptionObject, 1641 PyErr_Format(PyExc_ValueError,
2017 const char *encoding, 1642 "Can only set default encoding to %s",
2018 const char *input, Py_ssize_t length, 1643 unicode_default_encoding);
2019 Py_ssize_t startpos, Py_ssize_t endpos, 1644 return -1;
2020 const char *reason) 1645 }
2021 { 1646 return 0;
2022 if (*exceptionObject == NULL) {
2023 *exceptionObject = PyUnicodeDecodeError_Create(
2024 encoding, input, length, startpos, endpos, reason);
2025 }
2026 else {
2027 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2028 goto onError;
2029 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2030 goto onError;
2031 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2032 goto onError;
2033 }
2034 return;
2035
2036 onError:
2037 Py_DECREF(*exceptionObject);
2038 *exceptionObject = NULL;
2039 } 1647 }
2040 1648
2041 /* error handling callback helper: 1649 /* error handling callback helper:
2042 build arguments, call the callback and check the arguments, 1650 build arguments, call the callback and check the arguments,
2043 if no exception occurred, copy the replacement to the output 1651 if no exception occurred, copy the replacement to the output
2044 and adjust various state variables. 1652 and adjust various state variables.
2045 return 0 on success, -1 on error 1653 return 0 on success, -1 on error
2046 */ 1654 */
2047 1655
2048 static int 1656 static
2049 unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 1657 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler ,
2050 » » » » const char *encoding, const char *reason, 1658 const char *encoding, const char *reason,
2051 » » » » const char **input, const char **inend, Py_ssiz e_t *startinpos, 1659 const char **input, const char **inend, Py_ ssize_t *startinpos,
2052 » » » » Py_ssize_t *endinpos, PyObject **exceptionObjec t, const char **inptr, 1660 Py_ssize_t *endinpos, PyObject **exceptionO bject, const char **inptr,
2053 » » » » PyUnicodeObject **output, Py_ssize_t *outpos, P y_UNICODE **outptr) 1661 PyUnicodeObject **output, Py_ssize_t *outpo s, Py_UNICODE **outptr)
2054 { 1662 {
2055 static char *argparse = "O!n;decoding error handler must return (str, int) t uple"; 1663 static char *argparse = "O!n;decoding error handler must return (str, int) t uple";
2056 1664
2057 PyObject *restuple = NULL; 1665 PyObject *restuple = NULL;
2058 PyObject *repunicode = NULL; 1666 PyObject *repunicode = NULL;
2059 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output); 1667 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
2060 Py_ssize_t insize; 1668 Py_ssize_t insize;
2061 Py_ssize_t requiredsize; 1669 Py_ssize_t requiredsize;
2062 Py_ssize_t newpos; 1670 Py_ssize_t newpos;
2063 Py_UNICODE *repptr; 1671 Py_UNICODE *repptr;
2064 PyObject *inputobj = NULL; 1672 PyObject *inputobj = NULL;
2065 Py_ssize_t repsize; 1673 Py_ssize_t repsize;
2066 int res = -1; 1674 int res = -1;
2067 1675
2068 if (*errorHandler == NULL) { 1676 if (*errorHandler == NULL) {
2069 *errorHandler = PyCodec_LookupError(errors); 1677 *errorHandler = PyCodec_LookupError(errors);
2070 if (*errorHandler == NULL) 1678 if (*errorHandler == NULL)
2071 goto onError; 1679 goto onError;
2072 } 1680 }
2073 1681
2074 make_decode_exception(exceptionObject, 1682 if (*exceptionObject == NULL) {
2075 encoding, 1683 *exceptionObject = PyUnicodeDecodeError_Create(
2076 *input, *inend - *input, 1684 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
2077 *startinpos, *endinpos, 1685 if (*exceptionObject == NULL)
2078 reason); 1686 goto onError;
2079 if (*exceptionObject == NULL) 1687 }
2080 goto onError; 1688 else {
1689 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1690 goto onError;
1691 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1692 goto onError;
1693 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1694 goto onError;
1695 }
2081 1696
2082 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NUL L); 1697 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NUL L);
2083 if (restuple == NULL) 1698 if (restuple == NULL)
2084 goto onError; 1699 goto onError;
2085 if (!PyTuple_Check(restuple)) { 1700 if (!PyTuple_Check(restuple)) {
2086 PyErr_SetString(PyExc_TypeError, &argparse[4]); 1701 PyErr_SetString(PyExc_TypeError, &argparse[4]);
2087 goto onError; 1702 goto onError;
2088 } 1703 }
2089 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &new pos)) 1704 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &new pos))
2090 goto onError; 1705 goto onError;
(...skipping 122 matching lines...) Expand 10 before | Expand all | Expand 10 after
2213 * on whether we are encoding whitespace as itself. RFC2152 makes it 1828 * on whether we are encoding whitespace as itself. RFC2152 makes it
2214 * clear that the answers to these questions vary between 1829 * clear that the answers to these questions vary between
2215 * applications, so this code needs to be flexible. */ 1830 * applications, so this code needs to be flexible. */
2216 1831
2217 #define ENCODE_DIRECT(c, directO, directWS) \ 1832 #define ENCODE_DIRECT(c, directO, directWS) \
2218 ((c) < 128 && (c) > 0 && \ 1833 ((c) < 128 && (c) > 0 && \
2219 ((utf7_category[(c)] == 0) || \ 1834 ((utf7_category[(c)] == 0) || \
2220 (directWS && (utf7_category[(c)] == 2)) || \ 1835 (directWS && (utf7_category[(c)] == 2)) || \
2221 (directO && (utf7_category[(c)] == 1)))) 1836 (directO && (utf7_category[(c)] == 1))))
2222 1837
2223 PyObject * 1838 PyObject *PyUnicode_DecodeUTF7(const char *s,
2224 PyUnicode_DecodeUTF7(const char *s, 1839 Py_ssize_t size,
2225 » » Py_ssize_t size, 1840 const char *errors)
2226 » » const char *errors)
2227 { 1841 {
2228 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 1842 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2229 } 1843 }
2230 1844
2231 /* The decoder. The only state we preserve is our read position, 1845 /* The decoder. The only state we preserve is our read position,
2232 * i.e. how many characters we have consumed. So if we end in the 1846 * i.e. how many characters we have consumed. So if we end in the
2233 * middle of a shift sequence we have to back off the read position 1847 * middle of a shift sequence we have to back off the read position
2234 * and the output to the beginning of the sequence, otherwise we lose 1848 * and the output to the beginning of the sequence, otherwise we lose
2235 * all the shift state (seen bits, number of bits seen, high 1849 * all the shift state (seen bits, number of bits seen, high
2236 * surrogate). */ 1850 * surrogate). */
2237 1851
2238 PyObject * 1852 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
2239 PyUnicode_DecodeUTF7Stateful(const char *s, 1853 Py_ssize_t size,
2240 » » » Py_ssize_t size, 1854 const char *errors,
2241 » » » const char *errors, 1855 Py_ssize_t *consumed)
2242 » » » Py_ssize_t *consumed)
2243 { 1856 {
2244 const char *starts = s; 1857 const char *starts = s;
2245 Py_ssize_t startinpos; 1858 Py_ssize_t startinpos;
2246 Py_ssize_t endinpos; 1859 Py_ssize_t endinpos;
2247 Py_ssize_t outpos; 1860 Py_ssize_t outpos;
2248 const char *e; 1861 const char *e;
2249 PyUnicodeObject *unicode; 1862 PyUnicodeObject *unicode;
2250 Py_UNICODE *p; 1863 Py_UNICODE *p;
2251 const char *errmsg = ""; 1864 const char *errmsg = "";
2252 int inShift = 0; 1865 int inShift = 0;
(...skipping 166 matching lines...) Expand 10 before | Expand all | Expand 10 after
2419 return (PyObject *)unicode; 2032 return (PyObject *)unicode;
2420 2033
2421 onError: 2034 onError:
2422 Py_XDECREF(errorHandler); 2035 Py_XDECREF(errorHandler);
2423 Py_XDECREF(exc); 2036 Py_XDECREF(exc);
2424 Py_DECREF(unicode); 2037 Py_DECREF(unicode);
2425 return NULL; 2038 return NULL;
2426 } 2039 }
2427 2040
2428 2041
2429 PyObject * 2042 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
2430 PyUnicode_EncodeUTF7(const Py_UNICODE *s, 2043 Py_ssize_t size,
2431 » » Py_ssize_t size, 2044 int base64SetO,
2432 » » int base64SetO, 2045 int base64WhiteSpace,
2433 » » int base64WhiteSpace, 2046 const char *errors)
2434 » » const char *errors)
2435 { 2047 {
2436 PyObject *v; 2048 PyObject *v;
2437 /* It might be possible to tighten this worst case */ 2049 /* It might be possible to tighten this worst case */
2438 Py_ssize_t allocated = 8 * size; 2050 Py_ssize_t allocated = 8 * size;
2439 int inShift = 0; 2051 int inShift = 0;
2440 Py_ssize_t i = 0; 2052 Py_ssize_t i = 0;
2441 unsigned int base64bits = 0; 2053 unsigned int base64bits = 0;
2442 unsigned long base64buffer = 0; 2054 unsigned long base64buffer = 0;
2443 char * out; 2055 char * out;
2444 char * start; 2056 char * start;
(...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after
2529 #undef ENCODE_DIRECT 2141 #undef ENCODE_DIRECT
2530 2142
2531 /* --- UTF-8 Codec -------------------------------------------------------- */ 2143 /* --- UTF-8 Codec -------------------------------------------------------- */
2532 2144
2533 static 2145 static
2534 char utf8_code_length[256] = { 2146 char utf8_code_length[256] = {
2535 /* Map UTF-8 encoded prefix byte to sequence length. Zero means 2147 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2536 illegal prefix. See RFC 3629 for details */ 2148 illegal prefix. See RFC 3629 for details */
2537 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */ 2149 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2538 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2150 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2539 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2151 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2540 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2152 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2541 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2153 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2542 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2154 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2543 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2155 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2544 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */ 2156 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */ 2157 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
2546 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2158 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2547 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2159 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2548 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */ 2160 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2549 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */ 2161 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2550 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */ 2162 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2551 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */ 2163 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2552 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */ 2164 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
2553 }; 2165 };
2554 2166
2555 PyObject * 2167 PyObject *PyUnicode_DecodeUTF8(const char *s,
2556 PyUnicode_DecodeUTF8(const char *s, 2168 Py_ssize_t size,
2557 » » Py_ssize_t size, 2169 const char *errors)
2558 » » const char *errors)
2559 { 2170 {
2560 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 2171 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2561 } 2172 }
2562 2173
2563 /* Mask to check or force alignment of a pointer to C 'long' boundaries */ 2174 /* Mask to check or force alignment of a pointer to C 'long' boundaries */
2564 #define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1) 2175 #define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2565 2176
2566 /* Mask to quickly check whether a C 'long' contains a 2177 /* Mask to quickly check whether a C 'long' contains a
2567 non-ASCII, UTF8-encoded char. */ 2178 non-ASCII, UTF8-encoded char. */
2568 #if (SIZEOF_LONG == 8) 2179 #if (SIZEOF_LONG == 8)
2569 # define ASCII_CHAR_MASK 0x8080808080808080L 2180 # define ASCII_CHAR_MASK 0x8080808080808080L
2570 #elif (SIZEOF_LONG == 4) 2181 #elif (SIZEOF_LONG == 4)
2571 # define ASCII_CHAR_MASK 0x80808080L 2182 # define ASCII_CHAR_MASK 0x80808080L
2572 #else 2183 #else
2573 # error C 'long' size should be either 4 or 8! 2184 # error C 'long' size should be either 4 or 8!
2574 #endif 2185 #endif
2575 2186
2576 PyObject * 2187 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
2577 PyUnicode_DecodeUTF8Stateful(const char *s, 2188 Py_ssize_t size,
2578 » » » Py_ssize_t size, 2189 const char *errors,
2579 » » » const char *errors, 2190 Py_ssize_t *consumed)
2580 » » » Py_ssize_t *consumed)
2581 { 2191 {
2582 const char *starts = s; 2192 const char *starts = s;
2583 int n; 2193 int n;
2584 int k; 2194 int k;
2585 Py_ssize_t startinpos; 2195 Py_ssize_t startinpos;
2586 Py_ssize_t endinpos; 2196 Py_ssize_t endinpos;
2587 Py_ssize_t outpos; 2197 Py_ssize_t outpos;
2588 const char *e, *aligned_end; 2198 const char *e, *aligned_end;
2589 PyUnicodeObject *unicode; 2199 PyUnicodeObject *unicode;
2590 Py_UNICODE *p; 2200 Py_UNICODE *p;
(...skipping 200 matching lines...) Expand 10 before | Expand all | Expand 10 after
2791 2401
2792 onError: 2402 onError:
2793 Py_XDECREF(errorHandler); 2403 Py_XDECREF(errorHandler);
2794 Py_XDECREF(exc); 2404 Py_XDECREF(exc);
2795 Py_DECREF(unicode); 2405 Py_DECREF(unicode);
2796 return NULL; 2406 return NULL;
2797 } 2407 }
2798 2408
2799 #undef ASCII_CHAR_MASK 2409 #undef ASCII_CHAR_MASK
2800 2410
2801 #ifdef __APPLE__
2802
2803 /* Simplified UTF-8 decoder using surrogateescape error handler,
2804 used to decode the command line arguments on Mac OS X. */
2805
2806 wchar_t*
2807 _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
2808 {
2809 int n;
2810 const char *e;
2811 wchar_t *unicode, *p;
2812
2813 /* Note: size will always be longer than the resulting Unicode
2814 character count */
2815 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
2816 PyErr_NoMemory();
2817 return NULL;
2818 }
2819 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
2820 if (!unicode)
2821 return NULL;
2822
2823 /* Unpack UTF-8 encoded data */
2824 p = unicode;
2825 e = s + size;
2826 while (s < e) {
2827 Py_UCS4 ch = (unsigned char)*s;
2828
2829 if (ch < 0x80) {
2830 *p++ = (wchar_t)ch;
2831 s++;
2832 continue;
2833 }
2834
2835 n = utf8_code_length[ch];
2836 if (s + n > e) {
2837 goto surrogateescape;
2838 }
2839
2840 switch (n) {
2841 case 0:
2842 case 1:
2843 goto surrogateescape;
2844
2845 case 2:
2846 if ((s[1] & 0xc0) != 0x80)
2847 goto surrogateescape;
2848 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2849 assert ((ch > 0x007F) && (ch <= 0x07FF));
2850 *p++ = (wchar_t)ch;
2851 break;
2852
2853 case 3:
2854 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2855 will result in surrogates in range d800-dfff. Surrogates are
2856 not valid UTF-8 so they are rejected.
2857 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2858 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2859 if ((s[1] & 0xc0) != 0x80 ||
2860 (s[2] & 0xc0) != 0x80 ||
2861 ((unsigned char)s[0] == 0xE0 &&
2862 (unsigned char)s[1] < 0xA0) ||
2863 ((unsigned char)s[0] == 0xED &&
2864 (unsigned char)s[1] > 0x9F)) {
2865
2866 goto surrogateescape;
2867 }
2868 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2869 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2870 *p++ = (Py_UNICODE)ch;
2871 break;
2872
2873 case 4:
2874 if ((s[1] & 0xc0) != 0x80 ||
2875 (s[2] & 0xc0) != 0x80 ||
2876 (s[3] & 0xc0) != 0x80 ||
2877 ((unsigned char)s[0] == 0xF0 &&
2878 (unsigned char)s[1] < 0x90) ||
2879 ((unsigned char)s[0] == 0xF4 &&
2880 (unsigned char)s[1] > 0x8F)) {
2881 goto surrogateescape;
2882 }
2883 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2884 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2885 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2886
2887 #if SIZEOF_WCHAR_T == 4
2888 *p++ = (wchar_t)ch;
2889 #else
2890 /* compute and append the two surrogates: */
2891
2892 /* translate from 10000..10FFFF to 0..FFFF */
2893 ch -= 0x10000;
2894
2895 /* high surrogate = top 10 bits added to D800 */
2896 *p++ = (wchar_t)(0xD800 + (ch >> 10));
2897
2898 /* low surrogate = bottom 10 bits added to DC00 */
2899 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
2900 #endif
2901 break;
2902 }
2903 s += n;
2904 continue;
2905
2906 surrogateescape:
2907 *p++ = 0xDC00 + ch;
2908 s++;
2909 }
2910 *p = L'\0';
2911 return unicode;
2912 }
2913
2914 #endif /* __APPLE__ */
2915 2411
2916 /* Allocation strategy: if the string is short, convert into a stack buffer 2412 /* Allocation strategy: if the string is short, convert into a stack buffer
2917 and allocate exactly as much space needed at the end. Else allocate the 2413 and allocate exactly as much space needed at the end. Else allocate the
2918 maximum possible needed (4 result bytes per Unicode character), and return 2414 maximum possible needed (4 result bytes per Unicode character), and return
2919 the excess memory at the end. 2415 the excess memory at the end.
2920 */ 2416 */
2921 PyObject * 2417 PyObject *
2922 PyUnicode_EncodeUTF8(const Py_UNICODE *s, 2418 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2923 Py_ssize_t size, 2419 Py_ssize_t size,
2924 const char *errors) 2420 const char *errors)
(...skipping 151 matching lines...) Expand 10 before | Expand all | Expand 10 after
3076 return result; 2572 return result;
3077 error: 2573 error:
3078 Py_XDECREF(errorHandler); 2574 Py_XDECREF(errorHandler);
3079 Py_XDECREF(exc); 2575 Py_XDECREF(exc);
3080 Py_XDECREF(result); 2576 Py_XDECREF(result);
3081 return NULL; 2577 return NULL;
3082 2578
3083 #undef MAX_SHORT_UNICHARS 2579 #undef MAX_SHORT_UNICHARS
3084 } 2580 }
3085 2581
3086 PyObject * 2582 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
3087 PyUnicode_AsUTF8String(PyObject *unicode) 2583 {
3088 {
3089 PyObject *utf8;
3090 if (!PyUnicode_Check(unicode)) { 2584 if (!PyUnicode_Check(unicode)) {
3091 PyErr_BadArgument(); 2585 PyErr_BadArgument();
3092 return NULL; 2586 return NULL;
3093 } 2587 }
3094 utf8 = _PyUnicode_AsDefaultEncodedString(unicode); 2588 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
3095 if (utf8 == NULL) 2589 PyUnicode_GET_SIZE(unicode),
3096 return NULL; 2590 NULL);
3097 Py_INCREF(utf8);
3098 return utf8;
3099 } 2591 }
3100 2592
3101 /* --- UTF-32 Codec ------------------------------------------------------- */ 2593 /* --- UTF-32 Codec ------------------------------------------------------- */
3102 2594
3103 PyObject * 2595 PyObject *
3104 PyUnicode_DecodeUTF32(const char *s, 2596 PyUnicode_DecodeUTF32(const char *s,
3105 Py_ssize_t size, 2597 Py_ssize_t size,
3106 const char *errors, 2598 const char *errors,
3107 int *byteorder) 2599 int *byteorder)
3108 { 2600 {
(...skipping 23 matching lines...) Expand all
3132 int bo = 0; /* assume native ordering by default */ 2624 int bo = 0; /* assume native ordering by default */
3133 const char *errmsg = ""; 2625 const char *errmsg = "";
3134 /* Offsets from q for retrieving bytes in the right order. */ 2626 /* Offsets from q for retrieving bytes in the right order. */
3135 #ifdef BYTEORDER_IS_LITTLE_ENDIAN 2627 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
3136 int iorder[] = {0, 1, 2, 3}; 2628 int iorder[] = {0, 1, 2, 3};
3137 #else 2629 #else
3138 int iorder[] = {3, 2, 1, 0}; 2630 int iorder[] = {3, 2, 1, 0};
3139 #endif 2631 #endif
3140 PyObject *errorHandler = NULL; 2632 PyObject *errorHandler = NULL;
3141 PyObject *exc = NULL; 2633 PyObject *exc = NULL;
3142 2634
3143 q = (unsigned char *)s; 2635 q = (unsigned char *)s;
3144 e = q + size; 2636 e = q + size;
3145 2637
3146 if (byteorder) 2638 if (byteorder)
3147 bo = *byteorder; 2639 bo = *byteorder;
3148 2640
3149 /* Check for BOM marks (U+FEFF) in the input and adjust current 2641 /* Check for BOM marks (U+FEFF) in the input and adjust current
3150 byte order setting accordingly. In native mode, the leading BOM 2642 byte order setting accordingly. In native mode, the leading BOM
3151 mark is skipped, in all other modes, it is copied to the output 2643 mark is skipped, in all other modes, it is copied to the output
3152 stream as-is (giving a ZWNBSP character). */ 2644 stream as-is (giving a ZWNBSP character). */
(...skipping 201 matching lines...) Expand 10 before | Expand all | Expand 10 after
3354 } 2846 }
3355 #endif 2847 #endif
3356 STORECHAR(ch); 2848 STORECHAR(ch);
3357 } 2849 }
3358 2850
3359 done: 2851 done:
3360 return v; 2852 return v;
3361 #undef STORECHAR 2853 #undef STORECHAR
3362 } 2854 }
3363 2855
3364 PyObject * 2856 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
3365 PyUnicode_AsUTF32String(PyObject *unicode)
3366 { 2857 {
3367 if (!PyUnicode_Check(unicode)) { 2858 if (!PyUnicode_Check(unicode)) {
3368 PyErr_BadArgument(); 2859 PyErr_BadArgument();
3369 return NULL; 2860 return NULL;
3370 } 2861 }
3371 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode), 2862 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
3372 PyUnicode_GET_SIZE(unicode), 2863 PyUnicode_GET_SIZE(unicode),
3373 NULL, 2864 NULL,
3374 0); 2865 0);
3375 } 2866 }
(...skipping 369 matching lines...) Expand 10 before | Expand all | Expand 10 after
3745 STORECHAR(ch); 3236 STORECHAR(ch);
3746 if (ch2) 3237 if (ch2)
3747 STORECHAR(ch2); 3238 STORECHAR(ch2);
3748 } 3239 }
3749 3240
3750 done: 3241 done:
3751 return v; 3242 return v;
3752 #undef STORECHAR 3243 #undef STORECHAR
3753 } 3244 }
3754 3245
3755 PyObject * 3246 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3756 PyUnicode_AsUTF16String(PyObject *unicode)
3757 { 3247 {
3758 if (!PyUnicode_Check(unicode)) { 3248 if (!PyUnicode_Check(unicode)) {
3759 PyErr_BadArgument(); 3249 PyErr_BadArgument();
3760 return NULL; 3250 return NULL;
3761 } 3251 }
3762 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 3252 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
3763 PyUnicode_GET_SIZE(unicode), 3253 PyUnicode_GET_SIZE(unicode),
3764 NULL, 3254 NULL,
3765 0); 3255 0);
3766 } 3256 }
3767 3257
3768 /* --- Unicode Escape Codec ----------------------------------------------- */ 3258 /* --- Unicode Escape Codec ----------------------------------------------- */
3769 3259
3770 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 3260 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
3771 3261
3772 PyObject * 3262 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
3773 PyUnicode_DecodeUnicodeEscape(const char *s, 3263 Py_ssize_t size,
3774 » » » Py_ssize_t size, 3264 const char *errors)
3775 » » » const char *errors)
3776 { 3265 {
3777 const char *starts = s; 3266 const char *starts = s;
3778 Py_ssize_t startinpos; 3267 Py_ssize_t startinpos;
3779 Py_ssize_t endinpos; 3268 Py_ssize_t endinpos;
3780 Py_ssize_t outpos; 3269 Py_ssize_t outpos;
3781 int i; 3270 int i;
3782 PyUnicodeObject *v; 3271 PyUnicodeObject *v;
3783 Py_UNICODE *p; 3272 Py_UNICODE *p;
3784 const char *end; 3273 const char *end;
3785 char* message; 3274 char* message;
(...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after
3870 if (unicode_decode_call_errorhandler( 3359 if (unicode_decode_call_errorhandler(
3871 errors, &errorHandler, 3360 errors, &errorHandler,
3872 "unicodeescape", "end of string in escape sequence", 3361 "unicodeescape", "end of string in escape sequence",
3873 &starts, &end, &startinpos, &endinpos, &exc, &s, 3362 &starts, &end, &startinpos, &endinpos, &exc, &s,
3874 &v, &outpos, &p)) 3363 &v, &outpos, &p))
3875 goto onError; 3364 goto onError;
3876 goto nextByte; 3365 goto nextByte;
3877 } 3366 }
3878 for (i = 0; i < digits; ++i) { 3367 for (i = 0; i < digits; ++i) {
3879 c = (unsigned char) s[i]; 3368 c = (unsigned char) s[i];
3880 if (!Py_ISXDIGIT(c)) { 3369 if (!ISXDIGIT(c)) {
3881 endinpos = (s+i+1)-starts; 3370 endinpos = (s+i+1)-starts;
3882 if (unicode_decode_call_errorhandler( 3371 if (unicode_decode_call_errorhandler(
3883 errors, &errorHandler, 3372 errors, &errorHandler,
3884 "unicodeescape", message, 3373 "unicodeescape", message,
3885 &starts, &end, &startinpos, &endinpos, &exc, &s, 3374 &starts, &end, &startinpos, &endinpos, &exc, &s,
3886 &v, &outpos, &p)) 3375 &v, &outpos, &p))
3887 goto onError; 3376 goto onError;
3888 goto nextByte; 3377 goto nextByte;
3889 } 3378 }
3890 chr = (chr<<4) & ~0xF; 3379 chr = (chr<<4) & ~0xF;
(...skipping 130 matching lines...) Expand 10 before | Expand all | Expand 10 after
4021 if (*s == ch) 3510 if (*s == ch)
4022 return s; 3511 return s;
4023 s++; 3512 s++;
4024 } 3513 }
4025 3514
4026 return NULL; 3515 return NULL;
4027 } 3516 }
4028 3517
4029 static const char *hexdigits = "0123456789abcdef"; 3518 static const char *hexdigits = "0123456789abcdef";
4030 3519
4031 PyObject * 3520 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
4032 PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 3521 Py_ssize_t size)
4033 » » » Py_ssize_t size)
4034 { 3522 {
4035 PyObject *repr; 3523 PyObject *repr;
4036 char *p; 3524 char *p;
4037 3525
4038 #ifdef Py_UNICODE_WIDE 3526 #ifdef Py_UNICODE_WIDE
4039 const Py_ssize_t expandsize = 10; 3527 const Py_ssize_t expandsize = 10;
4040 #else 3528 #else
4041 const Py_ssize_t expandsize = 6; 3529 const Py_ssize_t expandsize = 6;
4042 #endif 3530 #endif
4043 3531
(...skipping 119 matching lines...) Expand 10 before | Expand all | Expand 10 after
4163 else 3651 else
4164 *p++ = (char) ch; 3652 *p++ = (char) ch;
4165 } 3653 }
4166 3654
4167 assert(p - PyBytes_AS_STRING(repr) > 0); 3655 assert(p - PyBytes_AS_STRING(repr) > 0);
4168 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) 3656 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
4169 return NULL; 3657 return NULL;
4170 return repr; 3658 return repr;
4171 } 3659 }
4172 3660
4173 PyObject * 3661 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
4174 PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
4175 { 3662 {
4176 PyObject *s; 3663 PyObject *s;
4177 if (!PyUnicode_Check(unicode)) { 3664 if (!PyUnicode_Check(unicode)) {
4178 PyErr_BadArgument(); 3665 PyErr_BadArgument();
4179 return NULL; 3666 return NULL;
4180 } 3667 }
4181 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 3668 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4182 PyUnicode_GET_SIZE(unicode)); 3669 PyUnicode_GET_SIZE(unicode));
4183 return s; 3670 return s;
4184 } 3671 }
4185 3672
4186 /* --- Raw Unicode Escape Codec ------------------------------------------- */ 3673 /* --- Raw Unicode Escape Codec ------------------------------------------- */
4187 3674
4188 PyObject * 3675 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
4189 PyUnicode_DecodeRawUnicodeEscape(const char *s, 3676 Py_ssize_t size,
4190 » » » » Py_ssize_t size, 3677 const char *errors)
4191 » » » » const char *errors)
4192 { 3678 {
4193 const char *starts = s; 3679 const char *starts = s;
4194 Py_ssize_t startinpos; 3680 Py_ssize_t startinpos;
4195 Py_ssize_t endinpos; 3681 Py_ssize_t endinpos;
4196 Py_ssize_t outpos; 3682 Py_ssize_t outpos;
4197 PyUnicodeObject *v; 3683 PyUnicodeObject *v;
4198 Py_UNICODE *p; 3684 Py_UNICODE *p;
4199 const char *end; 3685 const char *end;
4200 const char *bs; 3686 const char *bs;
4201 PyObject *errorHandler = NULL; 3687 PyObject *errorHandler = NULL;
(...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after
4239 continue; 3725 continue;
4240 } 3726 }
4241 p--; 3727 p--;
4242 count = *s=='u' ? 4 : 8; 3728 count = *s=='u' ? 4 : 8;
4243 s++; 3729 s++;
4244 3730
4245 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 3731 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
4246 outpos = p-PyUnicode_AS_UNICODE(v); 3732 outpos = p-PyUnicode_AS_UNICODE(v);
4247 for (x = 0, i = 0; i < count; ++i, ++s) { 3733 for (x = 0, i = 0; i < count; ++i, ++s) {
4248 c = (unsigned char)*s; 3734 c = (unsigned char)*s;
4249 if (!Py_ISXDIGIT(c)) { 3735 if (!ISXDIGIT(c)) {
4250 endinpos = s-starts; 3736 endinpos = s-starts;
4251 if (unicode_decode_call_errorhandler( 3737 if (unicode_decode_call_errorhandler(
4252 errors, &errorHandler, 3738 errors, &errorHandler,
4253 "rawunicodeescape", "truncated \\uXXXX", 3739 "rawunicodeescape", "truncated \\uXXXX",
4254 &starts, &end, &startinpos, &endinpos, &exc, &s, 3740 &starts, &end, &startinpos, &endinpos, &exc, &s,
4255 &v, &outpos, &p)) 3741 &v, &outpos, &p))
4256 goto onError; 3742 goto onError;
4257 goto nextByte; 3743 goto nextByte;
4258 } 3744 }
4259 x = (x<<4) & ~0xF; 3745 x = (x<<4) & ~0xF;
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after
4296 Py_XDECREF(exc); 3782 Py_XDECREF(exc);
4297 return (PyObject *)v; 3783 return (PyObject *)v;
4298 3784
4299 onError: 3785 onError:
4300 Py_XDECREF(v); 3786 Py_XDECREF(v);
4301 Py_XDECREF(errorHandler); 3787 Py_XDECREF(errorHandler);
4302 Py_XDECREF(exc); 3788 Py_XDECREF(exc);
4303 return NULL; 3789 return NULL;
4304 } 3790 }
4305 3791
4306 PyObject * 3792 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
4307 PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 3793 Py_ssize_t size)
4308 » » » » Py_ssize_t size)
4309 { 3794 {
4310 PyObject *repr; 3795 PyObject *repr;
4311 char *p; 3796 char *p;
4312 char *q; 3797 char *q;
4313 3798
4314 #ifdef Py_UNICODE_WIDE 3799 #ifdef Py_UNICODE_WIDE
4315 const Py_ssize_t expandsize = 10; 3800 const Py_ssize_t expandsize = 10;
4316 #else 3801 #else
4317 const Py_ssize_t expandsize = 6; 3802 const Py_ssize_t expandsize = 6;
4318 #endif 3803 #endif
(...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after
4385 *p++ = (char) ch; 3870 *p++ = (char) ch;
4386 } 3871 }
4387 size = p - q; 3872 size = p - q;
4388 3873
4389 assert(size > 0); 3874 assert(size > 0);
4390 if (_PyBytes_Resize(&repr, size) < 0) 3875 if (_PyBytes_Resize(&repr, size) < 0)
4391 return NULL; 3876 return NULL;
4392 return repr; 3877 return repr;
4393 } 3878 }
4394 3879
4395 PyObject * 3880 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4396 PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4397 { 3881 {
4398 PyObject *s; 3882 PyObject *s;
4399 if (!PyUnicode_Check(unicode)) { 3883 if (!PyUnicode_Check(unicode)) {
4400 PyErr_BadArgument(); 3884 PyErr_BadArgument();
4401 return NULL; 3885 return NULL;
4402 } 3886 }
4403 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 3887 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4404 PyUnicode_GET_SIZE(unicode)); 3888 PyUnicode_GET_SIZE(unicode));
4405 3889
4406 return s; 3890 return s;
4407 } 3891 }
4408 3892
4409 /* --- Unicode Internal Codec ------------------------------------------- */ 3893 /* --- Unicode Internal Codec ------------------------------------------- */
4410 3894
4411 PyObject * 3895 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
4412 _PyUnicode_DecodeUnicodeInternal(const char *s, 3896 Py_ssize_t size,
4413 » » » » Py_ssize_t size, 3897 const char *errors)
4414 » » » » const char *errors)
4415 { 3898 {
4416 const char *starts = s; 3899 const char *starts = s;
4417 Py_ssize_t startinpos; 3900 Py_ssize_t startinpos;
4418 Py_ssize_t endinpos; 3901 Py_ssize_t endinpos;
4419 Py_ssize_t outpos; 3902 Py_ssize_t outpos;
4420 PyUnicodeObject *v; 3903 PyUnicodeObject *v;
4421 Py_UNICODE *p; 3904 Py_UNICODE *p;
4422 const char *end; 3905 const char *end;
4423 const char *reason; 3906 const char *reason;
4424 PyObject *errorHandler = NULL; 3907 PyObject *errorHandler = NULL;
(...skipping 55 matching lines...) Expand 10 before | Expand all | Expand 10 after
4480 3963
4481 onError: 3964 onError:
4482 Py_XDECREF(v); 3965 Py_XDECREF(v);
4483 Py_XDECREF(errorHandler); 3966 Py_XDECREF(errorHandler);
4484 Py_XDECREF(exc); 3967 Py_XDECREF(exc);
4485 return NULL; 3968 return NULL;
4486 } 3969 }
4487 3970
4488 /* --- Latin-1 Codec ------------------------------------------------------ */ 3971 /* --- Latin-1 Codec ------------------------------------------------------ */
4489 3972
4490 PyObject * 3973 PyObject *PyUnicode_DecodeLatin1(const char *s,
4491 PyUnicode_DecodeLatin1(const char *s, 3974 Py_ssize_t size,
4492 » » Py_ssize_t size, 3975 const char *errors)
4493 » » const char *errors)
4494 { 3976 {
4495 PyUnicodeObject *v; 3977 PyUnicodeObject *v;
4496 Py_UNICODE *p; 3978 Py_UNICODE *p;
4497 const char *e, *unrolled_end; 3979 const char *e, *unrolled_end;
4498 3980
4499 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 3981 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
4500 if (size == 1) { 3982 if (size == 1) {
4501 Py_UNICODE r = *(unsigned char*)s; 3983 Py_UNICODE r = *(unsigned char*)s;
4502 return PyUnicode_FromUnicode(&r, 1); 3984 return PyUnicode_FromUnicode(&r, 1);
4503 } 3985 }
(...skipping 19 matching lines...) Expand all
4523 while (s < e) 4005 while (s < e)
4524 *p++ = (unsigned char) *s++; 4006 *p++ = (unsigned char) *s++;
4525 return (PyObject *)v; 4007 return (PyObject *)v;
4526 4008
4527 onError: 4009 onError:
4528 Py_XDECREF(v); 4010 Py_XDECREF(v);
4529 return NULL; 4011 return NULL;
4530 } 4012 }
4531 4013
4532 /* create or adjust a UnicodeEncodeError */ 4014 /* create or adjust a UnicodeEncodeError */
4533 static void 4015 static void make_encode_exception(PyObject **exceptionObject,
4534 make_encode_exception(PyObject **exceptionObject, 4016 const char *encoding,
4535 » » const char *encoding, 4017 const Py_UNICODE *unicode, Py_ssize_t size,
4536 » » const Py_UNICODE *unicode, Py_ssize_t size, 4018 Py_ssize_t startpos, Py_ssize_t endpos,
4537 » » Py_ssize_t startpos, Py_ssize_t endpos, 4019 const char *reason)
4538 » » const char *reason)
4539 { 4020 {
4540 if (*exceptionObject == NULL) { 4021 if (*exceptionObject == NULL) {
4541 *exceptionObject = PyUnicodeEncodeError_Create( 4022 *exceptionObject = PyUnicodeEncodeError_Create(
4542 encoding, unicode, size, startpos, endpos, reason); 4023 encoding, unicode, size, startpos, endpos, reason);
4543 } 4024 }
4544 else { 4025 else {
4545 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 4026 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4546 goto onError; 4027 goto onError;
4547 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 4028 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4548 goto onError; 4029 goto onError;
4549 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 4030 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4550 goto onError; 4031 goto onError;
4551 return; 4032 return;
4552 onError: 4033 onError:
4553 Py_DECREF(*exceptionObject); 4034 Py_DECREF(*exceptionObject);
4554 *exceptionObject = NULL; 4035 *exceptionObject = NULL;
4555 } 4036 }
4556 } 4037 }
4557 4038
4558 /* raises a UnicodeEncodeError */ 4039 /* raises a UnicodeEncodeError */
4559 static void 4040 static void raise_encode_exception(PyObject **exceptionObject,
4560 raise_encode_exception(PyObject **exceptionObject, 4041 const char *encoding,
4561 » » const char *encoding, 4042 const Py_UNICODE *unicode, Py_ssize_t size,
4562 » » const Py_UNICODE *unicode, Py_ssize_t size, 4043 Py_ssize_t startpos, Py_ssize_t endpos,
4563 » » Py_ssize_t startpos, Py_ssize_t endpos, 4044 const char *reason)
4564 » » const char *reason)
4565 { 4045 {
4566 make_encode_exception(exceptionObject, 4046 make_encode_exception(exceptionObject,
4567 encoding, unicode, size, startpos, endpos, reason); 4047 encoding, unicode, size, startpos, endpos, reason);
4568 if (*exceptionObject != NULL) 4048 if (*exceptionObject != NULL)
4569 PyCodec_StrictErrors(*exceptionObject); 4049 PyCodec_StrictErrors(*exceptionObject);
4570 } 4050 }
4571 4051
4572 /* error handling callback helper: 4052 /* error handling callback helper:
4573 build arguments, call the callback and check the arguments, 4053 build arguments, call the callback and check the arguments,
4574 put the result into newpos and return the replacement string, which 4054 put the result into newpos and return the replacement string, which
4575 has to be freed by the caller */ 4055 has to be freed by the caller */
4576 static PyObject * 4056 static PyObject *unicode_encode_call_errorhandler(const char *errors,
4577 unicode_encode_call_errorhandler(const char *errors, 4057 PyObject **errorHandler,
4578 » » » » PyObject **errorHandler, 4058 const char *encoding, const ch ar *reason,
4579 » » » » const char *encoding, const char *reason, 4059 const Py_UNICODE *unicode, Py_ ssize_t size, PyObject **exceptionObject,
4580 » » » » const Py_UNICODE *unicode, Py_ssize_t size, PyO bject **exceptionObject, 4060 Py_ssize_t startpos, Py_ssize_ t endpos,
4581 » » » » Py_ssize_t startpos, Py_ssize_t endpos, 4061 Py_ssize_t *newpos)
4582 » » » » Py_ssize_t *newpos)
4583 { 4062 {
4584 static char *argparse = "On;encoding error handler must return (str/bytes, i nt) tuple"; 4063 static char *argparse = "On;encoding error handler must return (str/bytes, i nt) tuple";
4585 4064
4586 PyObject *restuple; 4065 PyObject *restuple;
4587 PyObject *resunicode; 4066 PyObject *resunicode;
4588 4067
4589 if (*errorHandler == NULL) { 4068 if (*errorHandler == NULL) {
4590 *errorHandler = PyCodec_LookupError(errors); 4069 *errorHandler = PyCodec_LookupError(errors);
4591 if (*errorHandler == NULL) 4070 if (*errorHandler == NULL)
4592 return NULL; 4071 return NULL;
(...skipping 28 matching lines...) Expand all
4621 if (*newpos<0 || *newpos>size) { 4100 if (*newpos<0 || *newpos>size) {
4622 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of b ounds", *newpos); 4101 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of b ounds", *newpos);
4623 Py_DECREF(restuple); 4102 Py_DECREF(restuple);
4624 return NULL; 4103 return NULL;
4625 } 4104 }
4626 Py_INCREF(resunicode); 4105 Py_INCREF(resunicode);
4627 Py_DECREF(restuple); 4106 Py_DECREF(restuple);
4628 return resunicode; 4107 return resunicode;
4629 } 4108 }
4630 4109
4631 static PyObject * 4110 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
4632 unicode_encode_ucs1(const Py_UNICODE *p, 4111 Py_ssize_t size,
4633 » » Py_ssize_t size, 4112 const char *errors,
4634 » » const char *errors, 4113 int limit)
4635 » » int limit)
4636 { 4114 {
4637 /* output object */ 4115 /* output object */
4638 PyObject *res; 4116 PyObject *res;
4639 /* pointers to the beginning and end+1 of input */ 4117 /* pointers to the beginning and end+1 of input */
4640 const Py_UNICODE *startp = p; 4118 const Py_UNICODE *startp = p;
4641 const Py_UNICODE *endp = p + size; 4119 const Py_UNICODE *endp = p + size;
4642 /* pointer to the beginning of the unencodable characters */ 4120 /* pointer to the beginning of the unencodable characters */
4643 /* const Py_UNICODE *badp = NULL; */ 4121 /* const Py_UNICODE *badp = NULL; */
4644 /* pointer into the output */ 4122 /* pointer into the output */
4645 char *str; 4123 char *str;
(...skipping 172 matching lines...) Expand 10 before | Expand all | Expand 10 after
4818 Py_XDECREF(exc); 4296 Py_XDECREF(exc);
4819 return res; 4297 return res;
4820 4298
4821 onError: 4299 onError:
4822 Py_XDECREF(res); 4300 Py_XDECREF(res);
4823 Py_XDECREF(errorHandler); 4301 Py_XDECREF(errorHandler);
4824 Py_XDECREF(exc); 4302 Py_XDECREF(exc);
4825 return NULL; 4303 return NULL;
4826 } 4304 }
4827 4305
4828 PyObject * 4306 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
4829 PyUnicode_EncodeLatin1(const Py_UNICODE *p, 4307 Py_ssize_t size,
4830 » » Py_ssize_t size, 4308 const char *errors)
4831 » » const char *errors)
4832 { 4309 {
4833 return unicode_encode_ucs1(p, size, errors, 256); 4310 return unicode_encode_ucs1(p, size, errors, 256);
4834 } 4311 }
4835 4312
4836 PyObject * 4313 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4837 PyUnicode_AsLatin1String(PyObject *unicode)
4838 { 4314 {
4839 if (!PyUnicode_Check(unicode)) { 4315 if (!PyUnicode_Check(unicode)) {
4840 PyErr_BadArgument(); 4316 PyErr_BadArgument();
4841 return NULL; 4317 return NULL;
4842 } 4318 }
4843 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 4319 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
4844 PyUnicode_GET_SIZE(unicode), 4320 PyUnicode_GET_SIZE(unicode),
4845 NULL); 4321 NULL);
4846 } 4322 }
4847 4323
4848 /* --- 7-bit ASCII Codec -------------------------------------------------- */ 4324 /* --- 7-bit ASCII Codec -------------------------------------------------- */
4849 4325
4850 PyObject * 4326 PyObject *PyUnicode_DecodeASCII(const char *s,
4851 PyUnicode_DecodeASCII(const char *s, 4327 Py_ssize_t size,
4852 Py_ssize_t size, 4328 const char *errors)
4853 const char *errors)
4854 { 4329 {
4855 const char *starts = s; 4330 const char *starts = s;
4856 PyUnicodeObject *v; 4331 PyUnicodeObject *v;
4857 Py_UNICODE *p; 4332 Py_UNICODE *p;
4858 Py_ssize_t startinpos; 4333 Py_ssize_t startinpos;
4859 Py_ssize_t endinpos; 4334 Py_ssize_t endinpos;
4860 Py_ssize_t outpos; 4335 Py_ssize_t outpos;
4861 const char *e; 4336 const char *e;
4862 PyObject *errorHandler = NULL; 4337 PyObject *errorHandler = NULL;
4863 PyObject *exc = NULL; 4338 PyObject *exc = NULL;
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after
4900 Py_XDECREF(exc); 4375 Py_XDECREF(exc);
4901 return (PyObject *)v; 4376 return (PyObject *)v;
4902 4377
4903 onError: 4378 onError:
4904 Py_XDECREF(v); 4379 Py_XDECREF(v);
4905 Py_XDECREF(errorHandler); 4380 Py_XDECREF(errorHandler);
4906 Py_XDECREF(exc); 4381 Py_XDECREF(exc);
4907 return NULL; 4382 return NULL;
4908 } 4383 }
4909 4384
4910 PyObject * 4385 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
4911 PyUnicode_EncodeASCII(const Py_UNICODE *p, 4386 Py_ssize_t size,
4912 Py_ssize_t size, 4387 const char *errors)
4913 const char *errors)
4914 { 4388 {
4915 return unicode_encode_ucs1(p, size, errors, 128); 4389 return unicode_encode_ucs1(p, size, errors, 128);
4916 } 4390 }
4917 4391
4918 PyObject * 4392 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4919 PyUnicode_AsASCIIString(PyObject *unicode)
4920 { 4393 {
4921 if (!PyUnicode_Check(unicode)) { 4394 if (!PyUnicode_Check(unicode)) {
4922 PyErr_BadArgument(); 4395 PyErr_BadArgument();
4923 return NULL; 4396 return NULL;
4924 } 4397 }
4925 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 4398 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
4926 PyUnicode_GET_SIZE(unicode), 4399 PyUnicode_GET_SIZE(unicode),
4927 NULL); 4400 NULL);
4928 } 4401 }
4929 4402
4930 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 4403 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
4931 4404
4932 /* --- MBCS codecs for Windows -------------------------------------------- */ 4405 /* --- MBCS codecs for Windows -------------------------------------------- */
4933 4406
4934 #if SIZEOF_INT < SIZEOF_SIZE_T 4407 #if SIZEOF_INT < SIZEOF_SIZE_T
4935 #define NEED_RETRY 4408 #define NEED_RETRY
4936 #endif 4409 #endif
4937 4410
4938 /* XXX This code is limited to "true" double-byte encodings, as 4411 /* XXX This code is limited to "true" double-byte encodings, as
4939 a) it assumes an incomplete character consists of a single byte, and 4412 a) it assumes an incomplete character consists of a single byte, and
4940 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte 4413 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
4941 encodings, see IsDBCSLeadByteEx documentation. */ 4414 encodings, see IsDBCSLeadByteEx documentation. */
4942 4415
4943 static int 4416 static int is_dbcs_lead_byte(const char *s, int offset)
4944 is_dbcs_lead_byte(const char *s, int offset)
4945 { 4417 {
4946 const char *curr = s + offset; 4418 const char *curr = s + offset;
4947 4419
4948 if (IsDBCSLeadByte(*curr)) { 4420 if (IsDBCSLeadByte(*curr)) {
4949 const char *prev = CharPrev(s, curr); 4421 const char *prev = CharPrev(s, curr);
4950 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); 4422 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
4951 } 4423 }
4952 return 0; 4424 return 0;
4953 } 4425 }
4954 4426
4955 /* 4427 /*
4956 * Decode MBCS string into unicode object. If 'final' is set, converts 4428 * Decode MBCS string into unicode object. If 'final' is set, converts
4957 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise. 4429 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4958 */ 4430 */
4959 static int 4431 static int decode_mbcs(PyUnicodeObject **v,
4960 decode_mbcs(PyUnicodeObject **v, 4432 const char *s, /* MBCS string */
4961 const char *s, /* MBCS string */ 4433 int size, /* sizeof MBCS string */
4962 int size, /* sizeof MBCS string */ 4434 int final)
4963 int final,
4964 const char *errors)
4965 { 4435 {
4966 Py_UNICODE *p; 4436 Py_UNICODE *p;
4967 Py_ssize_t n; 4437 Py_ssize_t n = 0;
4968 DWORD usize; 4438 int usize = 0;
4969 DWORD flags;
4970 4439
4971 assert(size >= 0); 4440 assert(size >= 0);
4972
4973 /* check and handle 'errors' arg */
4974 if (errors==NULL || strcmp(errors, "strict")==0)
4975 flags = MB_ERR_INVALID_CHARS;
4976 else if (strcmp(errors, "ignore")==0)
4977 flags = 0;
4978 else {
4979 PyErr_Format(PyExc_ValueError,
4980 "mbcs encoding does not support errors='%s'",
4981 errors);
4982 return -1;
4983 }
4984 4441
4985 /* Skip trailing lead-byte unless 'final' is set */ 4442 /* Skip trailing lead-byte unless 'final' is set */
4986 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) 4443 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
4987 --size; 4444 --size;
4988 4445
4989 /* First get the size of the result */ 4446 /* First get the size of the result */
4990 if (size > 0) { 4447 if (size > 0) {
4991 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0); 4448 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4992 if (usize==0) 4449 if (usize == 0) {
4993 goto mbcs_decode_error; 4450 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4994 } else 4451 return -1;
4995 usize = 0; 4452 }
4453 }
4996 4454
4997 if (*v == NULL) { 4455 if (*v == NULL) {
4998 /* Create unicode object */ 4456 /* Create unicode object */
4999 *v = _PyUnicode_New(usize); 4457 *v = _PyUnicode_New(usize);
5000 if (*v == NULL) 4458 if (*v == NULL)
5001 return -1; 4459 return -1;
5002 n = 0;
5003 } 4460 }
5004 else { 4461 else {
5005 /* Extend unicode object */ 4462 /* Extend unicode object */
5006 n = PyUnicode_GET_SIZE(*v); 4463 n = PyUnicode_GET_SIZE(*v);
5007 if (_PyUnicode_Resize(v, n + usize) < 0) 4464 if (_PyUnicode_Resize(v, n + usize) < 0)
5008 return -1; 4465 return -1;
5009 } 4466 }
5010 4467
5011 /* Do the conversion */ 4468 /* Do the conversion */
5012 if (usize > 0) { 4469 if (size > 0) {
5013 p = PyUnicode_AS_UNICODE(*v) + n; 4470 p = PyUnicode_AS_UNICODE(*v) + n;
5014 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) { 4471 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
5015 goto mbcs_decode_error; 4472 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5016 } 4473 return -1;
5017 } 4474 }
4475 }
4476
5018 return size; 4477 return size;
5019 4478 }
5020 mbcs_decode_error: 4479
5021 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then 4480 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
5022 we raise a UnicodeDecodeError - else it is a 'generic' 4481 Py_ssize_t size,
5023 windows error 4482 const char *errors,
5024 */ 4483 Py_ssize_t *consumed)
5025 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
5026 /* Ideally, we should get reason from FormatMessage - this
5027 is the Windows 2000 English version of the message
5028 */
5029 PyObject *exc = NULL;
5030 const char *reason = "No mapping for the Unicode character exists "
5031 "in the target multi-byte code page.";
5032 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
5033 if (exc != NULL) {
5034 PyCodec_StrictErrors(exc);
5035 Py_DECREF(exc);
5036 }
5037 } else {
5038 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5039 }
5040 return -1;
5041 }
5042
5043 PyObject *
5044 PyUnicode_DecodeMBCSStateful(const char *s,
5045 Py_ssize_t size,
5046 const char *errors,
5047 Py_ssize_t *consumed)
5048 { 4484 {
5049 PyUnicodeObject *v = NULL; 4485 PyUnicodeObject *v = NULL;
5050 int done; 4486 int done;
5051 4487
5052 if (consumed) 4488 if (consumed)
5053 *consumed = 0; 4489 *consumed = 0;
5054 4490
5055 #ifdef NEED_RETRY 4491 #ifdef NEED_RETRY
5056 retry: 4492 retry:
5057 if (size > INT_MAX) 4493 if (size > INT_MAX)
5058 done = decode_mbcs(&v, s, INT_MAX, 0, errors); 4494 done = decode_mbcs(&v, s, INT_MAX, 0);
5059 else 4495 else
5060 #endif 4496 #endif
5061 done = decode_mbcs(&v, s, (int)size, !consumed, errors); 4497 done = decode_mbcs(&v, s, (int)size, !consumed);
5062 4498
5063 if (done < 0) { 4499 if (done < 0) {
5064 Py_XDECREF(v); 4500 Py_XDECREF(v);
5065 return NULL; 4501 return NULL;
5066 } 4502 }
5067 4503
5068 if (consumed) 4504 if (consumed)
5069 *consumed += done; 4505 *consumed += done;
5070 4506
5071 #ifdef NEED_RETRY 4507 #ifdef NEED_RETRY
5072 if (size > INT_MAX) { 4508 if (size > INT_MAX) {
5073 s += done; 4509 s += done;
5074 size -= done; 4510 size -= done;
5075 goto retry; 4511 goto retry;
5076 } 4512 }
5077 #endif 4513 #endif
5078 4514
5079 return (PyObject *)v; 4515 return (PyObject *)v;
5080 } 4516 }
5081 4517
5082 PyObject * 4518 PyObject *PyUnicode_DecodeMBCS(const char *s,
5083 PyUnicode_DecodeMBCS(const char *s, 4519 Py_ssize_t size,
5084 Py_ssize_t size, 4520 const char *errors)
5085 const char *errors)
5086 { 4521 {
5087 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 4522 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
5088 } 4523 }
5089 4524
5090 /* 4525 /*
5091 * Convert unicode into string object (MBCS). 4526 * Convert unicode into string object (MBCS).
5092 * Returns 0 if succeed, -1 otherwise. 4527 * Returns 0 if succeed, -1 otherwise.
5093 */ 4528 */
5094 static int 4529 static int encode_mbcs(PyObject **repr,
5095 encode_mbcs(PyObject **repr, 4530 const Py_UNICODE *p, /* unicode */
5096 const Py_UNICODE *p, /* unicode */ 4531 int size) /* size of unicode */
5097 int size, /* size of unicode */ 4532 {
5098 const char* errors) 4533 int mbcssize = 0;
5099 { 4534 Py_ssize_t n = 0;
5100 BOOL usedDefaultChar = FALSE;
5101 BOOL *pusedDefaultChar;
5102 int mbcssize;
5103 Py_ssize_t n;
5104 PyObject *exc = NULL;
5105 DWORD flags;
5106 4535
5107 assert(size >= 0); 4536 assert(size >= 0);
5108
5109 /* check and handle 'errors' arg */
5110 if (errors==NULL || strcmp(errors, "strict")==0) {
5111 flags = WC_NO_BEST_FIT_CHARS;
5112 pusedDefaultChar = &usedDefaultChar;
5113 } else if (strcmp(errors, "replace")==0) {
5114 flags = 0;
5115 pusedDefaultChar = NULL;
5116 } else {
5117 PyErr_Format(PyExc_ValueError,
5118 "mbcs encoding does not support errors='%s'",
5119 errors);
5120 return -1;
5121 }
5122 4537
5123 /* First get the size of the result */ 4538 /* First get the size of the result */
5124 if (size > 0) { 4539 if (size > 0) {
5125 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0, 4540 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
5126 NULL, pusedDefaultChar);
5127 if (mbcssize == 0) { 4541 if (mbcssize == 0) {
5128 PyErr_SetFromWindowsErrWithFilename(0, NULL); 4542 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5129 return -1; 4543 return -1;
5130 } 4544 }
5131 /* If we used a default char, then we failed! */
5132 if (pusedDefaultChar && *pusedDefaultChar)
5133 goto mbcs_encode_error;
5134 } else {
5135 mbcssize = 0;
5136 } 4545 }
5137 4546
5138 if (*repr == NULL) { 4547 if (*repr == NULL) {
5139 /* Create string object */ 4548 /* Create string object */
5140 *repr = PyBytes_FromStringAndSize(NULL, mbcssize); 4549 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
5141 if (*repr == NULL) 4550 if (*repr == NULL)
5142 return -1; 4551 return -1;
5143 n = 0;
5144 } 4552 }
5145 else { 4553 else {
5146 /* Extend string object */ 4554 /* Extend string object */
5147 n = PyBytes_Size(*repr); 4555 n = PyBytes_Size(*repr);
5148 if (_PyBytes_Resize(repr, n + mbcssize) < 0) 4556 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
5149 return -1; 4557 return -1;
5150 } 4558 }
5151 4559
5152 /* Do the conversion */ 4560 /* Do the conversion */
5153 if (size > 0) { 4561 if (size > 0) {
5154 char *s = PyBytes_AS_STRING(*repr) + n; 4562 char *s = PyBytes_AS_STRING(*repr) + n;
5155 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize, 4563 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL )) {
5156 NULL, pusedDefaultChar)) {
5157 PyErr_SetFromWindowsErrWithFilename(0, NULL); 4564 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5158 return -1; 4565 return -1;
5159 } 4566 }
5160 if (pusedDefaultChar && *pusedDefaultChar) 4567 }
5161 goto mbcs_encode_error; 4568
5162 }
5163 return 0; 4569 return 0;
5164 4570 }
5165 mbcs_encode_error: 4571
5166 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character"); 4572 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
5167 Py_XDECREF(exc); 4573 Py_ssize_t size,
5168 return -1; 4574 const char *errors)
5169 }
5170
5171 PyObject *
5172 PyUnicode_EncodeMBCS(const Py_UNICODE *p,
5173 Py_ssize_t size,
5174 const char *errors)
5175 { 4575 {
5176 PyObject *repr = NULL; 4576 PyObject *repr = NULL;
5177 int ret; 4577 int ret;
5178 4578
5179 #ifdef NEED_RETRY 4579 #ifdef NEED_RETRY
5180 retry: 4580 retry:
5181 if (size > INT_MAX) 4581 if (size > INT_MAX)
5182 ret = encode_mbcs(&repr, p, INT_MAX, errors); 4582 ret = encode_mbcs(&repr, p, INT_MAX);
5183 else 4583 else
5184 #endif 4584 #endif
5185 ret = encode_mbcs(&repr, p, (int)size, errors); 4585 ret = encode_mbcs(&repr, p, (int)size);
5186 4586
5187 if (ret < 0) { 4587 if (ret < 0) {
5188 Py_XDECREF(repr); 4588 Py_XDECREF(repr);
5189 return NULL; 4589 return NULL;
5190 } 4590 }
5191 4591
5192 #ifdef NEED_RETRY 4592 #ifdef NEED_RETRY
5193 if (size > INT_MAX) { 4593 if (size > INT_MAX) {
5194 p += INT_MAX; 4594 p += INT_MAX;
5195 size -= INT_MAX; 4595 size -= INT_MAX;
5196 goto retry; 4596 goto retry;
5197 } 4597 }
5198 #endif 4598 #endif
5199 4599
5200 return repr; 4600 return repr;
5201 } 4601 }
5202 4602
5203 PyObject * 4603 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
5204 PyUnicode_AsMBCSString(PyObject *unicode)
5205 { 4604 {
5206 if (!PyUnicode_Check(unicode)) { 4605 if (!PyUnicode_Check(unicode)) {
5207 PyErr_BadArgument(); 4606 PyErr_BadArgument();
5208 return NULL; 4607 return NULL;
5209 } 4608 }
5210 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 4609 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
5211 PyUnicode_GET_SIZE(unicode), 4610 PyUnicode_GET_SIZE(unicode),
5212 NULL); 4611 NULL);
5213 } 4612 }
5214 4613
5215 #undef NEED_RETRY 4614 #undef NEED_RETRY
5216 4615
5217 #endif /* MS_WINDOWS */ 4616 #endif /* MS_WINDOWS */
5218 4617
5219 /* --- Character Mapping Codec -------------------------------------------- */ 4618 /* --- Character Mapping Codec -------------------------------------------- */
5220 4619
5221 PyObject * 4620 PyObject *PyUnicode_DecodeCharmap(const char *s,
5222 PyUnicode_DecodeCharmap(const char *s, 4621 Py_ssize_t size,
5223 Py_ssize_t size, 4622 PyObject *mapping,
5224 PyObject *mapping, 4623 const char *errors)
5225 const char *errors)
5226 { 4624 {
5227 const char *starts = s; 4625 const char *starts = s;
5228 Py_ssize_t startinpos; 4626 Py_ssize_t startinpos;
5229 Py_ssize_t endinpos; 4627 Py_ssize_t endinpos;
5230 Py_ssize_t outpos; 4628 Py_ssize_t outpos;
5231 const char *e; 4629 const char *e;
5232 PyUnicodeObject *v; 4630 PyUnicodeObject *v;
5233 Py_UNICODE *p; 4631 Py_UNICODE *p;
5234 Py_ssize_t extrachars = 0; 4632 Py_ssize_t extrachars = 0;
5235 PyObject *errorHandler = NULL; 4633 PyObject *errorHandler = NULL;
(...skipping 139 matching lines...) Expand 10 before | Expand all | Expand 10 after
5375 4773
5376 onError: 4774 onError:
5377 Py_XDECREF(errorHandler); 4775 Py_XDECREF(errorHandler);
5378 Py_XDECREF(exc); 4776 Py_XDECREF(exc);
5379 Py_XDECREF(v); 4777 Py_XDECREF(v);
5380 return NULL; 4778 return NULL;
5381 } 4779 }
5382 4780
5383 /* Charmap encoding: the lookup table */ 4781 /* Charmap encoding: the lookup table */
5384 4782
5385 struct encoding_map { 4783 struct encoding_map{
5386 PyObject_HEAD 4784 PyObject_HEAD
5387 unsigned char level1[32]; 4785 unsigned char level1[32];
5388 int count2, count3; 4786 int count2, count3;
5389 unsigned char level23[1]; 4787 unsigned char level23[1];
5390 }; 4788 };
5391 4789
5392 static PyObject* 4790 static PyObject*
5393 encoding_map_size(PyObject *obj, PyObject* args) 4791 encoding_map_size(PyObject *obj, PyObject* args)
5394 { 4792 {
5395 struct encoding_map *map = (struct encoding_map*)obj; 4793 struct encoding_map *map = (struct encoding_map*)obj;
(...skipping 106 matching lines...) Expand 10 before | Expand all | Expand 10 after
5502 4900
5503 if (count2 >= 0xFF || count3 >= 0xFF) 4901 if (count2 >= 0xFF || count3 >= 0xFF)
5504 need_dict = 1; 4902 need_dict = 1;
5505 4903
5506 if (need_dict) { 4904 if (need_dict) {
5507 PyObject *result = PyDict_New(); 4905 PyObject *result = PyDict_New();
5508 PyObject *key, *value; 4906 PyObject *key, *value;
5509 if (!result) 4907 if (!result)
5510 return NULL; 4908 return NULL;
5511 for (i = 0; i < 256; i++) { 4909 for (i = 0; i < 256; i++) {
4910 key = value = NULL;
5512 key = PyLong_FromLong(decode[i]); 4911 key = PyLong_FromLong(decode[i]);
5513 value = PyLong_FromLong(i); 4912 value = PyLong_FromLong(i);
5514 if (!key || !value) 4913 if (!key || !value)
5515 goto failed1; 4914 goto failed1;
5516 if (PyDict_SetItem(result, key, value) == -1) 4915 if (PyDict_SetItem(result, key, value) == -1)
5517 goto failed1; 4916 goto failed1;
5518 Py_DECREF(key); 4917 Py_DECREF(key);
5519 Py_DECREF(value); 4918 Py_DECREF(value);
5520 } 4919 }
5521 return result; 4920 return result;
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after
5589 i = map->level23[16*map->count2 + 128*i + l3]; 4988 i = map->level23[16*map->count2 + 128*i + l3];
5590 if (i == 0) { 4989 if (i == 0) {
5591 return -1; 4990 return -1;
5592 } 4991 }
5593 return i; 4992 return i;
5594 } 4993 }
5595 4994
5596 /* Lookup the character ch in the mapping. If the character 4995 /* Lookup the character ch in the mapping. If the character
5597 can't be found, Py_None is returned (or NULL, if another 4996 can't be found, Py_None is returned (or NULL, if another
5598 error occurred). */ 4997 error occurred). */
5599 static PyObject * 4998 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
5600 charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
5601 { 4999 {
5602 PyObject *w = PyLong_FromLong((long)c); 5000 PyObject *w = PyLong_FromLong((long)c);
5603 PyObject *x; 5001 PyObject *x;
5604 5002
5605 if (w == NULL) 5003 if (w == NULL)
5606 return NULL; 5004 return NULL;
5607 x = PyObject_GetItem(mapping, w); 5005 x = PyObject_GetItem(mapping, w);
5608 Py_DECREF(w); 5006 Py_DECREF(w);
5609 if (x == NULL) { 5007 if (x == NULL) {
5610 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 5008 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after
5647 /* exponentially overallocate to minimize reallocations */ 5045 /* exponentially overallocate to minimize reallocations */
5648 if (requiredsize < 2*outsize) 5046 if (requiredsize < 2*outsize)
5649 requiredsize = 2*outsize; 5047 requiredsize = 2*outsize;
5650 if (_PyBytes_Resize(outobj, requiredsize)) 5048 if (_PyBytes_Resize(outobj, requiredsize))
5651 return -1; 5049 return -1;
5652 return 0; 5050 return 0;
5653 } 5051 }
5654 5052
5655 typedef enum charmapencode_result { 5053 typedef enum charmapencode_result {
5656 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 5054 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
5657 } charmapencode_result; 5055 }charmapencode_result;
5658 /* lookup the character, put the result in the output string and adjust 5056 /* lookup the character, put the result in the output string and adjust
5659 various state variables. Resize the output bytes object if not enough 5057 various state variables. Resize the output bytes object if not enough
5660 space is available. Return a new reference to the object that 5058 space is available. Return a new reference to the object that
5661 was put in the output buffer, or Py_None, if the mapping was undefined 5059 was put in the output buffer, or Py_None, if the mapping was undefined
5662 (in which case no character was written) or NULL, if a 5060 (in which case no character was written) or NULL, if a
5663 reallocation error occurred. The caller must decref the result */ 5061 reallocation error occurred. The caller must decref the result */
5664 static charmapencode_result 5062 static
5665 charmapencode_output(Py_UNICODE c, PyObject *mapping, 5063 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
5666 PyObject **outobj, Py_ssize_t *outpos) 5064 PyObject **outobj, Py_ssize_t *outpos)
5667 { 5065 {
5668 PyObject *rep; 5066 PyObject *rep;
5669 char *outstart; 5067 char *outstart;
5670 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 5068 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5671 5069
5672 if (Py_TYPE(mapping) == &EncodingMapType) { 5070 if (Py_TYPE(mapping) == &EncodingMapType) {
5673 int res = encoding_map_lookup(c, mapping); 5071 int res = encoding_map_lookup(c, mapping);
5674 Py_ssize_t requiredsize = *outpos+1; 5072 Py_ssize_t requiredsize = *outpos+1;
5675 if (res == -1) 5073 if (res == -1)
5676 return enc_FAILED; 5074 return enc_FAILED;
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after
5712 memcpy(outstart + *outpos, repchars, repsize); 5110 memcpy(outstart + *outpos, repchars, repsize);
5713 *outpos += repsize; 5111 *outpos += repsize;
5714 } 5112 }
5715 } 5113 }
5716 Py_DECREF(rep); 5114 Py_DECREF(rep);
5717 return enc_SUCCESS; 5115 return enc_SUCCESS;
5718 } 5116 }
5719 5117
5720 /* handle an error in PyUnicode_EncodeCharmap 5118 /* handle an error in PyUnicode_EncodeCharmap
5721 Return 0 on success, -1 on error */ 5119 Return 0 on success, -1 on error */
5722 static int 5120 static
5723 charmap_encoding_error( 5121 int charmap_encoding_error(
5724 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping, 5122 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
5725 PyObject **exceptionObject, 5123 PyObject **exceptionObject,
5726 int *known_errorHandler, PyObject **errorHandler, const char *errors, 5124 int *known_errorHandler, PyObject **errorHandler, const char *errors,
5727 PyObject **res, Py_ssize_t *respos) 5125 PyObject **res, Py_ssize_t *respos)
5728 { 5126 {
5729 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 5127 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5730 Py_ssize_t repsize; 5128 Py_ssize_t repsize;
5731 Py_ssize_t newpos; 5129 Py_ssize_t newpos;
5732 Py_UNICODE *uni2; 5130 Py_UNICODE *uni2;
5733 /* startpos for collecting unencodable chars */ 5131 /* startpos for collecting unencodable chars */
(...skipping 113 matching lines...) Expand 10 before | Expand all | Expand 10 after
5847 raise_encode_exception(exceptionObject, encoding, p, size, colls tartpos, collendpos, reason); 5245 raise_encode_exception(exceptionObject, encoding, p, size, colls tartpos, collendpos, reason);
5848 return -1; 5246 return -1;
5849 } 5247 }
5850 } 5248 }
5851 *inpos = newpos; 5249 *inpos = newpos;
5852 Py_DECREF(repunicode); 5250 Py_DECREF(repunicode);
5853 } 5251 }
5854 return 0; 5252 return 0;
5855 } 5253 }
5856 5254
5857 PyObject * 5255 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
5858 PyUnicode_EncodeCharmap(const Py_UNICODE *p, 5256 Py_ssize_t size,
5859 Py_ssize_t size, 5257 PyObject *mapping,
5860 PyObject *mapping, 5258 const char *errors)
5861 const char *errors)
5862 { 5259 {
5863 /* output object */ 5260 /* output object */
5864 PyObject *res = NULL; 5261 PyObject *res = NULL;
5865 /* current input position */ 5262 /* current input position */
5866 Py_ssize_t inpos = 0; 5263 Py_ssize_t inpos = 0;
5867 /* current output position */ 5264 /* current output position */
5868 Py_ssize_t respos = 0; 5265 Py_ssize_t respos = 0;
5869 PyObject *errorHandler = NULL; 5266 PyObject *errorHandler = NULL;
5870 PyObject *exc = NULL; 5267 PyObject *exc = NULL;
5871 /* the following variable is used for caching string comparisons 5268 /* the following variable is used for caching string comparisons
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after
5912 Py_XDECREF(errorHandler); 5309 Py_XDECREF(errorHandler);
5913 return res; 5310 return res;
5914 5311
5915 onError: 5312 onError:
5916 Py_XDECREF(res); 5313 Py_XDECREF(res);
5917 Py_XDECREF(exc); 5314 Py_XDECREF(exc);
5918 Py_XDECREF(errorHandler); 5315 Py_XDECREF(errorHandler);
5919 return NULL; 5316 return NULL;
5920 } 5317 }
5921 5318
5922 PyObject * 5319 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
5923 PyUnicode_AsCharmapString(PyObject *unicode, 5320 PyObject *mapping)
5924 PyObject *mapping)
5925 { 5321 {
5926 if (!PyUnicode_Check(unicode) || mapping == NULL) { 5322 if (!PyUnicode_Check(unicode) || mapping == NULL) {
5927 PyErr_BadArgument(); 5323 PyErr_BadArgument();
5928 return NULL; 5324 return NULL;
5929 } 5325 }
5930 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 5326 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
5931 PyUnicode_GET_SIZE(unicode), 5327 PyUnicode_GET_SIZE(unicode),
5932 mapping, 5328 mapping,
5933 NULL); 5329 NULL);
5934 } 5330 }
5935 5331
5936 /* create or adjust a UnicodeTranslateError */ 5332 /* create or adjust a UnicodeTranslateError */
5937 static void 5333 static void make_translate_exception(PyObject **exceptionObject,
5938 make_translate_exception(PyObject **exceptionObject, 5334 const Py_UNICODE *unicode, Py_ssize_t size,
5939 const Py_UNICODE *unicode, Py_ssize_t size, 5335 Py_ssize_t startpos, Py_ssize_t endpos,
5940 Py_ssize_t startpos, Py_ssize_t endpos, 5336 const char *reason)
5941 const char *reason)
5942 { 5337 {
5943 if (*exceptionObject == NULL) { 5338 if (*exceptionObject == NULL) {
5944 *exceptionObject = PyUnicodeTranslateError_Create( 5339 *exceptionObject = PyUnicodeTranslateError_Create(
5945 unicode, size, startpos, endpos, reason); 5340 unicode, size, startpos, endpos, reason);
5946 } 5341 }
5947 else { 5342 else {
5948 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 5343 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5949 goto onError; 5344 goto onError;
5950 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 5345 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5951 goto onError; 5346 goto onError;
5952 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 5347 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5953 goto onError; 5348 goto onError;
5954 return; 5349 return;
5955 onError: 5350 onError:
5956 Py_DECREF(*exceptionObject); 5351 Py_DECREF(*exceptionObject);
5957 *exceptionObject = NULL; 5352 *exceptionObject = NULL;
5958 } 5353 }
5959 } 5354 }
5960 5355
5961 /* raises a UnicodeTranslateError */ 5356 /* raises a UnicodeTranslateError */
5962 static void 5357 static void raise_translate_exception(PyObject **exceptionObject,
5963 raise_translate_exception(PyObject **exceptionObject, 5358 const Py_UNICODE *unicode, Py_ssize_t size ,
5964 const Py_UNICODE *unicode, Py_ssize_t size, 5359 Py_ssize_t startpos, Py_ssize_t endpos,
5965 Py_ssize_t startpos, Py_ssize_t endpos, 5360 const char *reason)
5966 const char *reason)
5967 { 5361 {
5968 make_translate_exception(exceptionObject, 5362 make_translate_exception(exceptionObject,
5969 unicode, size, startpos, endpos, reason); 5363 unicode, size, startpos, endpos, reason);
5970 if (*exceptionObject != NULL) 5364 if (*exceptionObject != NULL)
5971 PyCodec_StrictErrors(*exceptionObject); 5365 PyCodec_StrictErrors(*exceptionObject);
5972 } 5366 }
5973 5367
5974 /* error handling callback helper: 5368 /* error handling callback helper:
5975 build arguments, call the callback and check the arguments, 5369 build arguments, call the callback and check the arguments,
5976 put the result into newpos and return the replacement string, which 5370 put the result into newpos and return the replacement string, which
5977 has to be freed by the caller */ 5371 has to be freed by the caller */
5978 static PyObject * 5372 static PyObject *unicode_translate_call_errorhandler(const char *errors,
5979 unicode_translate_call_errorhandler(const char *errors, 5373 PyObject **errorHandler,
5980 PyObject **errorHandler, 5374 const char *reason,
5981 const char *reason, 5375 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5982 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 5376 Py_ssize_t startpos, Py_ssi ze_t endpos,
5983 Py_ssize_t startpos, Py_ssize_t endpos, 5377 Py_ssize_t *newpos)
5984 Py_ssize_t *newpos)
5985 { 5378 {
5986 static char *argparse = "O!n;translating error handler must return (str, int ) tuple"; 5379 static char *argparse = "O!n;translating error handler must return (str, int ) tuple";
5987 5380
5988 Py_ssize_t i_newpos; 5381 Py_ssize_t i_newpos;
5989 PyObject *restuple; 5382 PyObject *restuple;
5990 PyObject *resunicode; 5383 PyObject *resunicode;
5991 5384
5992 if (*errorHandler == NULL) { 5385 if (*errorHandler == NULL) {
5993 *errorHandler = PyCodec_LookupError(errors); 5386 *errorHandler = PyCodec_LookupError(errors);
5994 if (*errorHandler == NULL) 5387 if (*errorHandler == NULL)
(...skipping 29 matching lines...) Expand all
6024 return NULL; 5417 return NULL;
6025 } 5418 }
6026 Py_INCREF(resunicode); 5419 Py_INCREF(resunicode);
6027 Py_DECREF(restuple); 5420 Py_DECREF(restuple);
6028 return resunicode; 5421 return resunicode;
6029 } 5422 }
6030 5423
6031 /* Lookup the character ch in the mapping and put the result in result, 5424 /* Lookup the character ch in the mapping and put the result in result,
6032 which must be decrefed by the caller. 5425 which must be decrefed by the caller.
6033 Return 0 on success, -1 on error */ 5426 Return 0 on success, -1 on error */
6034 static int 5427 static
6035 charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result) 5428 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
6036 { 5429 {
6037 PyObject *w = PyLong_FromLong((long)c); 5430 PyObject *w = PyLong_FromLong((long)c);
6038 PyObject *x; 5431 PyObject *x;
6039 5432
6040 if (w == NULL) 5433 if (w == NULL)
6041 return -1; 5434 return -1;
6042 x = PyObject_GetItem(mapping, w); 5435 x = PyObject_GetItem(mapping, w);
6043 Py_DECREF(w); 5436 Py_DECREF(w);
6044 if (x == NULL) { 5437 if (x == NULL) {
6045 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 5438 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
(...skipping 28 matching lines...) Expand all
6074 /* wrong return value */ 5467 /* wrong return value */
6075 PyErr_SetString(PyExc_TypeError, 5468 PyErr_SetString(PyExc_TypeError,
6076 "character mapping must return integer, None or str"); 5469 "character mapping must return integer, None or str");
6077 Py_DECREF(x); 5470 Py_DECREF(x);
6078 return -1; 5471 return -1;
6079 } 5472 }
6080 } 5473 }
6081 /* ensure that *outobj is at least requiredsize characters long, 5474 /* ensure that *outobj is at least requiredsize characters long,
6082 if not reallocate and adjust various state variables. 5475 if not reallocate and adjust various state variables.
6083 Return 0 on success, -1 on error */ 5476 Return 0 on success, -1 on error */
6084 static int 5477 static
6085 charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, 5478 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
6086 Py_ssize_t requiredsize) 5479 Py_ssize_t requiredsize)
6087 { 5480 {
6088 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj); 5481 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
6089 if (requiredsize > oldsize) { 5482 if (requiredsize > oldsize) {
6090 /* remember old output position */ 5483 /* remember old output position */
6091 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj); 5484 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
6092 /* exponentially overallocate to minimize reallocations */ 5485 /* exponentially overallocate to minimize reallocations */
6093 if (requiredsize < 2 * oldsize) 5486 if (requiredsize < 2 * oldsize)
6094 requiredsize = 2 * oldsize; 5487 requiredsize = 2 * oldsize;
6095 if (PyUnicode_Resize(outobj, requiredsize) < 0) 5488 if (PyUnicode_Resize(outobj, requiredsize) < 0)
6096 return -1; 5489 return -1;
6097 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos; 5490 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
6098 } 5491 }
6099 return 0; 5492 return 0;
6100 } 5493 }
6101 /* lookup the character, put the result in the output string and adjust 5494 /* lookup the character, put the result in the output string and adjust
6102 various state variables. Return a new reference to the object that 5495 various state variables. Return a new reference to the object that
6103 was put in the output buffer in *result, or Py_None, if the mapping was 5496 was put in the output buffer in *result, or Py_None, if the mapping was
6104 undefined (in which case no character was written). 5497 undefined (in which case no character was written).
6105 The called must decref result. 5498 The called must decref result.
6106 Return 0 on success, -1 on error. */ 5499 Return 0 on success, -1 on error. */
6107 static int 5500 static
6108 charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp, 5501 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp ,
6109 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp, 5502 Py_ssize_t insize, PyObject *mapping, PyObject **out obj, Py_UNICODE **outp,
6110 PyObject **res) 5503 PyObject **res)
6111 { 5504 {
6112 if (charmaptranslate_lookup(*curinp, mapping, res)) 5505 if (charmaptranslate_lookup(*curinp, mapping, res))
6113 return -1; 5506 return -1;
6114 if (*res==NULL) { 5507 if (*res==NULL) {
6115 /* not found => default to 1:1 mapping */ 5508 /* not found => default to 1:1 mapping */
6116 *(*outp)++ = *curinp; 5509 *(*outp)++ = *curinp;
6117 } 5510 }
6118 else if (*res==Py_None) 5511 else if (*res==Py_None)
6119 ; 5512 ;
6120 else if (PyLong_Check(*res)) { 5513 else if (PyLong_Check(*res)) {
(...skipping 15 matching lines...) Expand all
6136 return -1; 5529 return -1;
6137 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize ); 5530 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize );
6138 *outp += repsize; 5531 *outp += repsize;
6139 } 5532 }
6140 } 5533 }
6141 else 5534 else
6142 return -1; 5535 return -1;
6143 return 0; 5536 return 0;
6144 } 5537 }
6145 5538
6146 PyObject * 5539 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
6147 PyUnicode_TranslateCharmap(const Py_UNICODE *p, 5540 Py_ssize_t size,
6148 Py_ssize_t size, 5541 PyObject *mapping,
6149 PyObject *mapping, 5542 const char *errors)
6150 const char *errors)
6151 { 5543 {
6152 /* output object */ 5544 /* output object */
6153 PyObject *res = NULL; 5545 PyObject *res = NULL;
6154 /* pointers to the beginning and end+1 of input */ 5546 /* pointers to the beginning and end+1 of input */
6155 const Py_UNICODE *startp = p; 5547 const Py_UNICODE *startp = p;
6156 const Py_UNICODE *endp = p + size; 5548 const Py_UNICODE *endp = p + size;
6157 /* pointer into the output */ 5549 /* pointer into the output */
6158 Py_UNICODE *str; 5550 Py_UNICODE *str;
6159 /* current output position */ 5551 /* current output position */
6160 Py_ssize_t respos = 0; 5552 Py_ssize_t respos = 0;
(...skipping 118 matching lines...) Expand 10 before | Expand all | Expand 10 after
6279 Py_XDECREF(errorHandler); 5671 Py_XDECREF(errorHandler);
6280 return res; 5672 return res;
6281 5673
6282 onError: 5674 onError:
6283 Py_XDECREF(res); 5675 Py_XDECREF(res);
6284 Py_XDECREF(exc); 5676 Py_XDECREF(exc);
6285 Py_XDECREF(errorHandler); 5677 Py_XDECREF(errorHandler);
6286 return NULL; 5678 return NULL;
6287 } 5679 }
6288 5680
6289 PyObject * 5681 PyObject *PyUnicode_Translate(PyObject *str,
6290 PyUnicode_Translate(PyObject *str, 5682 PyObject *mapping,
6291 PyObject *mapping, 5683 const char *errors)
6292 const char *errors)
6293 { 5684 {
6294 PyObject *result; 5685 PyObject *result;
6295 5686
6296 str = PyUnicode_FromObject(str); 5687 str = PyUnicode_FromObject(str);
6297 if (str == NULL) 5688 if (str == NULL)
6298 goto onError; 5689 goto onError;
6299 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str), 5690 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
6300 PyUnicode_GET_SIZE(str), 5691 PyUnicode_GET_SIZE(str),
6301 mapping, 5692 mapping,
6302 errors); 5693 errors);
6303 Py_DECREF(str); 5694 Py_DECREF(str);
6304 return result; 5695 return result;
6305 5696
6306 onError: 5697 onError:
6307 Py_XDECREF(str); 5698 Py_XDECREF(str);
6308 return NULL; 5699 return NULL;
6309 } 5700 }
6310 5701
6311 PyObject *
6312 PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
6313 Py_ssize_t length)
6314 {
6315 PyObject *result;
6316 Py_UNICODE *p; /* write pointer into result */
6317 Py_ssize_t i;
6318 /* Copy to a new string */
6319 result = (PyObject *)_PyUnicode_New(length);
6320 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
6321 if (result == NULL)
6322 return result;
6323 p = PyUnicode_AS_UNICODE(result);
6324 /* Iterate over code points */
6325 for (i = 0; i < length; i++) {
6326 Py_UNICODE ch =s[i];
6327 if (ch > 127) {
6328 int decimal = Py_UNICODE_TODECIMAL(ch);
6329 if (decimal >= 0)
6330 p[i] = '0' + decimal;
6331 }
6332 }
6333 return result;
6334 }
6335 /* --- Decimal Encoder ---------------------------------------------------- */ 5702 /* --- Decimal Encoder ---------------------------------------------------- */
6336 5703
6337 int 5704 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
6338 PyUnicode_EncodeDecimal(Py_UNICODE *s, 5705 Py_ssize_t length,
6339 Py_ssize_t length, 5706 char *output,
6340 char *output, 5707 const char *errors)
6341 const char *errors)
6342 { 5708 {
6343 Py_UNICODE *p, *end; 5709 Py_UNICODE *p, *end;
6344 PyObject *errorHandler = NULL; 5710 PyObject *errorHandler = NULL;
6345 PyObject *exc = NULL; 5711 PyObject *exc = NULL;
6346 const char *encoding = "decimal"; 5712 const char *encoding = "decimal";
6347 const char *reason = "invalid decimal Unicode string"; 5713 const char *reason = "invalid decimal Unicode string";
6348 /* the following variable is used for caching string comparisons 5714 /* the following variable is used for caching string comparisons
6349 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharre freplace */ 5715 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharre freplace */
6350 int known_errorHandler = -1; 5716 int known_errorHandler = -1;
6351 5717
(...skipping 115 matching lines...) Expand 10 before | Expand all | Expand 10 after
6467 onError: 5833 onError:
6468 Py_XDECREF(exc); 5834 Py_XDECREF(exc);
6469 Py_XDECREF(errorHandler); 5835 Py_XDECREF(errorHandler);
6470 return -1; 5836 return -1;
6471 } 5837 }
6472 5838
6473 /* --- Helpers ------------------------------------------------------------ */ 5839 /* --- Helpers ------------------------------------------------------------ */
6474 5840
6475 #include "stringlib/unicodedefs.h" 5841 #include "stringlib/unicodedefs.h"
6476 #include "stringlib/fastsearch.h" 5842 #include "stringlib/fastsearch.h"
6477
6478 #include "stringlib/count.h" 5843 #include "stringlib/count.h"
5844 /* Include _ParseTupleFinds from find.h */
5845 #define FROM_UNICODE
6479 #include "stringlib/find.h" 5846 #include "stringlib/find.h"
6480 #include "stringlib/partition.h" 5847 #include "stringlib/partition.h"
6481 #include "stringlib/split.h"
6482 5848
6483 #define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping 5849 #define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
6484 #define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLoca le 5850 #define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLoca le
6485 #include "stringlib/localeutil.h" 5851 #include "stringlib/localeutil.h"
6486 5852
6487 /* helper macro to fixup start/end slice values */ 5853 /* helper macro to fixup start/end slice values */
6488 #define ADJUST_INDICES(start, end, len) \ 5854 #define FIX_START_END(obj) \
6489 if (end > len) \ 5855 if (start < 0) \
6490 end = len; \ 5856 start += (obj)->length; \
6491 else if (end < 0) { \ 5857 if (start < 0) \
6492 end += len; \ 5858 start = 0; \
6493 if (end < 0) \ 5859 if (end > (obj)->length) \
6494 end = 0; \ 5860 end = (obj)->length; \
6495 } \ 5861 if (end < 0) \
6496 if (start < 0) { \ 5862 end += (obj)->length; \
6497 start += len; \ 5863 if (end < 0) \
6498 if (start < 0) \ 5864 end = 0;
6499 start = 0; \ 5865
6500 } 5866 Py_ssize_t PyUnicode_Count(PyObject *str,
6501 5867 PyObject *substr,
6502 Py_ssize_t 5868 Py_ssize_t start,
6503 PyUnicode_Count(PyObject *str, 5869 Py_ssize_t end)
6504 PyObject *substr,
6505 Py_ssize_t start,
6506 Py_ssize_t end)
6507 { 5870 {
6508 Py_ssize_t result; 5871 Py_ssize_t result;
6509 PyUnicodeObject* str_obj; 5872 PyUnicodeObject* str_obj;
6510 PyUnicodeObject* sub_obj; 5873 PyUnicodeObject* sub_obj;
6511 5874
6512 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str); 5875 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6513 if (!str_obj) 5876 if (!str_obj)
6514 return -1; 5877 return -1;
6515 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr); 5878 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6516 if (!sub_obj) { 5879 if (!sub_obj) {
6517 Py_DECREF(str_obj); 5880 Py_DECREF(str_obj);
6518 return -1; 5881 return -1;
6519 } 5882 }
6520 5883
6521 ADJUST_INDICES(start, end, str_obj->length); 5884 FIX_START_END(str_obj);
5885
6522 result = stringlib_count( 5886 result = stringlib_count(
6523 str_obj->str + start, end - start, sub_obj->str, sub_obj->length, 5887 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
6524 PY_SSIZE_T_MAX
6525 ); 5888 );
6526 5889
6527 Py_DECREF(sub_obj); 5890 Py_DECREF(sub_obj);
6528 Py_DECREF(str_obj); 5891 Py_DECREF(str_obj);
6529 5892
6530 return result; 5893 return result;
6531 } 5894 }
6532 5895
6533 Py_ssize_t 5896 Py_ssize_t PyUnicode_Find(PyObject *str,
6534 PyUnicode_Find(PyObject *str, 5897 PyObject *sub,
6535 PyObject *sub, 5898 Py_ssize_t start,
6536 Py_ssize_t start, 5899 Py_ssize_t end,
6537 Py_ssize_t end, 5900 int direction)
6538 int direction)
6539 { 5901 {
6540 Py_ssize_t result; 5902 Py_ssize_t result;
6541 5903
6542 str = PyUnicode_FromObject(str); 5904 str = PyUnicode_FromObject(str);
6543 if (!str) 5905 if (!str)
6544 return -2; 5906 return -2;
6545 sub = PyUnicode_FromObject(sub); 5907 sub = PyUnicode_FromObject(sub);
6546 if (!sub) { 5908 if (!sub) {
6547 Py_DECREF(str); 5909 Py_DECREF(str);
6548 return -2; 5910 return -2;
(...skipping 11 matching lines...) Expand all
6560 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 5922 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6561 start, end 5923 start, end
6562 ); 5924 );
6563 5925
6564 Py_DECREF(str); 5926 Py_DECREF(str);
6565 Py_DECREF(sub); 5927 Py_DECREF(sub);
6566 5928
6567 return result; 5929 return result;
6568 } 5930 }
6569 5931
6570 static int 5932 static
6571 tailmatch(PyUnicodeObject *self, 5933 int tailmatch(PyUnicodeObject *self,
6572 PyUnicodeObject *substring, 5934 PyUnicodeObject *substring,
6573 Py_ssize_t start, 5935 Py_ssize_t start,
6574 Py_ssize_t end, 5936 Py_ssize_t end,
6575 int direction) 5937 int direction)
6576 { 5938 {
6577 if (substring->length == 0) 5939 if (substring->length == 0)
6578 return 1; 5940 return 1;
6579 5941
6580 ADJUST_INDICES(start, end, self->length); 5942 FIX_START_END(self);
5943
6581 end -= substring->length; 5944 end -= substring->length;
6582 if (end < start) 5945 if (end < start)
6583 return 0; 5946 return 0;
6584 5947
6585 if (direction > 0) { 5948 if (direction > 0) {
6586 if (Py_UNICODE_MATCH(self, end, substring)) 5949 if (Py_UNICODE_MATCH(self, end, substring))
6587 return 1; 5950 return 1;
6588 } else { 5951 } else {
6589 if (Py_UNICODE_MATCH(self, start, substring)) 5952 if (Py_UNICODE_MATCH(self, start, substring))
6590 return 1; 5953 return 1;
6591 } 5954 }
6592 5955
6593 return 0; 5956 return 0;
6594 } 5957 }
6595 5958
6596 Py_ssize_t 5959 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
6597 PyUnicode_Tailmatch(PyObject *str, 5960 PyObject *substr,
6598 PyObject *substr, 5961 Py_ssize_t start,
6599 Py_ssize_t start, 5962 Py_ssize_t end,
6600 Py_ssize_t end, 5963 int direction)
6601 int direction)
6602 { 5964 {
6603 Py_ssize_t result; 5965 Py_ssize_t result;
6604 5966
6605 str = PyUnicode_FromObject(str); 5967 str = PyUnicode_FromObject(str);
6606 if (str == NULL) 5968 if (str == NULL)
6607 return -1; 5969 return -1;
6608 substr = PyUnicode_FromObject(substr); 5970 substr = PyUnicode_FromObject(substr);
6609 if (substr == NULL) { 5971 if (substr == NULL) {
6610 Py_DECREF(str); 5972 Py_DECREF(str);
6611 return -1; 5973 return -1;
6612 } 5974 }
6613 5975
6614 result = tailmatch((PyUnicodeObject *)str, 5976 result = tailmatch((PyUnicodeObject *)str,
6615 (PyUnicodeObject *)substr, 5977 (PyUnicodeObject *)substr,
6616 start, end, direction); 5978 start, end, direction);
6617 Py_DECREF(str); 5979 Py_DECREF(str);
6618 Py_DECREF(substr); 5980 Py_DECREF(substr);
6619 return result; 5981 return result;
6620 } 5982 }
6621 5983
6622 /* Apply fixfct filter to the Unicode object self and return a 5984 /* Apply fixfct filter to the Unicode object self and return a
6623 reference to the modified object */ 5985 reference to the modified object */
6624 5986
6625 static PyObject * 5987 static
6626 fixup(PyUnicodeObject *self, 5988 PyObject *fixup(PyUnicodeObject *self,
6627 int (*fixfct)(PyUnicodeObject *s)) 5989 int (*fixfct)(PyUnicodeObject *s))
6628 { 5990 {
6629 5991
6630 PyUnicodeObject *u; 5992 PyUnicodeObject *u;
6631 5993
6632 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 5994 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6633 if (u == NULL) 5995 if (u == NULL)
6634 return NULL; 5996 return NULL;
6635 5997
6636 Py_UNICODE_COPY(u->str, self->str, self->length); 5998 Py_UNICODE_COPY(u->str, self->str, self->length);
6637 5999
6638 if (!fixfct(u) && PyUnicode_CheckExact(self)) { 6000 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
6639 /* fixfct should return TRUE if it modified the buffer. If 6001 /* fixfct should return TRUE if it modified the buffer. If
6640 FALSE, return a reference to the original buffer instead 6002 FALSE, return a reference to the original buffer instead
6641 (to save space, not time) */ 6003 (to save space, not time) */
6642 Py_INCREF(self); 6004 Py_INCREF(self);
6643 Py_DECREF(u); 6005 Py_DECREF(u);
6644 return (PyObject*) self; 6006 return (PyObject*) self;
6645 } 6007 }
6646 return (PyObject*) u; 6008 return (PyObject*) u;
6647 } 6009 }
6648 6010
6649 static int 6011 static
6650 fixupper(PyUnicodeObject *self) 6012 int fixupper(PyUnicodeObject *self)
6651 { 6013 {
6652 Py_ssize_t len = self->length; 6014 Py_ssize_t len = self->length;
6653 Py_UNICODE *s = self->str; 6015 Py_UNICODE *s = self->str;
6654 int status = 0; 6016 int status = 0;
6655 6017
6656 while (len-- > 0) { 6018 while (len-- > 0) {
6657 register Py_UNICODE ch; 6019 register Py_UNICODE ch;
6658 6020
6659 ch = Py_UNICODE_TOUPPER(*s); 6021 ch = Py_UNICODE_TOUPPER(*s);
6660 if (ch != *s) { 6022 if (ch != *s) {
6661 status = 1; 6023 status = 1;
6662 *s = ch; 6024 *s = ch;
6663 } 6025 }
6664 s++; 6026 s++;
6665 } 6027 }
6666 6028
6667 return status; 6029 return status;
6668 } 6030 }
6669 6031
6670 static int 6032 static
6671 fixlower(PyUnicodeObject *self) 6033 int fixlower(PyUnicodeObject *self)
6672 { 6034 {
6673 Py_ssize_t len = self->length; 6035 Py_ssize_t len = self->length;
6674 Py_UNICODE *s = self->str; 6036 Py_UNICODE *s = self->str;
6675 int status = 0; 6037 int status = 0;
6676 6038
6677 while (len-- > 0) { 6039 while (len-- > 0) {
6678 register Py_UNICODE ch; 6040 register Py_UNICODE ch;
6679 6041
6680 ch = Py_UNICODE_TOLOWER(*s); 6042 ch = Py_UNICODE_TOLOWER(*s);
6681 if (ch != *s) { 6043 if (ch != *s) {
6682 status = 1; 6044 status = 1;
6683 *s = ch; 6045 *s = ch;
6684 } 6046 }
6685 s++; 6047 s++;
6686 } 6048 }
6687 6049
6688 return status; 6050 return status;
6689 } 6051 }
6690 6052
6691 static int 6053 static
6692 fixswapcase(PyUnicodeObject *self) 6054 int fixswapcase(PyUnicodeObject *self)
6693 { 6055 {
6694 Py_ssize_t len = self->length; 6056 Py_ssize_t len = self->length;
6695 Py_UNICODE *s = self->str; 6057 Py_UNICODE *s = self->str;
6696 int status = 0; 6058 int status = 0;
6697 6059
6698 while (len-- > 0) { 6060 while (len-- > 0) {
6699 if (Py_UNICODE_ISUPPER(*s)) { 6061 if (Py_UNICODE_ISUPPER(*s)) {
6700 *s = Py_UNICODE_TOLOWER(*s); 6062 *s = Py_UNICODE_TOLOWER(*s);
6701 status = 1; 6063 status = 1;
6702 } else if (Py_UNICODE_ISLOWER(*s)) { 6064 } else if (Py_UNICODE_ISLOWER(*s)) {
6703 *s = Py_UNICODE_TOUPPER(*s); 6065 *s = Py_UNICODE_TOUPPER(*s);
6704 status = 1; 6066 status = 1;
6705 } 6067 }
6706 s++; 6068 s++;
6707 } 6069 }
6708 6070
6709 return status; 6071 return status;
6710 } 6072 }
6711 6073
6712 static int 6074 static
6713 fixcapitalize(PyUnicodeObject *self) 6075 int fixcapitalize(PyUnicodeObject *self)
6714 { 6076 {
6715 Py_ssize_t len = self->length; 6077 Py_ssize_t len = self->length;
6716 Py_UNICODE *s = self->str; 6078 Py_UNICODE *s = self->str;
6717 int status = 0; 6079 int status = 0;
6718 6080
6719 if (len == 0) 6081 if (len == 0)
6720 return 0; 6082 return 0;
6721 if (Py_UNICODE_ISLOWER(*s)) { 6083 if (Py_UNICODE_ISLOWER(*s)) {
6722 *s = Py_UNICODE_TOUPPER(*s); 6084 *s = Py_UNICODE_TOUPPER(*s);
6723 status = 1; 6085 status = 1;
6724 } 6086 }
6725 s++; 6087 s++;
6726 while (--len > 0) { 6088 while (--len > 0) {
6727 if (Py_UNICODE_ISUPPER(*s)) { 6089 if (Py_UNICODE_ISUPPER(*s)) {
6728 *s = Py_UNICODE_TOLOWER(*s); 6090 *s = Py_UNICODE_TOLOWER(*s);
6729 status = 1; 6091 status = 1;
6730 } 6092 }
6731 s++; 6093 s++;
6732 } 6094 }
6733 return status; 6095 return status;
6734 } 6096 }
6735 6097
6736 static int 6098 static
6737 fixtitle(PyUnicodeObject *self) 6099 int fixtitle(PyUnicodeObject *self)
6738 { 6100 {
6739 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6101 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6740 register Py_UNICODE *e; 6102 register Py_UNICODE *e;
6741 int previous_is_cased; 6103 int previous_is_cased;
6742 6104
6743 /* Shortcut for single character strings */ 6105 /* Shortcut for single character strings */
6744 if (PyUnicode_GET_SIZE(self) == 1) { 6106 if (PyUnicode_GET_SIZE(self) == 1) {
6745 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p); 6107 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6746 if (*p != ch) { 6108 if (*p != ch) {
6747 *p = ch; 6109 *p = ch;
(...skipping 129 matching lines...) Expand 10 before | Expand all | Expand 10 after
6877 Done: 6239 Done:
6878 Py_DECREF(fseq); 6240 Py_DECREF(fseq);
6879 return (PyObject *)res; 6241 return (PyObject *)res;
6880 6242
6881 onError: 6243 onError:
6882 Py_DECREF(fseq); 6244 Py_DECREF(fseq);
6883 Py_XDECREF(res); 6245 Py_XDECREF(res);
6884 return NULL; 6246 return NULL;
6885 } 6247 }
6886 6248
6887 static PyUnicodeObject * 6249 static
6888 pad(PyUnicodeObject *self, 6250 PyUnicodeObject *pad(PyUnicodeObject *self,
6889 Py_ssize_t left, 6251 Py_ssize_t left,
6890 Py_ssize_t right, 6252 Py_ssize_t right,
6891 Py_UNICODE fill) 6253 Py_UNICODE fill)
6892 { 6254 {
6893 PyUnicodeObject *u; 6255 PyUnicodeObject *u;
6894 6256
6895 if (left < 0) 6257 if (left < 0)
6896 left = 0; 6258 left = 0;
6897 if (right < 0) 6259 if (right < 0)
6898 right = 0; 6260 right = 0;
6899 6261
6900 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { 6262 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
6901 Py_INCREF(self); 6263 Py_INCREF(self);
(...skipping 10 matching lines...)