Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code | Sign in
(4391)

Delta Between Two Patch Sets: Objects/unicodeobject.c

Issue 15027: Faster UTF-32 encoding
Left Patch Set: Created 6 years, 7 months ago
Right Patch Set: Created 5 years, 5 months ago
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
« Objects/stringlib/codecs.h ('K') | « Objects/stringlib/codecs.h ('k') | no next file » | no next file with change/comment »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
LEFTRIGHT
1 /* 1 /*
2 2
3 Unicode implementation based on original code by Fredrik Lundh, 3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com>. 4 modified by Marc-Andre Lemburg <mal@lemburg.com>.
5 5
6 Major speed upgrades to the method implementations at the Reykjavik 6 Major speed upgrades to the method implementations at the Reykjavik
7 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 7 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8 8
9 Copyright (c) Corporation for National Research Initiatives. 9 Copyright (c) Corporation for National Research Initiatives.
10 10
(...skipping 29 matching lines...) Expand all
40 40
41 #define PY_SSIZE_T_CLEAN 41 #define PY_SSIZE_T_CLEAN
42 #include "Python.h" 42 #include "Python.h"
43 #include "ucnhash.h" 43 #include "ucnhash.h"
44 #include "bytes_methods.h" 44 #include "bytes_methods.h"
45 45
46 #ifdef MS_WINDOWS 46 #ifdef MS_WINDOWS
47 #include <windows.h> 47 #include <windows.h>
48 #endif 48 #endif
49 49
50 /*[clinic]
51 class str
52 [clinic]*/
53 /*[clinic checksum: da39a3ee5e6b4b0d3255bfef95601890afd80709]*/
54
50 /* --- Globals ------------------------------------------------------------ 55 /* --- Globals ------------------------------------------------------------
51 56
52 The globals are initialized by the _PyUnicode_Init() API and should 57 NOTE: In the interpreter's initialization phase, some globals are currently
53 not be used before calling that API. 58 initialized dynamically as needed. In the process Unicode objects may
59 be created before the Unicode type is ready.
54 60
55 */ 61 */
56 62
57 63
58 #ifdef __cplusplus 64 #ifdef __cplusplus
59 extern "C" { 65 extern "C" {
60 #endif 66 #endif
61 67
62 /* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */ 68 /* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
63 #define MAX_UNICODE 0x10ffff 69 #define MAX_UNICODE 0x10ffff
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after
96 (((PyASCIIObject *)(op))->hash) 102 (((PyASCIIObject *)(op))->hash)
97 #define _PyUnicode_KIND(op) \ 103 #define _PyUnicode_KIND(op) \
98 (assert(_PyUnicode_CHECK(op)), \ 104 (assert(_PyUnicode_CHECK(op)), \
99 ((PyASCIIObject *)(op))->state.kind) 105 ((PyASCIIObject *)(op))->state.kind)
100 #define _PyUnicode_GET_LENGTH(op) \ 106 #define _PyUnicode_GET_LENGTH(op) \
101 (assert(_PyUnicode_CHECK(op)), \ 107 (assert(_PyUnicode_CHECK(op)), \
102 ((PyASCIIObject *)(op))->length) 108 ((PyASCIIObject *)(op))->length)
103 #define _PyUnicode_DATA_ANY(op) \ 109 #define _PyUnicode_DATA_ANY(op) \
104 (((PyUnicodeObject*)(op))->data.any) 110 (((PyUnicodeObject*)(op))->data.any)
105 111
106 /* Optimized version of Py_MAX() to compute the maximum character:
107 use it when your are computing the second argument of PyUnicode_New() */
108 #define MAX_MAXCHAR(maxchar1, maxchar2) \
109 ((maxchar1) | (maxchar2))
110
111 #undef PyUnicode_READY 112 #undef PyUnicode_READY
112 #define PyUnicode_READY(op) \ 113 #define PyUnicode_READY(op) \
113 (assert(_PyUnicode_CHECK(op)), \ 114 (assert(_PyUnicode_CHECK(op)), \
114 (PyUnicode_IS_READY(op) ? \ 115 (PyUnicode_IS_READY(op) ? \
115 0 : \ 116 0 : \
116 _PyUnicode_Ready(op))) 117 _PyUnicode_Ready(op)))
117 118
118 #define _PyUnicode_SHARE_UTF8(op) \ 119 #define _PyUnicode_SHARE_UTF8(op) \
119 (assert(_PyUnicode_CHECK(op)), \ 120 (assert(_PyUnicode_CHECK(op)), \
120 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \ 121 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
121 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op))) 122 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
122 #define _PyUnicode_SHARE_WSTR(op) \ 123 #define _PyUnicode_SHARE_WSTR(op) \
123 (assert(_PyUnicode_CHECK(op)), \ 124 (assert(_PyUnicode_CHECK(op)), \
124 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op))) 125 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
125 126
126 /* true if the Unicode object has an allocated UTF-8 memory block 127 /* true if the Unicode object has an allocated UTF-8 memory block
127 (not shared with other data) */ 128 (not shared with other data) */
128 #define _PyUnicode_HAS_UTF8_MEMORY(op) \ 129 #define _PyUnicode_HAS_UTF8_MEMORY(op) \
129 (assert(_PyUnicode_CHECK(op)), \ 130 ((!PyUnicode_IS_COMPACT_ASCII(op) \
130 (!PyUnicode_IS_COMPACT_ASCII(op) \
131 && _PyUnicode_UTF8(op) \ 131 && _PyUnicode_UTF8(op) \
132 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op))) 132 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
133 133
134 /* true if the Unicode object has an allocated wstr memory block 134 /* true if the Unicode object has an allocated wstr memory block
135 (not shared with other data) */ 135 (not shared with other data) */
136 #define _PyUnicode_HAS_WSTR_MEMORY(op) \ 136 #define _PyUnicode_HAS_WSTR_MEMORY(op) \
137 (assert(_PyUnicode_CHECK(op)), \ 137 ((_PyUnicode_WSTR(op) && \
138 (_PyUnicode_WSTR(op) && \
139 (!PyUnicode_IS_READY(op) || \ 138 (!PyUnicode_IS_READY(op) || \
140 _PyUnicode_WSTR(op) != PyUnicode_DATA(op)))) 139 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
141 140
142 /* Generic helper macro to convert characters of different types. 141 /* Generic helper macro to convert characters of different types.
143 from_type and to_type have to be valid type names, begin and end 142 from_type and to_type have to be valid type names, begin and end
144 are pointers to the source characters which should be of type 143 are pointers to the source characters which should be of type
145 "from_type *". to is a pointer of type "to_type *" and points to the 144 "from_type *". to is a pointer of type "to_type *" and points to the
146 buffer where the result characters are written to. */ 145 buffer where the result characters are written to. */
147 #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \ 146 #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
148 do { \ 147 do { \
149 to_type *_to = (to_type *) to; \ 148 to_type *_to = (to_type *)(to); \
150 const from_type *_iter = (begin); \ 149 const from_type *_iter = (from_type *)(begin); \
151 const from_type *_end = (end); \ 150 const from_type *_end = (from_type *)(end); \
152 Py_ssize_t n = (_end) - (_iter); \ 151 Py_ssize_t n = (_end) - (_iter); \
153 const from_type *_unrolled_end = \ 152 const from_type *_unrolled_end = \
154 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \ 153 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
155 while (_iter < (_unrolled_end)) { \ 154 while (_iter < (_unrolled_end)) { \
156 _to[0] = (to_type) _iter[0]; \ 155 _to[0] = (to_type) _iter[0]; \
157 _to[1] = (to_type) _iter[1]; \ 156 _to[1] = (to_type) _iter[1]; \
158 _to[2] = (to_type) _iter[2]; \ 157 _to[2] = (to_type) _iter[2]; \
159 _to[3] = (to_type) _iter[3]; \ 158 _to[3] = (to_type) _iter[3]; \
160 _iter += 4; _to += 4; \ 159 _iter += 4; _to += 4; \
161 } \ 160 } \
162 while (_iter < (_end)) \ 161 while (_iter < (_end)) \
163 *_to++ = (to_type) *_iter++; \ 162 *_to++ = (to_type) *_iter++; \
164 } while (0) 163 } while (0)
165 164
166 /* This dictionary holds all interned unicode strings. Note that references 165 /* This dictionary holds all interned unicode strings. Note that references
167 to strings in this dictionary are *not* counted in the string's ob_refcnt. 166 to strings in this dictionary are *not* counted in the string's ob_refcnt.
168 When the interned string reaches a refcnt of 0 the string deallocation 167 When the interned string reaches a refcnt of 0 the string deallocation
169 function will delete the reference from this dictionary. 168 function will delete the reference from this dictionary.
170 169
171 Another way to look at this is that to say that the actual reference 170 Another way to look at this is that to say that the actual reference
172 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 171 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
173 */ 172 */
174 static PyObject *interned; 173 static PyObject *interned = NULL;
175 174
176 /* The empty Unicode object is shared to improve performance. */ 175 /* The empty Unicode object is shared to improve performance. */
177 static PyObject *unicode_empty; 176 static PyObject *unicode_empty = NULL;
177
178 #define _Py_INCREF_UNICODE_EMPTY() \
179 do { \
180 if (unicode_empty != NULL) \
181 Py_INCREF(unicode_empty); \
182 else { \
183 unicode_empty = PyUnicode_New(0, 0); \
184 if (unicode_empty != NULL) { \
185 Py_INCREF(unicode_empty); \
186 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
187 } \
188 } \
189 } while (0)
190
191 #define _Py_RETURN_UNICODE_EMPTY() \
192 do { \
193 _Py_INCREF_UNICODE_EMPTY(); \
194 return unicode_empty; \
195 } while (0)
196
197 /* Forward declaration */
198 Py_LOCAL_INLINE(int)
199 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
178 200
179 /* List of static strings. */ 201 /* List of static strings. */
180 static _Py_Identifier *static_strings; 202 static _Py_Identifier *static_strings = NULL;
181 203
182 /* Single character Unicode strings in the Latin-1 range are being 204 /* Single character Unicode strings in the Latin-1 range are being
183 shared as well. */ 205 shared as well. */
184 static PyObject *unicode_latin1[256]; 206 static PyObject *unicode_latin1[256] = {NULL};
185 207
186 /* Fast detection of the most frequent whitespace characters */ 208 /* Fast detection of the most frequent whitespace characters */
187 const unsigned char _Py_ascii_whitespace[] = { 209 const unsigned char _Py_ascii_whitespace[] = {
188 0, 0, 0, 0, 0, 0, 0, 0, 210 0, 0, 0, 0, 0, 0, 0, 0,
189 /* case 0x0009: * CHARACTER TABULATION */ 211 /* case 0x0009: * CHARACTER TABULATION */
190 /* case 0x000A: * LINE FEED */ 212 /* case 0x000A: * LINE FEED */
191 /* case 0x000B: * LINE TABULATION */ 213 /* case 0x000B: * LINE TABULATION */
192 /* case 0x000C: * FORM FEED */ 214 /* case 0x000C: * FORM FEED */
193 /* case 0x000D: * CARRIAGE RETURN */ 215 /* case 0x000D: * CARRIAGE RETURN */
194 0, 1, 1, 1, 1, 1, 0, 0, 216 0, 1, 1, 1, 1, 1, 0, 0,
(...skipping 19 matching lines...) Expand all
214 0, 0, 0, 0, 0, 0, 0, 0 236 0, 0, 0, 0, 0, 0, 0, 0
215 }; 237 };
216 238
217 /* forward */ 239 /* forward */
218 static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length); 240 static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
219 static PyObject* get_latin1_char(unsigned char ch); 241 static PyObject* get_latin1_char(unsigned char ch);
220 static int unicode_modifiable(PyObject *unicode); 242 static int unicode_modifiable(PyObject *unicode);
221 243
222 244
223 static PyObject * 245 static PyObject *
224 _PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size); 246 _PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
225 static PyObject * 247 static PyObject *
226 _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size); 248 _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
227 static PyObject * 249 static PyObject *
228 _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size); 250 _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
229 251
230 static PyObject * 252 static PyObject *
231 unicode_encode_call_errorhandler(const char *errors, 253 unicode_encode_call_errorhandler(const char *errors,
232 PyObject **errorHandler,const char *encoding, const char *reason, 254 PyObject **errorHandler,const char *encoding, const char *reason,
233 PyObject *unicode, PyObject **exceptionObject, 255 PyObject *unicode, PyObject **exceptionObject,
234 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); 256 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
(...skipping 164 matching lines...) Expand 10 before | Expand all | Expand 10 after
399 #endif 421 #endif
400 422
401 static PyObject* 423 static PyObject*
402 unicode_result_wchar(PyObject *unicode) 424 unicode_result_wchar(PyObject *unicode)
403 { 425 {
404 #ifndef Py_DEBUG 426 #ifndef Py_DEBUG
405 Py_ssize_t len; 427 Py_ssize_t len;
406 428
407 len = _PyUnicode_WSTR_LENGTH(unicode); 429 len = _PyUnicode_WSTR_LENGTH(unicode);
408 if (len == 0) { 430 if (len == 0) {
409 Py_INCREF(unicode_empty);
410 Py_DECREF(unicode); 431 Py_DECREF(unicode);
411 return unicode_empty; 432 _Py_RETURN_UNICODE_EMPTY();
412 } 433 }
413 434
414 if (len == 1) { 435 if (len == 1) {
415 wchar_t ch = _PyUnicode_WSTR(unicode)[0]; 436 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
416 if (ch < 256) { 437 if ((Py_UCS4)ch < 256) {
417 PyObject *latin1_char = get_latin1_char((unsigned char)ch); 438 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
418 Py_DECREF(unicode); 439 Py_DECREF(unicode);
419 return latin1_char; 440 return latin1_char;
420 } 441 }
421 } 442 }
422 443
423 if (_PyUnicode_Ready(unicode) < 0) { 444 if (_PyUnicode_Ready(unicode) < 0) {
424 Py_DECREF(unicode); 445 Py_DECREF(unicode);
425 return NULL; 446 return NULL;
426 } 447 }
427 #else 448 #else
428 assert(Py_REFCNT(unicode) == 1); 449 assert(Py_REFCNT(unicode) == 1);
429 450
430 /* don't make the result ready in debug mode to ensure that the caller 451 /* don't make the result ready in debug mode to ensure that the caller
431 makes the string ready before using it */ 452 makes the string ready before using it */
432 assert(_PyUnicode_CheckConsistency(unicode, 1)); 453 assert(_PyUnicode_CheckConsistency(unicode, 1));
433 #endif 454 #endif
434 return unicode; 455 return unicode;
435 } 456 }
436 457
437 static PyObject* 458 static PyObject*
438 unicode_result_ready(PyObject *unicode) 459 unicode_result_ready(PyObject *unicode)
439 { 460 {
440 Py_ssize_t length; 461 Py_ssize_t length;
441 462
442 length = PyUnicode_GET_LENGTH(unicode); 463 length = PyUnicode_GET_LENGTH(unicode);
443 if (length == 0) { 464 if (length == 0) {
444 if (unicode != unicode_empty) { 465 if (unicode != unicode_empty) {
445 Py_INCREF(unicode_empty);
446 Py_DECREF(unicode); 466 Py_DECREF(unicode);
467 _Py_RETURN_UNICODE_EMPTY();
447 } 468 }
448 return unicode_empty; 469 return unicode_empty;
449 } 470 }
450 471
451 if (length == 1) { 472 if (length == 1) {
452 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); 473 void *data = PyUnicode_DATA(unicode);
474 int kind = PyUnicode_KIND(unicode);
475 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
453 if (ch < 256) { 476 if (ch < 256) {
454 PyObject *latin1_char = unicode_latin1[ch]; 477 PyObject *latin1_char = unicode_latin1[ch];
455 if (latin1_char != NULL) { 478 if (latin1_char != NULL) {
456 if (unicode != latin1_char) { 479 if (unicode != latin1_char) {
457 Py_INCREF(latin1_char); 480 Py_INCREF(latin1_char);
458 Py_DECREF(unicode); 481 Py_DECREF(unicode);
459 } 482 }
460 return latin1_char; 483 return latin1_char;
461 } 484 }
462 else { 485 else {
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after
513 #elif LONG_BIT >= 64 536 #elif LONG_BIT >= 64
514 #define BLOOM_WIDTH 64 537 #define BLOOM_WIDTH 64
515 #elif LONG_BIT >= 32 538 #elif LONG_BIT >= 32
516 #define BLOOM_WIDTH 32 539 #define BLOOM_WIDTH 32
517 #else 540 #else
518 #error "LONG_BIT is smaller than 32" 541 #error "LONG_BIT is smaller than 32"
519 #endif 542 #endif
520 543
521 #define BLOOM_MASK unsigned long 544 #define BLOOM_MASK unsigned long
522 545
523 static BLOOM_MASK bloom_linebreak; 546 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
524 547
525 #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
526 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 548 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
527 549
528 #define BLOOM_LINEBREAK(ch) \ 550 #define BLOOM_LINEBREAK(ch) \
529 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 551 ((ch) < 128U ? ascii_linebreak[(ch)] : \
530 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 552 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
531 553
532 Py_LOCAL_INLINE(BLOOM_MASK) 554 Py_LOCAL_INLINE(BLOOM_MASK)
533 make_bloom_mask(int kind, void* ptr, Py_ssize_t len) 555 make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
534 { 556 {
557 #define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
558 do { \
559 TYPE *data = (TYPE *)PTR; \
560 TYPE *end = data + LEN; \
561 Py_UCS4 ch; \
562 for (; data != end; data++) { \
563 ch = *data; \
564 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
565 } \
566 break; \
567 } while (0)
568
535 /* calculate simple bloom-style bitmask for a given unicode string */ 569 /* calculate simple bloom-style bitmask for a given unicode string */
536 570
537 BLOOM_MASK mask; 571 BLOOM_MASK mask;
538 Py_ssize_t i;
539 572
540 mask = 0; 573 mask = 0;
541 for (i = 0; i < len; i++) 574 switch (kind) {
542 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i)); 575 case PyUnicode_1BYTE_KIND:
543 576 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
577 break;
578 case PyUnicode_2BYTE_KIND:
579 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
580 break;
581 case PyUnicode_4BYTE_KIND:
582 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
583 break;
584 default:
585 assert(0);
586 }
544 return mask; 587 return mask;
545 } 588
546 589 #undef BLOOM_UPDATE
547 #define BLOOM_MEMBER(mask, chr, str) \ 590 }
548 (BLOOM(mask, chr) \
549 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
550 591
551 /* Compilation of templated routines */ 592 /* Compilation of templated routines */
552 593
553 #include "stringlib/asciilib.h" 594 #include "stringlib/asciilib.h"
554 #include "stringlib/fastsearch.h" 595 #include "stringlib/fastsearch.h"
555 #include "stringlib/partition.h" 596 #include "stringlib/partition.h"
556 #include "stringlib/split.h" 597 #include "stringlib/split.h"
557 #include "stringlib/count.h" 598 #include "stringlib/count.h"
558 #include "stringlib/find.h" 599 #include "stringlib/find.h"
559 #include "stringlib/find_max_char.h" 600 #include "stringlib/find_max_char.h"
560 #include "stringlib/localeutil.h" 601 #include "stringlib/localeutil.h"
561 #include "stringlib/undef.h" 602 #include "stringlib/undef.h"
562 603
563 #include "stringlib/ucs1lib.h" 604 #include "stringlib/ucs1lib.h"
564 #include "stringlib/fastsearch.h" 605 #include "stringlib/fastsearch.h"
565 #include "stringlib/partition.h" 606 #include "stringlib/partition.h"
566 #include "stringlib/split.h" 607 #include "stringlib/split.h"
567 #include "stringlib/count.h" 608 #include "stringlib/count.h"
568 #include "stringlib/find.h" 609 #include "stringlib/find.h"
610 #include "stringlib/replace.h"
569 #include "stringlib/find_max_char.h" 611 #include "stringlib/find_max_char.h"
570 #include "stringlib/localeutil.h" 612 #include "stringlib/localeutil.h"
571 #include "stringlib/undef.h" 613 #include "stringlib/undef.h"
572 614
573 #include "stringlib/ucs2lib.h" 615 #include "stringlib/ucs2lib.h"
574 #include "stringlib/fastsearch.h" 616 #include "stringlib/fastsearch.h"
575 #include "stringlib/partition.h" 617 #include "stringlib/partition.h"
576 #include "stringlib/split.h" 618 #include "stringlib/split.h"
577 #include "stringlib/count.h" 619 #include "stringlib/count.h"
578 #include "stringlib/find.h" 620 #include "stringlib/find.h"
621 #include "stringlib/replace.h"
579 #include "stringlib/find_max_char.h" 622 #include "stringlib/find_max_char.h"
580 #include "stringlib/localeutil.h" 623 #include "stringlib/localeutil.h"
581 #include "stringlib/undef.h" 624 #include "stringlib/undef.h"
582 625
583 #include "stringlib/ucs4lib.h" 626 #include "stringlib/ucs4lib.h"
584 #include "stringlib/fastsearch.h" 627 #include "stringlib/fastsearch.h"
585 #include "stringlib/partition.h" 628 #include "stringlib/partition.h"
586 #include "stringlib/split.h" 629 #include "stringlib/split.h"
587 #include "stringlib/count.h" 630 #include "stringlib/count.h"
588 #include "stringlib/find.h" 631 #include "stringlib/find.h"
632 #include "stringlib/replace.h"
589 #include "stringlib/find_max_char.h" 633 #include "stringlib/find_max_char.h"
590 #include "stringlib/localeutil.h" 634 #include "stringlib/localeutil.h"
591 #include "stringlib/undef.h" 635 #include "stringlib/undef.h"
592 636
593 #include "stringlib/unicodedefs.h" 637 #include "stringlib/unicodedefs.h"
594 #include "stringlib/fastsearch.h" 638 #include "stringlib/fastsearch.h"
595 #include "stringlib/count.h" 639 #include "stringlib/count.h"
596 #include "stringlib/find.h" 640 #include "stringlib/find.h"
597 #include "stringlib/undef.h" 641 #include "stringlib/undef.h"
598 642
(...skipping 92 matching lines...) Expand 10 before | Expand all | Expand 10 after
691 } 735 }
692 unicode = new_unicode; 736 unicode = new_unicode;
693 _Py_NewReference(unicode); 737 _Py_NewReference(unicode);
694 738
695 _PyUnicode_LENGTH(unicode) = length; 739 _PyUnicode_LENGTH(unicode) = length;
696 if (share_wstr) { 740 if (share_wstr) {
697 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode); 741 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
698 if (!PyUnicode_IS_ASCII(unicode)) 742 if (!PyUnicode_IS_ASCII(unicode))
699 _PyUnicode_WSTR_LENGTH(unicode) = length; 743 _PyUnicode_WSTR_LENGTH(unicode) = length;
700 } 744 }
745 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
746 PyObject_DEL(_PyUnicode_WSTR(unicode));
747 _PyUnicode_WSTR(unicode) = NULL;
748 }
701 #ifdef Py_DEBUG 749 #ifdef Py_DEBUG
702 unicode_fill_invalid(unicode, old_length); 750 unicode_fill_invalid(unicode, old_length);
703 #endif 751 #endif
704 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 752 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
705 length, 0); 753 length, 0);
706 assert(_PyUnicode_CheckConsistency(unicode, 0)); 754 assert(_PyUnicode_CheckConsistency(unicode, 0));
707 return unicode; 755 return unicode;
708 } 756 }
709 757
710 static int 758 static int
(...skipping 94 matching lines...) Expand 10 before | Expand all | Expand 10 after
805 return copy; 853 return copy;
806 } 854 }
807 else { 855 else {
808 PyObject *w; 856 PyObject *w;
809 857
810 w = (PyObject*)_PyUnicode_New(length); 858 w = (PyObject*)_PyUnicode_New(length);
811 if (w == NULL) 859 if (w == NULL)
812 return NULL; 860 return NULL;
813 copy_length = _PyUnicode_WSTR_LENGTH(unicode); 861 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
814 copy_length = Py_MIN(copy_length, length); 862 copy_length = Py_MIN(copy_length, length);
815 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode), 863 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
816 copy_length); 864 copy_length * sizeof(wchar_t));
817 return w; 865 return w;
818 } 866 }
819 } 867 }
820 868
821 /* We allocate one more byte to make sure the string is 869 /* We allocate one more byte to make sure the string is
822 Ux0000 terminated; some code (e.g. new_identifier) 870 Ux0000 terminated; some code (e.g. new_identifier)
823 relies on that. 871 relies on that.
824 872
825 XXX This allocator could further be enhanced by assuring that the 873 XXX This allocator could further be enhanced by assuring that the
826 free list never reduces its size below 1. 874 free list never reduces its size below 1.
827 875
828 */ 876 */
829 877
830 static PyUnicodeObject * 878 static PyUnicodeObject *
831 _PyUnicode_New(Py_ssize_t length) 879 _PyUnicode_New(Py_ssize_t length)
832 { 880 {
833 register PyUnicodeObject *unicode; 881 PyUnicodeObject *unicode;
834 size_t new_size; 882 size_t new_size;
835 883
836 /* Optimization for empty strings */ 884 /* Optimization for empty strings */
837 if (length == 0 && unicode_empty != NULL) { 885 if (length == 0 && unicode_empty != NULL) {
838 Py_INCREF(unicode_empty); 886 Py_INCREF(unicode_empty);
839 return (PyUnicodeObject*)unicode_empty; 887 return (PyUnicodeObject*)unicode_empty;
840 } 888 }
841 889
842 /* Ensure we won't overflow the size. */ 890 /* Ensure we won't overflow the size. */
843 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 891 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
844 return (PyUnicodeObject *)PyErr_NoMemory(); 892 return (PyUnicodeObject *)PyErr_NoMemory();
845 } 893 }
846 if (length < 0) { 894 if (length < 0) {
847 PyErr_SetString(PyExc_SystemError, 895 PyErr_SetString(PyExc_SystemError,
848 "Negative size passed to _PyUnicode_New"); 896 "Negative size passed to _PyUnicode_New");
849 return NULL; 897 return NULL;
850 } 898 }
851 899
852 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 900 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
853 if (unicode == NULL) 901 if (unicode == NULL)
854 return NULL; 902 return NULL;
855 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 903 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
856 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size); 904
857 if (!_PyUnicode_WSTR(unicode)) {
858 Py_DECREF(unicode);
859 PyErr_NoMemory();
860 return NULL;
861 }
862
863 /* Initialize the first element to guard against cases where
864 * the caller fails before initializing str -- unicode_resize()
865 * reads str[0], and the Keep-Alive optimization can keep memory
866 * allocated for str alive across a call to unicode_dealloc(unicode).
867 * We don't want unicode_resize to read uninitialized memory in
868 * that case.
869 */
870 _PyUnicode_WSTR(unicode)[0] = 0;
871 _PyUnicode_WSTR(unicode)[length] = 0;
872 _PyUnicode_WSTR_LENGTH(unicode) = length; 905 _PyUnicode_WSTR_LENGTH(unicode) = length;
873 _PyUnicode_HASH(unicode) = -1; 906 _PyUnicode_HASH(unicode) = -1;
874 _PyUnicode_STATE(unicode).interned = 0; 907 _PyUnicode_STATE(unicode).interned = 0;
875 _PyUnicode_STATE(unicode).kind = 0; 908 _PyUnicode_STATE(unicode).kind = 0;
876 _PyUnicode_STATE(unicode).compact = 0; 909 _PyUnicode_STATE(unicode).compact = 0;
877 _PyUnicode_STATE(unicode).ready = 0; 910 _PyUnicode_STATE(unicode).ready = 0;
878 _PyUnicode_STATE(unicode).ascii = 0; 911 _PyUnicode_STATE(unicode).ascii = 0;
879 _PyUnicode_DATA_ANY(unicode) = NULL; 912 _PyUnicode_DATA_ANY(unicode) = NULL;
880 _PyUnicode_LENGTH(unicode) = 0; 913 _PyUnicode_LENGTH(unicode) = 0;
881 _PyUnicode_UTF8(unicode) = NULL; 914 _PyUnicode_UTF8(unicode) = NULL;
882 _PyUnicode_UTF8_LENGTH(unicode) = 0; 915 _PyUnicode_UTF8_LENGTH(unicode) = 0;
916
917 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
918 if (!_PyUnicode_WSTR(unicode)) {
919 Py_DECREF(unicode);
920 PyErr_NoMemory();
921 return NULL;
922 }
923
924 /* Initialize the first element to guard against cases where
925 * the caller fails before initializing str -- unicode_resize()
926 * reads str[0], and the Keep-Alive optimization can keep memory
927 * allocated for str alive across a call to unicode_dealloc(unicode).
928 * We don't want unicode_resize to read uninitialized memory in
929 * that case.
930 */
931 _PyUnicode_WSTR(unicode)[0] = 0;
932 _PyUnicode_WSTR(unicode)[length] = 0;
933
883 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0)); 934 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
884 return unicode; 935 return unicode;
885 } 936 }
886 937
887 static const char* 938 static const char*
888 unicode_kind_name(PyObject *unicode) 939 unicode_kind_name(PyObject *unicode)
889 { 940 {
890 /* don't check consistency: unicode_kind_name() is called from 941 /* don't check consistency: unicode_kind_name() is called from
891 _PyUnicode_Dump() */ 942 _PyUnicode_Dump() */
892 if (!PyUnicode_IS_COMPACT(unicode)) 943 if (!PyUnicode_IS_COMPACT(unicode))
(...skipping 614 matching lines...) Expand 10 before | Expand all | Expand 10 after
1507 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1558 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1508 #endif 1559 #endif
1509 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0'; 1560 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1510 } 1561 }
1511 _PyUnicode_STATE(unicode).ready = 1; 1562 _PyUnicode_STATE(unicode).ready = 1;
1512 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1563 assert(_PyUnicode_CheckConsistency(unicode, 1));
1513 return 0; 1564 return 0;
1514 } 1565 }
1515 1566
1516 static void 1567 static void
1517 unicode_dealloc(register PyObject *unicode) 1568 unicode_dealloc(PyObject *unicode)
1518 { 1569 {
1519 switch (PyUnicode_CHECK_INTERNED(unicode)) { 1570 switch (PyUnicode_CHECK_INTERNED(unicode)) {
1520 case SSTATE_NOT_INTERNED: 1571 case SSTATE_NOT_INTERNED:
1521 break; 1572 break;
1522 1573
1523 case SSTATE_INTERNED_MORTAL: 1574 case SSTATE_INTERNED_MORTAL:
1524 /* revive dead object temporarily for DelItem */ 1575 /* revive dead object temporarily for DelItem */
1525 Py_REFCNT(unicode) = 3; 1576 Py_REFCNT(unicode) = 3;
1526 if (PyDict_DelItem(interned, unicode) != 0) 1577 if (PyDict_DelItem(interned, unicode) != 0)
1527 Py_FatalError( 1578 Py_FatalError(
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after
1595 assert(0 <= length); 1646 assert(0 <= length);
1596 1647
1597 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND) 1648 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1598 old_length = PyUnicode_WSTR_LENGTH(unicode); 1649 old_length = PyUnicode_WSTR_LENGTH(unicode);
1599 else 1650 else
1600 old_length = PyUnicode_GET_LENGTH(unicode); 1651 old_length = PyUnicode_GET_LENGTH(unicode);
1601 if (old_length == length) 1652 if (old_length == length)
1602 return 0; 1653 return 0;
1603 1654
1604 if (length == 0) { 1655 if (length == 0) {
1656 _Py_INCREF_UNICODE_EMPTY();
1657 if (!unicode_empty)
1658 return -1;
1605 Py_DECREF(*p_unicode); 1659 Py_DECREF(*p_unicode);
1606 *p_unicode = unicode_empty; 1660 *p_unicode = unicode_empty;
1607 Py_INCREF(*p_unicode);
1608 return 0; 1661 return 0;
1609 } 1662 }
1610 1663
1611 if (!unicode_modifiable(unicode)) { 1664 if (!unicode_modifiable(unicode)) {
1612 PyObject *copy = resize_copy(unicode, length); 1665 PyObject *copy = resize_copy(unicode, length);
1613 if (copy == NULL) 1666 if (copy == NULL)
1614 return -1; 1667 return -1;
1615 Py_DECREF(*p_unicode); 1668 Py_DECREF(*p_unicode);
1616 *p_unicode = copy; 1669 *p_unicode = copy;
1617 return 0; 1670 return 0;
(...skipping 17 matching lines...) Expand all
1635 PyErr_BadInternalCall(); 1688 PyErr_BadInternalCall();
1636 return -1; 1689 return -1;
1637 } 1690 }
1638 unicode = *p_unicode; 1691 unicode = *p_unicode;
1639 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0) 1692 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1640 { 1693 {
1641 PyErr_BadInternalCall(); 1694 PyErr_BadInternalCall();
1642 return -1; 1695 return -1;
1643 } 1696 }
1644 return unicode_resize(p_unicode, length); 1697 return unicode_resize(p_unicode, length);
1645 }
1646
1647 static int
1648 unicode_widen(PyObject **p_unicode, Py_ssize_t length,
1649 unsigned int maxchar)
1650 {
1651 PyObject *result;
1652 assert(PyUnicode_IS_READY(*p_unicode));
1653 assert(length <= PyUnicode_GET_LENGTH(*p_unicode));
1654 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1655 return 0;
1656 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1657 maxchar);
1658 if (result == NULL)
1659 return -1;
1660 _PyUnicode_FastCopyCharacters(result, 0, *p_unicode, 0, length);
1661 Py_DECREF(*p_unicode);
1662 *p_unicode = result;
1663 return 0;
1664 }
1665
1666 static int
1667 unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1668 Py_UCS4 ch)
1669 {
1670 assert(ch <= MAX_UNICODE);
1671 if (unicode_widen(p_unicode, *pos, ch) < 0)
1672 return -1;
1673 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1674 PyUnicode_DATA(*p_unicode),
1675 (*pos)++, ch);
1676 return 0;
1677 } 1698 }
1678 1699
1679 /* Copy a ASCII or latin1 char* string into a Python Unicode string. 1700 /* Copy a ASCII or latin1 char* string into a Python Unicode string.
1680 1701
1681 WARNING: The function doesn't copy the terminating null character and 1702 WARNING: The function doesn't copy the terminating null character and
1682 doesn't check the maximum character (may write a latin1 character in an 1703 doesn't check the maximum character (may write a latin1 character in an
1683 ASCII string). */ 1704 ASCII string). */
1684 static void 1705 static void
1685 unicode_write_cstr(PyObject *unicode, Py_ssize_t index, 1706 unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1686 const char *str, Py_ssize_t len) 1707 const char *str, Py_ssize_t len)
(...skipping 65 matching lines...) Expand 10 before | Expand all | Expand 10 after
1752 Py_UCS4 maxchar = 0; 1773 Py_UCS4 maxchar = 0;
1753 Py_ssize_t num_surrogates; 1774 Py_ssize_t num_surrogates;
1754 1775
1755 if (u == NULL) 1776 if (u == NULL)
1756 return (PyObject*)_PyUnicode_New(size); 1777 return (PyObject*)_PyUnicode_New(size);
1757 1778
1758 /* If the Unicode data is known at construction time, we can apply 1779 /* If the Unicode data is known at construction time, we can apply
1759 some optimizations which share commonly used objects. */ 1780 some optimizations which share commonly used objects. */
1760 1781
1761 /* Optimization for empty strings */ 1782 /* Optimization for empty strings */
1762 if (size == 0 && unicode_empty != NULL) { 1783 if (size == 0)
1763 Py_INCREF(unicode_empty); 1784 _Py_RETURN_UNICODE_EMPTY();
1764 return unicode_empty;
1765 }
1766 1785
1767 /* Single character Unicode objects in the Latin-1 range are 1786 /* Single character Unicode objects in the Latin-1 range are
1768 shared when using this constructor */ 1787 shared when using this constructor */
1769 if (size == 1 && *u < 256) 1788 if (size == 1 && (Py_UCS4)*u < 256)
1770 return get_latin1_char((unsigned char)*u); 1789 return get_latin1_char((unsigned char)*u);
1771 1790
1772 /* If not empty and not single character, copy the Unicode data 1791 /* If not empty and not single character, copy the Unicode data
1773 into the new object */ 1792 into the new object */
1774 if (find_maxchar_surrogates(u, u + size, 1793 if (find_maxchar_surrogates(u, u + size,
1775 &maxchar, &num_surrogates) == -1) 1794 &maxchar, &num_surrogates) == -1)
1776 return NULL; 1795 return NULL;
1777 1796
1778 unicode = PyUnicode_New(size - num_surrogates, maxchar); 1797 unicode = PyUnicode_New(size - num_surrogates, maxchar);
1779 if (!unicode) 1798 if (!unicode)
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after
1847 assert(!id->next); 1866 assert(!id->next);
1848 id->next = static_strings; 1867 id->next = static_strings;
1849 static_strings = id; 1868 static_strings = id;
1850 } 1869 }
1851 return id->object; 1870 return id->object;
1852 } 1871 }
1853 1872
1854 void 1873 void
1855 _PyUnicode_ClearStaticStrings() 1874 _PyUnicode_ClearStaticStrings()
1856 { 1875 {
1857 _Py_Identifier *i; 1876 _Py_Identifier *tmp, *s = static_strings;
1858 for (i = static_strings; i; i = i->next) { 1877 while (s) {
1859 Py_DECREF(i->object); 1878 Py_DECREF(s->object);
1860 i->object = NULL; 1879 s->object = NULL;
1861 i->next = NULL; 1880 tmp = s->next;
1862 } 1881 s->next = NULL;
1882 s = tmp;
1883 }
1884 static_strings = NULL;
1863 } 1885 }
1864 1886
1865 /* Internal function, doesn't check maximum character */ 1887 /* Internal function, doesn't check maximum character */
1866 1888
1867 PyObject* 1889 PyObject*
1868 _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size) 1890 _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
1869 { 1891 {
1870 const unsigned char *s = (const unsigned char *)buffer; 1892 const unsigned char *s = (const unsigned char *)buffer;
1871 PyObject *unicode; 1893 PyObject *unicode;
1872 if (size == 1) { 1894 if (size == 1) {
1873 #ifdef Py_DEBUG 1895 #ifdef Py_DEBUG
1874 assert(s[0] < 128); 1896 assert((unsigned char)s[0] < 128);
1875 #endif 1897 #endif
1876 return get_latin1_char(s[0]); 1898 return get_latin1_char(s[0]);
1877 } 1899 }
1878 unicode = PyUnicode_New(size, 127); 1900 unicode = PyUnicode_New(size, 127);
1879 if (!unicode) 1901 if (!unicode)
1880 return NULL; 1902 return NULL;
1881 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size); 1903 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1882 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1904 assert(_PyUnicode_CheckConsistency(unicode, 1));
1883 return unicode; 1905 return unicode;
1884 } 1906 }
(...skipping 21 matching lines...) Expand all
1906 return 127; 1928 return 127;
1907 else if (maxchar <= 255) 1929 else if (maxchar <= 255)
1908 return 255; 1930 return 255;
1909 else if (maxchar <= 65535) 1931 else if (maxchar <= 65535)
1910 return 65535; 1932 return 65535;
1911 else 1933 else
1912 return MAX_UNICODE; 1934 return MAX_UNICODE;
1913 } 1935 }
1914 1936
1915 static PyObject* 1937 static PyObject*
1916 _PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size) 1938 _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
1917 { 1939 {
1918 PyObject *res; 1940 PyObject *res;
1919 unsigned char max_char; 1941 unsigned char max_char;
1920 1942
1921 if (size == 0) { 1943 if (size == 0)
1922 Py_INCREF(unicode_empty); 1944 _Py_RETURN_UNICODE_EMPTY();
1923 return unicode_empty;
1924 }
1925 assert(size > 0); 1945 assert(size > 0);
1926 if (size == 1) 1946 if (size == 1)
1927 return get_latin1_char(u[0]); 1947 return get_latin1_char(u[0]);
1928 1948
1929 max_char = ucs1lib_find_max_char(u, u + size); 1949 max_char = ucs1lib_find_max_char(u, u + size);
1930 res = PyUnicode_New(size, max_char); 1950 res = PyUnicode_New(size, max_char);
1931 if (!res) 1951 if (!res)
1932 return NULL; 1952 return NULL;
1933 memcpy(PyUnicode_1BYTE_DATA(res), u, size); 1953 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1934 assert(_PyUnicode_CheckConsistency(res, 1)); 1954 assert(_PyUnicode_CheckConsistency(res, 1));
1935 return res; 1955 return res;
1936 } 1956 }
1937 1957
1938 static PyObject* 1958 static PyObject*
1939 _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) 1959 _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
1940 { 1960 {
1941 PyObject *res; 1961 PyObject *res;
1942 Py_UCS2 max_char; 1962 Py_UCS2 max_char;
1943 1963
1944 if (size == 0) { 1964 if (size == 0)
1945 Py_INCREF(unicode_empty); 1965 _Py_RETURN_UNICODE_EMPTY();
1946 return unicode_empty;
1947 }
1948 assert(size > 0); 1966 assert(size > 0);
1949 if (size == 1) { 1967 if (size == 1) {
1950 Py_UCS4 ch = u[0]; 1968 Py_UCS4 ch = u[0];
1969 int kind;
1970 void *data;
1951 if (ch < 256) 1971 if (ch < 256)
1952 return get_latin1_char((unsigned char)ch); 1972 return get_latin1_char((unsigned char)ch);
1953 1973
1954 res = PyUnicode_New(1, ch); 1974 res = PyUnicode_New(1, ch);
1955 if (res == NULL) 1975 if (res == NULL)
1956 return NULL; 1976 return NULL;
1957 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch); 1977 kind = PyUnicode_KIND(res);
1978 data = PyUnicode_DATA(res);
1979 PyUnicode_WRITE(kind, data, 0, ch);
1958 assert(_PyUnicode_CheckConsistency(res, 1)); 1980 assert(_PyUnicode_CheckConsistency(res, 1));
1959 return res; 1981 return res;
1960 } 1982 }
1961 1983
1962 max_char = ucs2lib_find_max_char(u, u + size); 1984 max_char = ucs2lib_find_max_char(u, u + size);
1963 res = PyUnicode_New(size, max_char); 1985 res = PyUnicode_New(size, max_char);
1964 if (!res) 1986 if (!res)
1965 return NULL; 1987 return NULL;
1966 if (max_char >= 256) 1988 if (max_char >= 256)
1967 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size); 1989 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1968 else { 1990 else {
1969 _PyUnicode_CONVERT_BYTES( 1991 _PyUnicode_CONVERT_BYTES(
1970 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res)); 1992 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1971 } 1993 }
1972 assert(_PyUnicode_CheckConsistency(res, 1)); 1994 assert(_PyUnicode_CheckConsistency(res, 1));
1973 return res; 1995 return res;
1974 } 1996 }
1975 1997
1976 static PyObject* 1998 static PyObject*
1977 _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) 1999 _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
1978 { 2000 {
1979 PyObject *res; 2001 PyObject *res;
1980 Py_UCS4 max_char; 2002 Py_UCS4 max_char;
1981 2003
1982 if (size == 0) { 2004 if (size == 0)
1983 Py_INCREF(unicode_empty); 2005 _Py_RETURN_UNICODE_EMPTY();
1984 return unicode_empty;
1985 }
1986 assert(size > 0); 2006 assert(size > 0);
1987 if (size == 1) { 2007 if (size == 1) {
1988 Py_UCS4 ch = u[0]; 2008 Py_UCS4 ch = u[0];
2009 int kind;
2010 void *data;
1989 if (ch < 256) 2011 if (ch < 256)
1990 return get_latin1_char((unsigned char)ch); 2012 return get_latin1_char((unsigned char)ch);
1991 2013
1992 res = PyUnicode_New(1, ch); 2014 res = PyUnicode_New(1, ch);
1993 if (res == NULL) 2015 if (res == NULL)
1994 return NULL; 2016 return NULL;
1995 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch); 2017 kind = PyUnicode_KIND(res);
2018 data = PyUnicode_DATA(res);
2019 PyUnicode_WRITE(kind, data, 0, ch);
1996 assert(_PyUnicode_CheckConsistency(res, 1)); 2020 assert(_PyUnicode_CheckConsistency(res, 1));
1997 return res; 2021 return res;
1998 } 2022 }
1999 2023
2000 max_char = ucs4lib_find_max_char(u, u + size); 2024 max_char = ucs4lib_find_max_char(u, u + size);
2001 res = PyUnicode_New(size, max_char); 2025 res = PyUnicode_New(size, max_char);
2002 if (!res) 2026 if (!res)
2003 return NULL; 2027 return NULL;
2004 if (max_char < 256) 2028 if (max_char < 256)
2005 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size, 2029 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
(...skipping 258 matching lines...) Expand 10 before | Expand all | Expand 10 after
2264 2288
2265 Py_UCS4* 2289 Py_UCS4*
2266 PyUnicode_AsUCS4Copy(PyObject *string) 2290 PyUnicode_AsUCS4Copy(PyObject *string)
2267 { 2291 {
2268 return as_ucs4(string, NULL, 0, 1); 2292 return as_ucs4(string, NULL, 0, 1);
2269 } 2293 }
2270 2294
2271 #ifdef HAVE_WCHAR_H 2295 #ifdef HAVE_WCHAR_H
2272 2296
2273 PyObject * 2297 PyObject *
2274 PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size) 2298 PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
2275 { 2299 {
2276 if (w == NULL) { 2300 if (w == NULL) {
2277 if (size == 0) { 2301 if (size == 0)
2278 Py_INCREF(unicode_empty); 2302 _Py_RETURN_UNICODE_EMPTY();
2279 return unicode_empty;
2280 }
2281 PyErr_BadInternalCall(); 2303 PyErr_BadInternalCall();
2282 return NULL; 2304 return NULL;
2283 } 2305 }
2284 2306
2285 if (size == -1) { 2307 if (size == -1) {
2286 size = wcslen(w); 2308 size = wcslen(w);
2287 } 2309 }
2288 2310
2289 return PyUnicode_FromUnicode(w, size); 2311 return PyUnicode_FromUnicode(w, size);
2290 } 2312 }
(...skipping 27 matching lines...) Expand all
2318 } 2340 }
2319 *fmt++ = c; 2341 *fmt++ = c;
2320 *fmt = '\0'; 2342 *fmt = '\0';
2321 } 2343 }
2322 2344
2323 /* maximum number of characters required for output of %lld or %p. 2345 /* maximum number of characters required for output of %lld or %p.
2324 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, 2346 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2325 plus 1 for the sign. 53/22 is an upper bound for log10(256). */ 2347 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2326 #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) 2348 #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2327 2349
2350 static int
2351 unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2352 Py_ssize_t width, Py_ssize_t precision)
2353 {
2354 Py_ssize_t length, fill, arglen;
2355 Py_UCS4 maxchar;
2356
2357 if (PyUnicode_READY(str) == -1)
2358 return -1;
2359
2360 length = PyUnicode_GET_LENGTH(str);
2361 if ((precision == -1 || precision >= length)
2362 && width <= length)
2363 return _PyUnicodeWriter_WriteStr(writer, str);
2364
2365 if (precision != -1)
2366 length = Py_MIN(precision, length);
2367
2368 arglen = Py_MAX(length, width);
2369 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2370 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2371 else
2372 maxchar = writer->maxchar;
2373
2374 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2375 return -1;
2376
2377 if (width > length) {
2378 fill = width - length;
2379 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2380 return -1;
2381 writer->pos += fill;
2382 }
2383
2384 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2385 str, 0, length);
2386 writer->pos += length;
2387 return 0;
2388 }
2389
2390 static int
2391 unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2392 Py_ssize_t width, Py_ssize_t precision)
2393 {
2394 /* UTF-8 */
2395 Py_ssize_t length;
2396 PyObject *unicode;
2397 int res;
2398
2399 length = strlen(str);
2400 if (precision != -1)
2401 length = Py_MIN(length, precision);
2402 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2403 if (unicode == NULL)
2404 return -1;
2405
2406 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2407 Py_DECREF(unicode);
2408 return res;
2409 }
2410
2328 static const char* 2411 static const char*
2329 unicode_fromformat_arg(_PyUnicodeWriter *writer, 2412 unicode_fromformat_arg(_PyUnicodeWriter *writer,
2330 const char *f, va_list *vargs) 2413 const char *f, va_list *vargs)
2331 { 2414 {
2332 const char *p; 2415 const char *p;
2333 Py_ssize_t len; 2416 Py_ssize_t len;
2334 int zeropad; 2417 int zeropad;
2335 int width; 2418 Py_ssize_t width;
2336 int precision; 2419 Py_ssize_t precision;
2337 int longflag; 2420 int longflag;
2338 int longlongflag; 2421 int longlongflag;
2339 int size_tflag; 2422 int size_tflag;
2340 int fill; 2423 Py_ssize_t fill;
2341 2424
2342 p = f; 2425 p = f;
2343 f++; 2426 f++;
2344 zeropad = 0; 2427 zeropad = 0;
2345 if (*f == '0') { 2428 if (*f == '0') {
2346 zeropad = 1; 2429 zeropad = 1;
2347 f++; 2430 f++;
2348 } 2431 }
2349 2432
2350 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ 2433 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2351 width = 0; 2434 width = -1;
2352 while (Py_ISDIGIT((unsigned)*f)) { 2435 if (Py_ISDIGIT((unsigned)*f)) {
2353 if (width > (INT_MAX - ((int)*f - '0')) / 10) { 2436 width = *f - '0';
2354 PyErr_SetString(PyExc_ValueError,
2355 "width too big");
2356 return NULL;
2357 }
2358 width = (width*10) + (*f - '0');
2359 f++; 2437 f++;
2360 } 2438 while (Py_ISDIGIT((unsigned)*f)) {
2361 precision = 0; 2439 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2440 PyErr_SetString(PyExc_ValueError,
2441 "width too big");
2442 return NULL;
2443 }
2444 width = (width * 10) + (*f - '0');
2445 f++;
2446 }
2447 }
2448 precision = -1;
2362 if (*f == '.') { 2449 if (*f == '.') {
2363 f++; 2450 f++;
2364 while (Py_ISDIGIT((unsigned)*f)) { 2451 if (Py_ISDIGIT((unsigned)*f)) {
2365 if (precision > (INT_MAX - ((int)*f - '0')) / 10) { 2452 precision = (*f - '0');
2366 PyErr_SetString(PyExc_ValueError, 2453 f++;
2367 "precision too big"); 2454 while (Py_ISDIGIT((unsigned)*f)) {
2368 return NULL; 2455 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2456 PyErr_SetString(PyExc_ValueError,
2457 "precision too big");
2458 return NULL;
2459 }
2460 precision = (precision * 10) + (*f - '0');
2461 f++;
2369 } 2462 }
2370 precision = (precision*10) + (*f - '0');
2371 f++;
2372 } 2463 }
2373 if (*f == '%') { 2464 if (*f == '%') {
2374 /* "%.3%s" => f points to "3" */ 2465 /* "%.3%s" => f points to "3" */
2375 f--; 2466 f--;
2376 } 2467 }
2377 } 2468 }
2378 if (*f == '\0') { 2469 if (*f == '\0') {
2379 /* bogus format "%.123" => go backward, f points to "3" */ 2470 /* bogus format "%.123" => go backward, f points to "3" */
2380 f--; 2471 f--;
2381 } 2472 }
(...skipping 22 matching lines...) Expand all
2404 } 2495 }
2405 2496
2406 if (f[1] == '\0') 2497 if (f[1] == '\0')
2407 writer->overallocate = 0; 2498 writer->overallocate = 0;
2408 2499
2409 switch (*f) { 2500 switch (*f) {
2410 case 'c': 2501 case 'c':
2411 { 2502 {
2412 int ordinal = va_arg(*vargs, int); 2503 int ordinal = va_arg(*vargs, int);
2413 if (ordinal < 0 || ordinal > MAX_UNICODE) { 2504 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2414 PyErr_SetString(PyExc_ValueError, 2505 PyErr_SetString(PyExc_OverflowError,
2415 "character argument not in range(0x110000)"); 2506 "character argument not in range(0x110000)");
2416 return NULL; 2507 return NULL;
2417 } 2508 }
2418 if (_PyUnicodeWriter_Prepare(writer, 1, ordinal) == -1) 2509 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2419 return NULL; 2510 return NULL;
2420 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ordinal);
2421 writer->pos++;
2422 break; 2511 break;
2423 } 2512 }
2424 2513
2425 case 'i': 2514 case 'i':
2426 case 'd': 2515 case 'd':
2427 case 'u': 2516 case 'u':
2428 case 'x': 2517 case 'x':
2429 { 2518 {
2430 /* used by sprintf */ 2519 /* used by sprintf */
2431 char fmt[10]; /* should be enough for "%0lld\0" */ 2520 char fmt[10]; /* should be enough for "%0lld\0" */
2432 char buffer[MAX_LONG_LONG_CHARS]; 2521 char buffer[MAX_LONG_LONG_CHARS];
2522 Py_ssize_t arglen;
2433 2523
2434 if (*f == 'u') { 2524 if (*f == 'u') {
2435 makefmt(fmt, longflag, longlongflag, size_tflag, *f); 2525 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2436 2526
2437 if (longflag) 2527 if (longflag)
2438 len = sprintf(buffer, fmt, 2528 len = sprintf(buffer, fmt,
2439 va_arg(*vargs, unsigned long)); 2529 va_arg(*vargs, unsigned long));
2440 #ifdef HAVE_LONG_LONG 2530 #ifdef HAVE_LONG_LONG
2441 else if (longlongflag) 2531 else if (longlongflag)
2442 len = sprintf(buffer, fmt, 2532 len = sprintf(buffer, fmt,
(...skipping 25 matching lines...) Expand all
2468 len = sprintf(buffer, fmt, 2558 len = sprintf(buffer, fmt,
2469 va_arg(*vargs, Py_ssize_t)); 2559 va_arg(*vargs, Py_ssize_t));
2470 else 2560 else
2471 len = sprintf(buffer, fmt, 2561 len = sprintf(buffer, fmt,
2472 va_arg(*vargs, int)); 2562 va_arg(*vargs, int));
2473 } 2563 }
2474 assert(len >= 0); 2564 assert(len >= 0);
2475 2565
2476 if (precision < len) 2566 if (precision < len)
2477 precision = len; 2567 precision = len;
2568
2569 arglen = Py_MAX(precision, width);
2570 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2571 return NULL;
2572
2478 if (width > precision) { 2573 if (width > precision) {
2479 Py_UCS4 fillchar; 2574 Py_UCS4 fillchar;
2480 fill = width - precision; 2575 fill = width - precision;
2481 fillchar = zeropad?'0':' '; 2576 fillchar = zeropad?'0':' ';
2482 if (_PyUnicodeWriter_Prepare(writer, fill, fillchar) == -1)
2483 return NULL;
2484 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == - 1) 2577 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == - 1)
2485 return NULL; 2578 return NULL;
2486 writer->pos += fill; 2579 writer->pos += fill;
2487 } 2580 }
2488 if (precision > len) { 2581 if (precision > len) {
2489 fill = precision - len; 2582 fill = precision - len;
2490 if (_PyUnicodeWriter_Prepare(writer, fill, '0') == -1)
2491 return NULL;
2492 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1) 2583 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2493 return NULL; 2584 return NULL;
2494 writer->pos += fill; 2585 writer->pos += fill;
2495 } 2586 }
2496 if (_PyUnicodeWriter_WriteCstr(writer, buffer, len) == -1) 2587
2588 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2497 return NULL; 2589 return NULL;
2498 break; 2590 break;
2499 } 2591 }
2500 2592
2501 case 'p': 2593 case 'p':
2502 { 2594 {
2503 char number[MAX_LONG_LONG_CHARS]; 2595 char number[MAX_LONG_LONG_CHARS];
2504 2596
2505 len = sprintf(number, "%p", va_arg(*vargs, void*)); 2597 len = sprintf(number, "%p", va_arg(*vargs, void*));
2506 assert(len >= 0); 2598 assert(len >= 0);
2507 2599
2508 /* %p is ill-defined: ensure leading 0x. */ 2600 /* %p is ill-defined: ensure leading 0x. */
2509 if (number[1] == 'X') 2601 if (number[1] == 'X')
2510 number[1] = 'x'; 2602 number[1] = 'x';
2511 else if (number[1] != 'x') { 2603 else if (number[1] != 'x') {
2512 memmove(number + 2, number, 2604 memmove(number + 2, number,
2513 strlen(number) + 1); 2605 strlen(number) + 1);
2514 number[0] = '0'; 2606 number[0] = '0';
2515 number[1] = 'x'; 2607 number[1] = 'x';
2516 len += 2; 2608 len += 2;
2517 } 2609 }
2518 2610
2519 if (_PyUnicodeWriter_WriteCstr(writer, number, len) == -1) 2611 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2520 return NULL; 2612 return NULL;
2521 break; 2613 break;
2522 } 2614 }
2523 2615
2524 case 's': 2616 case 's':
2525 { 2617 {
2526 /* UTF-8 */ 2618 /* UTF-8 */
2527 const char *s = va_arg(*vargs, const char*); 2619 const char *s = va_arg(*vargs, const char*);
2528 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NU LL); 2620 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
2529 if (!str)
2530 return NULL; 2621 return NULL;
2531 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
2532 Py_DECREF(str);
2533 return NULL;
2534 }
2535 Py_DECREF(str);
2536 break; 2622 break;
2537 } 2623 }
2538 2624
2539 case 'U': 2625 case 'U':
2540 { 2626 {
2541 PyObject *obj = va_arg(*vargs, PyObject *); 2627 PyObject *obj = va_arg(*vargs, PyObject *);
2542 assert(obj && _PyUnicode_CHECK(obj)); 2628 assert(obj && _PyUnicode_CHECK(obj));
2543 2629
2544 if (_PyUnicodeWriter_WriteStr(writer, obj) == -1) 2630 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2545 return NULL; 2631 return NULL;
2546 break; 2632 break;
2547 } 2633 }
2548 2634
2549 case 'V': 2635 case 'V':
2550 { 2636 {
2551 PyObject *obj = va_arg(*vargs, PyObject *); 2637 PyObject *obj = va_arg(*vargs, PyObject *);
2552 const char *str = va_arg(*vargs, const char *); 2638 const char *str = va_arg(*vargs, const char *);
2553 PyObject *str_obj;
2554 assert(obj || str);
2555 if (obj) { 2639 if (obj) {
2556 assert(_PyUnicode_CHECK(obj)); 2640 assert(_PyUnicode_CHECK(obj));
2557 if (_PyUnicodeWriter_WriteStr(writer, obj) == -1) 2641 if (unicode_fromformat_write_str(writer, obj, width, precision) == - 1)
2558 return NULL; 2642 return NULL;
2559 } 2643 }
2560 else { 2644 else {
2561 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL); 2645 assert(str != NULL);
2562 if (!str_obj) 2646 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0 )
2563 return NULL; 2647 return NULL;
2564 if (_PyUnicodeWriter_WriteStr(writer, str_obj) == -1) {
2565 Py_DECREF(str_obj);
2566 return NULL;
2567 }
2568 Py_DECREF(str_obj);
2569 } 2648 }
2570 break; 2649 break;
2571 } 2650 }
2572 2651
2573 case 'S': 2652 case 'S':
2574 { 2653 {
2575 PyObject *obj = va_arg(*vargs, PyObject *); 2654 PyObject *obj = va_arg(*vargs, PyObject *);
2576 PyObject *str; 2655 PyObject *str;
2577 assert(obj); 2656 assert(obj);
2578 str = PyObject_Str(obj); 2657 str = PyObject_Str(obj);
2579 if (!str) 2658 if (!str)
2580 return NULL; 2659 return NULL;
2581 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) { 2660 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
2582 Py_DECREF(str); 2661 Py_DECREF(str);
2583 return NULL; 2662 return NULL;
2584 } 2663 }
2585 Py_DECREF(str); 2664 Py_DECREF(str);
2586 break; 2665 break;
2587 } 2666 }
2588 2667
2589 case 'R': 2668 case 'R':
2590 { 2669 {
2591 PyObject *obj = va_arg(*vargs, PyObject *); 2670 PyObject *obj = va_arg(*vargs, PyObject *);
2592 PyObject *repr; 2671 PyObject *repr;
2593 assert(obj); 2672 assert(obj);
2594 repr = PyObject_Repr(obj); 2673 repr = PyObject_Repr(obj);
2595 if (!repr) 2674 if (!repr)
2596 return NULL; 2675 return NULL;
2597 if (_PyUnicodeWriter_WriteStr(writer, repr) == -1) { 2676 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
2598 Py_DECREF(repr); 2677 Py_DECREF(repr);
2599 return NULL; 2678 return NULL;
2600 } 2679 }
2601 Py_DECREF(repr); 2680 Py_DECREF(repr);
2602 break; 2681 break;
2603 } 2682 }
2604 2683
2605 case 'A': 2684 case 'A':
2606 { 2685 {
2607 PyObject *obj = va_arg(*vargs, PyObject *); 2686 PyObject *obj = va_arg(*vargs, PyObject *);
2608 PyObject *ascii; 2687 PyObject *ascii;
2609 assert(obj); 2688 assert(obj);
2610 ascii = PyObject_ASCII(obj); 2689 ascii = PyObject_ASCII(obj);
2611 if (!ascii) 2690 if (!ascii)
2612 return NULL; 2691 return NULL;
2613 if (_PyUnicodeWriter_WriteStr(writer, ascii) == -1) { 2692 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
2614 Py_DECREF(ascii); 2693 Py_DECREF(ascii);
2615 return NULL; 2694 return NULL;
2616 } 2695 }
2617 Py_DECREF(ascii); 2696 Py_DECREF(ascii);
2618 break; 2697 break;
2619 } 2698 }
2620 2699
2621 case '%': 2700 case '%':
2622 if (_PyUnicodeWriter_Prepare(writer, 1, '%') == 1) 2701 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2623 return NULL; 2702 return NULL;
2624 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%');
2625 writer->pos++;
2626 break; 2703 break;
2627 2704
2628 default: 2705 default:
2629 /* if we stumble upon an unknown formatting code, copy the rest 2706 /* if we stumble upon an unknown formatting code, copy the rest
2630 of the format string to the output string. (we cannot just 2707 of the format string to the output string. (we cannot just
2631 skip the code, since there's no way to know what's in the 2708 skip the code, since there's no way to know what's in the
2632 argument list) */ 2709 argument list) */
2633 len = strlen(p); 2710 len = strlen(p);
2634 if (_PyUnicodeWriter_WriteCstr(writer, p, len) == -1) 2711 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
2635 return NULL; 2712 return NULL;
2636 f = p+len; 2713 f = p+len;
2637 return f; 2714 return f;
2638 } 2715 }
2639 2716
2640 f++; 2717 f++;
2641 return f; 2718 return f;
2642 } 2719 }
2643 2720
2644 PyObject * 2721 PyObject *
2645 PyUnicode_FromFormatV(const char *format, va_list vargs) 2722 PyUnicode_FromFormatV(const char *format, va_list vargs)
2646 { 2723 {
2647 va_list vargs2; 2724 va_list vargs2;
2648 const char *f; 2725 const char *f;
2649 _PyUnicodeWriter writer; 2726 _PyUnicodeWriter writer;
2650 2727
2651 _PyUnicodeWriter_Init(&writer, strlen(format) + 100); 2728 _PyUnicodeWriter_Init(&writer);
2729 writer.min_length = strlen(format) + 100;
2730 writer.overallocate = 1;
2652 2731
2653 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64). 2732 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2654 Copy it to be able to pass a reference to a subfunction. */ 2733 Copy it to be able to pass a reference to a subfunction. */
2655 Py_VA_COPY(vargs2, vargs); 2734 Py_VA_COPY(vargs2, vargs);
2656 2735
2657 for (f = format; *f; ) { 2736 for (f = format; *f; ) {
2658 if (*f == '%') { 2737 if (*f == '%') {
2659 f = unicode_fromformat_arg(&writer, f, &vargs2); 2738 f = unicode_fromformat_arg(&writer, f, &vargs2);
2660 if (f == NULL) 2739 if (f == NULL)
2661 goto fail; 2740 goto fail;
(...skipping 12 matching lines...) Expand all
2674 (unsigned char)*p); 2753 (unsigned char)*p);
2675 return NULL; 2754 return NULL;
2676 } 2755 }
2677 p++; 2756 p++;
2678 } 2757 }
2679 while (*p != '\0' && *p != '%'); 2758 while (*p != '\0' && *p != '%');
2680 len = p - f; 2759 len = p - f;
2681 2760
2682 if (*p == '\0') 2761 if (*p == '\0')
2683 writer.overallocate = 0; 2762 writer.overallocate = 0;
2684 if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1) 2763
2764 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
2685 goto fail; 2765 goto fail;
2686 unicode_write_cstr(writer.buffer, writer.pos, f, len);
2687 writer.pos += len;
2688 2766
2689 f = p; 2767 f = p;
2690 } 2768 }
2691 } 2769 }
2692 return _PyUnicodeWriter_Finish(&writer); 2770 return _PyUnicodeWriter_Finish(&writer);
2693 2771
2694 fail: 2772 fail:
2695 _PyUnicodeWriter_Dealloc(&writer); 2773 _PyUnicodeWriter_Dealloc(&writer);
2696 return NULL; 2774 return NULL;
2697 } 2775 }
(...skipping 95 matching lines...) Expand 10 before | Expand all | Expand 10 after
2793 *size = buflen; 2871 *size = buflen;
2794 return buffer; 2872 return buffer;
2795 } 2873 }
2796 2874
2797 #endif /* HAVE_WCHAR_H */ 2875 #endif /* HAVE_WCHAR_H */
2798 2876
2799 PyObject * 2877 PyObject *
2800 PyUnicode_FromOrdinal(int ordinal) 2878 PyUnicode_FromOrdinal(int ordinal)
2801 { 2879 {
2802 PyObject *v; 2880 PyObject *v;
2881 void *data;
2882 int kind;
2883
2803 if (ordinal < 0 || ordinal > MAX_UNICODE) { 2884 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2804 PyErr_SetString(PyExc_ValueError, 2885 PyErr_SetString(PyExc_ValueError,
2805 "chr() arg not in range(0x110000)"); 2886 "chr() arg not in range(0x110000)");
2806 return NULL; 2887 return NULL;
2807 } 2888 }
2808 2889
2809 if (ordinal < 256) 2890 if ((Py_UCS4)ordinal < 256)
2810 return get_latin1_char(ordinal); 2891 return get_latin1_char((unsigned char)ordinal);
2811 2892
2812 v = PyUnicode_New(1, ordinal); 2893 v = PyUnicode_New(1, ordinal);
2813 if (v == NULL) 2894 if (v == NULL)
2814 return NULL; 2895 return NULL;
2815 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal); 2896 kind = PyUnicode_KIND(v);
2897 data = PyUnicode_DATA(v);
2898 PyUnicode_WRITE(kind, data, 0, ordinal);
2816 assert(_PyUnicode_CheckConsistency(v, 1)); 2899 assert(_PyUnicode_CheckConsistency(v, 1));
2817 return v; 2900 return v;
2818 } 2901 }
2819 2902
2820 PyObject * 2903 PyObject *
2821 PyUnicode_FromObject(register PyObject *obj) 2904 PyUnicode_FromObject(PyObject *obj)
2822 { 2905 {
2823 /* XXX Perhaps we should make this API an alias of 2906 /* XXX Perhaps we should make this API an alias of
2824 PyObject_Str() instead ?! */ 2907 PyObject_Str() instead ?! */
2825 if (PyUnicode_CheckExact(obj)) { 2908 if (PyUnicode_CheckExact(obj)) {
2826 if (PyUnicode_READY(obj) == -1) 2909 if (PyUnicode_READY(obj) == -1)
2827 return NULL; 2910 return NULL;
2828 Py_INCREF(obj); 2911 Py_INCREF(obj);
2829 return obj; 2912 return obj;
2830 } 2913 }
2831 if (PyUnicode_Check(obj)) { 2914 if (PyUnicode_Check(obj)) {
2832 /* For a Unicode subtype that's not a Unicode object, 2915 /* For a Unicode subtype that's not a Unicode object,
2833 return a true Unicode object with the same data. */ 2916 return a true Unicode object with the same data. */
2834 return _PyUnicode_Copy(obj); 2917 return _PyUnicode_Copy(obj);
2835 } 2918 }
2836 PyErr_Format(PyExc_TypeError, 2919 PyErr_Format(PyExc_TypeError,
2837 "Can't convert '%.100s' object to str implicitly", 2920 "Can't convert '%.100s' object to str implicitly",
2838 Py_TYPE(obj)->tp_name); 2921 Py_TYPE(obj)->tp_name);
2839 return NULL; 2922 return NULL;
2840 } 2923 }
2841 2924
2842 PyObject * 2925 PyObject *
2843 PyUnicode_FromEncodedObject(register PyObject *obj, 2926 PyUnicode_FromEncodedObject(PyObject *obj,
2844 const char *encoding, 2927 const char *encoding,
2845 const char *errors) 2928 const char *errors)
2846 { 2929 {
2847 Py_buffer buffer; 2930 Py_buffer buffer;
2848 PyObject *v; 2931 PyObject *v;
2849 2932
2850 if (obj == NULL) { 2933 if (obj == NULL) {
2851 PyErr_BadInternalCall(); 2934 PyErr_BadInternalCall();
2852 return NULL; 2935 return NULL;
2853 } 2936 }
2854 2937
2855 /* Decoding bytes objects is the most common case and should be fast */ 2938 /* Decoding bytes objects is the most common case and should be fast */
2856 if (PyBytes_Check(obj)) { 2939 if (PyBytes_Check(obj)) {
2857 if (PyBytes_GET_SIZE(obj) == 0) { 2940 if (PyBytes_GET_SIZE(obj) == 0)
2858 Py_INCREF(unicode_empty); 2941 _Py_RETURN_UNICODE_EMPTY();
2859 v = unicode_empty; 2942 v = PyUnicode_Decode(
2860 } 2943 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2861 else { 2944 encoding, errors);
2862 v = PyUnicode_Decode(
2863 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2864 encoding, errors);
2865 }
2866 return v; 2945 return v;
2867 } 2946 }
2868 2947
2869 if (PyUnicode_Check(obj)) { 2948 if (PyUnicode_Check(obj)) {
2870 PyErr_SetString(PyExc_TypeError, 2949 PyErr_SetString(PyExc_TypeError,
2871 "decoding str is not supported"); 2950 "decoding str is not supported");
2872 return NULL; 2951 return NULL;
2873 } 2952 }
2874 2953
2875 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ 2954 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2876 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { 2955 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2877 PyErr_Format(PyExc_TypeError, 2956 PyErr_Format(PyExc_TypeError,
2878 "coercing to str: need bytes, bytearray " 2957 "coercing to str: need bytes, bytearray "
2879 "or buffer-like object, %.80s found", 2958 "or buffer-like object, %.80s found",
2880 Py_TYPE(obj)->tp_name); 2959 Py_TYPE(obj)->tp_name);
2881 return NULL; 2960 return NULL;
2882 } 2961 }
2883 2962
2884 if (buffer.len == 0) { 2963 if (buffer.len == 0) {
2885 Py_INCREF(unicode_empty); 2964 PyBuffer_Release(&buffer);
2886 v = unicode_empty; 2965 _Py_RETURN_UNICODE_EMPTY();
2887 } 2966 }
2888 else 2967
2889 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); 2968 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
2890
2891 PyBuffer_Release(&buffer); 2969 PyBuffer_Release(&buffer);
2892 return v; 2970 return v;
2893 } 2971 }
2894 2972
2895 /* Convert encoding to lower case and replace '_' with '-' in order to 2973 /* Convert encoding to lower case and replace '_' with '-' in order to
2896 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1), 2974 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2897 1 on success. */ 2975 1 on success. */
2898 static int 2976 int
2899 normalize_encoding(const char *encoding, 2977 _Py_normalize_encoding(const char *encoding,
2900 char *lower, 2978 char *lower,
2901 size_t lower_len) 2979 size_t lower_len)
2902 { 2980 {
2903 const char *e; 2981 const char *e;
2904 char *l; 2982 char *l;
2905 char *l_end; 2983 char *l_end;
2906 2984
2907 if (encoding == NULL) { 2985 if (encoding == NULL) {
2986 /* 6 == strlen("utf-8") + 1 */
2987 if (lower_len < 6)
2988 return 0;
2908 strcpy(lower, "utf-8"); 2989 strcpy(lower, "utf-8");
2909 return 1; 2990 return 1;
2910 } 2991 }
2911 e = encoding; 2992 e = encoding;
2912 l = lower; 2993 l = lower;
2913 l_end = &lower[lower_len - 1]; 2994 l_end = &lower[lower_len - 1];
2914 while (*e) { 2995 while (*e) {
2915 if (l == l_end) 2996 if (l == l_end)
2916 return 0; 2997 return 0;
2917 if (Py_ISUPPER(*e)) { 2998 if (Py_ISUPPER(*e)) {
(...skipping 15 matching lines...) Expand all
2933 PyUnicode_Decode(const char *s, 3014 PyUnicode_Decode(const char *s,
2934 Py_ssize_t size, 3015 Py_ssize_t size,
2935 const char *encoding, 3016 const char *encoding,
2936 const char *errors) 3017 const char *errors)
2937 { 3018 {
2938 PyObject *buffer = NULL, *unicode; 3019 PyObject *buffer = NULL, *unicode;
2939 Py_buffer info; 3020 Py_buffer info;
2940 char lower[11]; /* Enough for any encoding shortcut */ 3021 char lower[11]; /* Enough for any encoding shortcut */
2941 3022
2942 /* Shortcuts for common default encodings */ 3023 /* Shortcuts for common default encodings */
2943 if (normalize_encoding(encoding, lower, sizeof(lower))) { 3024 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
2944 if ((strcmp(lower, "utf-8") == 0) || 3025 if ((strcmp(lower, "utf-8") == 0) ||
2945 (strcmp(lower, "utf8") == 0)) 3026 (strcmp(lower, "utf8") == 0))
2946 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 3027 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2947 else if ((strcmp(lower, "latin-1") == 0) || 3028 else if ((strcmp(lower, "latin-1") == 0) ||
2948 (strcmp(lower, "latin1") == 0) || 3029 (strcmp(lower, "latin1") == 0) ||
2949 (strcmp(lower, "iso-8859-1") == 0)) 3030 (strcmp(lower, "iso-8859-1") == 0) ||
3031 (strcmp(lower, "iso8859-1") == 0))
2950 return PyUnicode_DecodeLatin1(s, size, errors); 3032 return PyUnicode_DecodeLatin1(s, size, errors);
2951 #ifdef HAVE_MBCS 3033 #ifdef HAVE_MBCS
2952 else if (strcmp(lower, "mbcs") == 0) 3034 else if (strcmp(lower, "mbcs") == 0)
2953 return PyUnicode_DecodeMBCS(s, size, errors); 3035 return PyUnicode_DecodeMBCS(s, size, errors);
2954 #endif 3036 #endif
2955 else if (strcmp(lower, "ascii") == 0) 3037 else if (strcmp(lower, "ascii") == 0)
2956 return PyUnicode_DecodeASCII(s, size, errors); 3038 return PyUnicode_DecodeASCII(s, size, errors);
2957 else if (strcmp(lower, "utf-16") == 0) 3039 else if (strcmp(lower, "utf-16") == 0)
2958 return PyUnicode_DecodeUTF16(s, size, errors, 0); 3040 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2959 else if (strcmp(lower, "utf-32") == 0) 3041 else if (strcmp(lower, "utf-32") == 0)
2960 return PyUnicode_DecodeUTF32(s, size, errors, 0); 3042 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2961 } 3043 }
2962 3044
2963 /* Decode via the codec registry */ 3045 /* Decode via the codec registry */
2964 buffer = NULL; 3046 buffer = NULL;
2965 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) 3047 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
2966 goto onError; 3048 goto onError;
2967 buffer = PyMemoryView_FromBuffer(&info); 3049 buffer = PyMemoryView_FromBuffer(&info);
2968 if (buffer == NULL) 3050 if (buffer == NULL)
2969 goto onError; 3051 goto onError;
2970 unicode = PyCodec_Decode(buffer, encoding, errors); 3052 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
2971 if (unicode == NULL) 3053 if (unicode == NULL)
2972 goto onError; 3054 goto onError;
2973 if (!PyUnicode_Check(unicode)) { 3055 if (!PyUnicode_Check(unicode)) {
2974 PyErr_Format(PyExc_TypeError, 3056 PyErr_Format(PyExc_TypeError,
2975 "decoder did not return a str object (type=%.400s)", 3057 "'%.400s' decoder returned '%.400s' instead of 'str'; "
2976 Py_TYPE(unicode)->tp_name); 3058 "use codecs.decode() to decode to arbitrary types",
3059 encoding,
3060 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
2977 Py_DECREF(unicode); 3061 Py_DECREF(unicode);
2978 goto onError; 3062 goto onError;
2979 } 3063 }
2980 Py_DECREF(buffer); 3064 Py_DECREF(buffer);
2981 return unicode_result(unicode); 3065 return unicode_result(unicode);
2982 3066
2983 onError: 3067 onError:
2984 Py_XDECREF(buffer); 3068 Py_XDECREF(buffer);
2985 return NULL; 3069 return NULL;
2986 } 3070 }
(...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after
3024 3108
3025 if (encoding == NULL) 3109 if (encoding == NULL)
3026 encoding = PyUnicode_GetDefaultEncoding(); 3110 encoding = PyUnicode_GetDefaultEncoding();
3027 3111
3028 /* Decode via the codec registry */ 3112 /* Decode via the codec registry */
3029 v = PyCodec_Decode(unicode, encoding, errors); 3113 v = PyCodec_Decode(unicode, encoding, errors);
3030 if (v == NULL) 3114 if (v == NULL)
3031 goto onError; 3115 goto onError;
3032 if (!PyUnicode_Check(v)) { 3116 if (!PyUnicode_Check(v)) {
3033 PyErr_Format(PyExc_TypeError, 3117 PyErr_Format(PyExc_TypeError,
3034 "decoder did not return a str object (type=%.400s)", 3118 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3035 Py_TYPE(v)->tp_name); 3119 "use codecs.decode() to decode to arbitrary types",
3120 encoding,
3121 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
3036 Py_DECREF(v); 3122 Py_DECREF(v);
3037 goto onError; 3123 goto onError;
3038 } 3124 }
3039 return unicode_result(v); 3125 return unicode_result(v);
3040 3126
3041 onError: 3127 onError:
3042 return NULL; 3128 return NULL;
3043 } 3129 }
3044 3130
3045 PyObject * 3131 PyObject *
(...skipping 107 matching lines...) Expand 10 before | Expand all | Expand 10 after
3153 return -1; 3239 return -1;
3154 } 3240 }
3155 3241
3156 PyObject * 3242 PyObject *
3157 PyUnicode_EncodeLocale(PyObject *unicode, const char *errors) 3243 PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3158 { 3244 {
3159 Py_ssize_t wlen, wlen2; 3245 Py_ssize_t wlen, wlen2;
3160 wchar_t *wstr; 3246 wchar_t *wstr;
3161 PyObject *bytes = NULL; 3247 PyObject *bytes = NULL;
3162 char *errmsg; 3248 char *errmsg;
3163 PyObject *reason; 3249 PyObject *reason = NULL;
3164 PyObject *exc; 3250 PyObject *exc;
3165 size_t error_pos; 3251 size_t error_pos;
3166 int surrogateescape; 3252 int surrogateescape;
3167 3253
3168 if (locale_error_handler(errors, &surrogateescape) < 0) 3254 if (locale_error_handler(errors, &surrogateescape) < 0)
3169 return NULL; 3255 return NULL;
3170 3256
3171 wstr = PyUnicode_AsWideCharString(unicode, &wlen); 3257 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3172 if (wstr == NULL) 3258 if (wstr == NULL)
3173 return NULL; 3259 return NULL;
3174 3260
3175 wlen2 = wcslen(wstr); 3261 wlen2 = wcslen(wstr);
3176 if (wlen2 != wlen) { 3262 if (wlen2 != wlen) {
3177 PyMem_Free(wstr); 3263 PyMem_Free(wstr);
3178 PyErr_SetString(PyExc_TypeError, "embedded null character"); 3264 PyErr_SetString(PyExc_TypeError, "embedded null character");
3179 return NULL; 3265 return NULL;
3180 } 3266 }
3181 3267
3182 if (surrogateescape) { 3268 if (surrogateescape) {
3183 /* locale encoding with surrogateescape */ 3269 /* "surrogateescape" error handler */
3184 char *str; 3270 char *str;
3185 3271
3186 str = _Py_wchar2char(wstr, &error_pos); 3272 str = _Py_wchar2char(wstr, &error_pos);
3187 if (str == NULL) { 3273 if (str == NULL) {
3188 if (error_pos == (size_t)-1) { 3274 if (error_pos == (size_t)-1) {
3189 PyErr_NoMemory(); 3275 PyErr_NoMemory();
3190 PyMem_Free(wstr); 3276 PyMem_Free(wstr);
3191 return NULL; 3277 return NULL;
3192 } 3278 }
3193 else { 3279 else {
3194 goto encode_error; 3280 goto encode_error;
3195 } 3281 }
3196 } 3282 }
3197 PyMem_Free(wstr); 3283 PyMem_Free(wstr);
3198 3284
3199 bytes = PyBytes_FromString(str); 3285 bytes = PyBytes_FromString(str);
3200 PyMem_Free(str); 3286 PyMem_Free(str);
3201 } 3287 }
3202 else { 3288 else {
3289 /* strict mode */
3203 size_t len, len2; 3290 size_t len, len2;
3204 3291
3205 len = wcstombs(NULL, wstr, 0); 3292 len = wcstombs(NULL, wstr, 0);
3206 if (len == (size_t)-1) { 3293 if (len == (size_t)-1) {
3207 error_pos = (size_t)-1; 3294 error_pos = (size_t)-1;
3208 goto encode_error; 3295 goto encode_error;
3209 } 3296 }
3210 3297
3211 bytes = PyBytes_FromStringAndSize(NULL, len); 3298 bytes = PyBytes_FromStringAndSize(NULL, len);
3212 if (bytes == NULL) { 3299 if (bytes == NULL) {
(...skipping 18 matching lines...) Expand all
3231 error_pos = wcstombs_errorpos(wstr); 3318 error_pos = wcstombs_errorpos(wstr);
3232 3319
3233 PyMem_Free(wstr); 3320 PyMem_Free(wstr);
3234 Py_XDECREF(bytes); 3321 Py_XDECREF(bytes);
3235 3322
3236 if (errmsg != NULL) { 3323 if (errmsg != NULL) {
3237 size_t errlen; 3324 size_t errlen;
3238 wstr = _Py_char2wchar(errmsg, &errlen); 3325 wstr = _Py_char2wchar(errmsg, &errlen);
3239 if (wstr != NULL) { 3326 if (wstr != NULL) {
3240 reason = PyUnicode_FromWideChar(wstr, errlen); 3327 reason = PyUnicode_FromWideChar(wstr, errlen);
3241 PyMem_Free(wstr); 3328 PyMem_RawFree(wstr);
3242 } else 3329 } else
3243 errmsg = NULL; 3330 errmsg = NULL;
3244 } 3331 }
3245 if (errmsg == NULL) 3332 if (errmsg == NULL)
3246 reason = PyUnicode_FromString( 3333 reason = PyUnicode_FromString(
3247 "wcstombs() encountered an unencodable " 3334 "wcstombs() encountered an unencodable "
3248 "wide character"); 3335 "wide character");
3249 if (reason == NULL) 3336 if (reason == NULL)
3250 return NULL; 3337 return NULL;
3251 3338
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after
3298 { 3385 {
3299 PyObject *v; 3386 PyObject *v;
3300 char lower[11]; /* Enough for any encoding shortcut */ 3387 char lower[11]; /* Enough for any encoding shortcut */
3301 3388
3302 if (!PyUnicode_Check(unicode)) { 3389 if (!PyUnicode_Check(unicode)) {
3303 PyErr_BadArgument(); 3390 PyErr_BadArgument();
3304 return NULL; 3391 return NULL;
3305 } 3392 }
3306 3393
3307 /* Shortcuts for common default encodings */ 3394 /* Shortcuts for common default encodings */
3308 if (normalize_encoding(encoding, lower, sizeof(lower))) { 3395 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
3309 if ((strcmp(lower, "utf-8") == 0) || 3396 if ((strcmp(lower, "utf-8") == 0) ||
3310 (strcmp(lower, "utf8") == 0)) 3397 (strcmp(lower, "utf8") == 0))
3311 { 3398 {
3312 if (errors == NULL || strcmp(errors, "strict") == 0) 3399 if (errors == NULL || strcmp(errors, "strict") == 0)
3313 return _PyUnicode_AsUTF8String(unicode, NULL); 3400 return _PyUnicode_AsUTF8String(unicode, NULL);
3314 else 3401 else
3315 return _PyUnicode_AsUTF8String(unicode, errors); 3402 return _PyUnicode_AsUTF8String(unicode, errors);
3316 } 3403 }
3317 else if ((strcmp(lower, "latin-1") == 0) || 3404 else if ((strcmp(lower, "latin-1") == 0) ||
3318 (strcmp(lower, "latin1") == 0) || 3405 (strcmp(lower, "latin1") == 0) ||
3319 (strcmp(lower, "iso-8859-1") == 0)) 3406 (strcmp(lower, "iso-8859-1") == 0) ||
3407 (strcmp(lower, "iso8859-1") == 0))
3320 return _PyUnicode_AsLatin1String(unicode, errors); 3408 return _PyUnicode_AsLatin1String(unicode, errors);
3321 #ifdef HAVE_MBCS 3409 #ifdef HAVE_MBCS
3322 else if (strcmp(lower, "mbcs") == 0) 3410 else if (strcmp(lower, "mbcs") == 0)
3323 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors); 3411 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3324 #endif 3412 #endif
3325 else if (strcmp(lower, "ascii") == 0) 3413 else if (strcmp(lower, "ascii") == 0)
3326 return _PyUnicode_AsASCIIString(unicode, errors); 3414 return _PyUnicode_AsASCIIString(unicode, errors);
3327 } 3415 }
3328 3416
3329 /* Encode via the codec registry */ 3417 /* Encode via the codec registry */
3330 v = PyCodec_Encode(unicode, encoding, errors); 3418 v = _PyCodec_EncodeText(unicode, encoding, errors);
3331 if (v == NULL) 3419 if (v == NULL)
3332 return NULL; 3420 return NULL;
3333 3421
3334 /* The normal path */ 3422 /* The normal path */
3335 if (PyBytes_Check(v)) 3423 if (PyBytes_Check(v))
3336 return v; 3424 return v;
3337 3425
3338 /* If the codec returns a buffer, raise a warning and convert to bytes */ 3426 /* If the codec returns a buffer, raise a warning and convert to bytes */
3339 if (PyByteArray_Check(v)) { 3427 if (PyByteArray_Check(v)) {
3340 int error; 3428 int error;
3341 PyObject *b; 3429 PyObject *b;
3342 3430
3343 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, 3431 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3344 "encoder %s returned bytearray instead of bytes", 3432 "encoder %s returned bytearray instead of bytes; "
3433 "use codecs.encode() to encode to arbitrary types",
3345 encoding); 3434 encoding);
3346 if (error) { 3435 if (error) {
3347 Py_DECREF(v); 3436 Py_DECREF(v);
3348 return NULL; 3437 return NULL;
3349 } 3438 }
3350 3439
3351 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 3440 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3352 Py_DECREF(v); 3441 Py_DECREF(v);
3353 return b; 3442 return b;
3354 } 3443 }
3355 3444
3356 PyErr_Format(PyExc_TypeError, 3445 PyErr_Format(PyExc_TypeError,
3357 "encoder did not return a bytes object (type=%.400s)", 3446 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3358 Py_TYPE(v)->tp_name); 3447 "use codecs.encode() to encode to arbitrary types",
3448 encoding,
3449 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
3359 Py_DECREF(v); 3450 Py_DECREF(v);
3360 return NULL; 3451 return NULL;
3361 } 3452 }
3362 3453
3363 PyObject * 3454 PyObject *
3364 PyUnicode_AsEncodedUnicode(PyObject *unicode, 3455 PyUnicode_AsEncodedUnicode(PyObject *unicode,
3365 const char *encoding, 3456 const char *encoding,
3366 const char *errors) 3457 const char *errors)
3367 { 3458 {
3368 PyObject *v; 3459 PyObject *v;
3369 3460
3370 if (!PyUnicode_Check(unicode)) { 3461 if (!PyUnicode_Check(unicode)) {
3371 PyErr_BadArgument(); 3462 PyErr_BadArgument();
3372 goto onError; 3463 goto onError;
3373 } 3464 }
3374 3465
3375 if (encoding == NULL) 3466 if (encoding == NULL)
3376 encoding = PyUnicode_GetDefaultEncoding(); 3467 encoding = PyUnicode_GetDefaultEncoding();
3377 3468
3378 /* Encode via the codec registry */ 3469 /* Encode via the codec registry */
3379 v = PyCodec_Encode(unicode, encoding, errors); 3470 v = PyCodec_Encode(unicode, encoding, errors);
3380 if (v == NULL) 3471 if (v == NULL)
3381 goto onError; 3472 goto onError;
3382 if (!PyUnicode_Check(v)) { 3473 if (!PyUnicode_Check(v)) {
3383 PyErr_Format(PyExc_TypeError, 3474 PyErr_Format(PyExc_TypeError,
3384 "encoder did not return an str object (type=%.400s)", 3475 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3385 Py_TYPE(v)->tp_name); 3476 "use codecs.encode() to encode to arbitrary types",
3477 encoding,
3478 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
3386 Py_DECREF(v); 3479 Py_DECREF(v);
3387 goto onError; 3480 goto onError;
3388 } 3481 }
3389 return v; 3482 return v;
3390 3483
3391 onError: 3484 onError:
3392 return NULL; 3485 return NULL;
3393 } 3486 }
3394 3487
3395 static size_t 3488 static size_t
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after
3438 PyObject *reason, *exc; 3531 PyObject *reason, *exc;
3439 3532
3440 if (locale_error_handler(errors, &surrogateescape) < 0) 3533 if (locale_error_handler(errors, &surrogateescape) < 0)
3441 return NULL; 3534 return NULL;
3442 3535
3443 if (str[len] != '\0' || len != strlen(str)) { 3536 if (str[len] != '\0' || len != strlen(str)) {
3444 PyErr_SetString(PyExc_TypeError, "embedded null character"); 3537 PyErr_SetString(PyExc_TypeError, "embedded null character");
3445 return NULL; 3538 return NULL;
3446 } 3539 }
3447 3540
3448 if (surrogateescape) 3541 if (surrogateescape) {
3449 { 3542 /* "surrogateescape" error handler */
3450 wstr = _Py_char2wchar(str, &wlen); 3543 wstr = _Py_char2wchar(str, &wlen);
3451 if (wstr == NULL) { 3544 if (wstr == NULL) {
3452 if (wlen == (size_t)-1) 3545 if (wlen == (size_t)-1)
3453 PyErr_NoMemory(); 3546 PyErr_NoMemory();
3454 else 3547 else
3455 PyErr_SetFromErrno(PyExc_OSError); 3548 PyErr_SetFromErrno(PyExc_OSError);
3456 return NULL; 3549 return NULL;
3457 } 3550 }
3458 3551
3459 unicode = PyUnicode_FromWideChar(wstr, wlen); 3552 unicode = PyUnicode_FromWideChar(wstr, wlen);
3460 PyMem_Free(wstr); 3553 PyMem_RawFree(wstr);
3461 } 3554 }
3462 else { 3555 else {
3556 /* strict mode */
3463 #ifndef HAVE_BROKEN_MBSTOWCS 3557 #ifndef HAVE_BROKEN_MBSTOWCS
3464 wlen = mbstowcs(NULL, str, 0); 3558 wlen = mbstowcs(NULL, str, 0);
3465 #else 3559 #else
3466 wlen = len; 3560 wlen = len;
3467 #endif 3561 #endif
3468 if (wlen == (size_t)-1) 3562 if (wlen == (size_t)-1)
3469 goto decode_error; 3563 goto decode_error;
3470 if (wlen+1 <= smallbuf_len) { 3564 if (wlen+1 <= smallbuf_len) {
3471 wstr = smallbuf; 3565 wstr = smallbuf;
3472 } 3566 }
3473 else { 3567 else {
3474 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) 3568 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3475 return PyErr_NoMemory(); 3569 return PyErr_NoMemory();
3476 3570
3477 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t)); 3571 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3478 if (!wstr) 3572 if (!wstr)
3479 return PyErr_NoMemory(); 3573 return PyErr_NoMemory();
3480 } 3574 }
3481 3575
3482 /* This shouldn't fail now */
3483 wlen2 = mbstowcs(wstr, str, wlen+1); 3576 wlen2 = mbstowcs(wstr, str, wlen+1);
3484 if (wlen2 == (size_t)-1) { 3577 if (wlen2 == (size_t)-1) {
3485 if (wstr != smallbuf) 3578 if (wstr != smallbuf)
3486 PyMem_Free(wstr); 3579 PyMem_Free(wstr);
3487 goto decode_error; 3580 goto decode_error;
3488 } 3581 }
3489 #ifdef HAVE_BROKEN_MBSTOWCS 3582 #ifdef HAVE_BROKEN_MBSTOWCS
3490 assert(wlen2 == wlen); 3583 assert(wlen2 == wlen);
3491 #endif 3584 #endif
3492 unicode = PyUnicode_FromWideChar(wstr, wlen2); 3585 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3493 if (wstr != smallbuf) 3586 if (wstr != smallbuf)
3494 PyMem_Free(wstr); 3587 PyMem_Free(wstr);
3495 } 3588 }
3496 return unicode; 3589 return unicode;
3497 3590
3498 decode_error: 3591 decode_error:
3499 errmsg = strerror(errno); 3592 errmsg = strerror(errno);
3500 assert(errmsg != NULL); 3593 assert(errmsg != NULL);
3501 3594
3502 error_pos = mbstowcs_errorpos(str, len); 3595 error_pos = mbstowcs_errorpos(str, len);
3503 if (errmsg != NULL) { 3596 if (errmsg != NULL) {
3504 size_t errlen; 3597 size_t errlen;
3505 wstr = _Py_char2wchar(errmsg, &errlen); 3598 wstr = _Py_char2wchar(errmsg, &errlen);
3506 if (wstr != NULL) { 3599 if (wstr != NULL) {
3507 reason = PyUnicode_FromWideChar(wstr, errlen); 3600 reason = PyUnicode_FromWideChar(wstr, errlen);
3508 PyMem_Free(wstr); 3601 PyMem_RawFree(wstr);
3509 } else 3602 } else
3510 errmsg = NULL; 3603 errmsg = NULL;
3511 } 3604 }
3512 if (errmsg == NULL) 3605 if (errmsg == NULL)
3513 reason = PyUnicode_FromString( 3606 reason = PyUnicode_FromString(
3514 "mbstowcs() encountered an invalid multibyte sequence"); 3607 "mbstowcs() encountered an invalid multibyte sequence");
3515 if (reason == NULL) 3608 if (reason == NULL)
3516 return NULL; 3609 return NULL;
3517 3610
3518 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO", 3611 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after
3566 "surrogateescape"); 3659 "surrogateescape");
3567 } 3660 }
3568 else { 3661 else {
3569 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape"); 3662 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
3570 } 3663 }
3571 #endif 3664 #endif
3572 } 3665 }
3573 3666
3574 3667
3575 int 3668 int
3576 _PyUnicode_HasNULChars(PyObject* s) 3669 _PyUnicode_HasNULChars(PyObject* str)
3577 { 3670 {
3578 static PyObject *nul = NULL; 3671 Py_ssize_t pos;
3579 3672
3580 if (nul == NULL) 3673 if (PyUnicode_READY(str) == -1)
3581 nul = PyUnicode_FromStringAndSize("\0", 1);
3582 if (nul == NULL)
3583 return -1; 3674 return -1;
3584 return PyUnicode_Contains(s, nul); 3675 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3585 } 3676 PyUnicode_GET_LENGTH(str), '\0', 1);
3586 3677 if (pos == -1)
3678 return 0;
3679 else
3680 return 1;
3681 }
3587 3682
3588 int 3683 int
3589 PyUnicode_FSConverter(PyObject* arg, void* addr) 3684 PyUnicode_FSConverter(PyObject* arg, void* addr)
3590 { 3685 {
3591 PyObject *output = NULL; 3686 PyObject *output = NULL;
3592 Py_ssize_t size; 3687 Py_ssize_t size;
3593 void *data; 3688 void *data;
3594 if (arg == NULL) { 3689 if (arg == NULL) {
3595 Py_DECREF(*(PyObject**)addr); 3690 Py_DECREF(*(PyObject**)addr);
3596 return 1; 3691 return 1;
(...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after
3681 if (PyUnicode_READY(unicode) == -1) 3776 if (PyUnicode_READY(unicode) == -1)
3682 return NULL; 3777 return NULL;
3683 3778
3684 if (PyUnicode_UTF8(unicode) == NULL) { 3779 if (PyUnicode_UTF8(unicode) == NULL) {
3685 assert(!PyUnicode_IS_COMPACT_ASCII(unicode)); 3780 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3686 bytes = _PyUnicode_AsUTF8String(unicode, "strict"); 3781 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3687 if (bytes == NULL) 3782 if (bytes == NULL)
3688 return NULL; 3783 return NULL;
3689 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1); 3784 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3690 if (_PyUnicode_UTF8(unicode) == NULL) { 3785 if (_PyUnicode_UTF8(unicode) == NULL) {
3786 PyErr_NoMemory();
3691 Py_DECREF(bytes); 3787 Py_DECREF(bytes);
3692 return NULL; 3788 return NULL;
3693 } 3789 }
3694 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes); 3790 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3695 Py_MEMCPY(_PyUnicode_UTF8(unicode), 3791 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3696 PyBytes_AS_STRING(bytes), 3792 PyBytes_AS_STRING(bytes),
3697 _PyUnicode_UTF8_LENGTH(unicode) + 1); 3793 _PyUnicode_UTF8_LENGTH(unicode) + 1);
3698 Py_DECREF(bytes); 3794 Py_DECREF(bytes);
3699 } 3795 }
3700 3796
(...skipping 148 matching lines...) Expand 10 before | Expand all | Expand 10 after
3849 return -1; 3945 return -1;
3850 } 3946 }
3851 if (PyUnicode_READY(unicode) == -1) 3947 if (PyUnicode_READY(unicode) == -1)
3852 return -1; 3948 return -1;
3853 return PyUnicode_GET_LENGTH(unicode); 3949 return PyUnicode_GET_LENGTH(unicode);
3854 } 3950 }
3855 3951
3856 Py_UCS4 3952 Py_UCS4
3857 PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index) 3953 PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3858 { 3954 {
3955 void *data;
3956 int kind;
3957
3859 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 3958 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3860 PyErr_BadArgument(); 3959 PyErr_BadArgument();
3861 return (Py_UCS4)-1; 3960 return (Py_UCS4)-1;
3862 } 3961 }
3863 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { 3962 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
3864 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3963 PyErr_SetString(PyExc_IndexError, "string index out of range");
3865 return (Py_UCS4)-1; 3964 return (Py_UCS4)-1;
3866 } 3965 }
3867 return PyUnicode_READ_CHAR(unicode, index); 3966 data = PyUnicode_DATA(unicode);
3967 kind = PyUnicode_KIND(unicode);
3968 return PyUnicode_READ(kind, data, index);
3868 } 3969 }
3869 3970
3870 int 3971 int
3871 PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch) 3972 PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3872 { 3973 {
3873 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) { 3974 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
3874 PyErr_BadArgument(); 3975 PyErr_BadArgument();
3875 return -1; 3976 return -1;
3876 } 3977 }
3877 assert(PyUnicode_IS_READY(unicode)); 3978 assert(PyUnicode_IS_READY(unicode));
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after
3916 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 4017 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3917 goto onError; 4018 goto onError;
3918 } 4019 }
3919 return; 4020 return;
3920 4021
3921 onError: 4022 onError:
3922 Py_DECREF(*exceptionObject); 4023 Py_DECREF(*exceptionObject);
3923 *exceptionObject = NULL; 4024 *exceptionObject = NULL;
3924 } 4025 }
3925 4026
4027 #ifdef HAVE_MBCS
3926 /* error handling callback helper: 4028 /* error handling callback helper:
3927 build arguments, call the callback and check the arguments, 4029 build arguments, call the callback and check the arguments,
3928 if no exception occurred, copy the replacement to the output 4030 if no exception occurred, copy the replacement to the output
3929 and adjust various state variables. 4031 and adjust various state variables.
3930 return 0 on success, -1 on error 4032 return 0 on success, -1 on error
3931 */ 4033 */
3932 4034
3933 static int 4035 static int
3934 unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 4036 unicode_decode_call_errorhandler_wchar(
3935 const char *encoding, const char *reason, 4037 const char *errors, PyObject **errorHandler,
3936 const char **input, const char **inend, Py_ssiz e_t *startinpos, 4038 const char *encoding, const char *reason,
3937 Py_ssize_t *endinpos, PyObject **exceptionObjec t, const char **inptr, 4039 const char **input, const char **inend, Py_ssize_t *startinpos,
3938 PyObject **output, Py_ssize_t *outpos) 4040 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4041 PyObject **output, Py_ssize_t *outpos)
3939 { 4042 {
3940 static char *argparse = "O!n;decoding error handler must return (str, int) t uple"; 4043 static char *argparse = "O!n;decoding error handler must return (str, int) t uple";
3941 4044
3942 PyObject *restuple = NULL; 4045 PyObject *restuple = NULL;
3943 PyObject *repunicode = NULL; 4046 PyObject *repunicode = NULL;
3944 Py_ssize_t outsize; 4047 Py_ssize_t outsize;
3945 Py_ssize_t insize; 4048 Py_ssize_t insize;
3946 Py_ssize_t requiredsize; 4049 Py_ssize_t requiredsize;
3947 Py_ssize_t newpos; 4050 Py_ssize_t newpos;
3948 PyObject *inputobj = NULL; 4051 PyObject *inputobj = NULL;
3949 int res = -1; 4052 wchar_t *repwstr;
3950 4053 Py_ssize_t repwlen;
3951 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) 4054
3952 outsize = PyUnicode_GET_LENGTH(*output); 4055 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
3953 else 4056 outsize = _PyUnicode_WSTR_LENGTH(*output);
3954 outsize = _PyUnicode_WSTR_LENGTH(*output);
3955 4057
3956 if (*errorHandler == NULL) { 4058 if (*errorHandler == NULL) {
3957 *errorHandler = PyCodec_LookupError(errors); 4059 *errorHandler = PyCodec_LookupError(errors);
3958 if (*errorHandler == NULL) 4060 if (*errorHandler == NULL)
3959 goto onError; 4061 goto onError;
3960 } 4062 }
3961 4063
3962 make_decode_exception(exceptionObject, 4064 make_decode_exception(exceptionObject,
3963 encoding, 4065 encoding,
3964 *input, *inend - *input, 4066 *input, *inend - *input,
3965 *startinpos, *endinpos, 4067 *startinpos, *endinpos,
3966 reason); 4068 reason);
3967 if (*exceptionObject == NULL) 4069 if (*exceptionObject == NULL)
3968 goto onError; 4070 goto onError;
3969 4071
3970 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NUL L); 4072 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NUL L);
3971 if (restuple == NULL) 4073 if (restuple == NULL)
3972 goto onError; 4074 goto onError;
3973 if (!PyTuple_Check(restuple)) { 4075 if (!PyTuple_Check(restuple)) {
3974 PyErr_SetString(PyExc_TypeError, &argparse[4]); 4076 PyErr_SetString(PyExc_TypeError, &argparse[4]);
3975 goto onError; 4077 goto onError;
3976 } 4078 }
3977 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &new pos)) 4079 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &new pos))
3978 goto onError; 4080 goto onError;
3979 if (PyUnicode_READY(repunicode) == -1)
3980 goto onError;
3981 4081
3982 /* Copy back the bytes variables, which might have been modified by the 4082 /* Copy back the bytes variables, which might have been modified by the
3983 callback */ 4083 callback */
3984 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 4084 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3985 if (!inputobj) 4085 if (!inputobj)
3986 goto onError; 4086 goto onError;
3987 if (!PyBytes_Check(inputobj)) { 4087 if (!PyBytes_Check(inputobj)) {
3988 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes" ); 4088 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes" );
3989 } 4089 }
3990 *input = PyBytes_AS_STRING(inputobj); 4090 *input = PyBytes_AS_STRING(inputobj);
3991 insize = PyBytes_GET_SIZE(inputobj); 4091 insize = PyBytes_GET_SIZE(inputobj);
3992 *inend = *input + insize; 4092 *inend = *input + insize;
3993 /* we can DECREF safely, as the exception has another reference, 4093 /* we can DECREF safely, as the exception has another reference,
3994 so the object won't go away. */ 4094 so the object won't go away. */
3995 Py_DECREF(inputobj); 4095 Py_DECREF(inputobj);
3996 4096
3997 if (newpos<0) 4097 if (newpos<0)
3998 newpos = insize+newpos; 4098 newpos = insize+newpos;
3999 if (newpos<0 || newpos>insize) { 4099 if (newpos<0 || newpos>insize) {
4000 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of b ounds", newpos); 4100 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of b ounds", newpos);
4001 goto onError; 4101 goto onError;
4002 } 4102 }
4003 4103
4004 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) { 4104 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4005 /* need more space? (at least enough for what we 4105 if (repwstr == NULL)
4006 have+the replacement+the rest of the string (starting 4106 goto onError;
4007 at the new input position), so we won't have to check space 4107 /* need more space? (at least enough for what we
4008 when there are no errors in the rest of the string) */ 4108 have+the replacement+the rest of the string (starting
4009 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode); 4109 at the new input position), so we won't have to check space
4010 requiredsize = *outpos + replen + insize-newpos; 4110 when there are no errors in the rest of the string) */
4011 if (requiredsize > outsize) { 4111 requiredsize = *outpos + repwlen + insize-newpos;
4012 if (requiredsize<2*outsize) 4112 if (requiredsize > outsize) {
4013 requiredsize = 2*outsize; 4113 if (requiredsize < 2*outsize)
4014 if (unicode_resize(output, requiredsize) < 0) 4114 requiredsize = 2*outsize;
4015 goto onError; 4115 if (unicode_resize(output, requiredsize) < 0)
4016 }
4017 if (unicode_widen(output, *outpos,
4018 PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
4019 goto onError; 4116 goto onError;
4020 _PyUnicode_FastCopyCharacters(*output, *outpos, repunicode, 0, replen); 4117 }
4021 *outpos += replen; 4118 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4022 } 4119 *outpos += repwlen;
4023 else { 4120
4024 wchar_t *repwstr;
4025 Py_ssize_t repwlen;
4026 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4027 if (repwstr == NULL)
4028 goto onError;
4029 /* need more space? (at least enough for what we
4030 have+the replacement+the rest of the string (starting
4031 at the new input position), so we won't have to check space
4032 when there are no errors in the rest of the string) */
4033 requiredsize = *outpos + repwlen + insize-newpos;
4034 if (requiredsize > outsize) {
4035 if (requiredsize < 2*outsize)
4036 requiredsize = 2*outsize;
4037 if (unicode_resize(output, requiredsize) < 0)
4038 goto onError;
4039 }
4040 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4041 *outpos += repwlen;
4042 }
4043 *endinpos = newpos; 4121 *endinpos = newpos;
4044 *inptr = *input + newpos; 4122 *inptr = *input + newpos;
4045 4123
4046 /* we made it! */ 4124 /* we made it! */
4047 res = 0; 4125 Py_XDECREF(restuple);
4126 return 0;
4048 4127
4049 onError: 4128 onError:
4050 Py_XDECREF(restuple); 4129 Py_XDECREF(restuple);
4051 return res; 4130 return -1;
4131 }
4132 #endif /* HAVE_MBCS */
4133
4134 static int
4135 unicode_decode_call_errorhandler_writer(
4136 const char *errors, PyObject **errorHandler,
4137 const char *encoding, const char *reason,
4138 const char **input, const char **inend, Py_ssize_t *startinpos,
4139 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4140 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4141 {
4142 static char *argparse = "O!n;decoding error handler must return (str, int) t uple";
4143
4144 PyObject *restuple = NULL;
4145 PyObject *repunicode = NULL;
4146 Py_ssize_t insize;
4147 Py_ssize_t newpos;
4148 Py_ssize_t replen;
4149 PyObject *inputobj = NULL;
4150
4151 if (*errorHandler == NULL) {
4152 *errorHandler = PyCodec_LookupError(errors);
4153 if (*errorHandler == NULL)
4154 goto onError;
4155 }
4156
4157 make_decode_exception(exceptionObject,
4158 encoding,
4159 *input, *inend - *input,
4160 *startinpos, *endinpos,
4161 reason);
4162 if (*exceptionObject == NULL)
4163 goto onError;
4164
4165 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NUL L);
4166 if (restuple == NULL)
4167 goto onError;
4168 if (!PyTuple_Check(restuple)) {
4169 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4170 goto onError;
4171 }
4172 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &new pos))
4173 goto onError;
4174
4175 /* Copy back the bytes variables, which might have been modified by the
4176 callback */
4177 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4178 if (!inputobj)
4179 goto onError;
4180 if (!PyBytes_Check(inputobj)) {
4181 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes" );
4182 }
4183 *input = PyBytes_AS_STRING(inputobj);
4184 insize = PyBytes_GET_SIZE(inputobj);
4185 *inend = *input + insize;
4186 /* we can DECREF safely, as the exception has another reference,
4187 so the object won't go away. */
4188 Py_DECREF(inputobj);
4189
4190 if (newpos<0)
4191 newpos = insize+newpos;
4192 if (newpos<0 || newpos>insize) {
4193 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of b ounds", newpos);
4194 goto onError;
4195 }
4196
4197 if (PyUnicode_READY(repunicode) < 0)
4198 goto onError;
4199 replen = PyUnicode_GET_LENGTH(repunicode);
4200 writer->min_length += replen;
4201 if (replen > 1)
4202 writer->overallocate = 1;
4203 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4204 goto onError;
4205
4206 *endinpos = newpos;
4207 *inptr = *input + newpos;
4208
4209 /* we made it! */
4210 Py_XDECREF(restuple);
4211 return 0;
4212
4213 onError:
4214 Py_XDECREF(restuple);
4215 return -1;
4052 } 4216 }
4053 4217
4054 /* --- UTF-7 Codec -------------------------------------------------------- */ 4218 /* --- UTF-7 Codec -------------------------------------------------------- */
4055 4219
4056 /* See RFC2152 for details. We encode conservatively and decode liberally. */ 4220 /* See RFC2152 for details. We encode conservatively and decode liberally. */
4057 4221
4058 /* Three simple macros defining base-64. */ 4222 /* Three simple macros defining base-64. */
4059 4223
4060 /* Is c a base-64 character? */ 4224 /* Is c a base-64 character? */
4061 4225
(...skipping 87 matching lines...) Expand 10 before | Expand all | Expand 10 after
4149 4313
4150 PyObject * 4314 PyObject *
4151 PyUnicode_DecodeUTF7Stateful(const char *s, 4315 PyUnicode_DecodeUTF7Stateful(const char *s,
4152 Py_ssize_t size, 4316 Py_ssize_t size,
4153 const char *errors, 4317 const char *errors,
4154 Py_ssize_t *consumed) 4318 Py_ssize_t *consumed)
4155 { 4319 {
4156 const char *starts = s; 4320 const char *starts = s;
4157 Py_ssize_t startinpos; 4321 Py_ssize_t startinpos;
4158 Py_ssize_t endinpos; 4322 Py_ssize_t endinpos;
4159 Py_ssize_t outpos;
4160 const char *e; 4323 const char *e;
4161 PyObject *unicode; 4324 _PyUnicodeWriter writer;
4162 const char *errmsg = ""; 4325 const char *errmsg = "";
4163 int inShift = 0; 4326 int inShift = 0;
4164 Py_ssize_t shiftOutStart; 4327 Py_ssize_t shiftOutStart;
4165 unsigned int base64bits = 0; 4328 unsigned int base64bits = 0;
4166 unsigned long base64buffer = 0; 4329 unsigned long base64buffer = 0;
4167 Py_UCS4 surrogate = 0; 4330 Py_UCS4 surrogate = 0;
4168 PyObject *errorHandler = NULL; 4331 PyObject *errorHandler = NULL;
4169 PyObject *exc = NULL; 4332 PyObject *exc = NULL;
4170 4333
4171 /* Start off assuming it's all ASCII. Widen later as necessary. */
4172 unicode = PyUnicode_New(size, 127);
4173 if (!unicode)
4174 return NULL;
4175 if (size == 0) { 4334 if (size == 0) {
4176 if (consumed) 4335 if (consumed)
4177 *consumed = 0; 4336 *consumed = 0;
4178 return unicode; 4337 _Py_RETURN_UNICODE_EMPTY();
4179 } 4338 }
4180 4339
4181 shiftOutStart = outpos = 0; 4340 /* Start off assuming it's all ASCII. Widen later as necessary. */
4341 _PyUnicodeWriter_Init(&writer);
4342 writer.min_length = size;
4343
4344 shiftOutStart = 0;
4182 e = s + size; 4345 e = s + size;
4183 4346
4184 while (s < e) { 4347 while (s < e) {
4185 Py_UCS4 ch; 4348 Py_UCS4 ch;
4186 restart: 4349 restart:
4187 ch = (unsigned char) *s; 4350 ch = (unsigned char) *s;
4188 4351
4189 if (inShift) { /* in a base-64 section */ 4352 if (inShift) { /* in a base-64 section */
4190 if (IS_BASE64(ch)) { /* consume a base-64 character */ 4353 if (IS_BASE64(ch)) { /* consume a base-64 character */
4191 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 4354 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4192 base64bits += 6; 4355 base64bits += 6;
4193 s++; 4356 s++;
4194 if (base64bits >= 16) { 4357 if (base64bits >= 16) {
4195 /* we have enough bits for a UTF-16 value */ 4358 /* we have enough bits for a UTF-16 value */
4196 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16)); 4359 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4197 base64bits -= 16; 4360 base64bits -= 16;
4198 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 4361 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4362 assert(outCh <= 0xffff);
4199 if (surrogate) { 4363 if (surrogate) {
4200 /* expecting a second surrogate */ 4364 /* expecting a second surrogate */
4201 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) { 4365 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4202 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh); 4366 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4203 if (unicode_putchar(&unicode, &outpos, ch2) < 0) 4367 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4204 goto onError; 4368 goto onError;
4205 surrogate = 0; 4369 surrogate = 0;
4206 continue; 4370 continue;
4207 } 4371 }
4208 else { 4372 else {
4209 if (unicode_putchar(&unicode, &outpos, surrogate) < 0) 4373 if (_PyUnicodeWriter_WriteCharInline(&writer, surrog ate) < 0)
4210 goto onError; 4374 goto onError;
4211 surrogate = 0; 4375 surrogate = 0;
4212 } 4376 }
4213 } 4377 }
4214 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) { 4378 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4215 /* first surrogate */ 4379 /* first surrogate */
4216 surrogate = outCh; 4380 surrogate = outCh;
4217 } 4381 }
4218 else { 4382 else {
4219 if (unicode_putchar(&unicode, &outpos, outCh) < 0) 4383 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0 )
4220 goto onError; 4384 goto onError;
4221 } 4385 }
4222 } 4386 }
4223 } 4387 }
4224 else { /* now leaving a base-64 section */ 4388 else { /* now leaving a base-64 section */
4225 inShift = 0; 4389 inShift = 0;
4226 s++; 4390 s++;
4227 if (surrogate) { 4391 if (surrogate) {
4228 if (unicode_putchar(&unicode, &outpos, surrogate) < 0) 4392 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0 )
4229 goto onError; 4393 goto onError;
4230 surrogate = 0; 4394 surrogate = 0;
4231 } 4395 }
4232 if (base64bits > 0) { /* left-over bits */ 4396 if (base64bits > 0) { /* left-over bits */
4233 if (base64bits >= 6) { 4397 if (base64bits >= 6) {
4234 /* We've seen at least one base-64 character */ 4398 /* We've seen at least one base-64 character */
4235 errmsg = "partial character in shift sequence"; 4399 errmsg = "partial character in shift sequence";
4236 goto utf7Error; 4400 goto utf7Error;
4237 } 4401 }
4238 else { 4402 else {
4239 /* Some bits remain; they should be zero */ 4403 /* Some bits remain; they should be zero */
4240 if (base64buffer != 0) { 4404 if (base64buffer != 0) {
4241 errmsg = "non-zero padding bits in shift sequence"; 4405 errmsg = "non-zero padding bits in shift sequence";
4242 goto utf7Error; 4406 goto utf7Error;
4243 } 4407 }
4244 } 4408 }
4245 } 4409 }
4246 if (ch != '-') { 4410 if (ch != '-') {
4247 /* '-' is absorbed; other terminating 4411 /* '-' is absorbed; other terminating
4248 characters are preserved */ 4412 characters are preserved */
4249 if (unicode_putchar(&unicode, &outpos, ch) < 0) 4413 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4250 goto onError; 4414 goto onError;
4251 } 4415 }
4252 } 4416 }
4253 } 4417 }
4254 else if ( ch == '+' ) { 4418 else if ( ch == '+' ) {
4255 startinpos = s-starts; 4419 startinpos = s-starts;
4256 s++; /* consume '+' */ 4420 s++; /* consume '+' */
4257 if (s < e && *s == '-') { /* '+-' encodes '+' */ 4421 if (s < e && *s == '-') { /* '+-' encodes '+' */
4258 s++; 4422 s++;
4259 if (unicode_putchar(&unicode, &outpos, '+') < 0) 4423 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4260 goto onError; 4424 goto onError;
4261 } 4425 }
4262 else { /* begin base64-encoded section */ 4426 else { /* begin base64-encoded section */
4263 inShift = 1; 4427 inShift = 1;
4264 shiftOutStart = outpos; 4428 shiftOutStart = writer.pos;
4265 base64bits = 0; 4429 base64bits = 0;
4430 base64buffer = 0;
4266 } 4431 }
4267 } 4432 }
4268 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 4433 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4269 if (unicode_putchar(&unicode, &outpos, ch) < 0) 4434 s++;
4435 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4270 goto onError; 4436 goto onError;
4271 s++;
4272 } 4437 }
4273 else { 4438 else {
4274 startinpos = s-starts; 4439 startinpos = s-starts;
4275 s++; 4440 s++;
4276 errmsg = "unexpected special character"; 4441 errmsg = "unexpected special character";
4277 goto utf7Error; 4442 goto utf7Error;
4278 } 4443 }
4279 continue; 4444 continue;
4280 utf7Error: 4445 utf7Error:
4281 endinpos = s-starts; 4446 endinpos = s-starts;
4282 if (unicode_decode_call_errorhandler( 4447 if (unicode_decode_call_errorhandler_writer(
4283 errors, &errorHandler, 4448 errors, &errorHandler,
4284 "utf7", errmsg, 4449 "utf7", errmsg,
4285 &starts, &e, &startinpos, &endinpos, &exc, &s, 4450 &starts, &e, &startinpos, &endinpos, &exc, &s,
4286 &unicode, &outpos)) 4451 &writer))
4287 goto onError; 4452 goto onError;
4288 } 4453 }
4289 4454
4290 /* end of string */ 4455 /* end of string */
4291 4456
4292 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 4457 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4293 /* if we're in an inconsistent state, that's an error */ 4458 /* if we're in an inconsistent state, that's an error */
4294 if (surrogate || 4459 if (surrogate ||
4295 (base64bits >= 6) || 4460 (base64bits >= 6) ||
4296 (base64bits > 0 && base64buffer != 0)) { 4461 (base64bits > 0 && base64buffer != 0)) {
4297 endinpos = size; 4462 endinpos = size;
4298 if (unicode_decode_call_errorhandler( 4463 if (unicode_decode_call_errorhandler_writer(
4299 errors, &errorHandler, 4464 errors, &errorHandler,
4300 "utf7", "unterminated shift sequence", 4465 "utf7", "unterminated shift sequence",
4301 &starts, &e, &startinpos, &endinpos, &exc, &s, 4466 &starts, &e, &startinpos, &endinpos, &exc, &s,
4302 &unicode, &outpos)) 4467 &writer))
4303 goto onError; 4468 goto onError;
4304 if (s < e) 4469 if (s < e)
4305 goto restart; 4470 goto restart;
4306 } 4471 }
4307 } 4472 }
4308 4473
4309 /* return state */ 4474 /* return state */
4310 if (consumed) { 4475 if (consumed) {
4311 if (inShift) { 4476 if (inShift) {
4312 outpos = shiftOutStart; /* back off output */ 4477 writer.pos = shiftOutStart; /* back off output */
4313 *consumed = startinpos; 4478 *consumed = startinpos;
4314 } 4479 }
4315 else { 4480 else {
4316 *consumed = s-starts; 4481 *consumed = s-starts;
4317 } 4482 }
4318 } 4483 }
4319 4484
4320 if (unicode_resize(&unicode, outpos) < 0)
4321 goto onError;
4322
4323 Py_XDECREF(errorHandler); 4485 Py_XDECREF(errorHandler);
4324 Py_XDECREF(exc); 4486 Py_XDECREF(exc);
4325 return unicode_result(unicode); 4487 return _PyUnicodeWriter_Finish(&writer);
4326 4488
4327 onError: 4489 onError:
4328 Py_XDECREF(errorHandler); 4490 Py_XDECREF(errorHandler);
4329 Py_XDECREF(exc); 4491 Py_XDECREF(exc);
4330 Py_DECREF(unicode); 4492 _PyUnicodeWriter_Dealloc(&writer);
4331 return NULL; 4493 return NULL;
4332 } 4494 }
4333 4495
4334 4496
4335 PyObject * 4497 PyObject *
4336 _PyUnicode_EncodeUTF7(PyObject *str, 4498 _PyUnicode_EncodeUTF7(PyObject *str,
4337 int base64SetO, 4499 int base64SetO,
4338 int base64WhiteSpace, 4500 int base64WhiteSpace,
4339 const char *errors) 4501 const char *errors)
4340 { 4502 {
(...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after
4403 goto encode_char; 4565 goto encode_char;
4404 } 4566 }
4405 } 4567 }
4406 continue; 4568 continue;
4407 encode_char: 4569 encode_char:
4408 if (ch >= 0x10000) { 4570 if (ch >= 0x10000) {
4409 assert(ch <= MAX_UNICODE); 4571 assert(ch <= MAX_UNICODE);
4410 4572
4411 /* code first surrogate */ 4573 /* code first surrogate */
4412 base64bits += 16; 4574 base64bits += 16;
4413 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10); 4575 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4414 while (base64bits >= 6) { 4576 while (base64bits >= 6) {
4415 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4577 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4416 base64bits -= 6; 4578 base64bits -= 6;
4417 } 4579 }
4418 /* prepare second surrogate */ 4580 /* prepare second surrogate */
4419 ch = Py_UNICODE_LOW_SURROGATE(ch); 4581 ch = Py_UNICODE_LOW_SURROGATE(ch);
4420 } 4582 }
4421 base64bits += 16; 4583 base64bits += 16;
4422 base64buffer = (base64buffer << 16) | ch; 4584 base64buffer = (base64buffer << 16) | ch;
4423 while (base64bits >= 6) { 4585 while (base64bits >= 6) {
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after
4491 #else 4653 #else
4492 # error C 'long' size should be either 4 or 8! 4654 # error C 'long' size should be either 4 or 8!
4493 #endif 4655 #endif
4494 4656
4495 static Py_ssize_t 4657 static Py_ssize_t
4496 ascii_decode(const char *start, const char *end, Py_UCS1 *dest) 4658 ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4497 { 4659 {
4498 const char *p = start; 4660 const char *p = start;
4499 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG); 4661 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
4500 4662
4663 /*
4664 * Issue #17237: m68k is a bit different from most architectures in
4665 * that objects do not use "natural alignment" - for example, int and
4666 * long are only aligned at 2-byte boundaries. Therefore the assert()
4667 * won't work; also, tests have shown that skipping the "optimised
4668 * version" will even speed up m68k.
4669 */
4670 #if !defined(__m68k__)
4501 #if SIZEOF_LONG <= SIZEOF_VOID_P 4671 #if SIZEOF_LONG <= SIZEOF_VOID_P
4502 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG)); 4672 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4503 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { 4673 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4504 /* Fast path, see in STRINGLIB(utf8_decode) for 4674 /* Fast path, see in STRINGLIB(utf8_decode) for
4505 an explanation. */ 4675 an explanation. */
4506 /* Help register allocation */ 4676 /* Help allocation */
4507 register const char *_p = p; 4677 const char *_p = p;
4508 register Py_UCS1 * q = dest; 4678 Py_UCS1 * q = dest;
4509 while (_p < aligned_end) { 4679 while (_p < aligned_end) {
4510 unsigned long value = *(const unsigned long *) _p; 4680 unsigned long value = *(const unsigned long *) _p;
4511 if (value & ASCII_CHAR_MASK) 4681 if (value & ASCII_CHAR_MASK)
4512 break; 4682 break;
4513 *((unsigned long *)q) = value; 4683 *((unsigned long *)q) = value;
4514 _p += SIZEOF_LONG; 4684 _p += SIZEOF_LONG;
4515 q += SIZEOF_LONG; 4685 q += SIZEOF_LONG;
4516 } 4686 }
4517 p = _p; 4687 p = _p;
4518 while (p < end) { 4688 while (p < end) {
4519 if ((unsigned char)*p & 0x80) 4689 if ((unsigned char)*p & 0x80)
4520 break; 4690 break;
4521 *q++ = *p++; 4691 *q++ = *p++;
4522 } 4692 }
4523 return p - start; 4693 return p - start;
4524 } 4694 }
4525 #endif 4695 #endif
4696 #endif
4526 while (p < end) { 4697 while (p < end) {
4527 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h 4698 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4528 for an explanation. */ 4699 for an explanation. */
4529 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { 4700 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4530 /* Help register allocation */ 4701 /* Help allocation */
4531 register const char *_p = p; 4702 const char *_p = p;
4532 while (_p < aligned_end) { 4703 while (_p < aligned_end) {
4533 unsigned long value = *(unsigned long *) _p; 4704 unsigned long value = *(unsigned long *) _p;
4534 if (value & ASCII_CHAR_MASK) 4705 if (value & ASCII_CHAR_MASK)
4535 break; 4706 break;
4536 _p += SIZEOF_LONG; 4707 _p += SIZEOF_LONG;
4537 } 4708 }
4538 p = _p; 4709 p = _p;
4539 if (_p == end) 4710 if (_p == end)
4540 break; 4711 break;
4541 } 4712 }
4542 if ((unsigned char)*p & 0x80) 4713 if ((unsigned char)*p & 0x80)
4543 break; 4714 break;
4544 ++p; 4715 ++p;
4545 } 4716 }
4546 memcpy(dest, start, p - start); 4717 memcpy(dest, start, p - start);
4547 return p - start; 4718 return p - start;
4548 } 4719 }
4549 4720
4550 PyObject * 4721 PyObject *
4551 PyUnicode_DecodeUTF8Stateful(const char *s, 4722 PyUnicode_DecodeUTF8Stateful(const char *s,
4552 Py_ssize_t size, 4723 Py_ssize_t size,
4553 const char *errors, 4724 const char *errors,
4554 Py_ssize_t *consumed) 4725 Py_ssize_t *consumed)
4555 { 4726 {
4556 PyObject *unicode; 4727 _PyUnicodeWriter writer;
4557 const char *starts = s; 4728 const char *starts = s;
4558 const char *end = s + size; 4729 const char *end = s + size;
4559 Py_ssize_t outpos;
4560 4730
4561 Py_ssize_t startinpos; 4731 Py_ssize_t startinpos;
4562 Py_ssize_t endinpos; 4732 Py_ssize_t endinpos;
4563 const char *errmsg = ""; 4733 const char *errmsg = "";
4564 PyObject *errorHandler = NULL; 4734 PyObject *errorHandler = NULL;
4565 PyObject *exc = NULL; 4735 PyObject *exc = NULL;
4566 4736
4567 if (size == 0) { 4737 if (size == 0) {
4568 if (consumed) 4738 if (consumed)
4569 *consumed = 0; 4739 *consumed = 0;
4570 Py_INCREF(unicode_empty); 4740 _Py_RETURN_UNICODE_EMPTY();
4571 return unicode_empty;
4572 } 4741 }
4573 4742
4574 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 4743 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4575 if (size == 1 && (unsigned char)s[0] < 128) { 4744 if (size == 1 && (unsigned char)s[0] < 128) {
4576 if (consumed) 4745 if (consumed)
4577 *consumed = 1; 4746 *consumed = 1;
4578 return get_latin1_char((unsigned char)s[0]); 4747 return get_latin1_char((unsigned char)s[0]);
4579 } 4748 }
4580 4749
4581 unicode = PyUnicode_New(size, 127); 4750 _PyUnicodeWriter_Init(&writer);
4582 if (!unicode) 4751 writer.min_length = size;
4583 return NULL; 4752 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
4584 4753 goto onError;
4585 outpos = ascii_decode(s, end, PyUnicode_1BYTE_DATA(unicode)); 4754
4586 s += outpos; 4755 writer.pos = ascii_decode(s, end, writer.data);
4756 s += writer.pos;
4587 while (s < end) { 4757 while (s < end) {
4588 Py_UCS4 ch; 4758 Py_UCS4 ch;
4589 int kind = PyUnicode_KIND(unicode); 4759 int kind = writer.kind;
4590 if (kind == PyUnicode_1BYTE_KIND) { 4760 if (kind == PyUnicode_1BYTE_KIND) {
4591 if (PyUnicode_IS_ASCII(unicode)) 4761 if (PyUnicode_IS_ASCII(writer.buffer))
4592 ch = asciilib_utf8_decode(&s, end, 4762 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
4593 PyUnicode_1BYTE_DATA(unicode), &outpos);
4594 else 4763 else
4595 ch = ucs1lib_utf8_decode(&s, end, 4764 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
4596 PyUnicode_1BYTE_DATA(unicode), &outpos);
4597 } else if (kind == PyUnicode_2BYTE_KIND) { 4765 } else if (kind == PyUnicode_2BYTE_KIND) {
4598 ch = ucs2lib_utf8_decode(&s, end, 4766 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
4599 PyUnicode_2BYTE_DATA(unicode), &outpos);
4600 } else { 4767 } else {
4601 assert(kind == PyUnicode_4BYTE_KIND); 4768 assert(kind == PyUnicode_4BYTE_KIND);
4602 ch = ucs4lib_utf8_decode(&s, end, 4769 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
4603 PyUnicode_4BYTE_DATA(unicode), &outpos);
4604 } 4770 }
4605 4771
4606 switch (ch) { 4772 switch (ch) {
4607 case 0: 4773 case 0:
4608 if (s == end || consumed) 4774 if (s == end || consumed)
4609 goto End; 4775 goto End;
4610 errmsg = "unexpected end of data"; 4776 errmsg = "unexpected end of data";
4611 startinpos = s - starts; 4777 startinpos = s - starts;
4612 endinpos = startinpos + 1; 4778 endinpos = end - starts;
4613 while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
4614 endinpos++;
4615 break; 4779 break;
4616 case 1: 4780 case 1:
4617 errmsg = "invalid start byte"; 4781 errmsg = "invalid start byte";
4618 startinpos = s - starts; 4782 startinpos = s - starts;
4619 endinpos = startinpos + 1; 4783 endinpos = startinpos + 1;
4620 break; 4784 break;
4621 case 2: 4785 case 2:
4786 case 3:
4787 case 4:
4622 errmsg = "invalid continuation byte"; 4788 errmsg = "invalid continuation byte";
4623 startinpos = s - starts; 4789 startinpos = s - starts;
4624 endinpos = startinpos + 1; 4790 endinpos = startinpos + ch - 1;
4625 while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
4626 endinpos++;
4627 break; 4791 break;
4628 default: 4792 default:
4629 if (unicode_putchar(&unicode, &outpos, ch) < 0) 4793 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4630 goto onError; 4794 goto onError;
4631 continue; 4795 continue;
4632 } 4796 }
4633 4797
4634 if (unicode_decode_call_errorhandler( 4798 if (unicode_decode_call_errorhandler_writer(
4635 errors, &errorHandler, 4799 errors, &errorHandler,
4636 "utf-8", errmsg, 4800 "utf-8", errmsg,
4637 &starts, &end, &startinpos, &endinpos, &exc, &s, 4801 &starts, &end, &startinpos, &endinpos, &exc, &s,
4638 &unicode, &outpos)) 4802 &writer))
4639 goto onError; 4803 goto onError;
4640 } 4804 }
4641 4805
4642 End: 4806 End:
4643 if (unicode_resize(&unicode, outpos) < 0)
4644 goto onError;
4645
4646 if (consumed) 4807 if (consumed)
4647 *consumed = s - starts; 4808 *consumed = s - starts;
4648 4809
4649 Py_XDECREF(errorHandler); 4810 Py_XDECREF(errorHandler);
4650 Py_XDECREF(exc); 4811 Py_XDECREF(exc);
4651 assert(_PyUnicode_CheckConsistency(unicode, 1)); 4812 return _PyUnicodeWriter_Finish(&writer);
4652 return unicode;
4653 4813
4654 onError: 4814 onError:
4655 Py_XDECREF(errorHandler); 4815 Py_XDECREF(errorHandler);
4656 Py_XDECREF(exc); 4816 Py_XDECREF(exc);
4657 Py_XDECREF(unicode); 4817 _PyUnicodeWriter_Dealloc(&writer);
4658 return NULL; 4818 return NULL;
4659 } 4819 }
4660 4820
4661 #ifdef __APPLE__ 4821 #ifdef __APPLE__
4662 4822
4663 /* Simplified UTF-8 decoder using surrogateescape error handler, 4823 /* Simplified UTF-8 decoder using surrogateescape error handler,
4664 used to decode the command line arguments on Mac OS X. */ 4824 used to decode the command line arguments on Mac OS X.
4825
4826 Return a pointer to a newly allocated wide character string (use
4827 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
4665 4828
4666 wchar_t* 4829 wchar_t*
4667 _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) 4830 _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4668 { 4831 {
4669 const char *e; 4832 const char *e;
4670 wchar_t *unicode; 4833 wchar_t *unicode;
4671 Py_ssize_t outpos; 4834 Py_ssize_t outpos;
4672 4835
4673 /* Note: size will always be longer than the resulting Unicode 4836 /* Note: size will always be longer than the resulting Unicode
4674 character count */ 4837 character count */
4675 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) { 4838 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
4676 PyErr_NoMemory(); 4839 return NULL;
4677 return NULL; 4840 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
4678 }
4679 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4680 if (!unicode) 4841 if (!unicode)
4681 return NULL; 4842 return NULL;
4682 4843
4683 /* Unpack UTF-8 encoded data */ 4844 /* Unpack UTF-8 encoded data */
4684 e = s + size; 4845 e = s + size;
4685 outpos = 0; 4846 outpos = 0;
4686 while (s < e) { 4847 while (s < e) {
4687 Py_UCS4 ch; 4848 Py_UCS4 ch;
4688 #if SIZEOF_WCHAR_T == 4 4849 #if SIZEOF_WCHAR_T == 4
4689 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos); 4850 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
(...skipping 102 matching lines...) Expand 10 before | Expand all | Expand 10 after
4792 PyObject * 4953 PyObject *
4793 PyUnicode_DecodeUTF32Stateful(const char *s, 4954 PyUnicode_DecodeUTF32Stateful(const char *s,
4794 Py_ssize_t size, 4955 Py_ssize_t size,
4795 const char *errors, 4956 const char *errors,
4796 int *byteorder, 4957 int *byteorder,
4797 Py_ssize_t *consumed) 4958 Py_ssize_t *consumed)
4798 { 4959 {
4799 const char *starts = s; 4960 const char *starts = s;
4800 Py_ssize_t startinpos; 4961 Py_ssize_t startinpos;
4801 Py_ssize_t endinpos; 4962 Py_ssize_t endinpos;
4802 Py_ssize_t outpos; 4963 _PyUnicodeWriter writer;
4803 PyObject *unicode;
4804 const unsigned char *q, *e; 4964 const unsigned char *q, *e;
4805 int bo = 0; /* assume native ordering by default */ 4965 int le, bo = 0; /* assume native ordering by default */
4966 const char *encoding;
4806 const char *errmsg = ""; 4967 const char *errmsg = "";
4807 /* Offsets from q for retrieving bytes in the right order. */
4808 #if PY_LITTLE_ENDIAN
4809 int iorder[] = {0, 1, 2, 3};
4810 #else
4811 int iorder[] = {3, 2, 1, 0};
4812 #endif
4813 PyObject *errorHandler = NULL; 4968 PyObject *errorHandler = NULL;
4814 PyObject *exc = NULL; 4969 PyObject *exc = NULL;
4815 4970
4816 q = (unsigned char *)s; 4971 q = (unsigned char *)s;
4817 e = q + size; 4972 e = q + size;
4818 4973
4819 if (byteorder) 4974 if (byteorder)
4820 bo = *byteorder; 4975 bo = *byteorder;
4821 4976
4822 /* Check for BOM marks (U+FEFF) in the input and adjust current 4977 /* Check for BOM marks (U+FEFF) in the input and adjust current
4823 byte order setting accordingly. In native mode, the leading BOM 4978 byte order setting accordingly. In native mode, the leading BOM
4824 mark is skipped, in all other modes, it is copied to the output 4979 mark is skipped, in all other modes, it is copied to the output
4825 stream as-is (giving a ZWNBSP character). */ 4980 stream as-is (giving a ZWNBSP character). */
4826 if (bo == 0) { 4981 if (bo == 0 && size >= 4) {
4827 if (size >= 4) { 4982 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4828 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 4983 if (bom == 0x0000FEFF) {
4829 (q[iorder[1]] << 8) | q[iorder[0]]; 4984 bo = -1;
4830 #if PY_LITTLE_ENDIAN 4985 q += 4;
4831 if (bom == 0x0000FEFF) { 4986 }
4987 else if (bom == 0xFFFE0000) {
4988 bo = 1;
4989 q += 4;
4990 }
4991 if (byteorder)
4992 *byteorder = bo;
4993 }
4994
4995 if (q == e) {
4996 if (consumed)
4997 *consumed = size;
4998 _Py_RETURN_UNICODE_EMPTY();
4999 }
5000
5001 #ifdef WORDS_BIGENDIAN
5002 le = bo < 0;
5003 #else
5004 le = bo <= 0;
5005 #endif
5006 encoding = le ? "utf-32-le" : "utf-32-be";
5007
5008 _PyUnicodeWriter_Init(&writer);
5009 writer.min_length = (e - q + 3) / 4;
5010 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5011 goto onError;
5012
5013 while (1) {
5014 Py_UCS4 ch = 0;
5015 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5016
5017 if (e - q >= 4) {
5018 enum PyUnicode_Kind kind = writer.kind;
5019 void *data = writer.data;
5020 const unsigned char *last = e - 4;
5021 Py_ssize_t pos = writer.pos;
5022 if (le) {
5023 do {
5024 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5025 if (ch > maxch)
5026 break;
5027 if (kind != PyUnicode_1BYTE_KIND &&
5028 Py_UNICODE_IS_SURROGATE(ch))
5029 break;
5030 PyUnicode_WRITE(kind, data, pos++, ch);
5031 q += 4;
5032 } while (q <= last);
5033 }
5034 else {
5035 do {
5036 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5037 if (ch > maxch)
5038 break;
5039 if (kind != PyUnicode_1BYTE_KIND &&
5040 Py_UNICODE_IS_SURROGATE(ch))
5041 break;
5042 PyUnicode_WRITE(kind, data, pos++, ch);
5043 q += 4;
5044 } while (q <= last);
5045 }
5046 writer.pos = pos;
5047 }
5048
5049 if (Py_UNICODE_IS_SURROGATE(ch)) {
5050 errmsg = "codepoint in surrogate code point range(0xd800, 0xe000)";
5051 startinpos = ((const char *)q) - starts;
5052 endinpos = startinpos + 4;
5053 }
5054 else if (ch <= maxch) {
5055 if (q == e || consumed)
5056 break;
5057 /* remaining bytes at the end? (size should be divisible by 4) */
5058 errmsg = "truncated data";
5059 startinpos = ((const char *)q) - starts;
5060 endinpos = ((const char *)e) - starts;
5061 }
5062 else {
5063 if (ch < 0x110000) {
5064 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5065 goto onError;
4832 q += 4; 5066 q += 4;
4833 bo = -1; 5067 continue;
4834 } 5068 }
4835 else if (bom == 0xFFFE0000) {
4836 q += 4;
4837 bo = 1;
4838 }
4839 #else
4840 if (bom == 0x0000FEFF) {
4841 q += 4;
4842 bo = 1;
4843 }
4844 else if (bom == 0xFFFE0000) {
4845 q += 4;
4846 bo = -1;
4847 }
4848 #endif
4849 }
4850 }
4851
4852 if (bo == -1) {
4853 /* force LE */
4854 iorder[0] = 0;
4855 iorder[1] = 1;
4856 iorder[2] = 2;
4857 iorder[3] = 3;
4858 }
4859 else if (bo == 1) {
4860 /* force BE */
4861 iorder[0] = 3;
4862 iorder[1] = 2;
4863 iorder[2] = 1;
4864 iorder[3] = 0;
4865 }
4866
4867 /* This might be one to much, because of a BOM */
4868 unicode = PyUnicode_New((size+3)/4, 127);
4869 if (!unicode)
4870 return NULL;
4871 if (size == 0)
4872 return unicode;
4873 outpos = 0;
4874
4875 while (q < e) {
4876 Py_UCS4 ch;
4877 /* remaining bytes at the end? (size should be divisible by 4) */
4878 if (e-q<4) {
4879 if (consumed)
4880 break;
4881 errmsg = "truncated data";
4882 startinpos = ((const char *)q)-starts;
4883 endinpos = ((const char *)e)-starts;
4884 goto utf32Error;
4885 /* The remaining input chars are ignored if the callback
4886 chooses to skip the input */
4887 }
4888 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4889 (q[iorder[1]] << 8) | q[iorder[0]];
4890
4891 if (ch >= 0x110000)
4892 {
4893 errmsg = "codepoint not in range(0x110000)"; 5069 errmsg = "codepoint not in range(0x110000)";
4894 startinpos = ((const char *)q)-starts; 5070 startinpos = ((const char *)q) - starts;
4895 endinpos = startinpos+4; 5071 endinpos = startinpos + 4;
4896 goto utf32Error; 5072 }
4897 } 5073
4898 if (unicode_putchar(&unicode, &outpos, ch) < 0) 5074 /* The remaining input chars are ignored if the callback
5075 chooses to skip the input */
5076 if (unicode_decode_call_errorhandler_writer(
5077 errors, &errorHandler,
5078 encoding, errmsg,
5079 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5080 &writer))
4899 goto onError; 5081 goto onError;
4900 q += 4; 5082 }
4901 continue;
4902 utf32Error:
4903 if (unicode_decode_call_errorhandler(
4904 errors, &errorHandler,
4905 "utf32", errmsg,
4906 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4907 &unicode, &outpos))
4908 goto onError;
4909 }
4910
4911 if (byteorder)
4912 *byteorder = bo;
4913 5083
4914 if (consumed) 5084 if (consumed)
4915 *consumed = (const char *)q-starts; 5085 *consumed = (const char *)q-starts;
4916 5086
4917 /* Adjust length */
4918 if (unicode_resize(&unicode, outpos) < 0)
4919 goto onError;
4920
4921 Py_XDECREF(errorHandler); 5087 Py_XDECREF(errorHandler);
4922 Py_XDECREF(exc); 5088 Py_XDECREF(exc);
4923 return unicode_result(unicode); 5089 return _PyUnicodeWriter_Finish(&writer);
4924 5090
4925 onError: 5091 onError:
4926 Py_DECREF(unicode); 5092 _PyUnicodeWriter_Dealloc(&writer);
4927 Py_XDECREF(errorHandler); 5093 Py_XDECREF(errorHandler);
4928 Py_XDECREF(exc); 5094 Py_XDECREF(exc);
4929 return NULL; 5095 return NULL;
4930 } 5096 }
4931 5097
4932 PyObject * 5098 PyObject *
4933 _PyUnicode_EncodeUTF32(PyObject *str, 5099 _PyUnicode_EncodeUTF32(PyObject *str,
4934 const char *errors, 5100 const char *errors,
4935 int byteorder) 5101 int byteorder)
4936 { 5102 {
4937 enum PyUnicode_Kind kind; 5103 enum PyUnicode_Kind kind;
4938 const void *data; 5104 const void *data;
4939 Py_ssize_t len; 5105 Py_ssize_t len;
4940 PyObject *v; 5106 PyObject *v;
4941 PY_UINT32_T *out; 5107 PY_UINT32_T *out;
4942 Py_ssize_t bytesize; 5108 #if PY_LITTLE_ENDIAN
4943 #ifdef WORDS_BIGENDIAN 5109 int native_ordering = byteorder <= 0;
5110 #else
4944 int native_ordering = byteorder >= 0; 5111 int native_ordering = byteorder >= 0;
4945 #else
4946 int native_ordering = byteorder <= 0;
4947 #endif 5112 #endif
5113 const char *encoding;
5114 Py_ssize_t nsize, pos;
5115 PyObject *errorHandler = NULL;
5116 PyObject *exc = NULL;
5117 PyObject *rep = NULL;
4948 5118
4949 if (!PyUnicode_Check(str)) { 5119 if (!PyUnicode_Check(str)) {
4950 PyErr_BadArgument(); 5120 PyErr_BadArgument();
4951 return NULL; 5121 return NULL;
4952 } 5122 }
4953 if (PyUnicode_READY(str) == -1) 5123 if (PyUnicode_READY(str) == -1)
4954 return NULL; 5124 return NULL;
4955 kind = PyUnicode_KIND(str); 5125 kind = PyUnicode_KIND(str);
4956 data = PyUnicode_DATA(str); 5126 data = PyUnicode_DATA(str);
4957 len = PyUnicode_GET_LENGTH(str); 5127 len = PyUnicode_GET_LENGTH(str);
4958 5128
4959 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0)) 5129 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
4960 return PyErr_NoMemory(); 5130 return PyErr_NoMemory();
4961 bytesize = (len + (byteorder == 0)) * 4; 5131 nsize = len + (byteorder == 0);
4962 v = PyBytes_FromStringAndSize(NULL, bytesize); 5132 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
4963 if (v == NULL) 5133 if (v == NULL)
4964 return NULL; 5134 return NULL;
4965 5135
4966 /* output buffer is 4-bytes aligned */ 5136 /* output buffer is 4-bytes aligned */
4967 assert((Py_uintptr_t)PyBytes_AS_STRING(v) & 3 == 0); 5137 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
4968 out = (PY_UINT32_T *)PyBytes_AS_STRING(v); 5138 out = (PY_UINT32_T *)PyBytes_AS_STRING(v);
4969 if (byteorder == 0) 5139 if (byteorder == 0)
4970 *out++ = 0xFEFF; 5140 *out++ = 0xFEFF;
4971 if (len == 0) 5141 if (len == 0)
4972 goto done; 5142 goto done;
4973 5143
4974 switch (kind) { 5144 if (byteorder == -1)
4975 case PyUnicode_1BYTE_KIND: 5145 encoding = "utf-32-le";
4976 ucs1lib_utf32_encode(out, (const Py_UCS1 *)data, len, native_ordering); 5146 else if (byteorder == 1)
4977 break; 5147 encoding = "utf-32-be";
4978 case PyUnicode_2BYTE_KIND: 5148 else
4979 ucs2lib_utf32_encode(out, (const Py_UCS2 *)data, len, native_ordering); 5149 encoding = "utf-32";
4980 break; 5150
4981 case PyUnicode_4BYTE_KIND: 5151 if (kind == PyUnicode_1BYTE_KIND) {
4982 ucs4lib_utf32_encode(out, (const Py_UCS4 *)data, len, native_ordering); 5152 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
4983 break; 5153 goto done;
4984 default: 5154 }
4985 assert(0); 5155
4986 } 5156 pos = 0;
4987 5157 while (pos < len) {
4988 done: 5158 Py_ssize_t repsize, moreunits;
5159
5160 if (kind == PyUnicode_2BYTE_KIND) {
5161 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5162 &out, native_ordering);
5163 }
5164 else {
5165 assert(kind == PyUnicode_4BYTE_KIND);
5166 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5167 &out, native_ordering);
5168 }
5169 if (pos == len)
5170 break;
5171
5172 rep = unicode_encode_call_errorhandler(
5173 errors, &errorHandler,
5174 encoding, "surrogates not allowed",
5175 str, &exc, pos, pos + 1, &pos);
5176 if (!rep)
5177 goto error;
5178
5179 if (PyBytes_Check(rep)) {
5180 repsize = PyBytes_GET_SIZE(rep);
5181 if (repsize & 3) {
5182 raise_encode_exception(&exc, encoding,
5183 str, pos - 1, pos,
5184 "surrogates not allowed");
5185 goto error;
5186 }
5187 moreunits = repsize / 4;
5188 }
5189 else {
5190 assert(PyUnicode_Check(rep));
5191 if (PyUnicode_READY(rep) < 0)
5192 goto error;
5193 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5194 if (!PyUnicode_IS_ASCII(rep)) {
5195 raise_encode_exception(&exc, encoding,
5196 str, pos - 1, pos,
5197 "surrogates not allowed");
5198 goto error;
5199 }
5200 }
5201
5202 /* four bytes are reserved for each surrogate */
5203 if (moreunits > 1) {
5204 Py_ssize_t outpos = out - (PY_UINT32_T*) PyBytes_AS_STRING(v);
5205 Py_ssize_t morebytes = 4 * (moreunits - 1);
5206 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5207 /* integer overflow */
5208 PyErr_NoMemory();
5209 goto error;
5210 }
5211 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5212 goto error;
5213 out = (PY_UINT32_T*) PyBytes_AS_STRING(v) + outpos;
5214 }
5215
5216 if (PyBytes_Check(rep)) {
5217 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5218 out += moreunits;
5219 } else /* rep is unicode */ {
5220 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5221 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5222 &out, native_ordering);
5223 }
5224
5225 Py_CLEAR(rep);
5226 }
5227
5228 /* Cut back to size actually needed. This is necessary for, for example,
5229 encoding of a string containing isolated surrogates and the 'ignore'
5230 handler is used. */
5231 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5232 if (nsize != PyBytes_GET_SIZE(v))
5233 _PyBytes_Resize(&v, nsize);
5234 Py_XDECREF(errorHandler);
5235 Py_XDECREF(exc);
5236 done:
4989 return v; 5237 return v;
5238 error:
5239 Py_XDECREF(rep);
5240 Py_XDECREF(errorHandler);
5241 Py_XDECREF(exc);
5242 Py_XDECREF(v);
5243 return NULL;
4990 } 5244 }
4991 5245
4992 PyObject * 5246 PyObject *
4993 PyUnicode_EncodeUTF32(const Py_UNICODE *s, 5247 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
4994 Py_ssize_t size, 5248 Py_ssize_t size,
4995 const char *errors, 5249 const char *errors,
4996 int byteorder) 5250 int byteorder)
4997 { 5251 {
4998 PyObject *result; 5252 PyObject *result;
4999 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5253 PyObject *tmp = PyUnicode_FromUnicode(s, size);
(...skipping 24 matching lines...) Expand all
5024 PyObject * 5278 PyObject *
5025 PyUnicode_DecodeUTF16Stateful(const char *s, 5279 PyUnicode_DecodeUTF16Stateful(const char *s,
5026 Py_ssize_t size, 5280 Py_ssize_t size,
5027 const char *errors, 5281 const char *errors,
5028 int *byteorder, 5282 int *byteorder,
5029 Py_ssize_t *consumed) 5283 Py_ssize_t *consumed)
5030 { 5284 {
5031 const char *starts = s; 5285 const char *starts = s;
5032 Py_ssize_t startinpos; 5286 Py_ssize_t startinpos;
5033 Py_ssize_t endinpos; 5287 Py_ssize_t endinpos;
5034 Py_ssize_t outpos; 5288 _PyUnicodeWriter writer;
5035 PyObject *unicode;
5036 const unsigned char *q, *e; 5289 const unsigned char *q, *e;
5037 int bo = 0; /* assume native ordering by default */ 5290 int bo = 0; /* assume native ordering by default */
5038 int native_ordering; 5291 int native_ordering;
5039 const char *errmsg = ""; 5292 const char *errmsg = "";
5040 PyObject *errorHandler = NULL; 5293 PyObject *errorHandler = NULL;
5041 PyObject *exc = NULL; 5294 PyObject *exc = NULL;
5295 const char *encoding;
5042 5296
5043 q = (unsigned char *)s; 5297 q = (unsigned char *)s;
5044 e = q + size; 5298 e = q + size;
5045 5299
5046 if (byteorder) 5300 if (byteorder)
5047 bo = *byteorder; 5301 bo = *byteorder;
5048 5302
5049 /* Check for BOM marks (U+FEFF) in the input and adjust current 5303 /* Check for BOM marks (U+FEFF) in the input and adjust current
5050 byte order setting accordingly. In native mode, the leading BOM 5304 byte order setting accordingly. In native mode, the leading BOM
5051 mark is skipped, in all other modes, it is copied to the output 5305 mark is skipped, in all other modes, it is copied to the output
5052 stream as-is (giving a ZWNBSP character). */ 5306 stream as-is (giving a ZWNBSP character). */
5053 if (bo == 0 && size >= 2) { 5307 if (bo == 0 && size >= 2) {
5054 const Py_UCS4 bom = (q[1] << 8) | q[0]; 5308 const Py_UCS4 bom = (q[1] << 8) | q[0];
5055 if (bom == 0xFEFF) { 5309 if (bom == 0xFEFF) {
5056 q += 2; 5310 q += 2;
5057 bo = -1; 5311 bo = -1;
5058 } 5312 }
5059 else if (bom == 0xFFFE) { 5313 else if (bom == 0xFFFE) {
5060 q += 2; 5314 q += 2;
5061 bo = 1; 5315 bo = 1;
5062 } 5316 }
5063 if (byteorder) 5317 if (byteorder)
5064 *byteorder = bo; 5318 *byteorder = bo;
5065 } 5319 }
5066 5320
5067 if (q == e) { 5321 if (q == e) {
5068 if (consumed) 5322 if (consumed)
5069 *consumed = size; 5323 *consumed = size;
5070 Py_INCREF(unicode_empty); 5324 _Py_RETURN_UNICODE_EMPTY();
5071 return unicode_empty;
5072 } 5325 }
5073 5326
5074 #if PY_LITTLE_ENDIAN 5327 #if PY_LITTLE_ENDIAN
5075 native_ordering = bo <= 0; 5328 native_ordering = bo <= 0;
5329 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
5076 #else 5330 #else
5077 native_ordering = bo >= 0; 5331 native_ordering = bo >= 0;
5332 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
5078 #endif 5333 #endif
5079 5334
5080 /* Note: size will always be longer than the resulting Unicode 5335 /* Note: size will always be longer than the resulting Unicode
5081 character count */ 5336 character count */
5082 unicode = PyUnicode_New((e - q + 1) / 2, 127); 5337 _PyUnicodeWriter_Init(&writer);
5083 if (!unicode) 5338 writer.min_length = (e - q + 1) / 2;
5084 return NULL; 5339 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5085 5340 goto onError;
5086 outpos = 0; 5341
5087 while (1) { 5342 while (1) {
5088 Py_UCS4 ch = 0; 5343 Py_UCS4 ch = 0;
5089 if (e - q >= 2) { 5344 if (e - q >= 2) {
5090 int kind = PyUnicode_KIND(unicode); 5345 int kind = writer.kind;
5091 if (kind == PyUnicode_1BYTE_KIND) { 5346 if (kind == PyUnicode_1BYTE_KIND) {
5092 if (PyUnicode_IS_ASCII(unicode)) 5347 if (PyUnicode_IS_ASCII(writer.buffer))
5093 ch = asciilib_utf16_decode(&q, e, 5348 ch = asciilib_utf16_decode(&q, e,
5094 PyUnicode_1BYTE_DATA(unicode), &outpos, 5349 (Py_UCS1*)writer.data, &writer.pos,
5095 native_ordering); 5350 native_ordering);
5096 else 5351 else
5097 ch = ucs1lib_utf16_decode(&q, e, 5352 ch = ucs1lib_utf16_decode(&q, e,
5098 PyUnicode_1BYTE_DATA(unicode), &outpos, 5353 (Py_UCS1*)writer.data, &writer.pos,
5099 native_ordering); 5354 native_ordering);
5100 } else if (kind == PyUnicode_2BYTE_KIND) { 5355 } else if (kind == PyUnicode_2BYTE_KIND) {
5101 ch = ucs2lib_utf16_decode(&q, e, 5356 ch = ucs2lib_utf16_decode(&q, e,
5102 PyUnicode_2BYTE_DATA(unicode), &outpos, 5357 (Py_UCS2*)writer.data, &writer.pos,
5103 native_ordering); 5358 native_ordering);
5104 } else { 5359 } else {
5105 assert(kind == PyUnicode_4BYTE_KIND); 5360 assert(kind == PyUnicode_4BYTE_KIND);
5106 ch = ucs4lib_utf16_decode(&q, e, 5361 ch = ucs4lib_utf16_decode(&q, e,
5107 PyUnicode_4BYTE_DATA(unicode), &outpos, 5362 (Py_UCS4*)writer.data, &writer.pos,
5108 native_ordering); 5363 native_ordering);
5109 } 5364 }
5110 } 5365 }
5111 5366
5112 switch (ch) 5367 switch (ch)
5113 { 5368 {
5114 case 0: 5369 case 0:
5115 /* remaining byte at the end? (size should be even) */ 5370 /* remaining byte at the end? (size should be even) */
5116 if (q == e || consumed) 5371 if (q == e || consumed)
5117 goto End; 5372 goto End;
5118 errmsg = "truncated data"; 5373 errmsg = "truncated data";
5119 startinpos = ((const char *)q) - starts; 5374 startinpos = ((const char *)q) - starts;
5120 endinpos = ((const char *)e) - starts; 5375 endinpos = ((const char *)e) - starts;
5121 break; 5376 break;
5122 /* The remaining input chars are ignored if the callback 5377 /* The remaining input chars are ignored if the callback
5123 chooses to skip the input */ 5378 chooses to skip the input */
5124 case 1: 5379 case 1:
5380 q -= 2;
5381 if (consumed)
5382 goto End;
5125 errmsg = "unexpected end of data"; 5383 errmsg = "unexpected end of data";
5126 startinpos = ((const char *)q) - 2 - starts; 5384 startinpos = ((const char *)q) - starts;
5127 endinpos = ((const char *)e) - starts; 5385 endinpos = ((const char *)e) - starts;
5128 break; 5386 break;
5129 case 2: 5387 case 2:
5130 errmsg = "illegal encoding"; 5388 errmsg = "illegal encoding";
5131 startinpos = ((const char *)q) - 2 - starts; 5389 startinpos = ((const char *)q) - 2 - starts;
5132 endinpos = startinpos + 2; 5390 endinpos = startinpos + 2;
5133 break; 5391 break;
5134 case 3: 5392 case 3:
5135 errmsg = "illegal UTF-16 surrogate"; 5393 errmsg = "illegal UTF-16 surrogate";
5136 startinpos = ((const char *)q) - 4 - starts; 5394 startinpos = ((const char *)q) - 4 - starts;
5137 endinpos = startinpos + 2; 5395 endinpos = startinpos + 2;
5138 break; 5396 break;
5139 default: 5397 default:
5140 if (unicode_putchar(&unicode, &outpos, ch) < 0) 5398 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5141 goto onError; 5399 goto onError;
5142 continue; 5400 continue;
5143 } 5401 }
5144 5402
5145 if (unicode_decode_call_errorhandler( 5403 if (unicode_decode_call_errorhandler_writer(
5146 errors, 5404 errors,
5147 &errorHandler, 5405 &errorHandler,
5148 "utf16", errmsg, 5406 encoding, errmsg,
5149 &starts, 5407 &starts,
5150 (const char **)&e, 5408 (const char **)&e,
5151 &startinpos, 5409 &startinpos,
5152 &endinpos, 5410 &endinpos,
5153 &exc, 5411 &exc,
5154 (const char **)&q, 5412 (const char **)&q,
5155 &unicode, 5413 &writer))
5156 &outpos))
5157 goto onError; 5414 goto onError;
5158 } 5415 }
5159 5416
5160 End: 5417 End:
5161 if (consumed) 5418 if (consumed)
5162 *consumed = (const char *)q-starts; 5419 *consumed = (const char *)q-starts;
5163 5420
5164 /* Adjust length */
5165 if (unicode_resize(&unicode, outpos) < 0)
5166 goto onError;
5167
5168 Py_XDECREF(errorHandler); 5421 Py_XDECREF(errorHandler);
5169 Py_XDECREF(exc); 5422 Py_XDECREF(exc);
5170 return unicode_result(unicode); 5423 return _PyUnicodeWriter_Finish(&writer);
5171 5424
5172 onError: 5425 onError:
5173 Py_DECREF(unicode); 5426 _PyUnicodeWriter_Dealloc(&writer);
5174 Py_XDECREF(errorHandler); 5427 Py_XDECREF(errorHandler);
5175 Py_XDECREF(exc); 5428 Py_XDECREF(exc);
5176 return NULL; 5429 return NULL;
5177 } 5430 }
5178 5431
5179 PyObject * 5432 PyObject *
5180 _PyUnicode_EncodeUTF16(PyObject *str, 5433 _PyUnicode_EncodeUTF16(PyObject *str,
5181 const char *errors, 5434 const char *errors,
5182 int byteorder) 5435 int byteorder)
5183 { 5436 {
5184 enum PyUnicode_Kind kind; 5437 enum PyUnicode_Kind kind;
5185 const void *data; 5438 const void *data;
5186 Py_ssize_t len; 5439 Py_ssize_t len;
5187 PyObject *v; 5440 PyObject *v;
5188 unsigned short *out; 5441 unsigned short *out;
5189 Py_ssize_t bytesize;
5190 Py_ssize_t pairs; 5442 Py_ssize_t pairs;
5191 #if PY_BIG_ENDIAN 5443 #if PY_BIG_ENDIAN
5192 int native_ordering = byteorder >= 0; 5444 int native_ordering = byteorder >= 0;
5193 #else 5445 #else
5194 int native_ordering = byteorder <= 0; 5446 int native_ordering = byteorder <= 0;
5195 #endif 5447 #endif
5448 const char *encoding;
5449 Py_ssize_t nsize, pos;
5450 PyObject *errorHandler = NULL;
5451 PyObject *exc = NULL;
5452 PyObject *rep = NULL;
5196 5453
5197 if (!PyUnicode_Check(str)) { 5454 if (!PyUnicode_Check(str)) {
5198 PyErr_BadArgument(); 5455 PyErr_BadArgument();
5199 return NULL; 5456 return NULL;
5200 } 5457 }
5201 if (PyUnicode_READY(str) == -1) 5458 if (PyUnicode_READY(str) == -1)
5202 return NULL; 5459 return NULL;
5203 kind = PyUnicode_KIND(str); 5460 kind = PyUnicode_KIND(str);
5204 data = PyUnicode_DATA(str); 5461 data = PyUnicode_DATA(str);
5205 len = PyUnicode_GET_LENGTH(str); 5462 len = PyUnicode_GET_LENGTH(str);
5206 5463
5207 pairs = 0; 5464 pairs = 0;
5208 if (kind == PyUnicode_4BYTE_KIND) { 5465 if (kind == PyUnicode_4BYTE_KIND) {
5209 const Py_UCS4 *in = (const Py_UCS4 *)data; 5466 const Py_UCS4 *in = (const Py_UCS4 *)data;
5210 const Py_UCS4 *end = in + len; 5467 const Py_UCS4 *end = in + len;
5211 while (in < end) 5468 while (in < end)
5212 if (*in++ >= 0x10000) 5469 if (*in++ >= 0x10000)
5213 pairs++; 5470 pairs++;
5214 } 5471 }
5215 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) 5472 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
5216 return PyErr_NoMemory(); 5473 return PyErr_NoMemory();
5217 bytesize = (len + pairs + (byteorder == 0)) * 2; 5474 nsize = len + pairs + (byteorder == 0);
5218 v = PyBytes_FromStringAndSize(NULL, bytesize); 5475 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
5219 if (v == NULL) 5476 if (v == NULL)
5220 return NULL; 5477 return NULL;
5221 5478
5222 /* output buffer is 2-bytes aligned */ 5479 /* output buffer is 2-bytes aligned */
5223 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2)); 5480 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
5224 out = (unsigned short *)PyBytes_AS_STRING(v); 5481 out = (unsigned short *)PyBytes_AS_STRING(v);
5225 if (byteorder == 0) 5482 if (byteorder == 0)
5226 *out++ = 0xFEFF; 5483 *out++ = 0xFEFF;
5227 if (len == 0) 5484 if (len == 0)
5228 goto done; 5485 goto done;
5229 5486
5230 switch (kind) { 5487 if (kind == PyUnicode_1BYTE_KIND) {
5231 case PyUnicode_1BYTE_KIND: { 5488 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5232 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering); 5489 goto done;
5233 break; 5490 }
5234 } 5491
5235 case PyUnicode_2BYTE_KIND: { 5492 if (byteorder < 0)
5236 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering); 5493 encoding = "utf-16-le";
5237 break; 5494 else if (byteorder > 0)
5238 } 5495 encoding = "utf-16-be";
5239 case PyUnicode_4BYTE_KIND: { 5496 else
5240 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering); 5497 encoding = "utf-16";
5241 break; 5498
5242 } 5499 pos = 0;
5243 default: 5500 while (pos < len) {
5244 assert(0); 5501 Py_ssize_t repsize, moreunits;
5245 } 5502
5246 5503 if (kind == PyUnicode_2BYTE_KIND) {
5504 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5505 &out, native_ordering);
5506 }
5507 else {
5508 assert(kind == PyUnicode_4BYTE_KIND);
5509 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5510 &out, native_ordering);
5511 }
5512 if (pos == len)
5513 break;
5514
5515 rep = unicode_encode_call_errorhandler(
5516 errors, &errorHandler,
5517 encoding, "surrogates not allowed",
5518 str, &exc, pos, pos + 1, &pos);
5519 if (!rep)
5520 goto error;
5521
5522 if (PyBytes_Check(rep)) {
5523 repsize = PyBytes_GET_SIZE(rep);
5524 if (repsize & 1) {
5525 raise_encode_exception(&exc, encoding,
5526 str, pos - 1, pos,
5527 "surrogates not allowed");
5528 goto error;
5529 }
5530 moreunits = repsize / 2;
5531 }
5532 else {
5533 assert(PyUnicode_Check(rep));
5534 if (PyUnicode_READY(rep) < 0)
5535 goto error;
5536 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5537 if (!PyUnicode_IS_ASCII(rep)) {
5538 raise_encode_exception(&exc, encoding,
5539 str, pos - 1, pos,
5540 "surrogates not allowed");
5541 goto error;
5542 }
5543 }
5544
5545 /* two bytes are reserved for each surrogate */
5546 if (moreunits > 1) {
5547 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5548 Py_ssize_t morebytes = 2 * (moreunits - 1);
5549 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5550 /* integer overflow */
5551 PyErr_NoMemory();
5552 goto error;
5553 }
5554 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5555 goto error;
5556 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5557 }
5558
5559 if (PyBytes_Check(rep)) {
5560 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5561 out += moreunits;
5562 } else /* rep is unicode */ {
5563 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5564 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5565 &out, native_ordering);
5566 }
5567
5568 Py_CLEAR(rep);
5569 }
5570
5571 /* Cut back to size actually needed. This is necessary for, for example,
5572 encoding of a string containing isolated surrogates and the 'ignore' handler
5573 is used. */
5574 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5575 if (nsize != PyBytes_GET_SIZE(v))
5576 _PyBytes_Resize(&v, nsize);
5577 Py_XDECREF(errorHandler);
5578 Py_XDECREF(exc);
5247 done: 5579 done:
5248 return v; 5580 return v;
5581 error:
5582 Py_XDECREF(rep);
5583 Py_XDECREF(errorHandler);
5584 Py_XDECREF(exc);
5585 Py_XDECREF(v);
5586 return NULL;
5587 #undef STORECHAR
5249 } 5588 }
5250 5589
5251 PyObject * 5590 PyObject *
5252 PyUnicode_EncodeUTF16(const Py_UNICODE *s, 5591 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5253 Py_ssize_t size, 5592 Py_ssize_t size,
5254 const char *errors, 5593 const char *errors,
5255 int byteorder) 5594 int byteorder)
5256 { 5595 {
5257 PyObject *result; 5596 PyObject *result;
5258 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5597 PyObject *tmp = PyUnicode_FromUnicode(s, size);
(...skipping 70 matching lines...) Expand 10 before | Expand all | Expand 10 after
5329 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 5668 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
5330 5669
5331 PyObject * 5670 PyObject *
5332 PyUnicode_DecodeUnicodeEscape(const char *s, 5671 PyUnicode_DecodeUnicodeEscape(const char *s,
5333 Py_ssize_t size, 5672 Py_ssize_t size,
5334 const char *errors) 5673 const char *errors)
5335 { 5674 {
5336 const char *starts = s; 5675 const char *starts = s;
5337 Py_ssize_t startinpos; 5676 Py_ssize_t startinpos;
5338 Py_ssize_t endinpos; 5677 Py_ssize_t endinpos;
5339 int j; 5678 _PyUnicodeWriter writer;
5340 PyObject *v;
5341 const char *end; 5679 const char *end;
5342 char* message; 5680 char* message;
5343 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 5681 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
5344 PyObject *errorHandler = NULL; 5682 PyObject *errorHandler = NULL;
5345 PyObject *exc = NULL; 5683 PyObject *exc = NULL;
5346 Py_ssize_t len; 5684 Py_ssize_t len;
5347 Py_ssize_t i;
5348 5685
5349 len = length_of_escaped_ascii_string(s, size); 5686 len = length_of_escaped_ascii_string(s, size);
5687 if (len == 0)
5688 _Py_RETURN_UNICODE_EMPTY();
5350 5689
5351 /* After length_of_escaped_ascii_string() there are two alternatives, 5690 /* After length_of_escaped_ascii_string() there are two alternatives,
5352 either the string is pure ASCII with named escapes like \n, etc. 5691 either the string is pure ASCII with named escapes like \n, etc.
5353 and we determined it's exact size (common case) 5692 and we determined it's exact size (common case)
5354 or it contains \x, \u, ... escape sequences. then we create a 5693 or it contains \x, \u, ... escape sequences. then we create a
5355 legacy wchar string and resize it at the end of this function. */ 5694 legacy wchar string and resize it at the end of this function. */
5356 if (len >= 0) { 5695 _PyUnicodeWriter_Init(&writer);
5357 v = PyUnicode_New(len, 127); 5696 if (len > 0) {
5358 if (!v) 5697 writer.min_length = len;
5359 goto onError;
5360 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5361 } 5698 }
5362 else { 5699 else {
5363 /* Escaped strings will always be longer than the resulting 5700 /* Escaped strings will always be longer than the resulting
5364 Unicode string, so we start with size here and then reduce the 5701 Unicode string, so we start with size here and then reduce the
5365 length after conversion to the true value. 5702 length after conversion to the true value.
5366 (but if the error callback returns a long replacement string 5703 (but if the error callback returns a long replacement string
5367 we'll have to allocate more space) */ 5704 we'll have to allocate more space) */
5368 v = PyUnicode_New(size, 127); 5705 writer.min_length = size;
5369 if (!v)
5370 goto onError;
5371 len = size;
5372 } 5706 }
5373 5707
5374 if (size == 0) 5708 if (size == 0)
5375 return v; 5709 return _PyUnicodeWriter_Finish(&writer);
5376 i = 0;
5377 end = s + size; 5710 end = s + size;
5378 5711
5379 while (s < end) { 5712 while (s < end) {
5380 unsigned char c; 5713 unsigned char c;
5381 Py_UCS4 x; 5714 Py_UCS4 x;
5382 int digits; 5715 int digits;
5383 5716
5384 /* The only case in which i == ascii_length is a backslash
5385 followed by a newline. */
5386 assert(i <= len);
5387
5388 /* Non-escape characters are interpreted as Unicode ordinals */ 5717 /* Non-escape characters are interpreted as Unicode ordinals */
5389 if (*s != '\\') { 5718 if (*s != '\\') {
5390 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0) 5719 x = (unsigned char)*s;
5720 s++;
5721 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
5391 goto onError; 5722 goto onError;
5392 continue; 5723 continue;
5393 } 5724 }
5394 5725
5395 startinpos = s-starts; 5726 startinpos = s-starts;
5396 /* \ - Escapes */ 5727 /* \ - Escapes */
5397 s++; 5728 s++;
5398 c = *s++; 5729 c = *s++;
5399 if (s > end) 5730 if (s > end)
5400 c = '\0'; /* Invalid after \ */ 5731 c = '\0'; /* Invalid after \ */
5401 5732
5402 /* The only case in which i == ascii_length is a backslash
5403 followed by a newline. */
5404 assert(i < len || (i == len && c == '\n'));
5405
5406 switch (c) { 5733 switch (c) {
5407 5734
5408 /* \x escapes */ 5735 /* \x escapes */
5409 #define WRITECHAR(ch) \ 5736 #define WRITECHAR(ch) \
5410 do { \ 5737 do { \
5411 if (unicode_putchar(&v, &i, ch) < 0) \ 5738 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
5412 goto onError; \ 5739 goto onError; \
5413 }while(0) 5740 } while(0)
5414 5741
5415 case '\n': break; 5742 case '\n': break;
5416 case '\\': WRITECHAR('\\'); break; 5743 case '\\': WRITECHAR('\\'); break;
5417 case '\'': WRITECHAR('\''); break; 5744 case '\'': WRITECHAR('\''); break;
5418 case '\"': WRITECHAR('\"'); break; 5745 case '\"': WRITECHAR('\"'); break;
5419 case 'b': WRITECHAR('\b'); break; 5746 case 'b': WRITECHAR('\b'); break;
5420 /* FF */ 5747 /* FF */
5421 case 'f': WRITECHAR('\014'); break; 5748 case 'f': WRITECHAR('\014'); break;
5422 case 't': WRITECHAR('\t'); break; 5749 case 't': WRITECHAR('\t'); break;
5423 case 'n': WRITECHAR('\n'); break; 5750 case 'n': WRITECHAR('\n'); break;
(...skipping 27 matching lines...) Expand all
5451 digits = 4; 5778 digits = 4;
5452 message = "truncated \\uXXXX escape"; 5779 message = "truncated \\uXXXX escape";
5453 goto hexescape; 5780 goto hexescape;
5454 5781
5455 /* \UXXXXXXXX */ 5782 /* \UXXXXXXXX */
5456 case 'U': 5783 case 'U':
5457 digits = 8; 5784 digits = 8;
5458 message = "truncated \\UXXXXXXXX escape"; 5785 message = "truncated \\UXXXXXXXX escape";
5459 hexescape: 5786 hexescape:
5460 chr = 0; 5787 chr = 0;
5461 if (s+digits>end) { 5788 if (end - s < digits) {
5462 endinpos = size; 5789 /* count only hex digits */
5463 if (unicode_decode_call_errorhandler( 5790 for (; s < end; ++s) {
5464 errors, &errorHandler, 5791 c = (unsigned char)*s;
5465 "unicodeescape", "end of string in escape sequence", 5792 if (!Py_ISXDIGIT(c))
5466 &starts, &end, &startinpos, &endinpos, &exc, &s, 5793 goto error;
5467 &v, &i)) 5794 }
5468 goto onError; 5795 goto error;
5469 goto nextByte;
5470 } 5796 }
5471 for (j = 0; j < digits; ++j) { 5797 for (; digits--; ++s) {
5472 c = (unsigned char) s[j]; 5798 c = (unsigned char)*s;
5473 if (!Py_ISXDIGIT(c)) { 5799 if (!Py_ISXDIGIT(c))
5474 endinpos = (s+j+1)-starts; 5800 goto error;
5475 if (unicode_decode_call_errorhandler(
5476 errors, &errorHandler,
5477 "unicodeescape", message,
5478 &starts, &end, &startinpos, &endinpos, &exc, &s,
5479 &v, &i))
5480 goto onError;
5481 len = PyUnicode_GET_LENGTH(v);
5482 goto nextByte;
5483 }
5484 chr = (chr<<4) & ~0xF; 5801 chr = (chr<<4) & ~0xF;
5485 if (c >= '0' && c <= '9') 5802 if (c >= '0' && c <= '9')
5486 chr += c - '0'; 5803 chr += c - '0';
5487 else if (c >= 'a' && c <= 'f') 5804 else if (c >= 'a' && c <= 'f')
5488 chr += 10 + c - 'a'; 5805 chr += 10 + c - 'a';
5489 else 5806 else
5490 chr += 10 + c - 'A'; 5807 chr += 10 + c - 'A';
5491 } 5808 }
5492 s += j;
5493 if (chr == 0xffffffff && PyErr_Occurred()) 5809 if (chr == 0xffffffff && PyErr_Occurred())
5494 /* _decoding_error will have already written into the 5810 /* _decoding_error will have already written into the
5495 target buffer. */ 5811 target buffer. */
5496 break; 5812 break;
5497 store: 5813 store:
5498 /* when we get here, chr is a 32-bit unicode character */ 5814 /* when we get here, chr is a 32-bit unicode character */
5499 if (chr <= MAX_UNICODE) { 5815 message = "illegal Unicode character";
5500 WRITECHAR(chr); 5816 if (chr > MAX_UNICODE)
5501 } else { 5817 goto error;
5502 endinpos = s-starts; 5818 WRITECHAR(chr);
5503 if (unicode_decode_call_errorhandler(
5504 errors, &errorHandler,
5505 "unicodeescape", "illegal Unicode character",
5506 &starts, &end, &startinpos, &endinpos, &exc, &s,
5507 &v, &i))
5508 goto onError;
5509 }
5510 break; 5819 break;
5511 5820
5512 /* \N{name} */ 5821 /* \N{name} */
5513 case 'N': 5822 case 'N':
5514 message = "malformed \\N character escape"; 5823 message = "malformed \\N character escape";
5515 if (ucnhash_CAPI == NULL) { 5824 if (ucnhash_CAPI == NULL) {
5516 /* load the unicode data module */ 5825 /* load the unicode data module */
5517 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( 5826 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5518 PyUnicodeData_CAPSULE_NAME, 1); 5827 PyUnicodeData_CAPSULE_NAME, 1);
5519 if (ucnhash_CAPI == NULL) 5828 if (ucnhash_CAPI == NULL)
5520 goto ucnhashError; 5829 goto ucnhashError;
5521 } 5830 }
5522 if (*s == '{') { 5831 if (*s == '{') {
5523 const char *start = s+1; 5832 const char *start = s+1;
5524 /* look for the closing brace */ 5833 /* look for the closing brace */
5525 while (*s != '}' && s < end) 5834 while (*s != '}' && s < end)
5526 s++; 5835 s++;
5527 if (s > start && s < end && *s == '}') { 5836 if (s > start && s < end && *s == '}') {
5528 /* found a name. look it up in the unicode database */ 5837 /* found a name. look it up in the unicode database */
5529 message = "unknown Unicode character name"; 5838 message = "unknown Unicode character name";
5530 s++; 5839 s++;
5531 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), 5840 if (s - start - 1 <= INT_MAX &&
5841 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5532 &chr, 0)) 5842 &chr, 0))
5533 goto store; 5843 goto store;
5534 } 5844 }
5535 } 5845 }
5536 endinpos = s-starts; 5846 goto error;
5537 if (unicode_decode_call_errorhandler(
5538 errors, &errorHandler,
5539 "unicodeescape", message,
5540 &starts, &end, &startinpos, &endinpos, &exc, &s,
5541 &v, &i))
5542 goto onError;
5543 break;
5544 5847
5545 default: 5848 default:
5546 if (s > end) { 5849 if (s > end) {
5547 message = "\\ at end of string"; 5850 message = "\\ at end of string";
5548 s--; 5851 s--;
5549 endinpos = s-starts; 5852 goto error;
5550 if (unicode_decode_call_errorhandler(
5551 errors, &errorHandler,
5552 "unicodeescape", message,
5553 &starts, &end, &startinpos, &endinpos, &exc, &s,
5554 &v, &i))
5555 goto onError;
5556 } 5853 }
5557 else { 5854 else {
5558 WRITECHAR('\\'); 5855 WRITECHAR('\\');
5559 WRITECHAR(s[-1]); 5856 WRITECHAR((unsigned char)s[-1]);
5560 } 5857 }
5561 break; 5858 break;
5562 } 5859 }
5563 nextByte: 5860 continue;
5564 ; 5861
5862 error:
5863 endinpos = s-starts;
5864 if (unicode_decode_call_errorhandler_writer(
5865 errors, &errorHandler,
5866 "unicodeescape", message,
5867 &starts, &end, &startinpos, &endinpos, &exc, &s,
5868 &writer))
5869 goto onError;
5870 continue;
5565 } 5871 }
5566 #undef WRITECHAR 5872 #undef WRITECHAR
5567 5873
5568 if (unicode_resize(&v, i) < 0)
5569 goto onError;
5570 Py_XDECREF(errorHandler); 5874 Py_XDECREF(errorHandler);
5571 Py_XDECREF(exc); 5875 Py_XDECREF(exc);
5572 return unicode_result(v); 5876 return _PyUnicodeWriter_Finish(&writer);
5573 5877
5574 ucnhashError: 5878 ucnhashError:
5575 PyErr_SetString( 5879 PyErr_SetString(
5576 PyExc_UnicodeError, 5880 PyExc_UnicodeError,
5577 "\\N escapes not supported (can't load unicodedata module)" 5881 "\\N escapes not supported (can't load unicodedata module)"
5578 ); 5882 );
5579 Py_XDECREF(v); 5883 _PyUnicodeWriter_Dealloc(&writer);
5580 Py_XDECREF(errorHandler); 5884 Py_XDECREF(errorHandler);
5581 Py_XDECREF(exc); 5885 Py_XDECREF(exc);
5582 return NULL; 5886 return NULL;
5583 5887
5584 onError: 5888 onError:
5585 Py_XDECREF(v); 5889 _PyUnicodeWriter_Dealloc(&writer);
5586 Py_XDECREF(errorHandler); 5890 Py_XDECREF(errorHandler);
5587 Py_XDECREF(exc); 5891 Py_XDECREF(exc);
5588 return NULL; 5892 return NULL;
5589 } 5893 }
5590 5894
5591 /* Return a Unicode-Escape string version of the Unicode object. 5895 /* Return a Unicode-Escape string version of the Unicode object.
5592 5896
5593 If quotes is true, the string is enclosed in u"" or u'' quotes as 5897 If quotes is true, the string is enclosed in u"" or u'' quotes as
5594 appropriate. 5898 appropriate.
5595 5899
(...skipping 132 matching lines...) Expand 10 before | Expand all | Expand 10 after
5728 /* --- Raw Unicode Escape Codec ------------------------------------------- */ 6032 /* --- Raw Unicode Escape Codec ------------------------------------------- */
5729 6033
5730 PyObject * 6034 PyObject *
5731 PyUnicode_DecodeRawUnicodeEscape(const char *s, 6035 PyUnicode_DecodeRawUnicodeEscape(const char *s,
5732 Py_ssize_t size, 6036 Py_ssize_t size,
5733 const char *errors) 6037 const char *errors)
5734 { 6038 {
5735 const char *starts = s; 6039 const char *starts = s;
5736 Py_ssize_t startinpos; 6040 Py_ssize_t startinpos;
5737 Py_ssize_t endinpos; 6041 Py_ssize_t endinpos;
5738 Py_ssize_t outpos; 6042 _PyUnicodeWriter writer;
5739 PyObject *v;
5740 const char *end; 6043 const char *end;
5741 const char *bs; 6044 const char *bs;
5742 PyObject *errorHandler = NULL; 6045 PyObject *errorHandler = NULL;
5743 PyObject *exc = NULL; 6046 PyObject *exc = NULL;
5744 6047
6048 if (size == 0)
6049 _Py_RETURN_UNICODE_EMPTY();
6050
5745 /* Escaped strings will always be longer than the resulting 6051 /* Escaped strings will always be longer than the resulting
5746 Unicode string, so we start with size here and then reduce the 6052 Unicode string, so we start with size here and then reduce the
5747 length after conversion to the true value. (But decoding error 6053 length after conversion to the true value. (But decoding error
5748 handler might have to resize the string) */ 6054 handler might have to resize the string) */
5749 v = PyUnicode_New(size, 127); 6055 _PyUnicodeWriter_Init(&writer);
5750 if (v == NULL) 6056 writer.min_length = size;
5751 goto onError; 6057
5752 if (size == 0)
5753 return v;
5754 outpos = 0;
5755 end = s + size; 6058 end = s + size;
5756 while (s < end) { 6059 while (s < end) {
5757 unsigned char c; 6060 unsigned char c;
5758 Py_UCS4 x; 6061 Py_UCS4 x;
5759 int i; 6062 int i;
5760 int count; 6063 int count;
5761 6064
5762 /* Non-escape characters are interpreted as Unicode ordinals */ 6065 /* Non-escape characters are interpreted as Unicode ordinals */
5763 if (*s != '\\') { 6066 if (*s != '\\') {
5764 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0) 6067 x = (unsigned char)*s++;
6068 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
5765 goto onError; 6069 goto onError;
5766 continue; 6070 continue;
5767 } 6071 }
5768 startinpos = s-starts; 6072 startinpos = s-starts;
5769 6073
5770 /* \u-escapes are only interpreted iff the number of leading 6074 /* \u-escapes are only interpreted iff the number of leading
5771 backslashes if odd */ 6075 backslashes if odd */
5772 bs = s; 6076 bs = s;
5773 for (;s < end;) { 6077 for (;s < end;) {
5774 if (*s != '\\') 6078 if (*s != '\\')
5775 break; 6079 break;
5776 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0) 6080 x = (unsigned char)*s++;
6081 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
5777 goto onError; 6082 goto onError;
5778 } 6083 }
5779 if (((s - bs) & 1) == 0 || 6084 if (((s - bs) & 1) == 0 ||
5780 s >= end || 6085 s >= end ||
5781 (*s != 'u' && *s != 'U')) { 6086 (*s != 'u' && *s != 'U')) {
5782 continue; 6087 continue;
5783 } 6088 }
5784 outpos--; 6089 writer.pos--;
5785 count = *s=='u' ? 4 : 8; 6090 count = *s=='u' ? 4 : 8;
5786 s++; 6091 s++;
5787 6092
5788 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 6093 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5789 for (x = 0, i = 0; i < count; ++i, ++s) { 6094 for (x = 0, i = 0; i < count; ++i, ++s) {
5790 c = (unsigned char)*s; 6095 c = (unsigned char)*s;
5791 if (!Py_ISXDIGIT(c)) { 6096 if (!Py_ISXDIGIT(c)) {
5792 endinpos = s-starts; 6097 endinpos = s-starts;
5793 if (unicode_decode_call_errorhandler( 6098 if (unicode_decode_call_errorhandler_writer(
5794 errors, &errorHandler, 6099 errors, &errorHandler,
5795 "rawunicodeescape", "truncated \\uXXXX", 6100 "rawunicodeescape", "truncated \\uXXXX",
5796 &starts, &end, &startinpos, &endinpos, &exc, &s, 6101 &starts, &end, &startinpos, &endinpos, &exc, &s,
5797 &v, &outpos)) 6102 &writer))
5798 goto onError; 6103 goto onError;
5799 goto nextByte; 6104 goto nextByte;
5800 } 6105 }
5801 x = (x<<4) & ~0xF; 6106 x = (x<<4) & ~0xF;
5802 if (c >= '0' && c <= '9') 6107 if (c >= '0' && c <= '9')
5803 x += c - '0'; 6108 x += c - '0';
5804 else if (c >= 'a' && c <= 'f') 6109 else if (c >= 'a' && c <= 'f')
5805 x += 10 + c - 'a'; 6110 x += 10 + c - 'a';
5806 else 6111 else
5807 x += 10 + c - 'A'; 6112 x += 10 + c - 'A';
5808 } 6113 }
5809 if (x <= MAX_UNICODE) { 6114 if (x <= MAX_UNICODE) {
5810 if (unicode_putchar(&v, &outpos, x) < 0) 6115 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
5811 goto onError; 6116 goto onError;
5812 } else { 6117 }
6118 else {
5813 endinpos = s-starts; 6119 endinpos = s-starts;
5814 if (unicode_decode_call_errorhandler( 6120 if (unicode_decode_call_errorhandler_writer(
5815 errors, &errorHandler, 6121 errors, &errorHandler,
5816 "rawunicodeescape", "\\Uxxxxxxxx out of range", 6122 "rawunicodeescape", "\\Uxxxxxxxx out of range",
5817 &starts, &end, &startinpos, &endinpos, &exc, &s, 6123 &starts, &end, &startinpos, &endinpos, &exc, &s,
5818 &v, &outpos)) 6124 &writer))
5819 goto onError; 6125 goto onError;
5820 } 6126 }
5821 nextByte: 6127 nextByte:
5822 ; 6128 ;
5823 } 6129 }
5824 if (unicode_resize(&v, outpos) < 0)
5825 goto onError;
5826 Py_XDECREF(errorHandler); 6130 Py_XDECREF(errorHandler);
5827 Py_XDECREF(exc); 6131 Py_XDECREF(exc);
5828 return unicode_result(v); 6132 return _PyUnicodeWriter_Finish(&writer);
5829 6133
5830 onError: 6134 onError:
5831 Py_XDECREF(v); 6135 _PyUnicodeWriter_Dealloc(&writer);
5832 Py_XDECREF(errorHandler); 6136 Py_XDECREF(errorHandler);
5833 Py_XDECREF(exc); 6137 Py_XDECREF(exc);
5834 return NULL; 6138 return NULL;
5835 } 6139 }
5836 6140
5837 6141
5838 PyObject * 6142 PyObject *
5839 PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 6143 PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
5840 { 6144 {
5841 PyObject *repr; 6145 PyObject *repr;
(...skipping 79 matching lines...) Expand 10 before | Expand all | Expand 10 after
5921 /* --- Unicode Internal Codec ------------------------------------------- */ 6225 /* --- Unicode Internal Codec ------------------------------------------- */
5922 6226
5923 PyObject * 6227 PyObject *
5924 _PyUnicode_DecodeUnicodeInternal(const char *s, 6228 _PyUnicode_DecodeUnicodeInternal(const char *s,
5925 Py_ssize_t size, 6229 Py_ssize_t size,
5926 const char *errors) 6230 const char *errors)
5927 { 6231 {
5928 const char *starts = s; 6232 const char *starts = s;
5929 Py_ssize_t startinpos; 6233 Py_ssize_t startinpos;
5930 Py_ssize_t endinpos; 6234 Py_ssize_t endinpos;
5931 Py_ssize_t outpos; 6235 _PyUnicodeWriter writer;
5932 PyObject *v;
5933 const char *end; 6236 const char *end;
5934 const char *reason; 6237 const char *reason;
5935 PyObject *errorHandler = NULL; 6238 PyObject *errorHandler = NULL;
5936 PyObject *exc = NULL; 6239 PyObject *exc = NULL;
5937 6240
5938 if (PyErr_WarnEx(PyExc_DeprecationWarning, 6241 if (PyErr_WarnEx(PyExc_DeprecationWarning,
5939 "unicode_internal codec has been deprecated", 6242 "unicode_internal codec has been deprecated",
5940 1)) 6243 1))
5941 return NULL; 6244 return NULL;
5942 6245
5943 /* XXX overflow detection missing */ 6246 if (size == 0)
5944 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127); 6247 _Py_RETURN_UNICODE_EMPTY();
5945 if (v == NULL) 6248
6249 _PyUnicodeWriter_Init(&writer);
6250 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6251 PyErr_NoMemory();
5946 goto onError; 6252 goto onError;
5947 if (PyUnicode_GET_LENGTH(v) == 0) 6253 }
5948 return v; 6254 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
5949 outpos = 0; 6255
5950 end = s + size; 6256 end = s + size;