Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code | Sign in
(41380)

Delta Between Two Patch Sets: Objects/unicodeobject.c

Issue 15027: Faster UTF-32 encoding
Left Patch Set: Created 6 years, 11 months ago
Right Patch Set: Created 5 years, 5 months ago
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
« Objects/stringlib/codecs.h ('K') | « Objects/stringlib/codecs.h ('k') | no next file » | no next file with change/comment »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
LEFTRIGHT
1 /* 1 /*
2 2
3 Unicode implementation based on original code by Fredrik Lundh, 3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com>. 4 modified by Marc-Andre Lemburg <mal@lemburg.com>.
5 5
6 Major speed upgrades to the method implementations at the Reykjavik 6 Major speed upgrades to the method implementations at the Reykjavik
7 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 7 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8 8
9 Copyright (c) Corporation for National Research Initiatives. 9 Copyright (c) Corporation for National Research Initiatives.
10 10
(...skipping 29 matching lines...) Expand all
40 40
41 #define PY_SSIZE_T_CLEAN 41 #define PY_SSIZE_T_CLEAN
42 #include "Python.h" 42 #include "Python.h"
43 #include "ucnhash.h" 43 #include "ucnhash.h"
44 #include "bytes_methods.h" 44 #include "bytes_methods.h"
45 45
46 #ifdef MS_WINDOWS 46 #ifdef MS_WINDOWS
47 #include <windows.h> 47 #include <windows.h>
48 #endif 48 #endif
49 49
50 /* Endianness switches; defaults to little endian */ 50 /*[clinic]
51 51 class str
52 #ifdef WORDS_BIGENDIAN 52 [clinic]*/
53 # define BYTEORDER_IS_BIG_ENDIAN 53 /*[clinic checksum: da39a3ee5e6b4b0d3255bfef95601890afd80709]*/
54 #else
55 # define BYTEORDER_IS_LITTLE_ENDIAN
56 #endif
57 54
58 /* --- Globals ------------------------------------------------------------ 55 /* --- Globals ------------------------------------------------------------
59 56
60 The globals are initialized by the _PyUnicode_Init() API and should 57 NOTE: In the interpreter's initialization phase, some globals are currently
61 not be used before calling that API. 58 initialized dynamically as needed. In the process Unicode objects may
59 be created before the Unicode type is ready.
62 60
63 */ 61 */
64 62
65 63
66 #ifdef __cplusplus 64 #ifdef __cplusplus
67 extern "C" { 65 extern "C" {
68 #endif 66 #endif
69 67
70 /* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */ 68 /* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
71 #define MAX_UNICODE 0x10ffff 69 #define MAX_UNICODE 0x10ffff
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after
104 (((PyASCIIObject *)(op))->hash) 102 (((PyASCIIObject *)(op))->hash)
105 #define _PyUnicode_KIND(op) \ 103 #define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \ 104 (assert(_PyUnicode_CHECK(op)), \
107 ((PyASCIIObject *)(op))->state.kind) 105 ((PyASCIIObject *)(op))->state.kind)
108 #define _PyUnicode_GET_LENGTH(op) \ 106 #define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \ 107 (assert(_PyUnicode_CHECK(op)), \
110 ((PyASCIIObject *)(op))->length) 108 ((PyASCIIObject *)(op))->length)
111 #define _PyUnicode_DATA_ANY(op) \ 109 #define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any) 110 (((PyUnicodeObject*)(op))->data.any)
113 111
114 /* Optimized version of Py_MAX() to compute the maximum character:
115 use it when your are computing the second argument of PyUnicode_New() */
116 #define MAX_MAXCHAR(maxchar1, maxchar2) \
117 ((maxchar1) | (maxchar2))
118
119 #undef PyUnicode_READY 112 #undef PyUnicode_READY
120 #define PyUnicode_READY(op) \ 113 #define PyUnicode_READY(op) \
121 (assert(_PyUnicode_CHECK(op)), \ 114 (assert(_PyUnicode_CHECK(op)), \
122 (PyUnicode_IS_READY(op) ? \ 115 (PyUnicode_IS_READY(op) ? \
123 0 : \ 116 0 : \
124 _PyUnicode_Ready(op))) 117 _PyUnicode_Ready(op)))
125 118
126 #define _PyUnicode_SHARE_UTF8(op) \ 119 #define _PyUnicode_SHARE_UTF8(op) \
127 (assert(_PyUnicode_CHECK(op)), \ 120 (assert(_PyUnicode_CHECK(op)), \
128 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \ 121 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
129 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op))) 122 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
130 #define _PyUnicode_SHARE_WSTR(op) \ 123 #define _PyUnicode_SHARE_WSTR(op) \
131 (assert(_PyUnicode_CHECK(op)), \ 124 (assert(_PyUnicode_CHECK(op)), \
132 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op))) 125 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
133 126
134 /* true if the Unicode object has an allocated UTF-8 memory block 127 /* true if the Unicode object has an allocated UTF-8 memory block
135 (not shared with other data) */ 128 (not shared with other data) */
136 #define _PyUnicode_HAS_UTF8_MEMORY(op) \ 129 #define _PyUnicode_HAS_UTF8_MEMORY(op) \
137 (assert(_PyUnicode_CHECK(op)), \ 130 ((!PyUnicode_IS_COMPACT_ASCII(op) \
138 (!PyUnicode_IS_COMPACT_ASCII(op) \
139 && _PyUnicode_UTF8(op) \ 131 && _PyUnicode_UTF8(op) \
140 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op))) 132 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
141 133
142 /* true if the Unicode object has an allocated wstr memory block 134 /* true if the Unicode object has an allocated wstr memory block
143 (not shared with other data) */ 135 (not shared with other data) */
144 #define _PyUnicode_HAS_WSTR_MEMORY(op) \ 136 #define _PyUnicode_HAS_WSTR_MEMORY(op) \
145 (assert(_PyUnicode_CHECK(op)), \ 137 ((_PyUnicode_WSTR(op) && \
146 (_PyUnicode_WSTR(op) && \
147 (!PyUnicode_IS_READY(op) || \ 138 (!PyUnicode_IS_READY(op) || \
148 _PyUnicode_WSTR(op) != PyUnicode_DATA(op)))) 139 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
149 140
150 /* Generic helper macro to convert characters of different types. 141 /* Generic helper macro to convert characters of different types.
151 from_type and to_type have to be valid type names, begin and end 142 from_type and to_type have to be valid type names, begin and end
152 are pointers to the source characters which should be of type 143 are pointers to the source characters which should be of type
153 "from_type *". to is a pointer of type "to_type *" and points to the 144 "from_type *". to is a pointer of type "to_type *" and points to the
154 buffer where the result characters are written to. */ 145 buffer where the result characters are written to. */
155 #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \ 146 #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
156 do { \ 147 do { \
157 to_type *_to = (to_type *) to; \ 148 to_type *_to = (to_type *)(to); \
158 const from_type *_iter = (begin); \ 149 const from_type *_iter = (from_type *)(begin); \
159 const from_type *_end = (end); \ 150 const from_type *_end = (from_type *)(end); \
160 Py_ssize_t n = (_end) - (_iter); \ 151 Py_ssize_t n = (_end) - (_iter); \
161 const from_type *_unrolled_end = \ 152 const from_type *_unrolled_end = \
162 _iter + (n & ~ (Py_ssize_t) 3); \ 153 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
163 while (_iter < (_unrolled_end)) { \ 154 while (_iter < (_unrolled_end)) { \
164 _to[0] = (to_type) _iter[0]; \ 155 _to[0] = (to_type) _iter[0]; \
165 _to[1] = (to_type) _iter[1]; \ 156 _to[1] = (to_type) _iter[1]; \
166 _to[2] = (to_type) _iter[2]; \ 157 _to[2] = (to_type) _iter[2]; \
167 _to[3] = (to_type) _iter[3]; \ 158 _to[3] = (to_type) _iter[3]; \
168 _iter += 4; _to += 4; \ 159 _iter += 4; _to += 4; \
169 } \ 160 } \
170 while (_iter < (_end)) \ 161 while (_iter < (_end)) \
171 *_to++ = (to_type) *_iter++; \ 162 *_to++ = (to_type) *_iter++; \
172 } while (0) 163 } while (0)
173 164
174 /* This dictionary holds all interned unicode strings. Note that references 165 /* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt. 166 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation 167 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary. 168 function will delete the reference from this dictionary.
178 169
179 Another way to look at this is that to say that the actual reference 170 Another way to look at this is that to say that the actual reference
180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 171 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
181 */ 172 */
182 static PyObject *interned; 173 static PyObject *interned = NULL;
183 174
184 /* The empty Unicode object is shared to improve performance. */ 175 /* The empty Unicode object is shared to improve performance. */
185 static PyObject *unicode_empty; 176 static PyObject *unicode_empty = NULL;
177
178 #define _Py_INCREF_UNICODE_EMPTY() \
179 do { \
180 if (unicode_empty != NULL) \
181 Py_INCREF(unicode_empty); \
182 else { \
183 unicode_empty = PyUnicode_New(0, 0); \
184 if (unicode_empty != NULL) { \
185 Py_INCREF(unicode_empty); \
186 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
187 } \
188 } \
189 } while (0)
190
191 #define _Py_RETURN_UNICODE_EMPTY() \
192 do { \
193 _Py_INCREF_UNICODE_EMPTY(); \
194 return unicode_empty; \
195 } while (0)
196
197 /* Forward declaration */
198 Py_LOCAL_INLINE(int)
199 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
186 200
187 /* List of static strings. */ 201 /* List of static strings. */
188 static _Py_Identifier *static_strings; 202 static _Py_Identifier *static_strings = NULL;
189 203
190 /* Single character Unicode strings in the Latin-1 range are being 204 /* Single character Unicode strings in the Latin-1 range are being
191 shared as well. */ 205 shared as well. */
192 static PyObject *unicode_latin1[256]; 206 static PyObject *unicode_latin1[256] = {NULL};
193 207
194 /* Fast detection of the most frequent whitespace characters */ 208 /* Fast detection of the most frequent whitespace characters */
195 const unsigned char _Py_ascii_whitespace[] = { 209 const unsigned char _Py_ascii_whitespace[] = {
196 0, 0, 0, 0, 0, 0, 0, 0, 210 0, 0, 0, 0, 0, 0, 0, 0,
197 /* case 0x0009: * CHARACTER TABULATION */ 211 /* case 0x0009: * CHARACTER TABULATION */
198 /* case 0x000A: * LINE FEED */ 212 /* case 0x000A: * LINE FEED */
199 /* case 0x000B: * LINE TABULATION */ 213 /* case 0x000B: * LINE TABULATION */
200 /* case 0x000C: * FORM FEED */ 214 /* case 0x000C: * FORM FEED */
201 /* case 0x000D: * CARRIAGE RETURN */ 215 /* case 0x000D: * CARRIAGE RETURN */
202 0, 1, 1, 1, 1, 1, 0, 0, 216 0, 1, 1, 1, 1, 1, 0, 0,
(...skipping 19 matching lines...) Expand all
222 0, 0, 0, 0, 0, 0, 0, 0 236 0, 0, 0, 0, 0, 0, 0, 0
223 }; 237 };
224 238
225 /* forward */ 239 /* forward */
226 static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length); 240 static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
227 static PyObject* get_latin1_char(unsigned char ch); 241 static PyObject* get_latin1_char(unsigned char ch);
228 static int unicode_modifiable(PyObject *unicode); 242 static int unicode_modifiable(PyObject *unicode);
229 243
230 244
231 static PyObject * 245 static PyObject *
232 _PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size); 246 _PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
233 static PyObject * 247 static PyObject *
234 _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size); 248 _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
235 static PyObject * 249 static PyObject *
236 _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size); 250 _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
237 251
238 static PyObject * 252 static PyObject *
239 unicode_encode_call_errorhandler(const char *errors, 253 unicode_encode_call_errorhandler(const char *errors,
240 PyObject **errorHandler,const char *encoding, const char *reason, 254 PyObject **errorHandler,const char *encoding, const char *reason,
241 PyObject *unicode, PyObject **exceptionObject, 255 PyObject *unicode, PyObject **exceptionObject,
242 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); 256 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
(...skipping 162 matching lines...) Expand 10 before | Expand all | Expand 10 after
405 return 1; 419 return 1;
406 } 420 }
407 #endif 421 #endif
408 422
409 static PyObject* 423 static PyObject*
410 unicode_result_wchar(PyObject *unicode) 424 unicode_result_wchar(PyObject *unicode)
411 { 425 {
412 #ifndef Py_DEBUG 426 #ifndef Py_DEBUG
413 Py_ssize_t len; 427 Py_ssize_t len;
414 428
415 assert(Py_REFCNT(unicode) == 1);
416
417 len = _PyUnicode_WSTR_LENGTH(unicode); 429 len = _PyUnicode_WSTR_LENGTH(unicode);
418 if (len == 0) { 430 if (len == 0) {
419 Py_INCREF(unicode_empty);
420 Py_DECREF(unicode); 431 Py_DECREF(unicode);
421 return unicode_empty; 432 _Py_RETURN_UNICODE_EMPTY();
422 } 433 }
423 434
424 if (len == 1) { 435 if (len == 1) {
425 wchar_t ch = _PyUnicode_WSTR(unicode)[0]; 436 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
426 if (ch < 256) { 437 if ((Py_UCS4)ch < 256) {
427 PyObject *latin1_char = get_latin1_char((unsigned char)ch); 438 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
428 Py_DECREF(unicode); 439 Py_DECREF(unicode);
429 return latin1_char; 440 return latin1_char;
430 } 441 }
431 } 442 }
432 443
433 if (_PyUnicode_Ready(unicode) < 0) { 444 if (_PyUnicode_Ready(unicode) < 0) {
434 Py_XDECREF(unicode); 445 Py_DECREF(unicode);
435 return NULL; 446 return NULL;
436 } 447 }
437 #else 448 #else
449 assert(Py_REFCNT(unicode) == 1);
450
438 /* don't make the result ready in debug mode to ensure that the caller 451 /* don't make the result ready in debug mode to ensure that the caller
439 makes the string ready before using it */ 452 makes the string ready before using it */
440 assert(_PyUnicode_CheckConsistency(unicode, 1)); 453 assert(_PyUnicode_CheckConsistency(unicode, 1));
441 #endif 454 #endif
442 return unicode; 455 return unicode;
443 } 456 }
444 457
445 static PyObject* 458 static PyObject*
446 unicode_result_ready(PyObject *unicode) 459 unicode_result_ready(PyObject *unicode)
447 { 460 {
448 Py_ssize_t length; 461 Py_ssize_t length;
449 462
450 length = PyUnicode_GET_LENGTH(unicode); 463 length = PyUnicode_GET_LENGTH(unicode);
451 if (length == 0) { 464 if (length == 0) {
452 if (unicode != unicode_empty) { 465 if (unicode != unicode_empty) {
453 Py_INCREF(unicode_empty);
454 Py_DECREF(unicode); 466 Py_DECREF(unicode);
467 _Py_RETURN_UNICODE_EMPTY();
455 } 468 }
456 return unicode_empty; 469 return unicode_empty;
457 } 470 }
458 471
459 if (length == 1) { 472 if (length == 1) {
460 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); 473 void *data = PyUnicode_DATA(unicode);
474 int kind = PyUnicode_KIND(unicode);
475 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
461 if (ch < 256) { 476 if (ch < 256) {
462 PyObject *latin1_char = unicode_latin1[ch]; 477 PyObject *latin1_char = unicode_latin1[ch];
463 if (latin1_char != NULL) { 478 if (latin1_char != NULL) {
464 if (unicode != latin1_char) { 479 if (unicode != latin1_char) {
465 Py_INCREF(latin1_char); 480 Py_INCREF(latin1_char);
466 Py_DECREF(unicode); 481 Py_DECREF(unicode);
467 } 482 }
468 return latin1_char; 483 return latin1_char;
469 } 484 }
470 else { 485 else {
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after
521 #elif LONG_BIT >= 64 536 #elif LONG_BIT >= 64
522 #define BLOOM_WIDTH 64 537 #define BLOOM_WIDTH 64
523 #elif LONG_BIT >= 32 538 #elif LONG_BIT >= 32
524 #define BLOOM_WIDTH 32 539 #define BLOOM_WIDTH 32
525 #else 540 #else
526 #error "LONG_BIT is smaller than 32" 541 #error "LONG_BIT is smaller than 32"
527 #endif 542 #endif
528 543
529 #define BLOOM_MASK unsigned long 544 #define BLOOM_MASK unsigned long
530 545
531 static BLOOM_MASK bloom_linebreak; 546 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
532 547
533 #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
534 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 548 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
535 549
536 #define BLOOM_LINEBREAK(ch) \ 550 #define BLOOM_LINEBREAK(ch) \
537 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 551 ((ch) < 128U ? ascii_linebreak[(ch)] : \
538 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 552 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
539 553
540 Py_LOCAL_INLINE(BLOOM_MASK) 554 Py_LOCAL_INLINE(BLOOM_MASK)
541 make_bloom_mask(int kind, void* ptr, Py_ssize_t len) 555 make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
542 { 556 {
557 #define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
558 do { \
559 TYPE *data = (TYPE *)PTR; \
560 TYPE *end = data + LEN; \
561 Py_UCS4 ch; \
562 for (; data != end; data++) { \
563 ch = *data; \
564 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
565 } \
566 break; \
567 } while (0)
568
543 /* calculate simple bloom-style bitmask for a given unicode string */ 569 /* calculate simple bloom-style bitmask for a given unicode string */
544 570
545 BLOOM_MASK mask; 571 BLOOM_MASK mask;
546 Py_ssize_t i;
547 572
548 mask = 0; 573 mask = 0;
549 for (i = 0; i < len; i++) 574 switch (kind) {
550 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i)); 575 case PyUnicode_1BYTE_KIND:
551 576 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
577 break;
578 case PyUnicode_2BYTE_KIND:
579 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
580 break;
581 case PyUnicode_4BYTE_KIND:
582 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
583 break;
584 default:
585 assert(0);
586 }
552 return mask; 587 return mask;
553 } 588
554 589 #undef BLOOM_UPDATE
555 #define BLOOM_MEMBER(mask, chr, str) \ 590 }
556 (BLOOM(mask, chr) \
557 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
558 591
559 /* Compilation of templated routines */ 592 /* Compilation of templated routines */
560 593
561 #include "stringlib/asciilib.h" 594 #include "stringlib/asciilib.h"
562 #include "stringlib/fastsearch.h" 595 #include "stringlib/fastsearch.h"
563 #include "stringlib/partition.h" 596 #include "stringlib/partition.h"
564 #include "stringlib/split.h" 597 #include "stringlib/split.h"
565 #include "stringlib/count.h" 598 #include "stringlib/count.h"
566 #include "stringlib/find.h" 599 #include "stringlib/find.h"
567 #include "stringlib/find_max_char.h" 600 #include "stringlib/find_max_char.h"
568 #include "stringlib/localeutil.h" 601 #include "stringlib/localeutil.h"
569 #include "stringlib/undef.h" 602 #include "stringlib/undef.h"
570 603
571 #include "stringlib/ucs1lib.h" 604 #include "stringlib/ucs1lib.h"
572 #include "stringlib/fastsearch.h" 605 #include "stringlib/fastsearch.h"
573 #include "stringlib/partition.h" 606 #include "stringlib/partition.h"
574 #include "stringlib/split.h" 607 #include "stringlib/split.h"
575 #include "stringlib/count.h" 608 #include "stringlib/count.h"
576 #include "stringlib/find.h" 609 #include "stringlib/find.h"
610 #include "stringlib/replace.h"
577 #include "stringlib/find_max_char.h" 611 #include "stringlib/find_max_char.h"
578 #include "stringlib/localeutil.h" 612 #include "stringlib/localeutil.h"
579 #include "stringlib/undef.h" 613 #include "stringlib/undef.h"
580 614
581 #include "stringlib/ucs2lib.h" 615 #include "stringlib/ucs2lib.h"
582 #include "stringlib/fastsearch.h" 616 #include "stringlib/fastsearch.h"
583 #include "stringlib/partition.h" 617 #include "stringlib/partition.h"
584 #include "stringlib/split.h" 618 #include "stringlib/split.h"
585 #include "stringlib/count.h" 619 #include "stringlib/count.h"
586 #include "stringlib/find.h" 620 #include "stringlib/find.h"
621 #include "stringlib/replace.h"
587 #include "stringlib/find_max_char.h" 622 #include "stringlib/find_max_char.h"
588 #include "stringlib/localeutil.h" 623 #include "stringlib/localeutil.h"
589 #include "stringlib/undef.h" 624 #include "stringlib/undef.h"
590 625
591 #include "stringlib/ucs4lib.h" 626 #include "stringlib/ucs4lib.h"
592 #include "stringlib/fastsearch.h" 627 #include "stringlib/fastsearch.h"
593 #include "stringlib/partition.h" 628 #include "stringlib/partition.h"
594 #include "stringlib/split.h" 629 #include "stringlib/split.h"
595 #include "stringlib/count.h" 630 #include "stringlib/count.h"
596 #include "stringlib/find.h" 631 #include "stringlib/find.h"
632 #include "stringlib/replace.h"
597 #include "stringlib/find_max_char.h" 633 #include "stringlib/find_max_char.h"
598 #include "stringlib/localeutil.h" 634 #include "stringlib/localeutil.h"
599 #include "stringlib/undef.h" 635 #include "stringlib/undef.h"
600 636
601 #include "stringlib/unicodedefs.h" 637 #include "stringlib/unicodedefs.h"
602 #include "stringlib/fastsearch.h" 638 #include "stringlib/fastsearch.h"
603 #include "stringlib/count.h" 639 #include "stringlib/count.h"
604 #include "stringlib/find.h" 640 #include "stringlib/find.h"
605 #include "stringlib/undef.h" 641 #include "stringlib/undef.h"
606 642
(...skipping 26 matching lines...) Expand all
633 return -1; 669 return -1;
634 } 670 }
635 case PyUnicode_4BYTE_KIND: 671 case PyUnicode_4BYTE_KIND:
636 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode); 672 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
637 default: 673 default:
638 assert(0); 674 assert(0);
639 return -1; 675 return -1;
640 } 676 }
641 } 677 }
642 678
679 #ifdef Py_DEBUG
680 /* Fill the data of an Unicode string with invalid characters to detect bugs
681 earlier.
682
683 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
684 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
685 invalid character in Unicode 6.0. */
686 static void
687 unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
688 {
689 int kind = PyUnicode_KIND(unicode);
690 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
691 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
692 if (length <= old_length)
693 return;
694 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
695 }
696 #endif
697
643 static PyObject* 698 static PyObject*
644 resize_compact(PyObject *unicode, Py_ssize_t length) 699 resize_compact(PyObject *unicode, Py_ssize_t length)
645 { 700 {
646 Py_ssize_t char_size; 701 Py_ssize_t char_size;
647 Py_ssize_t struct_size; 702 Py_ssize_t struct_size;
648 Py_ssize_t new_size; 703 Py_ssize_t new_size;
649 int share_wstr; 704 int share_wstr;
650 PyObject *new_unicode; 705 PyObject *new_unicode;
706 #ifdef Py_DEBUG
707 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
708 #endif
709
651 assert(unicode_modifiable(unicode)); 710 assert(unicode_modifiable(unicode));
652 assert(PyUnicode_IS_READY(unicode)); 711 assert(PyUnicode_IS_READY(unicode));
653 assert(PyUnicode_IS_COMPACT(unicode)); 712 assert(PyUnicode_IS_COMPACT(unicode));
654 713
655 char_size = PyUnicode_KIND(unicode); 714 char_size = PyUnicode_KIND(unicode);
656 if (PyUnicode_IS_ASCII(unicode)) 715 if (PyUnicode_IS_ASCII(unicode))
657 struct_size = sizeof(PyASCIIObject); 716 struct_size = sizeof(PyASCIIObject);
658 else 717 else
659 struct_size = sizeof(PyCompactUnicodeObject); 718 struct_size = sizeof(PyCompactUnicodeObject);
660 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 719 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
(...skipping 15 matching lines...) Expand all
676 } 735 }
677 unicode = new_unicode; 736 unicode = new_unicode;
678 _Py_NewReference(unicode); 737 _Py_NewReference(unicode);
679 738
680 _PyUnicode_LENGTH(unicode) = length; 739 _PyUnicode_LENGTH(unicode) = length;
681 if (share_wstr) { 740 if (share_wstr) {
682 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode); 741 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
683 if (!PyUnicode_IS_ASCII(unicode)) 742 if (!PyUnicode_IS_ASCII(unicode))
684 _PyUnicode_WSTR_LENGTH(unicode) = length; 743 _PyUnicode_WSTR_LENGTH(unicode) = length;
685 } 744 }
745 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
746 PyObject_DEL(_PyUnicode_WSTR(unicode));
747 _PyUnicode_WSTR(unicode) = NULL;
748 }
749 #ifdef Py_DEBUG
750 unicode_fill_invalid(unicode, old_length);
751 #endif
686 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 752 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
687 length, 0); 753 length, 0);
688 assert(_PyUnicode_CheckConsistency(unicode, 0)); 754 assert(_PyUnicode_CheckConsistency(unicode, 0));
689 return unicode; 755 return unicode;
690 } 756 }
691 757
692 static int 758 static int
693 resize_inplace(PyObject *unicode, Py_ssize_t length) 759 resize_inplace(PyObject *unicode, Py_ssize_t length)
694 { 760 {
695 wchar_t *wstr; 761 wchar_t *wstr;
696 Py_ssize_t new_size; 762 Py_ssize_t new_size;
697 assert(!PyUnicode_IS_COMPACT(unicode)); 763 assert(!PyUnicode_IS_COMPACT(unicode));
698 assert(Py_REFCNT(unicode) == 1); 764 assert(Py_REFCNT(unicode) == 1);
699 765
700 if (PyUnicode_IS_READY(unicode)) { 766 if (PyUnicode_IS_READY(unicode)) {
701 Py_ssize_t char_size; 767 Py_ssize_t char_size;
702 int share_wstr, share_utf8; 768 int share_wstr, share_utf8;
703 void *data; 769 void *data;
770 #ifdef Py_DEBUG
771 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
772 #endif
704 773
705 data = _PyUnicode_DATA_ANY(unicode); 774 data = _PyUnicode_DATA_ANY(unicode);
706 char_size = PyUnicode_KIND(unicode); 775 char_size = PyUnicode_KIND(unicode);
707 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 776 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
708 share_utf8 = _PyUnicode_SHARE_UTF8(unicode); 777 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
709 778
710 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 779 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
711 PyErr_NoMemory(); 780 PyErr_NoMemory();
712 return -1; 781 return -1;
713 } 782 }
(...skipping 15 matching lines...) Expand all
729 if (share_wstr) { 798 if (share_wstr) {
730 _PyUnicode_WSTR(unicode) = data; 799 _PyUnicode_WSTR(unicode) = data;
731 _PyUnicode_WSTR_LENGTH(unicode) = length; 800 _PyUnicode_WSTR_LENGTH(unicode) = length;
732 } 801 }
733 if (share_utf8) { 802 if (share_utf8) {
734 _PyUnicode_UTF8(unicode) = data; 803 _PyUnicode_UTF8(unicode) = data;
735 _PyUnicode_UTF8_LENGTH(unicode) = length; 804 _PyUnicode_UTF8_LENGTH(unicode) = length;
736 } 805 }
737 _PyUnicode_LENGTH(unicode) = length; 806 _PyUnicode_LENGTH(unicode) = length;
738 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0); 807 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
808 #ifdef Py_DEBUG
809 unicode_fill_invalid(unicode, old_length);
810 #endif
739 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) { 811 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
740 assert(_PyUnicode_CheckConsistency(unicode, 0)); 812 assert(_PyUnicode_CheckConsistency(unicode, 0));
741 return 0; 813 return 0;
742 } 814 }
743 } 815 }
744 assert(_PyUnicode_WSTR(unicode) != NULL); 816 assert(_PyUnicode_WSTR(unicode) != NULL);
745 817
746 /* check for integer overflow */ 818 /* check for integer overflow */
747 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) { 819 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
748 PyErr_NoMemory(); 820 PyErr_NoMemory();
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after
781 return copy; 853 return copy;
782 } 854 }
783 else { 855 else {
784 PyObject *w; 856 PyObject *w;
785 857
786 w = (PyObject*)_PyUnicode_New(length); 858 w = (PyObject*)_PyUnicode_New(length);
787 if (w == NULL) 859 if (w == NULL)
788 return NULL; 860 return NULL;
789 copy_length = _PyUnicode_WSTR_LENGTH(unicode); 861 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
790 copy_length = Py_MIN(copy_length, length); 862 copy_length = Py_MIN(copy_length, length);
791 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode), 863 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
792 copy_length); 864 copy_length * sizeof(wchar_t));
793 return w; 865 return w;
794 } 866 }
795 } 867 }
796 868
797 /* We allocate one more byte to make sure the string is 869 /* We allocate one more byte to make sure the string is
798 Ux0000 terminated; some code (e.g. new_identifier) 870 Ux0000 terminated; some code (e.g. new_identifier)
799 relies on that. 871 relies on that.
800 872
801 XXX This allocator could further be enhanced by assuring that the 873 XXX This allocator could further be enhanced by assuring that the
802 free list never reduces its size below 1. 874 free list never reduces its size below 1.
803 875
804 */ 876 */
805 877
806 #ifdef Py_DEBUG
807 static int unicode_old_new_calls = 0;
808 #endif
809
810 static PyUnicodeObject * 878 static PyUnicodeObject *
811 _PyUnicode_New(Py_ssize_t length) 879 _PyUnicode_New(Py_ssize_t length)
812 { 880 {
813 register PyUnicodeObject *unicode; 881 PyUnicodeObject *unicode;
814 size_t new_size; 882 size_t new_size;
815 883
816 /* Optimization for empty strings */ 884 /* Optimization for empty strings */
817 if (length == 0 && unicode_empty != NULL) { 885 if (length == 0 && unicode_empty != NULL) {
818 Py_INCREF(unicode_empty); 886 Py_INCREF(unicode_empty);
819 return (PyUnicodeObject*)unicode_empty; 887 return (PyUnicodeObject*)unicode_empty;
820 } 888 }
821 889
822 /* Ensure we won't overflow the size. */ 890 /* Ensure we won't overflow the size. */
823 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 891 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
824 return (PyUnicodeObject *)PyErr_NoMemory(); 892 return (PyUnicodeObject *)PyErr_NoMemory();
825 } 893 }
826 if (length < 0) { 894 if (length < 0) {
827 PyErr_SetString(PyExc_SystemError, 895 PyErr_SetString(PyExc_SystemError,
828 "Negative size passed to _PyUnicode_New"); 896 "Negative size passed to _PyUnicode_New");
829 return NULL; 897 return NULL;
830 } 898 }
831 899
832 #ifdef Py_DEBUG
833 ++unicode_old_new_calls;
834 #endif
835
836 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 900 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
837 if (unicode == NULL) 901 if (unicode == NULL)
838 return NULL; 902 return NULL;
839 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 903 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
840 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size); 904
841 if (!_PyUnicode_WSTR(unicode)) {
842 Py_DECREF(unicode);
843 PyErr_NoMemory();
844 return NULL;
845 }
846
847 /* Initialize the first element to guard against cases where
848 * the caller fails before initializing str -- unicode_resize()
849 * reads str[0], and the Keep-Alive optimization can keep memory
850 * allocated for str alive across a call to unicode_dealloc(unicode).
851 * We don't want unicode_resize to read uninitialized memory in
852 * that case.
853 */
854 _PyUnicode_WSTR(unicode)[0] = 0;
855 _PyUnicode_WSTR(unicode)[length] = 0;
856 _PyUnicode_WSTR_LENGTH(unicode) = length; 905 _PyUnicode_WSTR_LENGTH(unicode) = length;
857 _PyUnicode_HASH(unicode) = -1; 906 _PyUnicode_HASH(unicode) = -1;
858 _PyUnicode_STATE(unicode).interned = 0; 907 _PyUnicode_STATE(unicode).interned = 0;
859 _PyUnicode_STATE(unicode).kind = 0; 908 _PyUnicode_STATE(unicode).kind = 0;
860 _PyUnicode_STATE(unicode).compact = 0; 909 _PyUnicode_STATE(unicode).compact = 0;
861 _PyUnicode_STATE(unicode).ready = 0; 910 _PyUnicode_STATE(unicode).ready = 0;
862 _PyUnicode_STATE(unicode).ascii = 0; 911 _PyUnicode_STATE(unicode).ascii = 0;
863 _PyUnicode_DATA_ANY(unicode) = NULL; 912 _PyUnicode_DATA_ANY(unicode) = NULL;
864 _PyUnicode_LENGTH(unicode) = 0; 913 _PyUnicode_LENGTH(unicode) = 0;
865 _PyUnicode_UTF8(unicode) = NULL; 914 _PyUnicode_UTF8(unicode) = NULL;
866 _PyUnicode_UTF8_LENGTH(unicode) = 0; 915 _PyUnicode_UTF8_LENGTH(unicode) = 0;
916
917 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
918 if (!_PyUnicode_WSTR(unicode)) {
919 Py_DECREF(unicode);
920 PyErr_NoMemory();
921 return NULL;
922 }
923
924 /* Initialize the first element to guard against cases where
925 * the caller fails before initializing str -- unicode_resize()
926 * reads str[0], and the Keep-Alive optimization can keep memory
927 * allocated for str alive across a call to unicode_dealloc(unicode).
928 * We don't want unicode_resize to read uninitialized memory in
929 * that case.
930 */
931 _PyUnicode_WSTR(unicode)[0] = 0;
932 _PyUnicode_WSTR(unicode)[length] = 0;
933
867 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0)); 934 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
868 return unicode; 935 return unicode;
869 } 936 }
870 937
871 static const char* 938 static const char*
872 unicode_kind_name(PyObject *unicode) 939 unicode_kind_name(PyObject *unicode)
873 { 940 {
874 /* don't check consistency: unicode_kind_name() is called from 941 /* don't check consistency: unicode_kind_name() is called from
875 _PyUnicode_Dump() */ 942 _PyUnicode_Dump() */
876 if (!PyUnicode_IS_COMPACT(unicode)) 943 if (!PyUnicode_IS_COMPACT(unicode))
(...skipping 25 matching lines...) Expand all
902 case PyUnicode_2BYTE_KIND: 969 case PyUnicode_2BYTE_KIND:
903 return "UCS2"; 970 return "UCS2";
904 case PyUnicode_4BYTE_KIND: 971 case PyUnicode_4BYTE_KIND:
905 return "UCS4"; 972 return "UCS4";
906 default: 973 default:
907 return "<invalid compact kind>"; 974 return "<invalid compact kind>";
908 } 975 }
909 } 976 }
910 977
911 #ifdef Py_DEBUG 978 #ifdef Py_DEBUG
912 static int unicode_new_new_calls = 0;
913
914 /* Functions wrapping macros for use in debugger */ 979 /* Functions wrapping macros for use in debugger */
915 char *_PyUnicode_utf8(void *unicode){ 980 char *_PyUnicode_utf8(void *unicode){
916 return PyUnicode_UTF8(unicode); 981 return PyUnicode_UTF8(unicode);
917 } 982 }
918 983
919 void *_PyUnicode_compact_data(void *unicode) { 984 void *_PyUnicode_compact_data(void *unicode) {
920 return _PyUnicode_COMPACT_DATA(unicode); 985 return _PyUnicode_COMPACT_DATA(unicode);
921 } 986 }
922 void *_PyUnicode_data(void *unicode){ 987 void *_PyUnicode_data(void *unicode){
923 printf("obj %p\n", unicode); 988 printf("obj %p\n", unicode);
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after
971 enum PyUnicode_Kind kind; 1036 enum PyUnicode_Kind kind;
972 int is_sharing, is_ascii; 1037 int is_sharing, is_ascii;
973 Py_ssize_t char_size; 1038 Py_ssize_t char_size;
974 Py_ssize_t struct_size; 1039 Py_ssize_t struct_size;
975 1040
976 /* Optimization for empty strings */ 1041 /* Optimization for empty strings */
977 if (size == 0 && unicode_empty != NULL) { 1042 if (size == 0 && unicode_empty != NULL) {
978 Py_INCREF(unicode_empty); 1043 Py_INCREF(unicode_empty);
979 return unicode_empty; 1044 return unicode_empty;
980 } 1045 }
981
982 #ifdef Py_DEBUG
983 ++unicode_new_new_calls;
984 #endif
985 1046
986 is_ascii = 0; 1047 is_ascii = 0;
987 is_sharing = 0; 1048 is_sharing = 0;
988 struct_size = sizeof(PyCompactUnicodeObject); 1049 struct_size = sizeof(PyCompactUnicodeObject);
989 if (maxchar < 128) { 1050 if (maxchar < 128) {
990 kind = PyUnicode_1BYTE_KIND; 1051 kind = PyUnicode_1BYTE_KIND;
991 char_size = 1; 1052 char_size = 1;
992 is_ascii = 1; 1053 is_ascii = 1;
993 struct_size = sizeof(PyASCIIObject); 1054 struct_size = sizeof(PyASCIIObject);
994 } 1055 }
(...skipping 72 matching lines...) Expand 10 before | Expand all | Expand 10 after
1067 if (is_sharing) { 1128 if (is_sharing) {
1068 _PyUnicode_WSTR_LENGTH(unicode) = size; 1129 _PyUnicode_WSTR_LENGTH(unicode) = size;
1069 _PyUnicode_WSTR(unicode) = (wchar_t *)data; 1130 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1070 } 1131 }
1071 else { 1132 else {
1072 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1133 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1073 _PyUnicode_WSTR(unicode) = NULL; 1134 _PyUnicode_WSTR(unicode) = NULL;
1074 } 1135 }
1075 } 1136 }
1076 #ifdef Py_DEBUG 1137 #ifdef Py_DEBUG
1077 /* Fill the data with invalid characters to detect bugs earlier. 1138 unicode_fill_invalid((PyObject*)unicode, 0);
1078 _PyUnicode_CheckConsistency(str, 1) detects invalid characters,
1079 at least for ASCII and UCS-4 strings. U+00FF is invalid in ASCII
1080 and U+FFFFFFFF is an invalid character in Unicode 6.0. */
1081 memset(data, 0xff, size * kind);
1082 #endif 1139 #endif
1083 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0)); 1140 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1084 return obj; 1141 return obj;
1085 } 1142 }
1086 1143
1087 #if SIZEOF_WCHAR_T == 2 1144 #if SIZEOF_WCHAR_T == 2
1088 /* Helper function to convert a 16-bits wchar_t representation to UCS4, this 1145 /* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1089 will decode surrogate pairs, the other conversions are implemented as macros 1146 will decode surrogate pairs, the other conversions are implemented as macros
1090 for efficiency. 1147 for efficiency.
1091 1148
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after
1135 return 0; 1192 return 0;
1136 } 1193 }
1137 1194
1138 static int 1195 static int
1139 _copy_characters(PyObject *to, Py_ssize_t to_start, 1196 _copy_characters(PyObject *to, Py_ssize_t to_start,
1140 PyObject *from, Py_ssize_t from_start, 1197 PyObject *from, Py_ssize_t from_start,
1141 Py_ssize_t how_many, int check_maxchar) 1198 Py_ssize_t how_many, int check_maxchar)
1142 { 1199 {
1143 unsigned int from_kind, to_kind; 1200 unsigned int from_kind, to_kind;
1144 void *from_data, *to_data; 1201 void *from_data, *to_data;
1145 int fast;
1146 1202
1147 assert(0 <= how_many); 1203 assert(0 <= how_many);
1148 assert(0 <= from_start); 1204 assert(0 <= from_start);
1149 assert(0 <= to_start); 1205 assert(0 <= to_start);
1150 assert(PyUnicode_Check(from)); 1206 assert(PyUnicode_Check(from));
1151 assert(PyUnicode_IS_READY(from)); 1207 assert(PyUnicode_IS_READY(from));
1152 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from)); 1208 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1153 1209
1154 if (how_many == 0)
1155 return 0;
1156
1157 assert(PyUnicode_Check(to)); 1210 assert(PyUnicode_Check(to));
1158 assert(PyUnicode_IS_READY(to)); 1211 assert(PyUnicode_IS_READY(to));
1159 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to)); 1212 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1213
1214 if (how_many == 0)
1215 return 0;
1160 1216
1161 from_kind = PyUnicode_KIND(from); 1217 from_kind = PyUnicode_KIND(from);
1162 from_data = PyUnicode_DATA(from); 1218 from_data = PyUnicode_DATA(from);
1163 to_kind = PyUnicode_KIND(to); 1219 to_kind = PyUnicode_KIND(to);
1164 to_data = PyUnicode_DATA(to); 1220 to_data = PyUnicode_DATA(to);
1165 1221
1166 #ifdef Py_DEBUG 1222 #ifdef Py_DEBUG
1167 if (!check_maxchar 1223 if (!check_maxchar
1168 && (from_kind > to_kind 1224 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1169 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
1170 { 1225 {
1171 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1226 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1172 Py_UCS4 ch; 1227 Py_UCS4 ch;
1173 Py_ssize_t i; 1228 Py_ssize_t i;
1174 for (i=0; i < how_many; i++) { 1229 for (i=0; i < how_many; i++) {
1175 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1230 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1176 assert(ch <= to_maxchar); 1231 assert(ch <= to_maxchar);
1177 } 1232 }
1178 } 1233 }
1179 #endif 1234 #endif
1180 fast = (from_kind == to_kind); 1235
1181 if (check_maxchar 1236 if (from_kind == to_kind) {
1182 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))) 1237 if (check_maxchar
1183 { 1238 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1184 /* deny latin1 => ascii */ 1239 {
1185 fast = 0; 1240 /* Writing Latin-1 characters into an ASCII string requires to
1186 } 1241 check that all written characters are pure ASCII */
1187 1242 Py_UCS4 max_char;
1188 if (fast) { 1243 max_char = ucs1lib_find_max_char(from_data,
1244 (Py_UCS1*)from_data + how_many);
1245 if (max_char >= 128)
1246 return -1;
1247 }
1189 Py_MEMCPY((char*)to_data + to_kind * to_start, 1248 Py_MEMCPY((char*)to_data + to_kind * to_start,
1190 (char*)from_data + from_kind * from_start, 1249 (char*)from_data + from_kind * from_start,
1191 to_kind * how_many); 1250 to_kind * how_many);
1192 } 1251 }
1193 else if (from_kind == PyUnicode_1BYTE_KIND 1252 else if (from_kind == PyUnicode_1BYTE_KIND
1194 && to_kind == PyUnicode_2BYTE_KIND) 1253 && to_kind == PyUnicode_2BYTE_KIND)
1195 { 1254 {
1196 _PyUnicode_CONVERT_BYTES( 1255 _PyUnicode_CONVERT_BYTES(
1197 Py_UCS1, Py_UCS2, 1256 Py_UCS1, Py_UCS2,
1198 PyUnicode_1BYTE_DATA(from) + from_start, 1257 PyUnicode_1BYTE_DATA(from) + from_start,
(...skipping 15 matching lines...) Expand all
1214 && to_kind == PyUnicode_4BYTE_KIND) 1273 && to_kind == PyUnicode_4BYTE_KIND)
1215 { 1274 {
1216 _PyUnicode_CONVERT_BYTES( 1275 _PyUnicode_CONVERT_BYTES(
1217 Py_UCS2, Py_UCS4, 1276 Py_UCS2, Py_UCS4,
1218 PyUnicode_2BYTE_DATA(from) + from_start, 1277 PyUnicode_2BYTE_DATA(from) + from_start,
1219 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1278 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1220 PyUnicode_4BYTE_DATA(to) + to_start 1279 PyUnicode_4BYTE_DATA(to) + to_start
1221 ); 1280 );
1222 } 1281 }
1223 else { 1282 else {
1224 /* check if max_char(from substring) <= max_char(to) */ 1283 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1225 if (from_kind > to_kind 1284
1226 /* latin1 => ascii */ 1285 if (!check_maxchar) {
1227 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))) 1286 if (from_kind == PyUnicode_2BYTE_KIND
1228 { 1287 && to_kind == PyUnicode_1BYTE_KIND)
1229 /* slow path to check for character overflow */ 1288 {
1289 _PyUnicode_CONVERT_BYTES(
1290 Py_UCS2, Py_UCS1,
1291 PyUnicode_2BYTE_DATA(from) + from_start,
1292 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1293 PyUnicode_1BYTE_DATA(to) + to_start
1294 );
1295 }
1296 else if (from_kind == PyUnicode_4BYTE_KIND
1297 && to_kind == PyUnicode_1BYTE_KIND)
1298 {
1299 _PyUnicode_CONVERT_BYTES(
1300 Py_UCS4, Py_UCS1,
1301 PyUnicode_4BYTE_DATA(from) + from_start,
1302 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1303 PyUnicode_1BYTE_DATA(to) + to_start
1304 );
1305 }
1306 else if (from_kind == PyUnicode_4BYTE_KIND
1307 && to_kind == PyUnicode_2BYTE_KIND)
1308 {
1309 _PyUnicode_CONVERT_BYTES(
1310 Py_UCS4, Py_UCS2,
1311 PyUnicode_4BYTE_DATA(from) + from_start,
1312 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1313 PyUnicode_2BYTE_DATA(to) + to_start
1314 );
1315 }
1316 else {
1317 assert(0);
1318 return -1;
1319 }
1320 }
1321 else {
1230 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1322 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1231 Py_UCS4 ch; 1323 Py_UCS4 ch;
1232 Py_ssize_t i; 1324 Py_ssize_t i;
1233 1325
1234 #ifdef Py_DEBUG
1235 for (i=0; i < how_many; i++) { 1326 for (i=0; i < how_many; i++) {
1236 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1327 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1237 assert(ch <= to_maxchar); 1328 if (ch > to_maxchar)
1329 return -1;
1238 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); 1330 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1239 } 1331 }
1240 #else
1241 if (!check_maxchar) {
1242 for (i=0; i < how_many; i++) {
1243 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1244 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1245 }
1246 }
1247 else {
1248 for (i=0; i < how_many; i++) {
1249 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1250 if (ch > to_maxchar)
1251 return 1;
1252 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1253 }
1254 }
1255 #endif
1256 }
1257 else {
1258 assert(0 && "inconsistent state");
1259 return 1;
1260 } 1332 }
1261 } 1333 }
1262 return 0; 1334 return 0;
1263 } 1335 }
1264 1336
1265 void 1337 void
1266 _PyUnicode_FastCopyCharacters( 1338 _PyUnicode_FastCopyCharacters(
1267 PyObject *to, Py_ssize_t to_start, 1339 PyObject *to, Py_ssize_t to_start,
1268 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many) 1340 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1269 { 1341 {
(...skipping 89 matching lines...) Expand 10 before | Expand all | Expand 10 after
1359 if (*maxchar > MAX_UNICODE) { 1431 if (*maxchar > MAX_UNICODE) {
1360 PyErr_Format(PyExc_ValueError, 1432 PyErr_Format(PyExc_ValueError,
1361 "character U+%x is not in range [U+0000; U+10ffff]" , 1433 "character U+%x is not in range [U+0000; U+10ffff]" ,
1362 ch); 1434 ch);
1363 return -1; 1435 return -1;
1364 } 1436 }
1365 } 1437 }
1366 } 1438 }
1367 return 0; 1439 return 0;
1368 } 1440 }
1369
1370 #ifdef Py_DEBUG
1371 static int unicode_ready_calls = 0;
1372 #endif
1373 1441
1374 int 1442 int
1375 _PyUnicode_Ready(PyObject *unicode) 1443 _PyUnicode_Ready(PyObject *unicode)
1376 { 1444 {
1377 wchar_t *end; 1445 wchar_t *end;
1378 Py_UCS4 maxchar = 0; 1446 Py_UCS4 maxchar = 0;
1379 Py_ssize_t num_surrogates; 1447 Py_ssize_t num_surrogates;
1380 #if SIZEOF_WCHAR_T == 2 1448 #if SIZEOF_WCHAR_T == 2
1381 Py_ssize_t length_wo_surrogates; 1449 Py_ssize_t length_wo_surrogates;
1382 #endif 1450 #endif
1383 1451
1384 /* _PyUnicode_Ready() is only intended for old-style API usage where 1452 /* _PyUnicode_Ready() is only intended for old-style API usage where
1385 strings were created using _PyObject_New() and where no canonical 1453 strings were created using _PyObject_New() and where no canonical
1386 representation (the str field) has been set yet aka strings 1454 representation (the str field) has been set yet aka strings
1387 which are not yet ready. */ 1455 which are not yet ready. */
1388 assert(_PyUnicode_CHECK(unicode)); 1456 assert(_PyUnicode_CHECK(unicode));
1389 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND); 1457 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1390 assert(_PyUnicode_WSTR(unicode) != NULL); 1458 assert(_PyUnicode_WSTR(unicode) != NULL);
1391 assert(_PyUnicode_DATA_ANY(unicode) == NULL); 1459 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1392 assert(_PyUnicode_UTF8(unicode) == NULL); 1460 assert(_PyUnicode_UTF8(unicode) == NULL);
1393 /* Actually, it should neither be interned nor be anything else: */ 1461 /* Actually, it should neither be interned nor be anything else: */
1394 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED); 1462 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1395
1396 #ifdef Py_DEBUG
1397 ++unicode_ready_calls;
1398 #endif
1399 1463
1400 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode); 1464 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1401 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end, 1465 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1402 &maxchar, &num_surrogates) == -1) 1466 &maxchar, &num_surrogates) == -1)
1403 return -1; 1467 return -1;
1404 1468
1405 if (maxchar < 256) { 1469 if (maxchar < 256) {
1406 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(un icode) + 1); 1470 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(un icode) + 1);
1407 if (!_PyUnicode_DATA_ANY(unicode)) { 1471 if (!_PyUnicode_DATA_ANY(unicode)) {
1408 PyErr_NoMemory(); 1472 PyErr_NoMemory();
(...skipping 85 matching lines...) Expand 10 before | Expand all | Expand 10 after
1494 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1558 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1495 #endif 1559 #endif
1496 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0'; 1560 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1497 } 1561 }
1498 _PyUnicode_STATE(unicode).ready = 1; 1562 _PyUnicode_STATE(unicode).ready = 1;
1499 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1563 assert(_PyUnicode_CheckConsistency(unicode, 1));
1500 return 0; 1564 return 0;
1501 } 1565 }
1502 1566
1503 static void 1567 static void
1504 unicode_dealloc(register PyObject *unicode) 1568 unicode_dealloc(PyObject *unicode)
1505 { 1569 {
1506 switch (PyUnicode_CHECK_INTERNED(unicode)) { 1570 switch (PyUnicode_CHECK_INTERNED(unicode)) {
1507 case SSTATE_NOT_INTERNED: 1571 case SSTATE_NOT_INTERNED:
1508 break; 1572 break;
1509 1573
1510 case SSTATE_INTERNED_MORTAL: 1574 case SSTATE_INTERNED_MORTAL:
1511 /* revive dead object temporarily for DelItem */ 1575 /* revive dead object temporarily for DelItem */
1512 Py_REFCNT(unicode) = 3; 1576 Py_REFCNT(unicode) = 3;
1513 if (PyDict_DelItem(interned, unicode) != 0) 1577 if (PyDict_DelItem(interned, unicode) != 0)
1514 Py_FatalError( 1578 Py_FatalError(
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after
1582 assert(0 <= length); 1646 assert(0 <= length);
1583 1647
1584 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND) 1648 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1585 old_length = PyUnicode_WSTR_LENGTH(unicode); 1649 old_length = PyUnicode_WSTR_LENGTH(unicode);
1586 else 1650 else
1587 old_length = PyUnicode_GET_LENGTH(unicode); 1651 old_length = PyUnicode_GET_LENGTH(unicode);
1588 if (old_length == length) 1652 if (old_length == length)
1589 return 0; 1653 return 0;
1590 1654
1591 if (length == 0) { 1655 if (length == 0) {
1656 _Py_INCREF_UNICODE_EMPTY();
1657 if (!unicode_empty)
1658 return -1;
1592 Py_DECREF(*p_unicode); 1659 Py_DECREF(*p_unicode);
1593 *p_unicode = unicode_empty; 1660 *p_unicode = unicode_empty;
1594 Py_INCREF(*p_unicode);
1595 return 0; 1661 return 0;
1596 } 1662 }
1597 1663
1598 if (!unicode_modifiable(unicode)) { 1664 if (!unicode_modifiable(unicode)) {
1599 PyObject *copy = resize_copy(unicode, length); 1665 PyObject *copy = resize_copy(unicode, length);
1600 if (copy == NULL) 1666 if (copy == NULL)
1601 return -1; 1667 return -1;
1602 Py_DECREF(*p_unicode); 1668 Py_DECREF(*p_unicode);
1603 *p_unicode = copy; 1669 *p_unicode = copy;
1604 return 0; 1670 return 0;
(...skipping 19 matching lines...) Expand all
1624 } 1690 }
1625 unicode = *p_unicode; 1691 unicode = *p_unicode;
1626 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0) 1692 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1627 { 1693 {
1628 PyErr_BadInternalCall(); 1694 PyErr_BadInternalCall();
1629 return -1; 1695 return -1;
1630 } 1696 }
1631 return unicode_resize(p_unicode, length); 1697 return unicode_resize(p_unicode, length);
1632 } 1698 }
1633 1699
1634 static int
1635 unicode_widen(PyObject **p_unicode, Py_ssize_t length,
1636 unsigned int maxchar)
1637 {
1638 PyObject *result;
1639 assert(PyUnicode_IS_READY(*p_unicode));
1640 assert(length <= PyUnicode_GET_LENGTH(*p_unicode));
1641 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1642 return 0;
1643 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1644 maxchar);
1645 if (result == NULL)
1646 return -1;
1647 _PyUnicode_FastCopyCharacters(result, 0, *p_unicode, 0, length);
1648 Py_DECREF(*p_unicode);
1649 *p_unicode = result;
1650 return 0;
1651 }
1652
1653 static int
1654 unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1655 Py_UCS4 ch)
1656 {
1657 assert(ch <= MAX_UNICODE);
1658 if (unicode_widen(p_unicode, *pos, ch) < 0)
1659 return -1;
1660 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1661 PyUnicode_DATA(*p_unicode),
1662 (*pos)++, ch);
1663 return 0;
1664 }
1665
1666 /* Copy a ASCII or latin1 char* string into a Python Unicode string. 1700 /* Copy a ASCII or latin1 char* string into a Python Unicode string.
1667 Return the length of the input string.
1668 1701
1669 WARNING: The function doesn't copy the terminating null character and 1702 WARNING: The function doesn't copy the terminating null character and
1670 doesn't check the maximum character (may write a latin1 character in an 1703 doesn't check the maximum character (may write a latin1 character in an
1671 ASCII string). */ 1704 ASCII string). */
1672 static Py_ssize_t 1705 static void
1673 unicode_write_cstr(PyObject *unicode, Py_ssize_t index, const char *str) 1706 unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1707 const char *str, Py_ssize_t len)
1674 { 1708 {
1675 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); 1709 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1676 void *data = PyUnicode_DATA(unicode); 1710 void *data = PyUnicode_DATA(unicode);
1711 const char *end = str + len;
1677 1712
1678 switch (kind) { 1713 switch (kind) {
1679 case PyUnicode_1BYTE_KIND: { 1714 case PyUnicode_1BYTE_KIND: {
1680 Py_ssize_t len = strlen(str);
1681 assert(index + len <= PyUnicode_GET_LENGTH(unicode)); 1715 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1716 #ifdef Py_DEBUG
1717 if (PyUnicode_IS_ASCII(unicode)) {
1718 Py_UCS4 maxchar = ucs1lib_find_max_char(
1719 (const Py_UCS1*)str,
1720 (const Py_UCS1*)str + len);
1721 assert(maxchar < 128);
1722 }
1723 #endif
1682 memcpy((char *) data + index, str, len); 1724 memcpy((char *) data + index, str, len);
1683 return len; 1725 break;
1684 } 1726 }
1685 case PyUnicode_2BYTE_KIND: { 1727 case PyUnicode_2BYTE_KIND: {
1686 Py_UCS2 *start = (Py_UCS2 *)data + index; 1728 Py_UCS2 *start = (Py_UCS2 *)data + index;
1687 Py_UCS2 *ucs2 = start; 1729 Py_UCS2 *ucs2 = start;
1688 assert(index <= PyUnicode_GET_LENGTH(unicode)); 1730 assert(index <= PyUnicode_GET_LENGTH(unicode));
1689 1731
1690 for (; *str; ++ucs2, ++str) 1732 for (; str < end; ++ucs2, ++str)
1691 *ucs2 = (Py_UCS2)*str; 1733 *ucs2 = (Py_UCS2)*str;
1692 1734
1693 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode)); 1735 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1694 return ucs2 - start; 1736 break;
1695 } 1737 }
1696 default: { 1738 default: {
1697 Py_UCS4 *start = (Py_UCS4 *)data + index; 1739 Py_UCS4 *start = (Py_UCS4 *)data + index;
1698 Py_UCS4 *ucs4 = start; 1740 Py_UCS4 *ucs4 = start;
1699 assert(kind == PyUnicode_4BYTE_KIND); 1741 assert(kind == PyUnicode_4BYTE_KIND);
1700 assert(index <= PyUnicode_GET_LENGTH(unicode)); 1742 assert(index <= PyUnicode_GET_LENGTH(unicode));
1701 1743
1702 for (; *str; ++ucs4, ++str) 1744 for (; str < end; ++ucs4, ++str)
1703 *ucs4 = (Py_UCS4)*str; 1745 *ucs4 = (Py_UCS4)*str;
1704 1746
1705 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode)); 1747 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1706 return ucs4 - start;
1707 } 1748 }
1708 } 1749 }
1709 } 1750 }
1710 1751
1711 1752
1712 static PyObject* 1753 static PyObject*
1713 get_latin1_char(unsigned char ch) 1754 get_latin1_char(unsigned char ch)
1714 { 1755 {
1715 PyObject *unicode = unicode_latin1[ch]; 1756 PyObject *unicode = unicode_latin1[ch];
1716 if (!unicode) { 1757 if (!unicode) {
(...skipping 15 matching lines...) Expand all
1732 Py_UCS4 maxchar = 0; 1773 Py_UCS4 maxchar = 0;
1733 Py_ssize_t num_surrogates; 1774 Py_ssize_t num_surrogates;
1734 1775
1735 if (u == NULL) 1776 if (u == NULL)
1736 return (PyObject*)_PyUnicode_New(size); 1777 return (PyObject*)_PyUnicode_New(size);
1737 1778
1738 /* If the Unicode data is known at construction time, we can apply 1779 /* If the Unicode data is known at construction time, we can apply
1739 some optimizations which share commonly used objects. */ 1780 some optimizations which share commonly used objects. */
1740 1781
1741 /* Optimization for empty strings */ 1782 /* Optimization for empty strings */
1742 if (size == 0 && unicode_empty != NULL) { 1783 if (size == 0)
1743 Py_INCREF(unicode_empty); 1784 _Py_RETURN_UNICODE_EMPTY();
1744 return unicode_empty;
1745 }
1746 1785
1747 /* Single character Unicode objects in the Latin-1 range are 1786 /* Single character Unicode objects in the Latin-1 range are
1748 shared when using this constructor */ 1787 shared when using this constructor */
1749 if (size == 1 && *u < 256) 1788 if (size == 1 && (Py_UCS4)*u < 256)
1750 return get_latin1_char((unsigned char)*u); 1789 return get_latin1_char((unsigned char)*u);
1751 1790
1752 /* If not empty and not single character, copy the Unicode data 1791 /* If not empty and not single character, copy the Unicode data
1753 into the new object */ 1792 into the new object */
1754 if (find_maxchar_surrogates(u, u + size, 1793 if (find_maxchar_surrogates(u, u + size,
1755 &maxchar, &num_surrogates) == -1) 1794 &maxchar, &num_surrogates) == -1)
1756 return NULL; 1795 return NULL;
1757 1796
1758 unicode = PyUnicode_New(size - num_surrogates, maxchar); 1797 unicode = PyUnicode_New(size - num_surrogates, maxchar);
1759 if (!unicode) 1798 if (!unicode)
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after
1827 assert(!id->next); 1866 assert(!id->next);
1828 id->next = static_strings; 1867 id->next = static_strings;
1829 static_strings = id; 1868 static_strings = id;
1830 } 1869 }
1831 return id->object; 1870 return id->object;
1832 } 1871 }
1833 1872
1834 void 1873 void
1835 _PyUnicode_ClearStaticStrings() 1874 _PyUnicode_ClearStaticStrings()
1836 { 1875 {
1837 _Py_Identifier *i; 1876 _Py_Identifier *tmp, *s = static_strings;
1838 for (i = static_strings; i; i = i->next) { 1877 while (s) {
1839 Py_DECREF(i->object); 1878 Py_DECREF(s->object);
1840 i->object = NULL; 1879 s->object = NULL;
1841 i->next = NULL; 1880 tmp = s->next;
1842 } 1881 s->next = NULL;
1882 s = tmp;
1883 }
1884 static_strings = NULL;
1843 } 1885 }
1844 1886
1845 /* Internal function, doesn't check maximum character */ 1887 /* Internal function, doesn't check maximum character */
1846 1888
1847 PyObject* 1889 PyObject*
1848 _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size) 1890 _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
1849 { 1891 {
1850 const unsigned char *s = (const unsigned char *)buffer; 1892 const unsigned char *s = (const unsigned char *)buffer;
1851 PyObject *unicode; 1893 PyObject *unicode;
1852 if (size == 1) { 1894 if (size == 1) {
1853 #ifdef Py_DEBUG 1895 #ifdef Py_DEBUG
1854 assert(s[0] < 128); 1896 assert((unsigned char)s[0] < 128);
1855 #endif 1897 #endif
1856 return get_latin1_char(s[0]); 1898 return get_latin1_char(s[0]);
1857 } 1899 }
1858 unicode = PyUnicode_New(size, 127); 1900 unicode = PyUnicode_New(size, 127);
1859 if (!unicode) 1901 if (!unicode)
1860 return NULL; 1902 return NULL;
1861 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size); 1903 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1862 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1904 assert(_PyUnicode_CheckConsistency(unicode, 1));
1863 return unicode; 1905 return unicode;
1864 } 1906 }
(...skipping 21 matching lines...) Expand all
1886 return 127; 1928 return 127;
1887 else if (maxchar <= 255) 1929 else if (maxchar <= 255)
1888 return 255; 1930 return 255;
1889 else if (maxchar <= 65535) 1931 else if (maxchar <= 65535)
1890 return 65535; 1932 return 65535;
1891 else 1933 else
1892 return MAX_UNICODE; 1934 return MAX_UNICODE;
1893 } 1935 }
1894 1936
1895 static PyObject* 1937 static PyObject*
1896 _PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size) 1938 _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
1897 { 1939 {
1898 PyObject *res; 1940 PyObject *res;
1899 unsigned char max_char; 1941 unsigned char max_char;
1900 1942
1901 if (size == 0) { 1943 if (size == 0)
1902 Py_INCREF(unicode_empty); 1944 _Py_RETURN_UNICODE_EMPTY();
1903 return unicode_empty;
1904 }
1905 assert(size > 0); 1945 assert(size > 0);
1906 if (size == 1) 1946 if (size == 1)
1907 return get_latin1_char(u[0]); 1947 return get_latin1_char(u[0]);
1908 1948
1909 max_char = ucs1lib_find_max_char(u, u + size); 1949 max_char = ucs1lib_find_max_char(u, u + size);
1910 res = PyUnicode_New(size, max_char); 1950 res = PyUnicode_New(size, max_char);
1911 if (!res) 1951 if (!res)
1912 return NULL; 1952 return NULL;
1913 memcpy(PyUnicode_1BYTE_DATA(res), u, size); 1953 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1914 assert(_PyUnicode_CheckConsistency(res, 1)); 1954 assert(_PyUnicode_CheckConsistency(res, 1));
1915 return res; 1955 return res;
1916 } 1956 }
1917 1957
1918 static PyObject* 1958 static PyObject*
1919 _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) 1959 _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
1920 { 1960 {
1921 PyObject *res; 1961 PyObject *res;
1922 Py_UCS2 max_char; 1962 Py_UCS2 max_char;
1923 1963
1924 if (size == 0) { 1964 if (size == 0)
1925 Py_INCREF(unicode_empty); 1965 _Py_RETURN_UNICODE_EMPTY();
1926 return unicode_empty;
1927 }
1928 assert(size > 0); 1966 assert(size > 0);
1929 if (size == 1) { 1967 if (size == 1) {
1930 Py_UCS4 ch = u[0]; 1968 Py_UCS4 ch = u[0];
1969 int kind;
1970 void *data;
1931 if (ch < 256) 1971 if (ch < 256)
1932 return get_latin1_char((unsigned char)ch); 1972 return get_latin1_char((unsigned char)ch);
1933 1973
1934 res = PyUnicode_New(1, ch); 1974 res = PyUnicode_New(1, ch);
1935 if (res == NULL) 1975 if (res == NULL)
1936 return NULL; 1976 return NULL;
1937 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch); 1977 kind = PyUnicode_KIND(res);
1978 data = PyUnicode_DATA(res);
1979 PyUnicode_WRITE(kind, data, 0, ch);
1938 assert(_PyUnicode_CheckConsistency(res, 1)); 1980 assert(_PyUnicode_CheckConsistency(res, 1));
1939 return res; 1981 return res;
1940 } 1982 }
1941 1983
1942 max_char = ucs2lib_find_max_char(u, u + size); 1984 max_char = ucs2lib_find_max_char(u, u + size);
1943 res = PyUnicode_New(size, max_char); 1985 res = PyUnicode_New(size, max_char);
1944 if (!res) 1986 if (!res)
1945 return NULL; 1987 return NULL;
1946 if (max_char >= 256) 1988 if (max_char >= 256)
1947 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size); 1989 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1948 else { 1990 else {
1949 _PyUnicode_CONVERT_BYTES( 1991 _PyUnicode_CONVERT_BYTES(
1950 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res)); 1992 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1951 } 1993 }
1952 assert(_PyUnicode_CheckConsistency(res, 1)); 1994 assert(_PyUnicode_CheckConsistency(res, 1));
1953 return res; 1995 return res;
1954 } 1996 }
1955 1997
1956 static PyObject* 1998 static PyObject*
1957 _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) 1999 _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
1958 { 2000 {
1959 PyObject *res; 2001 PyObject *res;
1960 Py_UCS4 max_char; 2002 Py_UCS4 max_char;
1961 2003
1962 if (size == 0) { 2004 if (size == 0)
1963 Py_INCREF(unicode_empty); 2005 _Py_RETURN_UNICODE_EMPTY();
1964 return unicode_empty;
1965 }
1966 assert(size > 0); 2006 assert(size > 0);
1967 if (size == 1) { 2007 if (size == 1) {
1968 Py_UCS4 ch = u[0]; 2008 Py_UCS4 ch = u[0];
2009 int kind;
2010 void *data;
1969 if (ch < 256) 2011 if (ch < 256)
1970 return get_latin1_char((unsigned char)ch); 2012 return get_latin1_char((unsigned char)ch);
1971 2013
1972 res = PyUnicode_New(1, ch); 2014 res = PyUnicode_New(1, ch);
1973 if (res == NULL) 2015 if (res == NULL)
1974 return NULL; 2016 return NULL;
1975 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch); 2017 kind = PyUnicode_KIND(res);
2018 data = PyUnicode_DATA(res);
2019 PyUnicode_WRITE(kind, data, 0, ch);
1976 assert(_PyUnicode_CheckConsistency(res, 1)); 2020 assert(_PyUnicode_CheckConsistency(res, 1));
1977 return res; 2021 return res;
1978 } 2022 }
1979 2023
1980 max_char = ucs4lib_find_max_char(u, u + size); 2024 max_char = ucs4lib_find_max_char(u, u + size);
1981 res = PyUnicode_New(size, max_char); 2025 res = PyUnicode_New(size, max_char);
1982 if (!res) 2026 if (!res)
1983 return NULL; 2027 return NULL;
1984 if (max_char < 256) 2028 if (max_char < 256)
1985 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size, 2029 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
(...skipping 96 matching lines...) Expand 10 before | Expand all | Expand 10 after
2082 return; 2126 return;
2083 } 2127 }
2084 else { 2128 else {
2085 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode); 2129 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2086 assert(kind == PyUnicode_4BYTE_KIND); 2130 assert(kind == PyUnicode_4BYTE_KIND);
2087 max_char = ucs4lib_find_max_char(u, u + len); 2131 max_char = ucs4lib_find_max_char(u, u + len);
2088 if (max_char >= 0x10000) 2132 if (max_char >= 0x10000)
2089 return; 2133 return;
2090 } 2134 }
2091 copy = PyUnicode_New(len, max_char); 2135 copy = PyUnicode_New(len, max_char);
2092 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len); 2136 if (copy != NULL)
2137 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2093 Py_DECREF(unicode); 2138 Py_DECREF(unicode);
2094 *p_unicode = copy; 2139 *p_unicode = copy;
2095 } 2140 }
2096 2141
2097 PyObject* 2142 PyObject*
2098 _PyUnicode_Copy(PyObject *unicode) 2143 _PyUnicode_Copy(PyObject *unicode)
2099 { 2144 {
2100 Py_ssize_t length; 2145 Py_ssize_t length;
2101 PyObject *copy; 2146 PyObject *copy;
2102 2147
(...skipping 140 matching lines...) Expand 10 before | Expand all | Expand 10 after
2243 2288
2244 Py_UCS4* 2289 Py_UCS4*
2245 PyUnicode_AsUCS4Copy(PyObject *string) 2290 PyUnicode_AsUCS4Copy(PyObject *string)
2246 { 2291 {
2247 return as_ucs4(string, NULL, 0, 1); 2292 return as_ucs4(string, NULL, 0, 1);
2248 } 2293 }
2249 2294
2250 #ifdef HAVE_WCHAR_H 2295 #ifdef HAVE_WCHAR_H
2251 2296
2252 PyObject * 2297 PyObject *
2253 PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size) 2298 PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
2254 { 2299 {
2255 if (w == NULL) { 2300 if (w == NULL) {
2256 if (size == 0) { 2301 if (size == 0)
2257 Py_INCREF(unicode_empty); 2302 _Py_RETURN_UNICODE_EMPTY();
2258 return unicode_empty;
2259 }
2260 PyErr_BadInternalCall(); 2303 PyErr_BadInternalCall();
2261 return NULL; 2304 return NULL;
2262 } 2305 }
2263 2306
2264 if (size == -1) { 2307 if (size == -1) {
2265 size = wcslen(w); 2308 size = wcslen(w);
2266 } 2309 }
2267 2310
2268 return PyUnicode_FromUnicode(w, size); 2311 return PyUnicode_FromUnicode(w, size);
2269 } 2312 }
2270 2313
2271 #endif /* HAVE_WCHAR_H */ 2314 #endif /* HAVE_WCHAR_H */
2272 2315
2273 static void 2316 static void
2274 makefmt(char *fmt, int longflag, int longlongflag, int size_tflag, 2317 makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2275 int zeropad, int width, int precision, char c) 2318 char c)
2276 { 2319 {
2277 *fmt++ = '%'; 2320 *fmt++ = '%';
2278 if (width) {
2279 if (zeropad)
2280 *fmt++ = '0';
2281 fmt += sprintf(fmt, "%d", width);
2282 }
2283 if (precision)
2284 fmt += sprintf(fmt, ".%d", precision);
2285 if (longflag) 2321 if (longflag)
2286 *fmt++ = 'l'; 2322 *fmt++ = 'l';
2287 else if (longlongflag) { 2323 else if (longlongflag) {
2288 /* longlongflag should only ever be nonzero on machines with 2324 /* longlongflag should only ever be nonzero on machines with
2289 HAVE_LONG_LONG defined */ 2325 HAVE_LONG_LONG defined */
2290 #ifdef HAVE_LONG_LONG 2326 #ifdef HAVE_LONG_LONG
2291 char *f = PY_FORMAT_LONG_LONG; 2327 char *f = PY_FORMAT_LONG_LONG;
2292 while (*f) 2328 while (*f)
2293 *fmt++ = *f++; 2329 *fmt++ = *f++;
2294 #else 2330 #else
2295 /* we shouldn't ever get here */ 2331 /* we shouldn't ever get here */
2296 assert(0); 2332 assert(0);
2297 *fmt++ = 'l'; 2333 *fmt++ = 'l';
2298 #endif 2334 #endif
2299 } 2335 }
2300 else if (size_tflag) { 2336 else if (size_tflag) {
2301 char *f = PY_FORMAT_SIZE_T; 2337 char *f = PY_FORMAT_SIZE_T;
2302 while (*f) 2338 while (*f)
2303 *fmt++ = *f++; 2339 *fmt++ = *f++;
2304 } 2340 }
2305 *fmt++ = c; 2341 *fmt++ = c;
2306 *fmt = '\0'; 2342 *fmt = '\0';
2307 } 2343 }
2308 2344
2309 /* helper for PyUnicode_FromFormatV() */ 2345 /* maximum number of characters required for output of %lld or %p.
2346 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2347 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2348 #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2349
2350 static int
2351 unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2352 Py_ssize_t width, Py_ssize_t precision)
2353 {
2354 Py_ssize_t length, fill, arglen;
2355 Py_UCS4 maxchar;
2356
2357 if (PyUnicode_READY(str) == -1)
2358 return -1;
2359
2360 length = PyUnicode_GET_LENGTH(str);
2361 if ((precision == -1 || precision >= length)
2362 && width <= length)
2363 return _PyUnicodeWriter_WriteStr(writer, str);
2364
2365 if (precision != -1)
2366 length = Py_MIN(precision, length);
2367
2368 arglen = Py_MAX(length, width);
2369 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2370 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2371 else
2372 maxchar = writer->maxchar;
2373
2374 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2375 return -1;
2376
2377 if (width > length) {
2378 fill = width - length;
2379 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2380 return -1;
2381 writer->pos += fill;
2382 }
2383
2384 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2385 str, 0, length);
2386 writer->pos += length;
2387 return 0;
2388 }
2389
2390 static int
2391 unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2392 Py_ssize_t width, Py_ssize_t precision)
2393 {
2394 /* UTF-8 */
2395 Py_ssize_t length;
2396 PyObject *unicode;
2397 int res;
2398
2399 length = strlen(str);
2400 if (precision != -1)
2401 length = Py_MIN(length, precision);
2402 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2403 if (unicode == NULL)
2404 return -1;
2405
2406 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2407 Py_DECREF(unicode);
2408 return res;
2409 }
2310 2410
2311 static const char* 2411 static const char*
2312 parse_format_flags(const char *f, 2412 unicode_fromformat_arg(_PyUnicodeWriter *writer,
2313 int *p_width, int *p_precision, 2413 const char *f, va_list *vargs)
2314 int *p_longflag, int *p_longlongflag, int *p_size_tflag) 2414 {
2315 { 2415 const char *p;
2316 int width, precision, longflag, longlongflag, size_tflag; 2416 Py_ssize_t len;
2417 int zeropad;
2418 Py_ssize_t width;
2419 Py_ssize_t precision;
2420 int longflag;
2421 int longlongflag;
2422 int size_tflag;
2423 Py_ssize_t fill;
2424
2425 p = f;
2426 f++;
2427 zeropad = 0;
2428 if (*f == '0') {
2429 zeropad = 1;
2430 f++;
2431 }
2317 2432
2318 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ 2433 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2319 f++; 2434 width = -1;
2320 width = 0; 2435 if (Py_ISDIGIT((unsigned)*f)) {
2321 while (Py_ISDIGIT((unsigned)*f)) 2436 width = *f - '0';
2322 width = (width*10) + *f++ - '0'; 2437 f++;
2323 precision = 0; 2438 while (Py_ISDIGIT((unsigned)*f)) {
2439 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2440 PyErr_SetString(PyExc_ValueError,
2441 "width too big");
2442 return NULL;
2443 }
2444 width = (width * 10) + (*f - '0');
2445 f++;
2446 }
2447 }
2448 precision = -1;
2324 if (*f == '.') { 2449 if (*f == '.') {
2325 f++; 2450 f++;
2326 while (Py_ISDIGIT((unsigned)*f)) 2451 if (Py_ISDIGIT((unsigned)*f)) {
2327 precision = (precision*10) + *f++ - '0'; 2452 precision = (*f - '0');
2453 f++;
2454 while (Py_ISDIGIT((unsigned)*f)) {
2455 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2456 PyErr_SetString(PyExc_ValueError,
2457 "precision too big");
2458 return NULL;
2459 }
2460 precision = (precision * 10) + (*f - '0');
2461 f++;
2462 }
2463 }
2328 if (*f == '%') { 2464 if (*f == '%') {
2329 /* "%.3%s" => f points to "3" */ 2465 /* "%.3%s" => f points to "3" */
2330 f--; 2466 f--;
2331 } 2467 }
2332 } 2468 }
2333 if (*f == '\0') { 2469 if (*f == '\0') {
2334 /* bogus format "%.1" => go backward, f points to "1" */ 2470 /* bogus format "%.123" => go backward, f points to "3" */
2335 f--; 2471 f--;
2336 } 2472 }
2337 if (p_width != NULL)
2338 *p_width = width;
2339 if (p_precision != NULL)
2340 *p_precision = precision;
2341 2473
2342 /* Handle %ld, %lu, %lld and %llu. */ 2474 /* Handle %ld, %lu, %lld and %llu. */
2343 longflag = 0; 2475 longflag = 0;
2344 longlongflag = 0; 2476 longlongflag = 0;
2345 size_tflag = 0; 2477 size_tflag = 0;
2346
2347 if (*f == 'l') { 2478 if (*f == 'l') {
2348 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') { 2479 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2349 longflag = 1; 2480 longflag = 1;
2350 ++f; 2481 ++f;
2351 } 2482 }
2352 #ifdef HAVE_LONG_LONG 2483 #ifdef HAVE_LONG_LONG
2353 else if (f[1] == 'l' && 2484 else if (f[1] == 'l' &&
2354 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) { 2485 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2355 longlongflag = 1; 2486 longlongflag = 1;
2356 f += 2; 2487 f += 2;
2357 } 2488 }
2358 #endif 2489 #endif
2359 } 2490 }
2360 /* handle the size_t flag. */ 2491 /* handle the size_t flag. */
2361 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) { 2492 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2362 size_tflag = 1; 2493 size_tflag = 1;
2363 ++f; 2494 ++f;
2364 } 2495 }
2365 if (p_longflag != NULL) 2496
2366 *p_longflag = longflag; 2497 if (f[1] == '\0')
2367 if (p_longlongflag != NULL) 2498 writer->overallocate = 0;
2368 *p_longlongflag = longlongflag; 2499
2369 if (p_size_tflag != NULL) 2500 switch (*f) {
2370 *p_size_tflag = size_tflag; 2501 case 'c':
2502 {
2503 int ordinal = va_arg(*vargs, int);
2504 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2505 PyErr_SetString(PyExc_OverflowError,
2506 "character argument not in range(0x110000)");
2507 return NULL;
2508 }
2509 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2510 return NULL;
2511 break;
2512 }
2513
2514 case 'i':
2515 case 'd':
2516 case 'u':
2517 case 'x':
2518 {
2519 /* used by sprintf */
2520 char fmt[10]; /* should be enough for "%0lld\0" */
2521 char buffer[MAX_LONG_LONG_CHARS];
2522 Py_ssize_t arglen;
2523
2524 if (*f == 'u') {
2525 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2526
2527 if (longflag)
2528 len = sprintf(buffer, fmt,
2529 va_arg(*vargs, unsigned long));
2530 #ifdef HAVE_LONG_LONG
2531 else if (longlongflag)
2532 len = sprintf(buffer, fmt,
2533 va_arg(*vargs, unsigned PY_LONG_LONG));
2534 #endif
2535 else if (size_tflag)
2536 len = sprintf(buffer, fmt,
2537 va_arg(*vargs, size_t));
2538 else
2539 len = sprintf(buffer, fmt,
2540 va_arg(*vargs, unsigned int));
2541 }
2542 else if (*f == 'x') {
2543 makefmt(fmt, 0, 0, 0, 'x');
2544 len = sprintf(buffer, fmt, va_arg(*vargs, int));
2545 }
2546 else {
2547 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2548
2549 if (longflag)
2550 len = sprintf(buffer, fmt,
2551 va_arg(*vargs, long));
2552 #ifdef HAVE_LONG_LONG
2553 else if (longlongflag)
2554 len = sprintf(buffer, fmt,
2555 va_arg(*vargs, PY_LONG_LONG));
2556 #endif
2557 else if (size_tflag)
2558 len = sprintf(buffer, fmt,
2559 va_arg(*vargs, Py_ssize_t));
2560 else
2561 len = sprintf(buffer, fmt,
2562 va_arg(*vargs, int));
2563 }
2564 assert(len >= 0);
2565
2566 if (precision < len)
2567 precision = len;
2568
2569 arglen = Py_MAX(precision, width);
2570 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2571 return NULL;
2572
2573 if (width > precision) {
2574 Py_UCS4 fillchar;
2575 fill = width - precision;
2576 fillchar = zeropad?'0':' ';
2577 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == - 1)
2578 return NULL;
2579 writer->pos += fill;
2580 }
2581 if (precision > len) {
2582 fill = precision - len;
2583 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2584 return NULL;
2585 writer->pos += fill;
2586 }
2587
2588 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2589 return NULL;
2590 break;
2591 }
2592
2593 case 'p':
2594 {
2595 char number[MAX_LONG_LONG_CHARS];
2596
2597 len = sprintf(number, "%p", va_arg(*vargs, void*));
2598 assert(len >= 0);
2599
2600 /* %p is ill-defined: ensure leading 0x. */
2601 if (number[1] == 'X')
2602 number[1] = 'x';
2603 else if (number[1] != 'x') {
2604 memmove(number + 2, number,
2605 strlen(number) + 1);
2606 number[0] = '0';
2607 number[1] = 'x';
2608 len += 2;
2609 }
2610
2611 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2612 return NULL;
2613 break;
2614 }
2615
2616 case 's':
2617 {
2618 /* UTF-8 */
2619 const char *s = va_arg(*vargs, const char*);
2620 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
2621 return NULL;
2622 break;
2623 }
2624
2625 case 'U':
2626 {
2627 PyObject *obj = va_arg(*vargs, PyObject *);
2628 assert(obj && _PyUnicode_CHECK(obj));
2629
2630 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2631 return NULL;
2632 break;
2633 }
2634
2635 case 'V':
2636 {
2637 PyObject *obj = va_arg(*vargs, PyObject *);
2638 const char *str = va_arg(*vargs, const char *);
2639 if (obj) {
2640 assert(_PyUnicode_CHECK(obj));
2641 if (unicode_fromformat_write_str(writer, obj, width, precision) == - 1)
2642 return NULL;
2643 }
2644 else {
2645 assert(str != NULL);
2646 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0 )
2647 return NULL;
2648 }
2649 break;
2650 }
2651
2652 case 'S':
2653 {
2654 PyObject *obj = va_arg(*vargs, PyObject *);
2655 PyObject *str;
2656 assert(obj);
2657 str = PyObject_Str(obj);
2658 if (!str)
2659 return NULL;
2660 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
2661 Py_DECREF(str);
2662 return NULL;
2663 }
2664 Py_DECREF(str);
2665 break;
2666 }
2667
2668 case 'R':
2669 {
2670 PyObject *obj = va_arg(*vargs, PyObject *);
2671 PyObject *repr;
2672 assert(obj);
2673 repr = PyObject_Repr(obj);
2674 if (!repr)
2675 return NULL;
2676 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
2677 Py_DECREF(repr);
2678 return NULL;
2679 }
2680 Py_DECREF(repr);
2681 break;
2682 }
2683
2684 case 'A':
2685 {
2686 PyObject *obj = va_arg(*vargs, PyObject *);
2687 PyObject *ascii;
2688 assert(obj);
2689 ascii = PyObject_ASCII(obj);
2690 if (!ascii)
2691 return NULL;
2692 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
2693 Py_DECREF(ascii);
2694 return NULL;
2695 }
2696 Py_DECREF(ascii);
2697 break;
2698 }
2699
2700 case '%':
2701 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2702 return NULL;
2703 break;
2704
2705 default:
2706 /* if we stumble upon an unknown formatting code, copy the rest
2707 of the format string to the output string. (we cannot just
2708 skip the code, since there's no way to know what's in the
2709 argument list) */
2710 len = strlen(p);
2711 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
2712 return NULL;
2713 f = p+len;
2714 return f;
2715 }
2716
2717 f++;
2371 return f; 2718 return f;
2372 } 2719 }
2373
2374 /* maximum number of characters required for output of %ld. 21 characters
2375 allows for 64-bit integers (in decimal) and an optional sign. */
2376 #define MAX_LONG_CHARS 21
2377 /* maximum number of characters required for output of %lld.
2378 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2379 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2380 #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2381 2720
2382 PyObject * 2721 PyObject *
2383 PyUnicode_FromFormatV(const char *format, va_list vargs) 2722 PyUnicode_FromFormatV(const char *format, va_list vargs)
2384 { 2723 {
2385 va_list count; 2724 va_list vargs2;
2386 Py_ssize_t callcount = 0; 2725 const char *f;
2387 PyObject **callresults = NULL; 2726 _PyUnicodeWriter writer;
2388 PyObject **callresult = NULL; 2727
2389 Py_ssize_t n = 0; 2728 _PyUnicodeWriter_Init(&writer);
2390 int width = 0; 2729 writer.min_length = strlen(format) + 100;
2391 int precision = 0; 2730 writer.overallocate = 1;
2392 int zeropad; 2731
2393 const char* f; 2732 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2394 PyObject *string; 2733 Copy it to be able to pass a reference to a subfunction. */
2395 /* used by sprintf */ 2734 Py_VA_COPY(vargs2, vargs);
2396 char fmt[61]; /* should be enough for %0width.precisionlld */ 2735
2397 Py_UCS4 maxchar = 127; /* result is ASCII by default */ 2736 for (f = format; *f; ) {
2398 Py_UCS4 argmaxchar;
2399 Py_ssize_t numbersize = 0;
2400 char *numberresults = NULL;
2401 char *numberresult = NULL;
2402 Py_ssize_t i;
2403 int kind;
2404 void *data;
2405
2406 Py_VA_COPY(count, vargs);
2407 /* step 1: count the number of %S/%R/%A/%s format specifications
2408 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2409 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
2410 * result in an array)
2411 * also estimate a upper bound for all the number formats in the string,
2412 * numbers will be formatted in step 3 and be kept in a '\0'-separated
2413 * buffer before putting everything together. */
2414 for (f = format; *f; f++) {
2415 if (*f == '%') { 2737 if (*f == '%') {
2416 int longlongflag; 2738 f = unicode_fromformat_arg(&writer, f, &vargs2);
2417 /* skip width or width.precision (eg. "1.2" of "%1.2f") */ 2739 if (f == NULL)
2418 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL); 2740 goto fail;
2419 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V') 2741 }
2420 ++callcount; 2742 else {
2421 2743 const char *p;
2422 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') { 2744 Py_ssize_t len;
2423 #ifdef HAVE_LONG_LONG 2745
2424 if (longlongflag) { 2746 p = f;
2425 if (width < MAX_LONG_LONG_CHARS) 2747 do
2426 width = MAX_LONG_LONG_CHARS; 2748 {
2749 if ((unsigned char)*p > 127) {
2750 PyErr_Format(PyExc_ValueError,
2751 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2752 "string, got a non-ASCII byte: 0x%02x",
2753 (unsigned char)*p);
2754 return NULL;
2427 } 2755 }
2428 else 2756 p++;
2429 #endif
2430 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2431 including sign. Decimal takes the most space. This
2432 isn't enough for octal. If a width is specified we
2433 need more (which we allocate later). */
2434 if (width < MAX_LONG_CHARS)
2435 width = MAX_LONG_CHARS;
2436
2437 /* account for the size + '\0' to separate numbers
2438 inside of the numberresults buffer */
2439 numbersize += (width + 1);
2440 } 2757 }
2441 } 2758 while (*p != '\0' && *p != '%');
2442 else if ((unsigned char)*f > 127) { 2759 len = p - f;
2443 PyErr_Format(PyExc_ValueError, 2760
2444 "PyUnicode_FromFormatV() expects an ASCII-encoded format " 2761 if (*p == '\0')
2445 "string, got a non-ASCII byte: 0x%02x", 2762 writer.overallocate = 0;
2446 (unsigned char)*f); 2763
2447 return NULL; 2764 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
2448 } 2765 goto fail;
2449 } 2766
2450 /* step 2: allocate memory for the results of 2767 f = p;
2451 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */ 2768 }
2452 if (callcount) { 2769 }
2453 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount); 2770 return _PyUnicodeWriter_Finish(&writer);
2454 if (!callresults) { 2771
2455 PyErr_NoMemory();
2456 return NULL;
2457 }
2458 callresult = callresults;
2459 }
2460 /* step 2.5: allocate memory for the results of formating numbers */
2461 if (numbersize) {
2462 numberresults = PyObject_Malloc(numbersize);
2463 if (!numberresults) {
2464 PyErr_NoMemory();
2465 goto fail;
2466 }
2467 numberresult = numberresults;
2468 }
2469
2470 /* step 3: format numbers and figure out how large a buffer we need */
2471 for (f = format; *f; f++) {
2472 if (*f == '%') {
2473 const char* p;
2474 int longflag;
2475 int longlongflag;
2476 int size_tflag;
2477 int numprinted;
2478
2479 p = f;
2480 zeropad = (f[1] == '0');
2481 f = parse_format_flags(f, &width, &precision,
2482 &longflag, &longlongflag, &size_tflag);
2483 switch (*f) {
2484 case 'c':
2485 {
2486 Py_UCS4 ordinal = va_arg(count, int);
2487 maxchar = MAX_MAXCHAR(maxchar, ordinal);
2488 n++;
2489 break;
2490 }
2491 case '%':
2492 n++;
2493 break;
2494 case 'i':
2495 case 'd':
2496 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2497 width, precision, *f);
2498 if (longflag)
2499 numprinted = sprintf(numberresult, fmt,
2500 va_arg(count, long));
2501 #ifdef HAVE_LONG_LONG
2502 else if (longlongflag)
2503 numprinted = sprintf(numberresult, fmt,
2504 va_arg(count, PY_LONG_LONG));
2505 #endif
2506 else if (size_tflag)
2507 numprinted = sprintf(numberresult, fmt,
2508 va_arg(count, Py_ssize_t));
2509 else
2510 numprinted = sprintf(numberresult, fmt,
2511 va_arg(count, int));
2512 n += numprinted;
2513 /* advance by +1 to skip over the '\0' */
2514 numberresult += (numprinted + 1);
2515 assert(*(numberresult - 1) == '\0');
2516 assert(*(numberresult - 2) != '\0');
2517 assert(numprinted >= 0);
2518 assert(numberresult <= numberresults + numbersize);
2519 break;
2520 case 'u':
2521 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2522 width, precision, 'u');
2523 if (longflag)
2524 numprinted = sprintf(numberresult, fmt,
2525 va_arg(count, unsigned long));
2526 #ifdef HAVE_LONG_LONG
2527 else if (longlongflag)
2528 numprinted = sprintf(numberresult, fmt,
2529 va_arg(count, unsigned PY_LONG_LONG));
2530 #endif
2531 else if (size_tflag)
2532 numprinted = sprintf(numberresult, fmt,
2533 va_arg(count, size_t));
2534 else
2535 numprinted = sprintf(numberresult, fmt,
2536 va_arg(count, unsigned int));
2537 n += numprinted;
2538 numberresult += (numprinted + 1);
2539 assert(*(numberresult - 1) == '\0');
2540 assert(*(numberresult - 2) != '\0');
2541 assert(numprinted >= 0);
2542 assert(numberresult <= numberresults + numbersize);
2543 break;
2544 case 'x':
2545 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2546 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2547 n += numprinted;
2548 numberresult += (numprinted + 1);
2549 assert(*(numberresult - 1) == '\0');
2550 assert(*(numberresult - 2) != '\0');
2551 assert(numprinted >= 0);
2552 assert(numberresult <= numberresults + numbersize);
2553 break;
2554 case 'p':
2555 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2556 /* %p is ill-defined: ensure leading 0x. */
2557 if (numberresult[1] == 'X')
2558 numberresult[1] = 'x';
2559 else if (numberresult[1] != 'x') {
2560 memmove(numberresult + 2, numberresult,
2561 strlen(numberresult) + 1);
2562 numberresult[0] = '0';
2563 numberresult[1] = 'x';
2564 numprinted += 2;
2565 }
2566 n += numprinted;
2567 numberresult += (numprinted + 1);
2568 assert(*(numberresult - 1) == '\0');
2569 assert(*(numberresult - 2) != '\0');
2570 assert(numprinted >= 0);
2571 assert(numberresult <= numberresults + numbersize);
2572 break;
2573 case 's':
2574 {
2575 /* UTF-8 */
2576 const char *s = va_arg(count, const char*);
2577 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "repl ace", NULL);
2578 if (!str)
2579 goto fail;
2580 /* since PyUnicode_DecodeUTF8 returns already flexible
2581 unicode objects, there is no need to call ready on them */
2582 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
2583 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
2584 n += PyUnicode_GET_LENGTH(str);
2585 /* Remember the str and switch to the next slot */
2586 *callresult++ = str;
2587 break;
2588 }
2589 case 'U':
2590 {
2591 PyObject *obj = va_arg(count, PyObject *);
2592 assert(obj && _PyUnicode_CHECK(obj));
2593 if (PyUnicode_READY(obj) == -1)
2594 goto fail;
2595 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
2596 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
2597 n += PyUnicode_GET_LENGTH(obj);
2598 break;
2599 }
2600 case 'V':
2601 {
2602 PyObject *obj = va_arg(count, PyObject *);
2603 const char *str = va_arg(count, const char *);
2604 PyObject *str_obj;
2605 assert(obj || str);
2606 assert(!obj || _PyUnicode_CHECK(obj));
2607 if (obj) {
2608 if (PyUnicode_READY(obj) == -1)
2609 goto fail;
2610 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
2611 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
2612 n += PyUnicode_GET_LENGTH(obj);
2613 *callresult++ = NULL;
2614 }
2615 else {
2616 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "re place", NULL);
2617 if (!str_obj)
2618 goto fail;
2619 if (PyUnicode_READY(str_obj) == -1) {
2620 Py_DECREF(str_obj);
2621 goto fail;
2622 }
2623 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
2624 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
2625 n += PyUnicode_GET_LENGTH(str_obj);
2626 *callresult++ = str_obj;
2627 }
2628 break;
2629 }
2630 case 'S':
2631 {
2632 PyObject *obj = va_arg(count, PyObject *);
2633 PyObject *str;
2634 assert(obj);
2635 str = PyObject_Str(obj);
2636 if (!str)
2637 goto fail;
2638 if (PyUnicode_READY(str) == -1) {
2639 Py_DECREF(str);
2640 goto fail;
2641 }
2642 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
2643 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
2644 n += PyUnicode_GET_LENGTH(str);
2645 /* Remember the str and switch to the next slot */
2646 *callresult++ = str;
2647 break;
2648 }
2649 case 'R':
2650 {
2651 PyObject *obj = va_arg(count, PyObject *);
2652 PyObject *repr;
2653 assert(obj);
2654 repr = PyObject_Repr(obj);
2655 if (!repr)
2656 goto fail;
2657 if (PyUnicode_READY(repr) == -1) {
2658 Py_DECREF(repr);
2659 goto fail;
2660 }
2661 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
2662 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
2663 n += PyUnicode_GET_LENGTH(repr);
2664 /* Remember the repr and switch to the next slot */
2665 *callresult++ = repr;
2666 break;
2667 }
2668 case 'A':
2669 {
2670 PyObject *obj = va_arg(count, PyObject *);
2671 PyObject *ascii;
2672 assert(obj);
2673 ascii = PyObject_ASCII(obj);
2674 if (!ascii)
2675 goto fail;
2676 if (PyUnicode_READY(ascii) == -1) {
2677 Py_DECREF(ascii);
2678 goto fail;
2679 }
2680 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
2681 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
2682 n += PyUnicode_GET_LENGTH(ascii);
2683 /* Remember the repr and switch to the next slot */
2684 *callresult++ = ascii;
2685 break;
2686 }
2687 default:
2688 /* if we stumble upon an unknown
2689 formatting code, copy the rest of
2690 the format string to the output
2691 string. (we cannot just skip the
2692 code, since there's no way to know
2693 what's in the argument list) */
2694 n += strlen(p);
2695 goto expand;
2696 }
2697 } else
2698 n++;
2699 }
2700 expand:
2701 /* step 4: fill the buffer */
2702 /* Since we've analyzed how much space we need,
2703 we don't have to resize the string.
2704 There can be no errors beyond this point. */
2705 string = PyUnicode_New(n, maxchar);
2706 if (!string)
2707 goto fail;
2708 kind = PyUnicode_KIND(string);
2709 data = PyUnicode_DATA(string);
2710 callresult = callresults;
2711 numberresult = numberresults;
2712
2713 for (i = 0, f = format; *f; f++) {
2714 if (*f == '%') {
2715 const char* p;
2716
2717 p = f;
2718 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2719 /* checking for == because the last argument could be a empty
2720 string, which causes i to point to end, the assert at the end of
2721 the loop */
2722 assert(i <= PyUnicode_GET_LENGTH(string));
2723
2724 switch (*f) {
2725 case 'c':
2726 {
2727 const int ordinal = va_arg(vargs, int);
2728 PyUnicode_WRITE(kind, data, i++, ordinal);
2729 break;
2730 }
2731 case 'i':
2732 case 'd':
2733 case 'u':
2734 case 'x':
2735 case 'p':
2736 {
2737 Py_ssize_t written;
2738 /* unused, since we already have the result */
2739 if (*f == 'p')
2740 (void) va_arg(vargs, void *);
2741 else
2742 (void) va_arg(vargs, int);
2743 /* extract the result from numberresults and append. */
2744 written = unicode_write_cstr(string, i, numberresult);
2745 /* skip over the separating '\0' */
2746 i += written;
2747 numberresult += written;
2748 assert(*numberresult == '\0');
2749 numberresult++;
2750 assert(numberresult <= numberresults + numbersize);
2751 break;
2752 }
2753 case 's':
2754 {
2755 /* unused, since we already have the result */
2756 Py_ssize_t size;
2757 (void) va_arg(vargs, char *);
2758 size = PyUnicode_GET_LENGTH(*callresult);
2759 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
2760 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
2761 i += size;
2762 /* We're done with the unicode()/repr() => forget it */
2763 Py_DECREF(*callresult);
2764 /* switch to next unicode()/repr() result */
2765 ++callresult;
2766 break;
2767 }
2768 case 'U':
2769 {
2770 PyObject *obj = va_arg(vargs, PyObject *);
2771 Py_ssize_t size;
2772 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2773 size = PyUnicode_GET_LENGTH(obj);
2774 _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
2775 i += size;
2776 break;
2777 }
2778 case 'V':
2779 {
2780 Py_ssize_t size;
2781 PyObject *obj = va_arg(vargs, PyObject *);
2782 va_arg(vargs, const char *);
2783 if (obj) {
2784 size = PyUnicode_GET_LENGTH(obj);
2785 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2786 _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
2787 i += size;
2788 } else {
2789 size = PyUnicode_GET_LENGTH(*callresult);
2790 assert(PyUnicode_KIND(*callresult) <=
2791 PyUnicode_KIND(string));
2792 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, siz e);
2793 i += size;
2794 Py_DECREF(*callresult);
2795 }
2796 ++callresult;
2797 break;
2798 }
2799 case 'S':
2800 case 'R':
2801 case 'A':
2802 {
2803 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
2804 /* unused, since we already have the result */
2805 (void) va_arg(vargs, PyObject *);
2806 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
2807 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
2808 i += size;
2809 /* We're done with the unicode()/repr() => forget it */
2810 Py_DECREF(*callresult);
2811 /* switch to next unicode()/repr() result */
2812 ++callresult;
2813 break;
2814 }
2815 case '%':
2816 PyUnicode_WRITE(kind, data, i++, '%');
2817 break;
2818 default:
2819 i += unicode_write_cstr(string, i, p);
2820 assert(i == PyUnicode_GET_LENGTH(string));
2821 goto end;
2822 }
2823 }
2824 else {
2825 assert(i < PyUnicode_GET_LENGTH(string));
2826 PyUnicode_WRITE(kind, data, i++, *f);
2827 }
2828 }
2829 assert(i == PyUnicode_GET_LENGTH(string));
2830
2831 end:
2832 if (callresults)
2833 PyObject_Free(callresults);
2834 if (numberresults)
2835 PyObject_Free(numberresults);
2836 return unicode_result(string);
2837 fail: 2772 fail:
2838 if (callresults) { 2773 _PyUnicodeWriter_Dealloc(&writer);
2839 PyObject **callresult2 = callresults;
2840 while (callresult2 < callresult) {
2841 Py_XDECREF(*callresult2);
2842 ++callresult2;
2843 }
2844 PyObject_Free(callresults);
2845 }
2846 if (numberresults)
2847 PyObject_Free(numberresults);
2848 return NULL; 2774 return NULL;
2849 } 2775 }
2850 2776
2851 PyObject * 2777 PyObject *
2852 PyUnicode_FromFormat(const char *format, ...) 2778 PyUnicode_FromFormat(const char *format, ...)
2853 { 2779 {
2854 PyObject* ret; 2780 PyObject* ret;
2855 va_list vargs; 2781 va_list vargs;
2856 2782
2857 #ifdef HAVE_STDARG_PROTOTYPES 2783 #ifdef HAVE_STDARG_PROTOTYPES
(...skipping 72 matching lines...) Expand 10 before | Expand all | Expand 10 after
2930 PyErr_NoMemory(); 2856 PyErr_NoMemory();
2931 return NULL; 2857 return NULL;
2932 } 2858 }
2933 2859
2934 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t)); 2860 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2935 if (buffer == NULL) { 2861 if (buffer == NULL) {
2936 PyErr_NoMemory(); 2862 PyErr_NoMemory();
2937 return NULL; 2863 return NULL;
2938 } 2864 }
2939 buflen = unicode_aswidechar(unicode, buffer, buflen); 2865 buflen = unicode_aswidechar(unicode, buffer, buflen);
2940 if (buflen == -1) 2866 if (buflen == -1) {
2941 return NULL; 2867 PyMem_FREE(buffer);
2868 return NULL;
2869 }
2942 if (size != NULL) 2870 if (size != NULL)
2943 *size = buflen; 2871 *size = buflen;
2944 return buffer; 2872 return buffer;
2945 } 2873 }
2946 2874
2947 #endif /* HAVE_WCHAR_H */ 2875 #endif /* HAVE_WCHAR_H */
2948 2876
2949 PyObject * 2877 PyObject *
2950 PyUnicode_FromOrdinal(int ordinal) 2878 PyUnicode_FromOrdinal(int ordinal)
2951 { 2879 {
2952 PyObject *v; 2880 PyObject *v;
2881 void *data;
2882 int kind;
2883
2953 if (ordinal < 0 || ordinal > MAX_UNICODE) { 2884 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2954 PyErr_SetString(PyExc_ValueError, 2885 PyErr_SetString(PyExc_ValueError,
2955 "chr() arg not in range(0x110000)"); 2886 "chr() arg not in range(0x110000)");
2956 return NULL; 2887 return NULL;
2957 } 2888 }
2958 2889
2959 if (ordinal < 256) 2890 if ((Py_UCS4)ordinal < 256)
2960 return get_latin1_char(ordinal); 2891 return get_latin1_char((unsigned char)ordinal);
2961 2892
2962 v = PyUnicode_New(1, ordinal); 2893 v = PyUnicode_New(1, ordinal);
2963 if (v == NULL) 2894 if (v == NULL)
2964 return NULL; 2895 return NULL;
2965 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal); 2896 kind = PyUnicode_KIND(v);
2897 data = PyUnicode_DATA(v);
2898 PyUnicode_WRITE(kind, data, 0, ordinal);
2966 assert(_PyUnicode_CheckConsistency(v, 1)); 2899 assert(_PyUnicode_CheckConsistency(v, 1));
2967 return v; 2900 return v;
2968 } 2901 }
2969 2902
2970 PyObject * 2903 PyObject *
2971 PyUnicode_FromObject(register PyObject *obj) 2904 PyUnicode_FromObject(PyObject *obj)
2972 { 2905 {
2973 /* XXX Perhaps we should make this API an alias of 2906 /* XXX Perhaps we should make this API an alias of
2974 PyObject_Str() instead ?! */ 2907 PyObject_Str() instead ?! */
2975 if (PyUnicode_CheckExact(obj)) { 2908 if (PyUnicode_CheckExact(obj)) {
2976 if (PyUnicode_READY(obj) == -1) 2909 if (PyUnicode_READY(obj) == -1)
2977 return NULL; 2910 return NULL;
2978 Py_INCREF(obj); 2911 Py_INCREF(obj);
2979 return obj; 2912 return obj;
2980 } 2913 }
2981 if (PyUnicode_Check(obj)) { 2914 if (PyUnicode_Check(obj)) {
2982 /* For a Unicode subtype that's not a Unicode object, 2915 /* For a Unicode subtype that's not a Unicode object,
2983 return a true Unicode object with the same data. */ 2916 return a true Unicode object with the same data. */
2984 return _PyUnicode_Copy(obj); 2917 return _PyUnicode_Copy(obj);
2985 } 2918 }
2986 PyErr_Format(PyExc_TypeError, 2919 PyErr_Format(PyExc_TypeError,
2987 "Can't convert '%.100s' object to str implicitly", 2920 "Can't convert '%.100s' object to str implicitly",
2988 Py_TYPE(obj)->tp_name); 2921 Py_TYPE(obj)->tp_name);
2989 return NULL; 2922 return NULL;
2990 } 2923 }
2991 2924
2992 PyObject * 2925 PyObject *
2993 PyUnicode_FromEncodedObject(register PyObject *obj, 2926 PyUnicode_FromEncodedObject(PyObject *obj,
2994 const char *encoding, 2927 const char *encoding,
2995 const char *errors) 2928 const char *errors)
2996 { 2929 {
2997 Py_buffer buffer; 2930 Py_buffer buffer;
2998 PyObject *v; 2931 PyObject *v;
2999 2932
3000 if (obj == NULL) { 2933 if (obj == NULL) {
3001 PyErr_BadInternalCall(); 2934 PyErr_BadInternalCall();
3002 return NULL; 2935 return NULL;
3003 } 2936 }
3004 2937
3005 /* Decoding bytes objects is the most common case and should be fast */ 2938 /* Decoding bytes objects is the most common case and should be fast */
3006 if (PyBytes_Check(obj)) { 2939 if (PyBytes_Check(obj)) {
3007 if (PyBytes_GET_SIZE(obj) == 0) { 2940 if (PyBytes_GET_SIZE(obj) == 0)
3008 Py_INCREF(unicode_empty); 2941 _Py_RETURN_UNICODE_EMPTY();
3009 v = unicode_empty; 2942 v = PyUnicode_Decode(
3010 } 2943 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3011 else { 2944 encoding, errors);
3012 v = PyUnicode_Decode(
3013 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3014 encoding, errors);
3015 }
3016 return v; 2945 return v;
3017 } 2946 }
3018 2947
3019 if (PyUnicode_Check(obj)) { 2948 if (PyUnicode_Check(obj)) {
3020 PyErr_SetString(PyExc_TypeError, 2949 PyErr_SetString(PyExc_TypeError,
3021 "decoding str is not supported"); 2950 "decoding str is not supported");
3022 return NULL; 2951 return NULL;
3023 } 2952 }
3024 2953
3025 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ 2954 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3026 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { 2955 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3027 PyErr_Format(PyExc_TypeError, 2956 PyErr_Format(PyExc_TypeError,
3028 "coercing to str: need bytes, bytearray " 2957 "coercing to str: need bytes, bytearray "
3029 "or buffer-like object, %.80s found", 2958 "or buffer-like object, %.80s found",
3030 Py_TYPE(obj)->tp_name); 2959 Py_TYPE(obj)->tp_name);
3031 return NULL; 2960 return NULL;
3032 } 2961 }
3033 2962
3034 if (buffer.len == 0) { 2963 if (buffer.len == 0) {
3035 Py_INCREF(unicode_empty); 2964 PyBuffer_Release(&buffer);
3036 v = unicode_empty; 2965 _Py_RETURN_UNICODE_EMPTY();
3037 } 2966 }
3038 else 2967
3039 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); 2968 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3040
3041 PyBuffer_Release(&buffer); 2969 PyBuffer_Release(&buffer);
3042 return v; 2970 return v;
3043 } 2971 }
3044 2972
3045 /* Convert encoding to lower case and replace '_' with '-' in order to 2973 /* Convert encoding to lower case and replace '_' with '-' in order to
3046 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1), 2974 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
3047 1 on success. */ 2975 1 on success. */
3048 static int 2976 int
3049 normalize_encoding(const char *encoding, 2977 _Py_normalize_encoding(const char *encoding,
3050 char *lower, 2978 char *lower,
3051 size_t lower_len) 2979 size_t lower_len)
3052 { 2980 {
3053 const char *e; 2981 const char *e;
3054 char *l; 2982 char *l;
3055 char *l_end; 2983 char *l_end;
3056 2984
3057 if (encoding == NULL) { 2985 if (encoding == NULL) {
2986 /* 6 == strlen("utf-8") + 1 */
2987 if (lower_len < 6)
2988 return 0;
3058 strcpy(lower, "utf-8"); 2989 strcpy(lower, "utf-8");
3059 return 1; 2990 return 1;
3060 } 2991 }
3061 e = encoding; 2992 e = encoding;
3062 l = lower; 2993 l = lower;
3063 l_end = &lower[lower_len - 1]; 2994 l_end = &lower[lower_len - 1];
3064 while (*e) { 2995 while (*e) {
3065 if (l == l_end) 2996 if (l == l_end)
3066 return 0; 2997 return 0;
3067 if (Py_ISUPPER(*e)) { 2998 if (Py_ISUPPER(*e)) {
(...skipping 15 matching lines...) Expand all
3083 PyUnicode_Decode(const char *s, 3014 PyUnicode_Decode(const char *s,
3084 Py_ssize_t size, 3015 Py_ssize_t size,
3085 const char *encoding, 3016 const char *encoding,
3086 const char *errors) 3017 const char *errors)
3087 { 3018 {
3088 PyObject *buffer = NULL, *unicode; 3019 PyObject *buffer = NULL, *unicode;
3089 Py_buffer info; 3020 Py_buffer info;
3090 char lower[11]; /* Enough for any encoding shortcut */ 3021 char lower[11]; /* Enough for any encoding shortcut */
3091 3022
3092 /* Shortcuts for common default encodings */ 3023 /* Shortcuts for common default encodings */
3093 if (normalize_encoding(encoding, lower, sizeof(lower))) { 3024 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
3094 if ((strcmp(lower, "utf-8") == 0) || 3025 if ((strcmp(lower, "utf-8") == 0) ||
3095 (strcmp(lower, "utf8") == 0)) 3026 (strcmp(lower, "utf8") == 0))
3096 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 3027 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3097 else if ((strcmp(lower, "latin-1") == 0) || 3028 else if ((strcmp(lower, "latin-1") == 0) ||
3098 (strcmp(lower, "latin1") == 0) || 3029 (strcmp(lower, "latin1") == 0) ||
3099 (strcmp(lower, "iso-8859-1") == 0)) 3030 (strcmp(lower, "iso-8859-1") == 0) ||
3031 (strcmp(lower, "iso8859-1") == 0))
3100 return PyUnicode_DecodeLatin1(s, size, errors); 3032 return PyUnicode_DecodeLatin1(s, size, errors);
3101 #ifdef HAVE_MBCS 3033 #ifdef HAVE_MBCS
3102 else if (strcmp(lower, "mbcs") == 0) 3034 else if (strcmp(lower, "mbcs") == 0)
3103 return PyUnicode_DecodeMBCS(s, size, errors); 3035 return PyUnicode_DecodeMBCS(s, size, errors);
3104 #endif 3036 #endif
3105 else if (strcmp(lower, "ascii") == 0) 3037 else if (strcmp(lower, "ascii") == 0)
3106 return PyUnicode_DecodeASCII(s, size, errors); 3038 return PyUnicode_DecodeASCII(s, size, errors);
3107 else if (strcmp(lower, "utf-16") == 0) 3039 else if (strcmp(lower, "utf-16") == 0)
3108 return PyUnicode_DecodeUTF16(s, size, errors, 0); 3040 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3109 else if (strcmp(lower, "utf-32") == 0) 3041 else if (strcmp(lower, "utf-32") == 0)
3110 return PyUnicode_DecodeUTF32(s, size, errors, 0); 3042 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3111 } 3043 }
3112 3044
3113 /* Decode via the codec registry */ 3045 /* Decode via the codec registry */
3114 buffer = NULL; 3046 buffer = NULL;
3115 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) 3047 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3116 goto onError; 3048 goto onError;
3117 buffer = PyMemoryView_FromBuffer(&info); 3049 buffer = PyMemoryView_FromBuffer(&info);
3118 if (buffer == NULL) 3050 if (buffer == NULL)
3119 goto onError; 3051 goto onError;
3120 unicode = PyCodec_Decode(buffer, encoding, errors); 3052 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3121 if (unicode == NULL) 3053 if (unicode == NULL)
3122 goto onError; 3054 goto onError;
3123 if (!PyUnicode_Check(unicode)) { 3055 if (!PyUnicode_Check(unicode)) {
3124 PyErr_Format(PyExc_TypeError, 3056 PyErr_Format(PyExc_TypeError,
3125 "decoder did not return a str object (type=%.400s)", 3057 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3126 Py_TYPE(unicode)->tp_name); 3058 "use codecs.decode() to decode to arbitrary types",
3059 encoding,
3060 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
3127 Py_DECREF(unicode); 3061 Py_DECREF(unicode);
3128 goto onError; 3062 goto onError;
3129 } 3063 }
3130 Py_DECREF(buffer); 3064 Py_DECREF(buffer);
3131 return unicode_result(unicode); 3065 return unicode_result(unicode);
3132 3066
3133 onError: 3067 onError:
3134 Py_XDECREF(buffer); 3068 Py_XDECREF(buffer);
3135 return NULL; 3069 return NULL;
3136 } 3070 }
(...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after
3174 3108
3175 if (encoding == NULL) 3109 if (encoding == NULL)
3176 encoding = PyUnicode_GetDefaultEncoding(); 3110 encoding = PyUnicode_GetDefaultEncoding();
3177 3111
3178 /* Decode via the codec registry */ 3112 /* Decode via the codec registry */
3179 v = PyCodec_Decode(unicode, encoding, errors); 3113 v = PyCodec_Decode(unicode, encoding, errors);
3180 if (v == NULL) 3114 if (v == NULL)
3181 goto onError; 3115 goto onError;
3182 if (!PyUnicode_Check(v)) { 3116 if (!PyUnicode_Check(v)) {
3183 PyErr_Format(PyExc_TypeError, 3117 PyErr_Format(PyExc_TypeError,
3184 "decoder did not return a str object (type=%.400s)", 3118 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3185 Py_TYPE(v)->tp_name); 3119 "use codecs.decode() to decode to arbitrary types",
3120 encoding,
3121 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
3186 Py_DECREF(v); 3122 Py_DECREF(v);
3187 goto onError; 3123 goto onError;
3188 } 3124 }
3189 return unicode_result(v); 3125 return unicode_result(v);
3190 3126
3191 onError: 3127 onError:
3192 return NULL; 3128 return NULL;
3193 } 3129 }
3194 3130
3195 PyObject * 3131 PyObject *
(...skipping 107 matching lines...) Expand 10 before | Expand all | Expand 10 after
3303 return -1; 3239 return -1;
3304 } 3240 }
3305 3241
3306 PyObject * 3242 PyObject *
3307 PyUnicode_EncodeLocale(PyObject *unicode, const char *errors) 3243 PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3308 { 3244 {
3309 Py_ssize_t wlen, wlen2; 3245 Py_ssize_t wlen, wlen2;
3310 wchar_t *wstr; 3246 wchar_t *wstr;
3311 PyObject *bytes = NULL; 3247 PyObject *bytes = NULL;
3312 char *errmsg; 3248 char *errmsg;
3313 PyObject *reason; 3249 PyObject *reason = NULL;
3314 PyObject *exc; 3250 PyObject *exc;
3315 size_t error_pos; 3251 size_t error_pos;
3316 int surrogateescape; 3252 int surrogateescape;
3317 3253
3318 if (locale_error_handler(errors, &surrogateescape) < 0) 3254 if (locale_error_handler(errors, &surrogateescape) < 0)
3319 return NULL; 3255 return NULL;
3320 3256
3321 wstr = PyUnicode_AsWideCharString(unicode, &wlen); 3257 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3322 if (wstr == NULL) 3258 if (wstr == NULL)
3323 return NULL; 3259 return NULL;
3324 3260
3325 wlen2 = wcslen(wstr); 3261 wlen2 = wcslen(wstr);
3326 if (wlen2 != wlen) { 3262 if (wlen2 != wlen) {
3327 PyMem_Free(wstr); 3263 PyMem_Free(wstr);
3328 PyErr_SetString(PyExc_TypeError, "embedded null character"); 3264 PyErr_SetString(PyExc_TypeError, "embedded null character");
3329 return NULL; 3265 return NULL;
3330 } 3266 }
3331 3267
3332 if (surrogateescape) { 3268 if (surrogateescape) {
3333 /* locale encoding with surrogateescape */ 3269 /* "surrogateescape" error handler */
3334 char *str; 3270 char *str;
3335 3271
3336 str = _Py_wchar2char(wstr, &error_pos); 3272 str = _Py_wchar2char(wstr, &error_pos);
3337 if (str == NULL) { 3273 if (str == NULL) {
3338 if (error_pos == (size_t)-1) { 3274 if (error_pos == (size_t)-1) {
3339 PyErr_NoMemory(); 3275 PyErr_NoMemory();
3340 PyMem_Free(wstr); 3276 PyMem_Free(wstr);
3341 return NULL; 3277 return NULL;
3342 } 3278 }
3343 else { 3279 else {
3344 goto encode_error; 3280 goto encode_error;
3345 } 3281 }
3346 } 3282 }
3347 PyMem_Free(wstr); 3283 PyMem_Free(wstr);
3348 3284
3349 bytes = PyBytes_FromString(str); 3285 bytes = PyBytes_FromString(str);
3350 PyMem_Free(str); 3286 PyMem_Free(str);
3351 } 3287 }
3352 else { 3288 else {
3289 /* strict mode */
3353 size_t len, len2; 3290 size_t len, len2;
3354 3291
3355 len = wcstombs(NULL, wstr, 0); 3292 len = wcstombs(NULL, wstr, 0);
3356 if (len == (size_t)-1) { 3293 if (len == (size_t)-1) {
3357 error_pos = (size_t)-1; 3294 error_pos = (size_t)-1;
3358 goto encode_error; 3295 goto encode_error;
3359 } 3296 }
3360 3297
3361 bytes = PyBytes_FromStringAndSize(NULL, len); 3298 bytes = PyBytes_FromStringAndSize(NULL, len);
3362 if (bytes == NULL) { 3299 if (bytes == NULL) {
(...skipping 18 matching lines...) Expand all
3381 error_pos = wcstombs_errorpos(wstr); 3318 error_pos = wcstombs_errorpos(wstr);
3382 3319
3383 PyMem_Free(wstr); 3320 PyMem_Free(wstr);
3384 Py_XDECREF(bytes); 3321 Py_XDECREF(bytes);
3385 3322
3386 if (errmsg != NULL) { 3323 if (errmsg != NULL) {
3387 size_t errlen; 3324 size_t errlen;
3388 wstr = _Py_char2wchar(errmsg, &errlen); 3325 wstr = _Py_char2wchar(errmsg, &errlen);
3389 if (wstr != NULL) { 3326 if (wstr != NULL) {
3390 reason = PyUnicode_FromWideChar(wstr, errlen); 3327 reason = PyUnicode_FromWideChar(wstr, errlen);
3391 PyMem_Free(wstr); 3328 PyMem_RawFree(wstr);
3392 } else 3329 } else
3393 errmsg = NULL; 3330 errmsg = NULL;
3394 } 3331 }
3395 if (errmsg == NULL) 3332 if (errmsg == NULL)
3396 reason = PyUnicode_FromString( 3333 reason = PyUnicode_FromString(
3397 "wcstombs() encountered an unencodable " 3334 "wcstombs() encountered an unencodable "
3398 "wide character"); 3335 "wide character");
3399 if (reason == NULL) 3336 if (reason == NULL)
3400 return NULL; 3337 return NULL;
3401 3338
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after
3448 { 3385 {
3449 PyObject *v; 3386 PyObject *v;
3450 char lower[11]; /* Enough for any encoding shortcut */ 3387 char lower[11]; /* Enough for any encoding shortcut */
3451 3388
3452 if (!PyUnicode_Check(unicode)) { 3389 if (!PyUnicode_Check(unicode)) {
3453 PyErr_BadArgument(); 3390 PyErr_BadArgument();
3454 return NULL; 3391 return NULL;
3455 } 3392 }
3456 3393
3457 /* Shortcuts for common default encodings */ 3394 /* Shortcuts for common default encodings */
3458 if (normalize_encoding(encoding, lower, sizeof(lower))) { 3395 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
3459 if ((strcmp(lower, "utf-8") == 0) || 3396 if ((strcmp(lower, "utf-8") == 0) ||
3460 (strcmp(lower, "utf8") == 0)) 3397 (strcmp(lower, "utf8") == 0))
3461 { 3398 {
3462 if (errors == NULL || strcmp(errors, "strict") == 0) 3399 if (errors == NULL || strcmp(errors, "strict") == 0)
3463 return _PyUnicode_AsUTF8String(unicode, NULL); 3400 return _PyUnicode_AsUTF8String(unicode, NULL);
3464 else 3401 else
3465 return _PyUnicode_AsUTF8String(unicode, errors); 3402 return _PyUnicode_AsUTF8String(unicode, errors);
3466 } 3403 }
3467 else if ((strcmp(lower, "latin-1") == 0) || 3404 else if ((strcmp(lower, "latin-1") == 0) ||
3468 (strcmp(lower, "latin1") == 0) || 3405 (strcmp(lower, "latin1") == 0) ||
3469 (strcmp(lower, "iso-8859-1") == 0)) 3406 (strcmp(lower, "iso-8859-1") == 0) ||
3407 (strcmp(lower, "iso8859-1") == 0))
3470 return _PyUnicode_AsLatin1String(unicode, errors); 3408 return _PyUnicode_AsLatin1String(unicode, errors);
3471 #ifdef HAVE_MBCS 3409 #ifdef HAVE_MBCS
3472 else if (strcmp(lower, "mbcs") == 0) 3410 else if (strcmp(lower, "mbcs") == 0)
3473 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors); 3411 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3474 #endif 3412 #endif
3475 else if (strcmp(lower, "ascii") == 0) 3413 else if (strcmp(lower, "ascii") == 0)
3476 return _PyUnicode_AsASCIIString(unicode, errors); 3414 return _PyUnicode_AsASCIIString(unicode, errors);
3477 } 3415 }
3478 3416
3479 /* Encode via the codec registry */ 3417 /* Encode via the codec registry */
3480 v = PyCodec_Encode(unicode, encoding, errors); 3418 v = _PyCodec_EncodeText(unicode, encoding, errors);
3481 if (v == NULL) 3419 if (v == NULL)
3482 return NULL; 3420 return NULL;
3483 3421
3484 /* The normal path */ 3422 /* The normal path */
3485 if (PyBytes_Check(v)) 3423 if (PyBytes_Check(v))
3486 return v; 3424 return v;
3487 3425
3488 /* If the codec returns a buffer, raise a warning and convert to bytes */ 3426 /* If the codec returns a buffer, raise a warning and convert to bytes */
3489 if (PyByteArray_Check(v)) { 3427 if (PyByteArray_Check(v)) {
3490 int error; 3428 int error;
3491 PyObject *b; 3429 PyObject *b;
3492 3430
3493 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, 3431 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3494 "encoder %s returned bytearray instead of bytes", 3432 "encoder %s returned bytearray instead of bytes; "
3433 "use codecs.encode() to encode to arbitrary types",
3495 encoding); 3434 encoding);
3496 if (error) { 3435 if (error) {
3497 Py_DECREF(v); 3436 Py_DECREF(v);
3498 return NULL; 3437 return NULL;
3499 } 3438 }
3500 3439
3501 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 3440 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3502 Py_DECREF(v); 3441 Py_DECREF(v);
3503 return b; 3442 return b;
3504 } 3443 }
3505 3444
3506 PyErr_Format(PyExc_TypeError, 3445 PyErr_Format(PyExc_TypeError,
3507 "encoder did not return a bytes object (type=%.400s)", 3446 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3508 Py_TYPE(v)->tp_name); 3447 "use codecs.encode() to encode to arbitrary types",
3448 encoding,
3449 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
3509 Py_DECREF(v); 3450 Py_DECREF(v);
3510 return NULL; 3451 return NULL;
3511 } 3452 }
3512 3453
3513 PyObject * 3454 PyObject *
3514 PyUnicode_AsEncodedUnicode(PyObject *unicode, 3455 PyUnicode_AsEncodedUnicode(PyObject *unicode,
3515 const char *encoding, 3456 const char *encoding,
3516 const char *errors) 3457 const char *errors)
3517 { 3458 {
3518 PyObject *v; 3459 PyObject *v;
3519 3460
3520 if (!PyUnicode_Check(unicode)) { 3461 if (!PyUnicode_Check(unicode)) {
3521 PyErr_BadArgument(); 3462 PyErr_BadArgument();
3522 goto onError; 3463 goto onError;
3523 } 3464 }
3524 3465
3525 if (encoding == NULL) 3466 if (encoding == NULL)
3526 encoding = PyUnicode_GetDefaultEncoding(); 3467 encoding = PyUnicode_GetDefaultEncoding();
3527 3468
3528 /* Encode via the codec registry */ 3469 /* Encode via the codec registry */
3529 v = PyCodec_Encode(unicode, encoding, errors); 3470 v = PyCodec_Encode(unicode, encoding, errors);
3530 if (v == NULL) 3471 if (v == NULL)
3531 goto onError; 3472 goto onError;
3532 if (!PyUnicode_Check(v)) { 3473 if (!PyUnicode_Check(v)) {
3533 PyErr_Format(PyExc_TypeError, 3474 PyErr_Format(PyExc_TypeError,
3534 "encoder did not return an str object (type=%.400s)", 3475 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3535 Py_TYPE(v)->tp_name); 3476 "use codecs.encode() to encode to arbitrary types",
3477 encoding,
3478 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
3536 Py_DECREF(v); 3479 Py_DECREF(v);
3537 goto onError; 3480 goto onError;
3538 } 3481 }
3539 return v; 3482 return v;
3540 3483
3541 onError: 3484 onError:
3542 return NULL; 3485 return NULL;
3543 } 3486 }
3544 3487
3545 static size_t 3488 static size_t
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after
3588 PyObject *reason, *exc; 3531 PyObject *reason, *exc;
3589 3532
3590 if (locale_error_handler(errors, &surrogateescape) < 0) 3533 if (locale_error_handler(errors, &surrogateescape) < 0)
3591 return NULL; 3534 return NULL;
3592 3535
3593 if (str[len] != '\0' || len != strlen(str)) { 3536 if (str[len] != '\0' || len != strlen(str)) {
3594 PyErr_SetString(PyExc_TypeError, "embedded null character"); 3537 PyErr_SetString(PyExc_TypeError, "embedded null character");
3595 return NULL; 3538 return NULL;
3596 } 3539 }
3597 3540
3598 if (surrogateescape) 3541 if (surrogateescape) {
3599 { 3542 /* "surrogateescape" error handler */
3600 wstr = _Py_char2wchar(str, &wlen); 3543 wstr = _Py_char2wchar(str, &wlen);
3601 if (wstr == NULL) { 3544 if (wstr == NULL) {
3602 if (wlen == (size_t)-1) 3545 if (wlen == (size_t)-1)
3603 PyErr_NoMemory(); 3546 PyErr_NoMemory();
3604 else 3547 else
3605 PyErr_SetFromErrno(PyExc_OSError); 3548 PyErr_SetFromErrno(PyExc_OSError);
3606 return NULL; 3549 return NULL;
3607 } 3550 }
3608 3551
3609 unicode = PyUnicode_FromWideChar(wstr, wlen); 3552 unicode = PyUnicode_FromWideChar(wstr, wlen);
3610 PyMem_Free(wstr); 3553 PyMem_RawFree(wstr);
3611 } 3554 }
3612 else { 3555 else {
3556 /* strict mode */
3613 #ifndef HAVE_BROKEN_MBSTOWCS 3557 #ifndef HAVE_BROKEN_MBSTOWCS
3614 wlen = mbstowcs(NULL, str, 0); 3558 wlen = mbstowcs(NULL, str, 0);
3615 #else 3559 #else
3616 wlen = len; 3560 wlen = len;
3617 #endif 3561 #endif
3618 if (wlen == (size_t)-1) 3562 if (wlen == (size_t)-1)
3619 goto decode_error; 3563 goto decode_error;
3620 if (wlen+1 <= smallbuf_len) { 3564 if (wlen+1 <= smallbuf_len) {
3621 wstr = smallbuf; 3565 wstr = smallbuf;
3622 } 3566 }
3623 else { 3567 else {
3624 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) 3568 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3625 return PyErr_NoMemory(); 3569 return PyErr_NoMemory();
3626 3570
3627 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t)); 3571 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3628 if (!wstr) 3572 if (!wstr)
3629 return PyErr_NoMemory(); 3573 return PyErr_NoMemory();
3630 } 3574 }
3631 3575
3632 /* This shouldn't fail now */
3633 wlen2 = mbstowcs(wstr, str, wlen+1); 3576 wlen2 = mbstowcs(wstr, str, wlen+1);
3634 if (wlen2 == (size_t)-1) { 3577 if (wlen2 == (size_t)-1) {
3635 if (wstr != smallbuf) 3578 if (wstr != smallbuf)
3636 PyMem_Free(wstr); 3579 PyMem_Free(wstr);
3637 goto decode_error; 3580 goto decode_error;
3638 } 3581 }
3639 #ifdef HAVE_BROKEN_MBSTOWCS 3582 #ifdef HAVE_BROKEN_MBSTOWCS
3640 assert(wlen2 == wlen); 3583 assert(wlen2 == wlen);
3641 #endif 3584 #endif
3642 unicode = PyUnicode_FromWideChar(wstr, wlen2); 3585 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3643 if (wstr != smallbuf) 3586 if (wstr != smallbuf)
3644 PyMem_Free(wstr); 3587 PyMem_Free(wstr);
3645 } 3588 }
3646 return unicode; 3589 return unicode;
3647 3590
3648 decode_error: 3591 decode_error:
3649 errmsg = strerror(errno); 3592 errmsg = strerror(errno);
3650 assert(errmsg != NULL); 3593 assert(errmsg != NULL);
3651 3594
3652 error_pos = mbstowcs_errorpos(str, len); 3595 error_pos = mbstowcs_errorpos(str, len);
3653 if (errmsg != NULL) { 3596 if (errmsg != NULL) {
3654 size_t errlen; 3597 size_t errlen;
3655 wstr = _Py_char2wchar(errmsg, &errlen); 3598 wstr = _Py_char2wchar(errmsg, &errlen);
3656 if (wstr != NULL) { 3599 if (wstr != NULL) {
3657 reason = PyUnicode_FromWideChar(wstr, errlen); 3600 reason = PyUnicode_FromWideChar(wstr, errlen);
3658 PyMem_Free(wstr); 3601 PyMem_RawFree(wstr);
3659 } else 3602 } else
3660 errmsg = NULL; 3603 errmsg = NULL;
3661 } 3604 }
3662 if (errmsg == NULL) 3605 if (errmsg == NULL)
3663 reason = PyUnicode_FromString( 3606 reason = PyUnicode_FromString(
3664 "mbstowcs() encountered an invalid multibyte sequence"); 3607 "mbstowcs() encountered an invalid multibyte sequence");
3665 if (reason == NULL) 3608 if (reason == NULL)
3666 return NULL; 3609 return NULL;
3667 3610
3668 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO", 3611 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after
3716 "surrogateescape"); 3659 "surrogateescape");
3717 } 3660 }
3718 else { 3661 else {
3719 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape"); 3662 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
3720 } 3663 }
3721 #endif 3664 #endif
3722 } 3665 }
3723 3666
3724 3667
3725 int 3668 int
3726 _PyUnicode_HasNULChars(PyObject* s) 3669 _PyUnicode_HasNULChars(PyObject* str)
3727 { 3670 {
3728 static PyObject *nul = NULL; 3671 Py_ssize_t pos;
3729 3672
3730 if (nul == NULL) 3673 if (PyUnicode_READY(str) == -1)
3731 nul = PyUnicode_FromStringAndSize("\0", 1);
3732 if (nul == NULL)
3733 return -1; 3674 return -1;
3734 return PyUnicode_Contains(s, nul); 3675 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3735 } 3676 PyUnicode_GET_LENGTH(str), '\0', 1);
3736 3677 if (pos == -1)
3678 return 0;
3679 else
3680 return 1;
3681 }
3737 3682
3738 int 3683 int
3739 PyUnicode_FSConverter(PyObject* arg, void* addr) 3684 PyUnicode_FSConverter(PyObject* arg, void* addr)
3740 { 3685 {
3741 PyObject *output = NULL; 3686 PyObject *output = NULL;
3742 Py_ssize_t size; 3687 Py_ssize_t size;
3743 void *data; 3688 void *data;
3744 if (arg == NULL) { 3689 if (arg == NULL) {
3745 Py_DECREF(*(PyObject**)addr); 3690 Py_DECREF(*(PyObject**)addr);
3746 return 1; 3691 return 1;
(...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after
3831 if (PyUnicode_READY(unicode) == -1) 3776 if (PyUnicode_READY(unicode) == -1)
3832 return NULL; 3777 return NULL;
3833 3778
3834 if (PyUnicode_UTF8(unicode) == NULL) { 3779 if (PyUnicode_UTF8(unicode) == NULL) {
3835 assert(!PyUnicode_IS_COMPACT_ASCII(unicode)); 3780 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3836 bytes = _PyUnicode_AsUTF8String(unicode, "strict"); 3781 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3837 if (bytes == NULL) 3782 if (bytes == NULL)
3838 return NULL; 3783 return NULL;
3839 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1); 3784 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3840 if (_PyUnicode_UTF8(unicode) == NULL) { 3785 if (_PyUnicode_UTF8(unicode) == NULL) {
3786 PyErr_NoMemory();
3841 Py_DECREF(bytes); 3787 Py_DECREF(bytes);
3842 return NULL; 3788 return NULL;
3843 } 3789 }
3844 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes); 3790 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3845 Py_MEMCPY(_PyUnicode_UTF8(unicode), 3791 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3846 PyBytes_AS_STRING(bytes), 3792 PyBytes_AS_STRING(bytes),
3847 _PyUnicode_UTF8_LENGTH(unicode) + 1); 3793 _PyUnicode_UTF8_LENGTH(unicode) + 1);
3848 Py_DECREF(bytes); 3794 Py_DECREF(bytes);
3849 } 3795 }
3850 3796
3851 if (psize) 3797 if (psize)
3852 *psize = PyUnicode_UTF8_LENGTH(unicode); 3798 *psize = PyUnicode_UTF8_LENGTH(unicode);
3853 return PyUnicode_UTF8(unicode); 3799 return PyUnicode_UTF8(unicode);
3854 } 3800 }
3855 3801
3856 char* 3802 char*
3857 PyUnicode_AsUTF8(PyObject *unicode) 3803 PyUnicode_AsUTF8(PyObject *unicode)
3858 { 3804 {
3859 return PyUnicode_AsUTF8AndSize(unicode, NULL); 3805 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3860 } 3806 }
3861
3862 #ifdef Py_DEBUG
3863 static int unicode_as_unicode_calls = 0;
3864 #endif
3865
3866 3807
3867 Py_UNICODE * 3808 Py_UNICODE *
3868 PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size) 3809 PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3869 { 3810 {
3870 const unsigned char *one_byte; 3811 const unsigned char *one_byte;
3871 #if SIZEOF_WCHAR_T == 4 3812 #if SIZEOF_WCHAR_T == 4
3872 const Py_UCS2 *two_bytes; 3813 const Py_UCS2 *two_bytes;
3873 #else 3814 #else
3874 const Py_UCS4 *four_bytes; 3815 const Py_UCS4 *four_bytes;
3875 const Py_UCS4 *ucs4_end; 3816 const Py_UCS4 *ucs4_end;
3876 Py_ssize_t num_surrogates; 3817 Py_ssize_t num_surrogates;
3877 #endif 3818 #endif
3878 wchar_t *w; 3819 wchar_t *w;
3879 wchar_t *wchar_end; 3820 wchar_t *wchar_end;
3880 3821
3881 if (!PyUnicode_Check(unicode)) { 3822 if (!PyUnicode_Check(unicode)) {
3882 PyErr_BadArgument(); 3823 PyErr_BadArgument();
3883 return NULL; 3824 return NULL;
3884 } 3825 }
3885 if (_PyUnicode_WSTR(unicode) == NULL) { 3826 if (_PyUnicode_WSTR(unicode) == NULL) {
3886 /* Non-ASCII compact unicode object */ 3827 /* Non-ASCII compact unicode object */
3887 assert(_PyUnicode_KIND(unicode) != 0); 3828 assert(_PyUnicode_KIND(unicode) != 0);
3888 assert(PyUnicode_IS_READY(unicode)); 3829 assert(PyUnicode_IS_READY(unicode));
3889
3890 #ifdef Py_DEBUG
3891 ++unicode_as_unicode_calls;
3892 #endif
3893 3830
3894 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) { 3831 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3895 #if SIZEOF_WCHAR_T == 2 3832 #if SIZEOF_WCHAR_T == 2
3896 four_bytes = PyUnicode_4BYTE_DATA(unicode); 3833 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3897 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode); 3834 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
3898 num_surrogates = 0; 3835 num_surrogates = 0;
3899 3836
3900 for (; four_bytes < ucs4_end; ++four_bytes) { 3837 for (; four_bytes < ucs4_end; ++four_bytes) {
3901 if (*four_bytes > 0xFFFF) 3838 if (*four_bytes > 0xFFFF)
3902 ++num_surrogates; 3839 ++num_surrogates;
(...skipping 93 matching lines...) Expand 10 before | Expand all | Expand 10 after
3996 } 3933 }
3997 return PyUnicode_GET_SIZE(unicode); 3934 return PyUnicode_GET_SIZE(unicode);
3998 3935
3999 onError: 3936 onError:
4000 return -1; 3937 return -1;
4001 } 3938 }
4002 3939
4003 Py_ssize_t 3940 Py_ssize_t
4004 PyUnicode_GetLength(PyObject *unicode) 3941 PyUnicode_GetLength(PyObject *unicode)
4005 { 3942 {
4006 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 3943 if (!PyUnicode_Check(unicode)) {
4007 PyErr_BadArgument(); 3944 PyErr_BadArgument();
4008 return -1; 3945 return -1;
4009 } 3946 }
4010 3947 if (PyUnicode_READY(unicode) == -1)
3948 return -1;
4011 return PyUnicode_GET_LENGTH(unicode); 3949 return PyUnicode_GET_LENGTH(unicode);
4012 } 3950 }
4013 3951
4014 Py_UCS4 3952 Py_UCS4
4015 PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index) 3953 PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4016 { 3954 {
3955 void *data;
3956 int kind;
3957
4017 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 3958 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4018 PyErr_BadArgument(); 3959 PyErr_BadArgument();
4019 return (Py_UCS4)-1; 3960 return (Py_UCS4)-1;
4020 } 3961 }
4021 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { 3962 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4022 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3963 PyErr_SetString(PyExc_IndexError, "string index out of range");
4023 return (Py_UCS4)-1; 3964 return (Py_UCS4)-1;
4024 } 3965 }
4025 return PyUnicode_READ_CHAR(unicode, index); 3966 data = PyUnicode_DATA(unicode);
3967 kind = PyUnicode_KIND(unicode);
3968 return PyUnicode_READ(kind, data, index);
4026 } 3969 }
4027 3970
4028 int 3971 int
4029 PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch) 3972 PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4030 { 3973 {
4031 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) { 3974 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4032 PyErr_BadArgument(); 3975 PyErr_BadArgument();
4033 return -1; 3976 return -1;
4034 } 3977 }
4035 assert(PyUnicode_IS_READY(unicode)); 3978 assert(PyUnicode_IS_READY(unicode));
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after
4074 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 4017 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4075 goto onError; 4018 goto onError;
4076 } 4019 }
4077 return; 4020 return;
4078 4021
4079 onError: 4022 onError:
4080 Py_DECREF(*exceptionObject); 4023 Py_DECREF(*exceptionObject);
4081 *exceptionObject = NULL; 4024 *exceptionObject = NULL;
4082 } 4025 }
4083 4026
4027 #ifdef HAVE_MBCS
4084 /* error handling callback helper: 4028 /* error handling callback helper:
4085 build arguments, call the callback and check the arguments, 4029 build arguments, call the callback and check the arguments,
4086 if no exception occurred, copy the replacement to the output 4030 if no exception occurred, copy the replacement to the output
4087 and adjust various state variables. 4031 and adjust various state variables.
4088 return 0 on success, -1 on error 4032 return 0 on success, -1 on error
4089 */ 4033 */
4090 4034
4091 static int 4035 static int
4092 unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 4036 unicode_decode_call_errorhandler_wchar(
4093 const char *encoding, const char *reason, 4037 const char *errors, PyObject **errorHandler,
4094 const char **input, const char **inend, Py_ssiz e_t *startinpos, 4038 const char *encoding, const char *reason,
4095 Py_ssize_t *endinpos, PyObject **exceptionObjec t, const char **inptr, 4039 const char **input, const char **inend, Py_ssize_t *startinpos,
4096 PyObject **output, Py_ssize_t *outpos) 4040 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4041 PyObject **output, Py_ssize_t *outpos)
4097 { 4042 {
4098 static char *argparse = "O!n;decoding error handler must return (str, int) t uple"; 4043 static char *argparse = "O!n;decoding error handler must return (str, int) t uple";
4099 4044
4100 PyObject *restuple = NULL; 4045 PyObject *restuple = NULL;
4101 PyObject *repunicode = NULL; 4046 PyObject *repunicode = NULL;
4102 Py_ssize_t outsize; 4047 Py_ssize_t outsize;
4103 Py_ssize_t insize; 4048 Py_ssize_t insize;
4104 Py_ssize_t requiredsize; 4049 Py_ssize_t requiredsize;
4105 Py_ssize_t newpos; 4050 Py_ssize_t newpos;
4106 PyObject *inputobj = NULL; 4051 PyObject *inputobj = NULL;
4107 int res = -1; 4052 wchar_t *repwstr;
4108 4053 Py_ssize_t repwlen;
4109 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) 4054
4110 outsize = PyUnicode_GET_LENGTH(*output); 4055 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4111 else 4056 outsize = _PyUnicode_WSTR_LENGTH(*output);
4112 outsize = _PyUnicode_WSTR_LENGTH(*output);
4113 4057
4114 if (*errorHandler == NULL) { 4058 if (*errorHandler == NULL) {
4115 *errorHandler = PyCodec_LookupError(errors); 4059 *errorHandler = PyCodec_LookupError(errors);
4116 if (*errorHandler == NULL) 4060 if (*errorHandler == NULL)
4117 goto onError; 4061 goto onError;
4118 } 4062 }
4119 4063
4120 make_decode_exception(exceptionObject, 4064 make_decode_exception(exceptionObject,
4121 encoding, 4065 encoding,
4122 *input, *inend - *input, 4066 *input, *inend - *input,
4123 *startinpos, *endinpos, 4067 *startinpos, *endinpos,
4124 reason); 4068 reason);
4125 if (*exceptionObject == NULL) 4069 if (*exceptionObject == NULL)
4126 goto onError; 4070 goto onError;
4127 4071
4128 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NUL L); 4072 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NUL L);
4129 if (restuple == NULL) 4073 if (restuple == NULL)
4130 goto onError; 4074 goto onError;
4131 if (!PyTuple_Check(restuple)) { 4075 if (!PyTuple_Check(restuple)) {
4132 PyErr_SetString(PyExc_TypeError, &argparse[4]); 4076 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4133 goto onError; 4077 goto onError;
4134 } 4078 }
4135 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &new pos)) 4079 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &new pos))
4136 goto onError; 4080 goto onError;
4137 if (PyUnicode_READY(repunicode) == -1)
4138 goto onError;
4139 4081
4140 /* Copy back the bytes variables, which might have been modified by the 4082 /* Copy back the bytes variables, which might have been modified by the
4141 callback */ 4083 callback */
4142 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 4084 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4143 if (!inputobj) 4085 if (!inputobj)
4144 goto onError; 4086 goto onError;
4145 if (!PyBytes_Check(inputobj)) { 4087 if (!PyBytes_Check(inputobj)) {
4146 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes" ); 4088 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes" );
4147 } 4089 }
4148 *input = PyBytes_AS_STRING(inputobj); 4090 *input = PyBytes_AS_STRING(inputobj);
4149 insize = PyBytes_GET_SIZE(inputobj); 4091 insize = PyBytes_GET_SIZE(inputobj);
4150 *inend = *input + insize; 4092 *inend = *input + insize;
4151 /* we can DECREF safely, as the exception has another reference, 4093 /* we can DECREF safely, as the exception has another reference,
4152 so the object won't go away. */ 4094 so the object won't go away. */
4153 Py_DECREF(inputobj); 4095 Py_DECREF(inputobj);
4154 4096
4155 if (newpos<0) 4097 if (newpos<0)
4156 newpos = insize+newpos; 4098 newpos = insize+newpos;
4157 if (newpos<0 || newpos>insize) { 4099 if (newpos<0 || newpos>insize) {
4158 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of b ounds", newpos); 4100 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of b ounds", newpos);
4159 goto onError; 4101 goto onError;
4160 } 4102 }
4161 4103
4162 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) { 4104 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4163 /* need more space? (at least enough for what we 4105 if (repwstr == NULL)
4164 have+the replacement+the rest of the string (starting 4106 goto onError;
4165 at the new input position), so we won't have to check space 4107 /* need more space? (at least enough for what we
4166 when there are no errors in the rest of the string) */ 4108 have+the replacement+the rest of the string (starting
4167 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode); 4109 at the new input position), so we won't have to check space
4168 requiredsize = *outpos + replen + insize-newpos; 4110 when there are no errors in the rest of the string) */
4169 if (requiredsize > outsize) { 4111 requiredsize = *outpos + repwlen + insize-newpos;
4170 if (requiredsize<2*outsize) 4112 if (requiredsize > outsize) {
4171 requiredsize = 2*outsize; 4113 if (requiredsize < 2*outsize)
4172 if (unicode_resize(output, requiredsize) < 0) 4114 requiredsize = 2*outsize;
4173 goto onError; 4115 if (unicode_resize(output, requiredsize) < 0)
4174 }
4175 if (unicode_widen(output, *outpos,
4176 PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
4177 goto onError; 4116 goto onError;
4178 _PyUnicode_FastCopyCharacters(*output, *outpos, repunicode, 0, replen); 4117 }
4179 *outpos += replen; 4118 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4180 } 4119 *outpos += repwlen;
4181 else { 4120
4182 wchar_t *repwstr;
4183 Py_ssize_t repwlen;
4184 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4185 if (repwstr == NULL)
4186 goto onError;
4187 /* need more space? (at least enough for what we
4188 have+the replacement+the rest of the string (starting
4189 at the new input position), so we won't have to check space
4190 when there are no errors in the rest of the string) */
4191 requiredsize = *outpos + repwlen + insize-newpos;
4192 if (requiredsize > outsize) {
4193 if (requiredsize < 2*outsize)
4194 requiredsize = 2*outsize;
4195 if (unicode_resize(output, requiredsize) < 0)
4196 goto onError;
4197 }
4198 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4199 *outpos += repwlen;
4200 }
4201 *endinpos = newpos; 4121 *endinpos = newpos;
4202 *inptr = *input + newpos; 4122 *inptr = *input + newpos;
4203 4123
4204 /* we made it! */ 4124 /* we made it! */
4205 res = 0; 4125 Py_XDECREF(restuple);
4126 return 0;
4206 4127
4207 onError: 4128 onError:
4208 Py_XDECREF(restuple); 4129 Py_XDECREF(restuple);
4209 return res; 4130 return -1;
4131 }
4132 #endif /* HAVE_MBCS */
4133
4134 static int
4135 unicode_decode_call_errorhandler_writer(
4136 const char *errors, PyObject **errorHandler,
4137 const char *encoding, const char *reason,
4138 const char **input, const char **inend, Py_ssize_t *startinpos,
4139 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4140 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4141 {
4142 static char *argparse = "O!n;decoding error handler must return (str, int) t uple";
4143
4144 PyObject *restuple = NULL;
4145 PyObject *repunicode = NULL;
4146 Py_ssize_t insize;
4147 Py_ssize_t newpos;
4148 Py_ssize_t replen;
4149 PyObject *inputobj = NULL;
4150
4151 if (*errorHandler == NULL) {
4152 *errorHandler = PyCodec_LookupError(errors);
4153 if (*errorHandler == NULL)
4154 goto onError;
4155 }
4156
4157 make_decode_exception(exceptionObject,
4158 encoding,
4159 *input, *inend - *input,
4160 *startinpos, *endinpos,
4161 reason);
4162 if (*exceptionObject == NULL)
4163 goto onError;
4164
4165 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NUL L);
4166 if (restuple == NULL)
4167 goto onError;
4168 if (!PyTuple_Check(restuple)) {
4169 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4170 goto onError;
4171 }
4172 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &new pos))
4173 goto onError;
4174
4175 /* Copy back the bytes variables, which might have been modified by the
4176 callback */
4177 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4178 if (!inputobj)
4179 goto onError;
4180 if (!PyBytes_Check(inputobj)) {
4181 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes" );
4182 }
4183 *input = PyBytes_AS_STRING(inputobj);
4184 insize = PyBytes_GET_SIZE(inputobj);
4185 *inend = *input + insize;
4186 /* we can DECREF safely, as the exception has another reference,
4187 so the object won't go away. */
4188 Py_DECREF(inputobj);
4189
4190 if (newpos<0)
4191 newpos = insize+newpos;
4192 if (newpos<0 || newpos>insize) {
4193 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of b ounds", newpos);
4194 goto onError;
4195 }
4196
4197 if (PyUnicode_READY(repunicode) < 0)
4198 goto onError;
4199 replen = PyUnicode_GET_LENGTH(repunicode);
4200 writer->min_length += replen;
4201 if (replen > 1)
4202 writer->overallocate = 1;
4203 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4204 goto onError;
4205
4206 *endinpos = newpos;
4207 *inptr = *input + newpos;
4208
4209 /* we made it! */
4210 Py_XDECREF(restuple);
4211 return 0;
4212
4213 onError:
4214 Py_XDECREF(restuple);
4215 return -1;
4210 } 4216 }
4211 4217
4212 /* --- UTF-7 Codec -------------------------------------------------------- */ 4218 /* --- UTF-7 Codec -------------------------------------------------------- */
4213 4219
4214 /* See RFC2152 for details. We encode conservatively and decode liberally. */ 4220 /* See RFC2152 for details. We encode conservatively and decode liberally. */
4215 4221
4216 /* Three simple macros defining base-64. */ 4222 /* Three simple macros defining base-64. */
4217 4223
4218 /* Is c a base-64 character? */ 4224 /* Is c a base-64 character? */
4219 4225
(...skipping 87 matching lines...) Expand 10 before | Expand all | Expand 10 after
4307 4313
4308 PyObject * 4314 PyObject *
4309 PyUnicode_DecodeUTF7Stateful(const char *s, 4315 PyUnicode_DecodeUTF7Stateful(const char *s,
4310 Py_ssize_t size, 4316 Py_ssize_t size,
4311 const char *errors, 4317 const char *errors,
4312 Py_ssize_t *consumed) 4318 Py_ssize_t *consumed)
4313 { 4319 {
4314 const char *starts = s; 4320 const char *starts = s;
4315 Py_ssize_t startinpos; 4321 Py_ssize_t startinpos;
4316 Py_ssize_t endinpos; 4322 Py_ssize_t endinpos;
4317 Py_ssize_t outpos;
4318 const char *e; 4323 const char *e;
4319 PyObject *unicode; 4324 _PyUnicodeWriter writer;
4320 const char *errmsg = ""; 4325 const char *errmsg = "";
4321 int inShift = 0; 4326 int inShift = 0;
4322 Py_ssize_t shiftOutStart; 4327 Py_ssize_t shiftOutStart;
4323 unsigned int base64bits = 0; 4328 unsigned int base64bits = 0;
4324 unsigned long base64buffer = 0; 4329 unsigned long base64buffer = 0;
4325 Py_UCS4 surrogate = 0; 4330 Py_UCS4 surrogate = 0;
4326 PyObject *errorHandler = NULL; 4331 PyObject *errorHandler = NULL;
4327 PyObject *exc = NULL; 4332 PyObject *exc = NULL;
4328 4333
4329 /* Start off assuming it's all ASCII. Widen later as necessary. */
4330 unicode = PyUnicode_New(size, 127);
4331 if (!unicode)
4332 return NULL;
4333 if (size == 0) { 4334 if (size == 0) {
4334 if (consumed) 4335 if (consumed)
4335 *consumed = 0; 4336 *consumed = 0;
4336 return unicode; 4337 _Py_RETURN_UNICODE_EMPTY();
4337 } 4338 }
4338 4339
4339 shiftOutStart = outpos = 0; 4340 /* Start off assuming it's all ASCII. Widen later as necessary. */
4341 _PyUnicodeWriter_Init(&writer);
4342 writer.min_length = size;
4343
4344 shiftOutStart = 0;
4340 e = s + size; 4345 e = s + size;
4341 4346
4342 while (s < e) { 4347 while (s < e) {
4343 Py_UCS4 ch; 4348 Py_UCS4 ch;
4344 restart: 4349 restart:
4345 ch = (unsigned char) *s; 4350 ch = (unsigned char) *s;
4346 4351
4347 if (inShift) { /* in a base-64 section */ 4352 if (inShift) { /* in a base-64 section */
4348 if (IS_BASE64(ch)) { /* consume a base-64 character */ 4353 if (IS_BASE64(ch)) { /* consume a base-64 character */
4349 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 4354 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4350 base64bits += 6; 4355 base64bits += 6;
4351 s++; 4356 s++;
4352 if (base64bits >= 16) { 4357 if (base64bits >= 16) {
4353 /* we have enough bits for a UTF-16 value */ 4358 /* we have enough bits for a UTF-16 value */
4354 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16)); 4359 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4355 base64bits -= 16; 4360 base64bits -= 16;
4356 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 4361 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4362 assert(outCh <= 0xffff);
4357 if (surrogate) { 4363 if (surrogate) {
4358 /* expecting a second surrogate */ 4364 /* expecting a second surrogate */
4359 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) { 4365 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4360 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh); 4366 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4361 if (unicode_putchar(&unicode, &outpos, ch2) < 0) 4367 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4362 goto onError; 4368 goto onError;
4363 surrogate = 0; 4369 surrogate = 0;
4364 continue; 4370 continue;
4365 } 4371 }
4366 else { 4372 else {
4367 if (unicode_putchar(&unicode, &outpos, surrogate) < 0) 4373 if (_PyUnicodeWriter_WriteCharInline(&writer, surrog ate) < 0)
4368 goto onError; 4374 goto onError;
4369 surrogate = 0; 4375 surrogate = 0;
4370 } 4376 }
4371 } 4377 }
4372 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) { 4378 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4373 /* first surrogate */ 4379 /* first surrogate */
4374 surrogate = outCh; 4380 surrogate = outCh;
4375 } 4381 }
4376 else { 4382 else {
4377 if (unicode_putchar(&unicode, &outpos, outCh) < 0) 4383 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0 )
4378 goto onError; 4384 goto onError;
4379 } 4385 }
4380 } 4386 }
4381 } 4387 }
4382 else { /* now leaving a base-64 section */ 4388 else { /* now leaving a base-64 section */
4383 inShift = 0; 4389 inShift = 0;
4384 s++; 4390 s++;
4385 if (surrogate) { 4391 if (surrogate) {
4386 if (unicode_putchar(&unicode, &outpos, surrogate) < 0) 4392 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0 )
4387 goto onError; 4393 goto onError;
4388 surrogate = 0; 4394 surrogate = 0;
4389 } 4395 }
4390 if (base64bits > 0) { /* left-over bits */ 4396 if (base64bits > 0) { /* left-over bits */
4391 if (base64bits >= 6) { 4397 if (base64bits >= 6) {
4392 /* We've seen at least one base-64 character */ 4398 /* We've seen at least one base-64 character */
4393 errmsg = "partial character in shift sequence"; 4399 errmsg = "partial character in shift sequence";
4394 goto utf7Error; 4400 goto utf7Error;
4395 } 4401 }
4396 else { 4402 else {
4397 /* Some bits remain; they should be zero */ 4403 /* Some bits remain; they should be zero */
4398 if (base64buffer != 0) { 4404 if (base64buffer != 0) {
4399 errmsg = "non-zero padding bits in shift sequence"; 4405 errmsg = "non-zero padding bits in shift sequence";
4400 goto utf7Error; 4406 goto utf7Error;
4401 } 4407 }
4402 } 4408 }
4403 } 4409 }
4404 if (ch != '-') { 4410 if (ch != '-') {
4405 /* '-' is absorbed; other terminating 4411 /* '-' is absorbed; other terminating
4406 characters are preserved */ 4412 characters are preserved */
4407 if (unicode_putchar(&unicode, &outpos, ch) < 0) 4413 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4408 goto onError; 4414 goto onError;
4409 } 4415 }
4410 } 4416 }
4411 } 4417 }
4412 else if ( ch == '+' ) { 4418 else if ( ch == '+' ) {
4413 startinpos = s-starts; 4419 startinpos = s-starts;
4414 s++; /* consume '+' */ 4420 s++; /* consume '+' */
4415 if (s < e && *s == '-') { /* '+-' encodes '+' */ 4421 if (s < e && *s == '-') { /* '+-' encodes '+' */
4416 s++; 4422 s++;
4417 if (unicode_putchar(&unicode, &outpos, '+') < 0) 4423 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4418 goto onError; 4424 goto onError;
4419 } 4425 }
4420 else { /* begin base64-encoded section */ 4426 else { /* begin base64-encoded section */
4421 inShift = 1; 4427 inShift = 1;
4422 shiftOutStart = outpos; 4428 shiftOutStart = writer.pos;
4423 base64bits = 0; 4429 base64bits = 0;
4430 base64buffer = 0;
4424 } 4431 }
4425 } 4432 }
4426 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 4433 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4427 if (unicode_putchar(&unicode, &outpos, ch) < 0) 4434 s++;
4435 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4428 goto onError; 4436 goto onError;
4429 s++;
4430 } 4437 }
4431 else { 4438 else {
4432 startinpos = s-starts; 4439 startinpos = s-starts;
4433 s++; 4440 s++;
4434 errmsg = "unexpected special character"; 4441 errmsg = "unexpected special character";
4435 goto utf7Error; 4442 goto utf7Error;
4436 } 4443 }
4437 continue; 4444 continue;
4438 utf7Error: 4445 utf7Error:
4439 endinpos = s-starts; 4446 endinpos = s-starts;
4440 if (unicode_decode_call_errorhandler( 4447 if (unicode_decode_call_errorhandler_writer(
4441 errors, &errorHandler, 4448 errors, &errorHandler,
4442 "utf7", errmsg, 4449 "utf7", errmsg,
4443 &starts, &e, &startinpos, &endinpos, &exc, &s, 4450 &starts, &e, &startinpos, &endinpos, &exc, &s,
4444 &unicode, &outpos)) 4451 &writer))
4445 goto onError; 4452 goto onError;
4446 } 4453 }
4447 4454
4448 /* end of string */ 4455 /* end of string */
4449 4456
4450 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 4457 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4451 /* if we're in an inconsistent state, that's an error */ 4458 /* if we're in an inconsistent state, that's an error */
4452 if (surrogate || 4459 if (surrogate ||
4453 (base64bits >= 6) || 4460 (base64bits >= 6) ||
4454 (base64bits > 0 && base64buffer != 0)) { 4461 (base64bits > 0 && base64buffer != 0)) {
4455 endinpos = size; 4462 endinpos = size;
4456 if (unicode_decode_call_errorhandler( 4463 if (unicode_decode_call_errorhandler_writer(
4457 errors, &errorHandler, 4464 errors, &errorHandler,
4458 "utf7", "unterminated shift sequence", 4465 "utf7", "unterminated shift sequence",
4459 &starts, &e, &startinpos, &endinpos, &exc, &s, 4466 &starts, &e, &startinpos, &endinpos, &exc, &s,
4460 &unicode, &outpos)) 4467 &writer))
4461 goto onError; 4468 goto onError;
4462 if (s < e) 4469 if (s < e)
4463 goto restart; 4470 goto restart;
4464 } 4471 }
4465 } 4472 }
4466 4473
4467 /* return state */ 4474 /* return state */
4468 if (consumed) { 4475 if (consumed) {
4469 if (inShift) { 4476 if (inShift) {
4470 outpos = shiftOutStart; /* back off output */ 4477 writer.pos = shiftOutStart; /* back off output */
4471 *consumed = startinpos; 4478 *consumed = startinpos;
4472 } 4479 }
4473 else { 4480 else {
4474 *consumed = s-starts; 4481 *consumed = s-starts;
4475 } 4482 }
4476 } 4483 }
4477 4484
4478 if (unicode_resize(&unicode, outpos) < 0)
4479 goto onError;
4480
4481 Py_XDECREF(errorHandler); 4485 Py_XDECREF(errorHandler);
4482 Py_XDECREF(exc); 4486 Py_XDECREF(exc);
4483 return unicode_result(unicode); 4487 return _PyUnicodeWriter_Finish(&writer);
4484 4488
4485 onError: 4489 onError:
4486 Py_XDECREF(errorHandler); 4490 Py_XDECREF(errorHandler);
4487 Py_XDECREF(exc); 4491 Py_XDECREF(exc);
4488 Py_DECREF(unicode); 4492 _PyUnicodeWriter_Dealloc(&writer);
4489 return NULL; 4493 return NULL;
4490 } 4494 }
4491 4495
4492 4496
4493 PyObject * 4497 PyObject *
4494 _PyUnicode_EncodeUTF7(PyObject *str, 4498 _PyUnicode_EncodeUTF7(PyObject *str,
4495 int base64SetO, 4499 int base64SetO,
4496 int base64WhiteSpace, 4500 int base64WhiteSpace,
4497 const char *errors) 4501 const char *errors)
4498 { 4502 {
4499 int kind; 4503 int kind;
4500 void *data; 4504 void *data;
4501 Py_ssize_t len; 4505 Py_ssize_t len;
4502 PyObject *v; 4506 PyObject *v;
4503 Py_ssize_t allocated;
4504 int inShift = 0; 4507 int inShift = 0;
4505 Py_ssize_t i; 4508 Py_ssize_t i;
4506 unsigned int base64bits = 0; 4509 unsigned int base64bits = 0;
4507 unsigned long base64buffer = 0; 4510 unsigned long base64buffer = 0;
4508 char * out; 4511 char * out;
4509 char * start; 4512 char * start;
4510 4513
4511 if (PyUnicode_READY(str) == -1) 4514 if (PyUnicode_READY(str) == -1)
4512 return NULL; 4515 return NULL;
4513 kind = PyUnicode_KIND(str); 4516 kind = PyUnicode_KIND(str);
4514 data = PyUnicode_DATA(str); 4517 data = PyUnicode_DATA(str);
4515 len = PyUnicode_GET_LENGTH(str); 4518 len = PyUnicode_GET_LENGTH(str);
4516 4519
4517 if (len == 0) 4520 if (len == 0)
4518 return PyBytes_FromStringAndSize(NULL, 0); 4521 return PyBytes_FromStringAndSize(NULL, 0);
4519 4522
4520 /* It might be possible to tighten this worst case */ 4523 /* It might be possible to tighten this worst case */
4521 allocated = 8 * len; 4524 if (len > PY_SSIZE_T_MAX / 8)
4522 if (allocated / 8 != len)
4523 return PyErr_NoMemory(); 4525 return PyErr_NoMemory();
4524 4526 v = PyBytes_FromStringAndSize(NULL, len * 8);
4525 v = PyBytes_FromStringAndSize(NULL, allocated);
4526 if (v == NULL) 4527 if (v == NULL)
4527 return NULL; 4528 return NULL;
4528 4529
4529 start = out = PyBytes_AS_STRING(v); 4530 start = out = PyBytes_AS_STRING(v);
4530 for (i = 0; i < len; ++i) { 4531 for (i = 0; i < len; ++i) {
4531 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 4532 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4532 4533
4533 if (inShift) { 4534 if (inShift) {
4534 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 4535 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4535 /* shifting out */ 4536 /* shifting out */
(...skipping 28 matching lines...) Expand all
4564 goto encode_char; 4565 goto encode_char;
4565 } 4566 }
4566 } 4567 }
4567 continue; 4568 continue;
4568 encode_char: 4569 encode_char:
4569 if (ch >= 0x10000) { 4570 if (ch >= 0x10000) {
4570 assert(ch <= MAX_UNICODE); 4571 assert(ch <= MAX_UNICODE);
4571 4572
4572 /* code first surrogate */ 4573 /* code first surrogate */
4573 base64bits += 16; 4574 base64bits += 16;
4574 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10); 4575 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4575 while (base64bits >= 6) { 4576 while (base64bits >= 6) {
4576 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4577 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4577 base64bits -= 6; 4578 base64bits -= 6;
4578 } 4579 }
4579 /* prepare second surrogate */ 4580 /* prepare second surrogate */
4580 ch = Py_UNICODE_LOW_SURROGATE(ch); 4581 ch = Py_UNICODE_LOW_SURROGATE(ch);
4581 } 4582 }
4582 base64bits += 16; 4583 base64bits += 16;
4583 base64buffer = (base64buffer << 16) | ch; 4584 base64buffer = (base64buffer << 16) | ch;
4584 while (base64bits >= 6) { 4585 while (base64bits >= 6) {
(...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after
4636 #include "stringlib/undef.h" 4637 #include "stringlib/undef.h"
4637 4638
4638 #include "stringlib/ucs2lib.h" 4639 #include "stringlib/ucs2lib.h"
4639 #include "stringlib/codecs.h" 4640 #include "stringlib/codecs.h"
4640 #include "stringlib/undef.h" 4641 #include "stringlib/undef.h"
4641 4642
4642 #include "stringlib/ucs4lib.h" 4643 #include "stringlib/ucs4lib.h"
4643 #include "stringlib/codecs.h" 4644 #include "stringlib/codecs.h"
4644 #include "stringlib/undef.h" 4645 #include "stringlib/undef.h"
4645 4646
4646 /* Mask to check or force alignment of a pointer to C 'long' boundaries */
4647 #define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4648
4649 /* Mask to quickly check whether a C 'long' contains a 4647 /* Mask to quickly check whether a C 'long' contains a
4650 non-ASCII, UTF8-encoded char. */ 4648 non-ASCII, UTF8-encoded char. */
4651 #if (SIZEOF_LONG == 8) 4649 #if (SIZEOF_LONG == 8)
4652 # define ASCII_CHAR_MASK 0x8080808080808080L 4650 # define ASCII_CHAR_MASK 0x8080808080808080UL
4653 #elif (SIZEOF_LONG == 4) 4651 #elif (SIZEOF_LONG == 4)
4654 # define ASCII_CHAR_MASK 0x80808080L 4652 # define ASCII_CHAR_MASK 0x80808080UL
4655 #else 4653 #else
4656 # error C 'long' size should be either 4 or 8! 4654 # error C 'long' size should be either 4 or 8!
4657 #endif 4655 #endif
4658 4656
4659 static Py_ssize_t 4657 static Py_ssize_t
4660 ascii_decode(const char *start, const char *end, Py_UCS1 *dest) 4658 ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4661 { 4659 {
4662 const char *p = start; 4660 const char *p = start;
4663 const char *aligned_end = (const char *) ((size_t) end & ~LONG_PTR_MASK); 4661 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
4664 4662
4663 /*
4664 * Issue #17237: m68k is a bit different from most architectures in
4665 * that objects do not use "natural alignment" - for example, int and
4666 * long are only aligned at 2-byte boundaries. Therefore the assert()
4667 * won't work; also, tests have shown that skipping the "optimised
4668 * version" will even speed up m68k.
4669 */
4670 #if !defined(__m68k__)
4665 #if SIZEOF_LONG <= SIZEOF_VOID_P 4671 #if SIZEOF_LONG <= SIZEOF_VOID_P
4666 assert(!((size_t) dest & LONG_PTR_MASK)); 4672 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4667 if (!((size_t) p & LONG_PTR_MASK)) { 4673 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4668 /* Fast path, see in STRINGLIB(utf8_decode) for 4674 /* Fast path, see in STRINGLIB(utf8_decode) for
4669 an explanation. */ 4675 an explanation. */
4670 /* Help register allocation */ 4676 /* Help allocation */
4671 register const char *_p = p; 4677 const char *_p = p;
4672 register Py_UCS1 * q = dest; 4678 Py_UCS1 * q = dest;
4673 while (_p < aligned_end) { 4679 while (_p < aligned_end) {
4674 unsigned long value = *(const unsigned long *) _p; 4680 unsigned long value = *(const unsigned long *) _p;
4675 if (value & ASCII_CHAR_MASK) 4681 if (value & ASCII_CHAR_MASK)
4676 break; 4682 break;
4677 *((unsigned long *)q) = value; 4683 *((unsigned long *)q) = value;
4678 _p += SIZEOF_LONG; 4684 _p += SIZEOF_LONG;
4679 q += SIZEOF_LONG; 4685 q += SIZEOF_LONG;
4680 } 4686 }
4681 p = _p; 4687 p = _p;
4682 while (p < end) { 4688 while (p < end) {
4683 if ((unsigned char)*p & 0x80) 4689 if ((unsigned char)*p & 0x80)
4684 break; 4690 break;
4685 *q++ = *p++; 4691 *q++ = *p++;
4686 } 4692 }
4687 return p - start; 4693 return p - start;
4688 } 4694 }
4689 #endif 4695 #endif
4696 #endif
4690 while (p < end) { 4697 while (p < end) {
4691 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h 4698 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4692 for an explanation. */ 4699 for an explanation. */
4693 if (!((size_t) p & LONG_PTR_MASK)) { 4700 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4694 /* Help register allocation */ 4701 /* Help allocation */
4695 register const char *_p = p; 4702 const char *_p = p;
4696 while (_p < aligned_end) { 4703 while (_p < aligned_end) {
4697 unsigned long value = *(unsigned long *) _p; 4704 unsigned long value = *(unsigned long *) _p;
4698 if (value & ASCII_CHAR_MASK) 4705 if (value & ASCII_CHAR_MASK)
4699 break; 4706 break;
4700 _p += SIZEOF_LONG; 4707 _p += SIZEOF_LONG;
4701 } 4708 }
4702 p = _p; 4709 p = _p;
4703 if (_p == end) 4710 if (_p == end)
4704 break; 4711 break;
4705 } 4712 }
4706 if ((unsigned char)*p & 0x80) 4713 if ((unsigned char)*p & 0x80)
4707 break; 4714 break;
4708 ++p; 4715 ++p;
4709 } 4716 }
4710 memcpy(dest, start, p - start); 4717 memcpy(dest, start, p - start);
4711 return p - start; 4718 return p - start;
4712 } 4719 }
4713 4720
4714 PyObject * 4721 PyObject *
4715 PyUnicode_DecodeUTF8Stateful(const char *s, 4722 PyUnicode_DecodeUTF8Stateful(const char *s,
4716 Py_ssize_t size, 4723 Py_ssize_t size,
4717 const char *errors, 4724 const char *errors,
4718 Py_ssize_t *consumed) 4725 Py_ssize_t *consumed)
4719 { 4726 {
4720 PyObject *unicode; 4727 _PyUnicodeWriter writer;
4721 const char *starts = s; 4728 const char *starts = s;
4722 const char *end = s + size; 4729 const char *end = s + size;
4723 Py_ssize_t outpos;
4724 4730
4725 Py_ssize_t startinpos; 4731 Py_ssize_t startinpos;
4726 Py_ssize_t endinpos; 4732 Py_ssize_t endinpos;
4727 const char *errmsg = ""; 4733 const char *errmsg = "";
4728 PyObject *errorHandler = NULL; 4734 PyObject *errorHandler = NULL;
4729 PyObject *exc = NULL; 4735 PyObject *exc = NULL;
4730 4736
4731 if (size == 0) { 4737 if (size == 0) {
4732 if (consumed) 4738 if (consumed)
4733 *consumed = 0; 4739 *consumed = 0;
4734 Py_INCREF(unicode_empty); 4740 _Py_RETURN_UNICODE_EMPTY();
4735 return unicode_empty;
4736 } 4741 }
4737 4742
4738 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 4743 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4739 if (size == 1 && (unsigned char)s[0] < 128) { 4744 if (size == 1 && (unsigned char)s[0] < 128) {
4740 if (consumed) 4745 if (consumed)
4741 *consumed = 1; 4746 *consumed = 1;
4742 return get_latin1_char((unsigned char)s[0]); 4747 return get_latin1_char((unsigned char)s[0]);
4743 } 4748 }
4744 4749
4745 unicode = PyUnicode_New(size, 127); 4750 _PyUnicodeWriter_Init(&writer);
4746 if (!unicode) 4751 writer.min_length = size;
4747 return NULL; 4752 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
4748 4753 goto onError;
4749 outpos = ascii_decode(s, end, PyUnicode_1BYTE_DATA(unicode)); 4754
4750 s += outpos; 4755 writer.pos = ascii_decode(s, end, writer.data);
4756 s += writer.pos;
4751 while (s < end) { 4757 while (s < end) {
4752 Py_UCS4 ch; 4758 Py_UCS4 ch;
4753 int kind = PyUnicode_KIND(unicode); 4759 int kind = writer.kind;
4754 if (kind == PyUnicode_1BYTE_KIND) { 4760 if (kind == PyUnicode_1BYTE_KIND) {
4755 if (PyUnicode_IS_ASCII(unicode)) 4761 if (PyUnicode_IS_ASCII(writer.buffer))
4756 ch = asciilib_utf8_decode(&s, end, 4762 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
4757 PyUnicode_1BYTE_DATA(unicode), &outpos);
4758 else 4763 else
4759 ch = ucs1lib_utf8_decode(&s, end, 4764 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
4760 PyUnicode_1BYTE_DATA(unicode), &outpos);
4761 } else if (kind == PyUnicode_2BYTE_KIND) { 4765 } else if (kind == PyUnicode_2BYTE_KIND) {
4762 ch = ucs2lib_utf8_decode(&s, end, 4766 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
4763 PyUnicode_2BYTE_DATA(unicode), &outpos);
4764 } else { 4767 } else {
4765 assert(kind == PyUnicode_4BYTE_KIND); 4768 assert(kind == PyUnicode_4BYTE_KIND);
4766 ch = ucs4lib_utf8_decode(&s, end, 4769 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
4767 PyUnicode_4BYTE_DATA(unicode), &outpos);
4768 } 4770 }
4769 4771
4770 switch (ch) { 4772 switch (ch) {
4771 case 0: 4773 case 0:
4772 if (s == end || consumed) 4774 if (s == end || consumed)
4773 goto End; 4775 goto End;
4774 errmsg = "unexpected end of data"; 4776 errmsg = "unexpected end of data";
4775 startinpos = s - starts; 4777 startinpos = s - starts;
4776 endinpos = startinpos + 1; 4778 endinpos = end - starts;
4777 while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
4778 endinpos++;
4779 break; 4779 break;
4780 case 1: 4780 case 1:
4781 errmsg = "invalid start byte"; 4781 errmsg = "invalid start byte";
4782 startinpos = s - starts; 4782 startinpos = s - starts;
4783 endinpos = startinpos + 1; 4783 endinpos = startinpos + 1;
4784 break; 4784 break;
4785 case 2: 4785 case 2:
4786 case 3:
4787 case 4:
4786 errmsg = "invalid continuation byte"; 4788 errmsg = "invalid continuation byte";
4787 startinpos = s - starts; 4789 startinpos = s - starts;
4788 endinpos = startinpos + 1; 4790 endinpos = startinpos + ch - 1;
4789 while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
4790 endinpos++;
4791 break; 4791 break;
4792 default: 4792 default:
4793 if (unicode_putchar(&unicode, &outpos, ch) < 0) 4793 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4794 goto onError; 4794 goto onError;
4795 continue; 4795 continue;
4796 } 4796 }
4797 4797
4798 if (unicode_decode_call_errorhandler( 4798 if (unicode_decode_call_errorhandler_writer(
4799 errors, &errorHandler, 4799 errors, &errorHandler,
4800 "utf-8", errmsg, 4800 "utf-8", errmsg,
4801 &starts, &end, &startinpos, &endinpos, &exc, &s, 4801 &starts, &end, &startinpos, &endinpos, &exc, &s,
4802 &unicode, &outpos)) 4802 &writer))
4803 goto onError; 4803 goto onError;
4804 } 4804 }
4805 4805
4806 End: 4806 End:
4807 if (unicode_resize(&unicode, outpos) < 0)
4808 goto onError;
4809
4810 if (consumed) 4807 if (consumed)
4811 *consumed = s - starts; 4808 *consumed = s - starts;
4812 4809
4813 Py_XDECREF(errorHandler); 4810 Py_XDECREF(errorHandler);
4814 Py_XDECREF(exc); 4811 Py_XDECREF(exc);
4815 assert(_PyUnicode_CheckConsistency(unicode, 1)); 4812 return _PyUnicodeWriter_Finish(&writer);
4816 return unicode;
4817 4813
4818 onError: 4814 onError:
4819 Py_XDECREF(errorHandler); 4815 Py_XDECREF(errorHandler);
4820 Py_XDECREF(exc); 4816 Py_XDECREF(exc);
4821 Py_XDECREF(unicode); 4817 _PyUnicodeWriter_Dealloc(&writer);
4822 return NULL; 4818 return NULL;
4823 } 4819 }
4824 4820
4825 #ifdef __APPLE__ 4821 #ifdef __APPLE__
4826 4822
4827 /* Simplified UTF-8 decoder using surrogateescape error handler, 4823 /* Simplified UTF-8 decoder using surrogateescape error handler,
4828 used to decode the command line arguments on Mac OS X. */ 4824 used to decode the command line arguments on Mac OS X.
4825
4826 Return a pointer to a newly allocated wide character string (use
4827 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
4829 4828
4830 wchar_t* 4829 wchar_t*
4831 _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) 4830 _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4832 { 4831 {
4833 const char *e; 4832 const char *e;
4834 wchar_t *unicode; 4833 wchar_t *unicode;
4835 Py_ssize_t outpos; 4834 Py_ssize_t outpos;
4836 4835
4837 /* Note: size will always be longer than the resulting Unicode 4836 /* Note: size will always be longer than the resulting Unicode
4838 character count */ 4837 character count */
4839 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) { 4838 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
4840 PyErr_NoMemory(); 4839 return NULL;
4841 return NULL; 4840 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
4842 }
4843 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4844 if (!unicode) 4841 if (!unicode)
4845 return NULL; 4842 return NULL;
4846 4843
4847 /* Unpack UTF-8 encoded data */ 4844 /* Unpack UTF-8 encoded data */
4848 e = s + size; 4845 e = s + size;
4849 outpos = 0; 4846 outpos = 0;
4850 while (s < e) { 4847 while (s < e) {
4851 Py_UCS4 ch; 4848 Py_UCS4 ch;
4852 #if SIZEOF_WCHAR_T == 4 4849 #if SIZEOF_WCHAR_T == 4
4853 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos); 4850 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
(...skipping 102 matching lines...) Expand 10 before | Expand all | Expand 10 after
4956 PyObject * 4953 PyObject *
4957 PyUnicode_DecodeUTF32Stateful(const char *s, 4954 PyUnicode_DecodeUTF32Stateful(const char *s,
4958 Py_ssize_t size, 4955 Py_ssize_t size,
4959 const char *errors, 4956 const char *errors,
4960 int *byteorder, 4957 int *byteorder,
4961 Py_ssize_t *consumed) 4958 Py_ssize_t *consumed)
4962 { 4959 {
4963 const char *starts = s; 4960 const char *starts = s;
4964 Py_ssize_t startinpos; 4961 Py_ssize_t startinpos;
4965 Py_ssize_t endinpos; 4962 Py_ssize_t endinpos;
4966 Py_ssize_t outpos; 4963 _PyUnicodeWriter writer;
4967 PyObject *unicode;
4968 const unsigned char *q, *e; 4964 const unsigned char *q, *e;
4969 int bo = 0; /* assume native ordering by default */ 4965 int le, bo = 0; /* assume native ordering by default */
4966 const char *encoding;
4970 const char *errmsg = ""; 4967 const char *errmsg = "";
4971 /* Offsets from q for retrieving bytes in the right order. */
4972 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
4973 int iorder[] = {0, 1, 2, 3};
4974 #else
4975 int iorder[] = {3, 2, 1, 0};
4976 #endif
4977 PyObject *errorHandler = NULL; 4968 PyObject *errorHandler = NULL;
4978 PyObject *exc = NULL; 4969 PyObject *exc = NULL;
4979 4970
4980 q = (unsigned char *)s; 4971 q = (unsigned char *)s;
4981 e = q + size; 4972 e = q + size;
4982 4973
4983 if (byteorder) 4974 if (byteorder)
4984 bo = *byteorder; 4975 bo = *byteorder;
4985 4976
4986 /* Check for BOM marks (U+FEFF) in the input and adjust current 4977 /* Check for BOM marks (U+FEFF) in the input and adjust current
4987 byte order setting accordingly. In native mode, the leading BOM 4978 byte order setting accordingly. In native mode, the leading BOM
4988 mark is skipped, in all other modes, it is copied to the output 4979 mark is skipped, in all other modes, it is copied to the output
4989 stream as-is (giving a ZWNBSP character). */ 4980 stream as-is (giving a ZWNBSP character). */
4990 if (bo == 0) { 4981 if (bo == 0 && size >= 4) {
4991 if (size >= 4) { 4982 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4992 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 4983 if (bom == 0x0000FEFF) {
4993 (q[iorder[1]] << 8) | q[iorder[0]]; 4984 bo = -1;
4994 #ifdef BYTEORDER_IS_LITTLE_ENDIAN 4985 q += 4;
4995 if (bom == 0x0000FEFF) { 4986 }
4987 else if (bom == 0xFFFE0000) {
4988 bo = 1;
4989 q += 4;
4990 }
4991 if (byteorder)
4992 *byteorder = bo;
4993 }
4994
4995 if (q == e) {
4996 if (consumed)
4997 *consumed = size;
4998 _Py_RETURN_UNICODE_EMPTY();
4999 }
5000
5001 #ifdef WORDS_BIGENDIAN
5002 le = bo < 0;
5003 #else
5004 le = bo <= 0;
5005 #endif
5006 encoding = le ? "utf-32-le" : "utf-32-be";
5007
5008 _PyUnicodeWriter_Init(&writer);
5009 writer.min_length = (e - q + 3) / 4;
5010 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5011 goto onError;
5012
5013 while (1) {
5014 Py_UCS4 ch = 0;
5015 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5016
5017 if (e - q >= 4) {
5018 enum PyUnicode_Kind kind = writer.kind;
5019 void *data = writer.data;
5020 const unsigned char *last = e - 4;
5021 Py_ssize_t pos = writer.pos;
5022 if (le) {
5023 do {
5024 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5025 if (ch > maxch)
5026 break;
5027 if (kind != PyUnicode_1BYTE_KIND &&
5028 Py_UNICODE_IS_SURROGATE(ch))
5029 break;
5030 PyUnicode_WRITE(kind, data, pos++, ch);
5031 q += 4;
5032 } while (q <= last);
5033 }
5034 else {
5035 do {
5036 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5037 if (ch > maxch)
5038 break;
5039 if (kind != PyUnicode_1BYTE_KIND &&
5040 Py_UNICODE_IS_SURROGATE(ch))
5041 break;
5042 PyUnicode_WRITE(kind, data, pos++, ch);
5043