Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code | Sign in
(16688)

Delta Between Two Patch Sets: Objects/unicodeobject.c

Issue 20538: Segfault in UTF-7 incremental decoder
Left Patch Set: Created 5 years, 7 months ago
Right Patch Set: Created 5 years, 7 months ago
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
« no previous file with change/comment | « Lib/test/test_codecs.py ('k') | no next file » | no next file with change/comment »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
LEFTRIGHT
1 /* 1 /*
2 2
3 Unicode implementation based on original code by Fredrik Lundh, 3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com>. 4 modified by Marc-Andre Lemburg <mal@lemburg.com>.
5 5
6 Major speed upgrades to the method implementations at the Reykjavik 6 Major speed upgrades to the method implementations at the Reykjavik
7 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 7 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8 8
9 Copyright (c) Corporation for National Research Initiatives. 9 Copyright (c) Corporation for National Research Initiatives.
10 10
(...skipping 29 matching lines...) Expand all
40 40
41 #define PY_SSIZE_T_CLEAN 41 #define PY_SSIZE_T_CLEAN
42 #include "Python.h" 42 #include "Python.h"
43 #include "ucnhash.h" 43 #include "ucnhash.h"
44 #include "bytes_methods.h" 44 #include "bytes_methods.h"
45 45
46 #ifdef MS_WINDOWS 46 #ifdef MS_WINDOWS
47 #include <windows.h> 47 #include <windows.h>
48 #endif 48 #endif
49 49
50 /* Endianness switches; defaults to little endian */ 50 /*[clinic input]
51 51 class str "PyUnicodeObject *" "&PyUnicode_Type"
52 #ifdef WORDS_BIGENDIAN 52 [clinic start generated code]*/
53 # define BYTEORDER_IS_BIG_ENDIAN 53 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
54 #else
55 # define BYTEORDER_IS_LITTLE_ENDIAN
56 #endif
57 54
58 /* --- Globals ------------------------------------------------------------ 55 /* --- Globals ------------------------------------------------------------
59 56
60 NOTE: In the interpreter's initialization phase, some globals are currently 57 NOTE: In the interpreter's initialization phase, some globals are currently
61 initialized dynamically as needed. In the process Unicode objects may 58 initialized dynamically as needed. In the process Unicode objects may
62 be created before the Unicode type is ready. 59 be created before the Unicode type is ready.
63 60
64 */ 61 */
65 62
66 63
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after
123 (assert(_PyUnicode_CHECK(op)), \ 120 (assert(_PyUnicode_CHECK(op)), \
124 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \ 121 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
125 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op))) 122 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
126 #define _PyUnicode_SHARE_WSTR(op) \ 123 #define _PyUnicode_SHARE_WSTR(op) \
127 (assert(_PyUnicode_CHECK(op)), \ 124 (assert(_PyUnicode_CHECK(op)), \
128 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op))) 125 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
129 126
130 /* true if the Unicode object has an allocated UTF-8 memory block 127 /* true if the Unicode object has an allocated UTF-8 memory block
131 (not shared with other data) */ 128 (not shared with other data) */
132 #define _PyUnicode_HAS_UTF8_MEMORY(op) \ 129 #define _PyUnicode_HAS_UTF8_MEMORY(op) \
133 (assert(_PyUnicode_CHECK(op)), \ 130 ((!PyUnicode_IS_COMPACT_ASCII(op) \
134 (!PyUnicode_IS_COMPACT_ASCII(op) \
135 && _PyUnicode_UTF8(op) \ 131 && _PyUnicode_UTF8(op) \
136 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op))) 132 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
137 133
138 /* true if the Unicode object has an allocated wstr memory block 134 /* true if the Unicode object has an allocated wstr memory block
139 (not shared with other data) */ 135 (not shared with other data) */
140 #define _PyUnicode_HAS_WSTR_MEMORY(op) \ 136 #define _PyUnicode_HAS_WSTR_MEMORY(op) \
141 (assert(_PyUnicode_CHECK(op)), \ 137 ((_PyUnicode_WSTR(op) && \
142 (_PyUnicode_WSTR(op) && \
143 (!PyUnicode_IS_READY(op) || \ 138 (!PyUnicode_IS_READY(op) || \
144 _PyUnicode_WSTR(op) != PyUnicode_DATA(op)))) 139 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
145 140
146 /* Generic helper macro to convert characters of different types. 141 /* Generic helper macro to convert characters of different types.
147 from_type and to_type have to be valid type names, begin and end 142 from_type and to_type have to be valid type names, begin and end
148 are pointers to the source characters which should be of type 143 are pointers to the source characters which should be of type
149 "from_type *". to is a pointer of type "to_type *" and points to the 144 "from_type *". to is a pointer of type "to_type *" and points to the
150 buffer where the result characters are written to. */ 145 buffer where the result characters are written to. */
151 #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \ 146 #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
152 do { \ 147 do { \
153 to_type *_to = (to_type *) to; \ 148 to_type *_to = (to_type *)(to); \
154 const from_type *_iter = (begin); \ 149 const from_type *_iter = (from_type *)(begin); \
155 const from_type *_end = (end); \ 150 const from_type *_end = (from_type *)(end); \
156 Py_ssize_t n = (_end) - (_iter); \ 151 Py_ssize_t n = (_end) - (_iter); \
157 const from_type *_unrolled_end = \ 152 const from_type *_unrolled_end = \
158 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \ 153 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
159 while (_iter < (_unrolled_end)) { \ 154 while (_iter < (_unrolled_end)) { \
160 _to[0] = (to_type) _iter[0]; \ 155 _to[0] = (to_type) _iter[0]; \
161 _to[1] = (to_type) _iter[1]; \ 156 _to[1] = (to_type) _iter[1]; \
162 _to[2] = (to_type) _iter[2]; \ 157 _to[2] = (to_type) _iter[2]; \
163 _to[3] = (to_type) _iter[3]; \ 158 _to[3] = (to_type) _iter[3]; \
164 _iter += 4; _to += 4; \ 159 _iter += 4; _to += 4; \
165 } \ 160 } \
(...skipping 25 matching lines...) Expand all
191 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \ 186 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
192 } \ 187 } \
193 } \ 188 } \
194 } while (0) 189 } while (0)
195 190
196 #define _Py_RETURN_UNICODE_EMPTY() \ 191 #define _Py_RETURN_UNICODE_EMPTY() \
197 do { \ 192 do { \
198 _Py_INCREF_UNICODE_EMPTY(); \ 193 _Py_INCREF_UNICODE_EMPTY(); \
199 return unicode_empty; \ 194 return unicode_empty; \
200 } while (0) 195 } while (0)
196
197 /* Forward declaration */
198 Py_LOCAL_INLINE(int)
199 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
201 200
202 /* List of static strings. */ 201 /* List of static strings. */
203 static _Py_Identifier *static_strings = NULL; 202 static _Py_Identifier *static_strings = NULL;
204 203
205 /* Single character Unicode strings in the Latin-1 range are being 204 /* Single character Unicode strings in the Latin-1 range are being
206 shared as well. */ 205 shared as well. */
207 static PyObject *unicode_latin1[256] = {NULL}; 206 static PyObject *unicode_latin1[256] = {NULL};
208 207
209 /* Fast detection of the most frequent whitespace characters */ 208 /* Fast detection of the most frequent whitespace characters */
210 const unsigned char _Py_ascii_whitespace[] = { 209 const unsigned char _Py_ascii_whitespace[] = {
(...skipping 209 matching lines...) Expand 10 before | Expand all | Expand 10 after
420 return 1; 419 return 1;
421 } 420 }
422 #endif 421 #endif
423 422
424 static PyObject* 423 static PyObject*
425 unicode_result_wchar(PyObject *unicode) 424 unicode_result_wchar(PyObject *unicode)
426 { 425 {
427 #ifndef Py_DEBUG 426 #ifndef Py_DEBUG
428 Py_ssize_t len; 427 Py_ssize_t len;
429 428
430 assert(Py_REFCNT(unicode) == 1);
431
432 len = _PyUnicode_WSTR_LENGTH(unicode); 429 len = _PyUnicode_WSTR_LENGTH(unicode);
433 if (len == 0) { 430 if (len == 0) {
434 Py_DECREF(unicode); 431 Py_DECREF(unicode);
435 _Py_RETURN_UNICODE_EMPTY(); 432 _Py_RETURN_UNICODE_EMPTY();
436 } 433 }
437 434
438 if (len == 1) { 435 if (len == 1) {
439 wchar_t ch = _PyUnicode_WSTR(unicode)[0]; 436 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
440 if ((Py_UCS4)ch < 256) { 437 if ((Py_UCS4)ch < 256) {
441 PyObject *latin1_char = get_latin1_char((unsigned char)ch); 438 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
442 Py_DECREF(unicode); 439 Py_DECREF(unicode);
443 return latin1_char; 440 return latin1_char;
444 } 441 }
445 } 442 }
446 443
447 if (_PyUnicode_Ready(unicode) < 0) { 444 if (_PyUnicode_Ready(unicode) < 0) {
448 Py_XDECREF(unicode); 445 Py_DECREF(unicode);
449 return NULL; 446 return NULL;
450 } 447 }
451 #else 448 #else
449 assert(Py_REFCNT(unicode) == 1);
450
452 /* don't make the result ready in debug mode to ensure that the caller 451 /* don't make the result ready in debug mode to ensure that the caller
453 makes the string ready before using it */ 452 makes the string ready before using it */
454 assert(_PyUnicode_CheckConsistency(unicode, 1)); 453 assert(_PyUnicode_CheckConsistency(unicode, 1));
455 #endif 454 #endif
456 return unicode; 455 return unicode;
457 } 456 }
458 457
459 static PyObject* 458 static PyObject*
460 unicode_result_ready(PyObject *unicode) 459 unicode_result_ready(PyObject *unicode)
461 { 460 {
462 Py_ssize_t length; 461 Py_ssize_t length;
463 462
464 length = PyUnicode_GET_LENGTH(unicode); 463 length = PyUnicode_GET_LENGTH(unicode);
465 if (length == 0) { 464 if (length == 0) {
466 if (unicode != unicode_empty) { 465 if (unicode != unicode_empty) {
467 Py_DECREF(unicode); 466 Py_DECREF(unicode);
468 _Py_RETURN_UNICODE_EMPTY(); 467 _Py_RETURN_UNICODE_EMPTY();
469 } 468 }
470 return unicode_empty; 469 return unicode_empty;
471 } 470 }
472 471
473 if (length == 1) { 472 if (length == 1) {
474 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); 473 void *data = PyUnicode_DATA(unicode);
474 int kind = PyUnicode_KIND(unicode);
475 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
475 if (ch < 256) { 476 if (ch < 256) {
476 PyObject *latin1_char = unicode_latin1[ch]; 477 PyObject *latin1_char = unicode_latin1[ch];
477 if (latin1_char != NULL) { 478 if (latin1_char != NULL) {
478 if (unicode != latin1_char) { 479 if (unicode != latin1_char) {
479 Py_INCREF(latin1_char); 480 Py_INCREF(latin1_char);
480 Py_DECREF(unicode); 481 Py_DECREF(unicode);
481 } 482 }
482 return latin1_char; 483 return latin1_char;
483 } 484 }
484 else { 485 else {
(...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after
537 #elif LONG_BIT >= 32 538 #elif LONG_BIT >= 32
538 #define BLOOM_WIDTH 32 539 #define BLOOM_WIDTH 32
539 #else 540 #else
540 #error "LONG_BIT is smaller than 32" 541 #error "LONG_BIT is smaller than 32"
541 #endif 542 #endif
542 543
543 #define BLOOM_MASK unsigned long 544 #define BLOOM_MASK unsigned long
544 545
545 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0; 546 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
546 547
547 #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
548 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 548 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
549 549
550 #define BLOOM_LINEBREAK(ch) \ 550 #define BLOOM_LINEBREAK(ch) \
551 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 551 ((ch) < 128U ? ascii_linebreak[(ch)] : \
552 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 552 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
553 553
554 Py_LOCAL_INLINE(BLOOM_MASK) 554 Py_LOCAL_INLINE(BLOOM_MASK)
555 make_bloom_mask(int kind, void* ptr, Py_ssize_t len) 555 make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
556 { 556 {
557 #define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
558 do { \
559 TYPE *data = (TYPE *)PTR; \
560 TYPE *end = data + LEN; \
561 Py_UCS4 ch; \
562 for (; data != end; data++) { \
563 ch = *data; \
564 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
565 } \
566 break; \
567 } while (0)
568
557 /* calculate simple bloom-style bitmask for a given unicode string */ 569 /* calculate simple bloom-style bitmask for a given unicode string */
558 570
559 BLOOM_MASK mask; 571 BLOOM_MASK mask;
560 Py_ssize_t i;
561 572
562 mask = 0; 573 mask = 0;
563 for (i = 0; i < len; i++) 574 switch (kind) {
564 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i)); 575 case PyUnicode_1BYTE_KIND:
565 576 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
577 break;
578 case PyUnicode_2BYTE_KIND:
579 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
580 break;
581 case PyUnicode_4BYTE_KIND:
582 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
583 break;
584 default:
585 assert(0);
586 }
566 return mask; 587 return mask;
567 } 588
568 589 #undef BLOOM_UPDATE
569 #define BLOOM_MEMBER(mask, chr, str) \ 590 }
570 (BLOOM(mask, chr) \
571 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
572 591
573 /* Compilation of templated routines */ 592 /* Compilation of templated routines */
574 593
575 #include "stringlib/asciilib.h" 594 #include "stringlib/asciilib.h"
576 #include "stringlib/fastsearch.h" 595 #include "stringlib/fastsearch.h"
577 #include "stringlib/partition.h" 596 #include "stringlib/partition.h"
578 #include "stringlib/split.h" 597 #include "stringlib/split.h"
579 #include "stringlib/count.h" 598 #include "stringlib/count.h"
580 #include "stringlib/find.h" 599 #include "stringlib/find.h"
581 #include "stringlib/find_max_char.h" 600 #include "stringlib/find_max_char.h"
582 #include "stringlib/localeutil.h" 601 #include "stringlib/localeutil.h"
583 #include "stringlib/undef.h" 602 #include "stringlib/undef.h"
584 603
585 #include "stringlib/ucs1lib.h" 604 #include "stringlib/ucs1lib.h"
586 #include "stringlib/fastsearch.h" 605 #include "stringlib/fastsearch.h"
587 #include "stringlib/partition.h" 606 #include "stringlib/partition.h"
588 #include "stringlib/split.h" 607 #include "stringlib/split.h"
589 #include "stringlib/count.h" 608 #include "stringlib/count.h"
590 #include "stringlib/find.h" 609 #include "stringlib/find.h"
610 #include "stringlib/replace.h"
591 #include "stringlib/find_max_char.h" 611 #include "stringlib/find_max_char.h"
592 #include "stringlib/localeutil.h" 612 #include "stringlib/localeutil.h"
593 #include "stringlib/undef.h" 613 #include "stringlib/undef.h"
594 614
595 #include "stringlib/ucs2lib.h" 615 #include "stringlib/ucs2lib.h"
596 #include "stringlib/fastsearch.h" 616 #include "stringlib/fastsearch.h"
597 #include "stringlib/partition.h" 617 #include "stringlib/partition.h"
598 #include "stringlib/split.h" 618 #include "stringlib/split.h"
599 #include "stringlib/count.h" 619 #include "stringlib/count.h"
600 #include "stringlib/find.h" 620 #include "stringlib/find.h"
621 #include "stringlib/replace.h"
601 #include "stringlib/find_max_char.h" 622 #include "stringlib/find_max_char.h"
602 #include "stringlib/localeutil.h" 623 #include "stringlib/localeutil.h"
603 #include "stringlib/undef.h" 624 #include "stringlib/undef.h"
604 625
605 #include "stringlib/ucs4lib.h" 626 #include "stringlib/ucs4lib.h"
606 #include "stringlib/fastsearch.h" 627 #include "stringlib/fastsearch.h"
607 #include "stringlib/partition.h" 628 #include "stringlib/partition.h"
608 #include "stringlib/split.h" 629 #include "stringlib/split.h"
609 #include "stringlib/count.h" 630 #include "stringlib/count.h"
610 #include "stringlib/find.h" 631 #include "stringlib/find.h"
632 #include "stringlib/replace.h"
611 #include "stringlib/find_max_char.h" 633 #include "stringlib/find_max_char.h"
612 #include "stringlib/localeutil.h" 634 #include "stringlib/localeutil.h"
613 #include "stringlib/undef.h" 635 #include "stringlib/undef.h"
614 636
615 #include "stringlib/unicodedefs.h" 637 #include "stringlib/unicodedefs.h"
616 #include "stringlib/fastsearch.h" 638 #include "stringlib/fastsearch.h"
617 #include "stringlib/count.h" 639 #include "stringlib/count.h"
618 #include "stringlib/find.h" 640 #include "stringlib/find.h"
619 #include "stringlib/undef.h" 641 #include "stringlib/undef.h"
620 642
(...skipping 26 matching lines...) Expand all
647 return -1; 669 return -1;
648 } 670 }
649 case PyUnicode_4BYTE_KIND: 671 case PyUnicode_4BYTE_KIND:
650 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode); 672 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
651 default: 673 default:
652 assert(0); 674 assert(0);
653 return -1; 675 return -1;
654 } 676 }
655 } 677 }
656 678
679 #ifdef Py_DEBUG
680 /* Fill the data of an Unicode string with invalid characters to detect bugs
681 earlier.
682
683 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
684 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
685 invalid character in Unicode 6.0. */
686 static void
687 unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
688 {
689 int kind = PyUnicode_KIND(unicode);
690 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
691 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
692 if (length <= old_length)
693 return;
694 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
695 }
696 #endif
697
657 static PyObject* 698 static PyObject*
658 resize_compact(PyObject *unicode, Py_ssize_t length) 699 resize_compact(PyObject *unicode, Py_ssize_t length)
659 { 700 {
660 Py_ssize_t char_size; 701 Py_ssize_t char_size;
661 Py_ssize_t struct_size; 702 Py_ssize_t struct_size;
662 Py_ssize_t new_size; 703 Py_ssize_t new_size;
663 int share_wstr; 704 int share_wstr;
664 PyObject *new_unicode; 705 PyObject *new_unicode;
706 #ifdef Py_DEBUG
707 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
708 #endif
709
665 assert(unicode_modifiable(unicode)); 710 assert(unicode_modifiable(unicode));
666 assert(PyUnicode_IS_READY(unicode)); 711 assert(PyUnicode_IS_READY(unicode));
667 assert(PyUnicode_IS_COMPACT(unicode)); 712 assert(PyUnicode_IS_COMPACT(unicode));
668 713
669 char_size = PyUnicode_KIND(unicode); 714 char_size = PyUnicode_KIND(unicode);
670 if (PyUnicode_IS_ASCII(unicode)) 715 if (PyUnicode_IS_ASCII(unicode))
671 struct_size = sizeof(PyASCIIObject); 716 struct_size = sizeof(PyASCIIObject);
672 else 717 else
673 struct_size = sizeof(PyCompactUnicodeObject); 718 struct_size = sizeof(PyCompactUnicodeObject);
674 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 719 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
(...skipping 19 matching lines...) Expand all
694 _PyUnicode_LENGTH(unicode) = length; 739 _PyUnicode_LENGTH(unicode) = length;
695 if (share_wstr) { 740 if (share_wstr) {
696 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode); 741 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
697 if (!PyUnicode_IS_ASCII(unicode)) 742 if (!PyUnicode_IS_ASCII(unicode))
698 _PyUnicode_WSTR_LENGTH(unicode) = length; 743 _PyUnicode_WSTR_LENGTH(unicode) = length;
699 } 744 }
700 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) { 745 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
701 PyObject_DEL(_PyUnicode_WSTR(unicode)); 746 PyObject_DEL(_PyUnicode_WSTR(unicode));
702 _PyUnicode_WSTR(unicode) = NULL; 747 _PyUnicode_WSTR(unicode) = NULL;
703 } 748 }
749 #ifdef Py_DEBUG
750 unicode_fill_invalid(unicode, old_length);
751 #endif
704 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 752 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
705 length, 0); 753 length, 0);
706 assert(_PyUnicode_CheckConsistency(unicode, 0)); 754 assert(_PyUnicode_CheckConsistency(unicode, 0));
707 return unicode; 755 return unicode;
708 } 756 }
709 757
710 static int 758 static int
711 resize_inplace(PyObject *unicode, Py_ssize_t length) 759 resize_inplace(PyObject *unicode, Py_ssize_t length)
712 { 760 {
713 wchar_t *wstr; 761 wchar_t *wstr;
714 Py_ssize_t new_size; 762 Py_ssize_t new_size;
715 assert(!PyUnicode_IS_COMPACT(unicode)); 763 assert(!PyUnicode_IS_COMPACT(unicode));
716 assert(Py_REFCNT(unicode) == 1); 764 assert(Py_REFCNT(unicode) == 1);
717 765
718 if (PyUnicode_IS_READY(unicode)) { 766 if (PyUnicode_IS_READY(unicode)) {
719 Py_ssize_t char_size; 767 Py_ssize_t char_size;
720 int share_wstr, share_utf8; 768 int share_wstr, share_utf8;
721 void *data; 769 void *data;
770 #ifdef Py_DEBUG
771 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
772 #endif
722 773
723 data = _PyUnicode_DATA_ANY(unicode); 774 data = _PyUnicode_DATA_ANY(unicode);
724 char_size = PyUnicode_KIND(unicode); 775 char_size = PyUnicode_KIND(unicode);
725 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 776 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
726 share_utf8 = _PyUnicode_SHARE_UTF8(unicode); 777 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
727 778
728 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 779 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
729 PyErr_NoMemory(); 780 PyErr_NoMemory();
730 return -1; 781 return -1;
731 } 782 }
(...skipping 15 matching lines...) Expand all
747 if (share_wstr) { 798 if (share_wstr) {
748 _PyUnicode_WSTR(unicode) = data; 799 _PyUnicode_WSTR(unicode) = data;
749 _PyUnicode_WSTR_LENGTH(unicode) = length; 800 _PyUnicode_WSTR_LENGTH(unicode) = length;
750 } 801 }
751 if (share_utf8) { 802 if (share_utf8) {
752 _PyUnicode_UTF8(unicode) = data; 803 _PyUnicode_UTF8(unicode) = data;
753 _PyUnicode_UTF8_LENGTH(unicode) = length; 804 _PyUnicode_UTF8_LENGTH(unicode) = length;
754 } 805 }
755 _PyUnicode_LENGTH(unicode) = length; 806 _PyUnicode_LENGTH(unicode) = length;
756 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0); 807 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
808 #ifdef Py_DEBUG
809 unicode_fill_invalid(unicode, old_length);
810 #endif
757 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) { 811 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
758 assert(_PyUnicode_CheckConsistency(unicode, 0)); 812 assert(_PyUnicode_CheckConsistency(unicode, 0));
759 return 0; 813 return 0;
760 } 814 }
761 } 815 }
762 assert(_PyUnicode_WSTR(unicode) != NULL); 816 assert(_PyUnicode_WSTR(unicode) != NULL);
763 817
764 /* check for integer overflow */ 818 /* check for integer overflow */
765 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) { 819 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
766 PyErr_NoMemory(); 820 PyErr_NoMemory();
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after
799 return copy; 853 return copy;
800 } 854 }
801 else { 855 else {
802 PyObject *w; 856 PyObject *w;
803 857
804 w = (PyObject*)_PyUnicode_New(length); 858 w = (PyObject*)_PyUnicode_New(length);
805 if (w == NULL) 859 if (w == NULL)
806 return NULL; 860 return NULL;
807 copy_length = _PyUnicode_WSTR_LENGTH(unicode); 861 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
808 copy_length = Py_MIN(copy_length, length); 862 copy_length = Py_MIN(copy_length, length);
809 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode), 863 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
810 copy_length); 864 copy_length * sizeof(wchar_t));
811 return w; 865 return w;
812 } 866 }
813 } 867 }
814 868
815 /* We allocate one more byte to make sure the string is 869 /* We allocate one more byte to make sure the string is
816 Ux0000 terminated; some code (e.g. new_identifier) 870 Ux0000 terminated; some code (e.g. new_identifier)
817 relies on that. 871 relies on that.
818 872
819 XXX This allocator could further be enhanced by assuring that the 873 XXX This allocator could further be enhanced by assuring that the
820 free list never reduces its size below 1. 874 free list never reduces its size below 1.
821 875
822 */ 876 */
823 877
824 static PyUnicodeObject * 878 static PyUnicodeObject *
825 _PyUnicode_New(Py_ssize_t length) 879 _PyUnicode_New(Py_ssize_t length)
826 { 880 {
827 register PyUnicodeObject *unicode; 881 PyUnicodeObject *unicode;
828 size_t new_size; 882 size_t new_size;
829 883
830 /* Optimization for empty strings */ 884 /* Optimization for empty strings */
831 if (length == 0 && unicode_empty != NULL) { 885 if (length == 0 && unicode_empty != NULL) {
832 Py_INCREF(unicode_empty); 886 Py_INCREF(unicode_empty);
833 return (PyUnicodeObject*)unicode_empty; 887 return (PyUnicodeObject*)unicode_empty;
834 } 888 }
835 889
836 /* Ensure we won't overflow the size. */ 890 /* Ensure we won't overflow the size. */
837 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 891 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
838 return (PyUnicodeObject *)PyErr_NoMemory(); 892 return (PyUnicodeObject *)PyErr_NoMemory();
839 } 893 }
840 if (length < 0) { 894 if (length < 0) {
841 PyErr_SetString(PyExc_SystemError, 895 PyErr_SetString(PyExc_SystemError,
842 "Negative size passed to _PyUnicode_New"); 896 "Negative size passed to _PyUnicode_New");
843 return NULL; 897 return NULL;
844 } 898 }
845 899
846 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 900 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
847 if (unicode == NULL) 901 if (unicode == NULL)
848 return NULL; 902 return NULL;
849 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 903 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
850 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size); 904
851 if (!_PyUnicode_WSTR(unicode)) {
852 Py_DECREF(unicode);
853 PyErr_NoMemory();
854 return NULL;
855 }
856
857 /* Initialize the first element to guard against cases where
858 * the caller fails before initializing str -- unicode_resize()
859 * reads str[0], and the Keep-Alive optimization can keep memory
860 * allocated for str alive across a call to unicode_dealloc(unicode).
861 * We don't want unicode_resize to read uninitialized memory in
862 * that case.
863 */
864 _PyUnicode_WSTR(unicode)[0] = 0;
865 _PyUnicode_WSTR(unicode)[length] = 0;
866 _PyUnicode_WSTR_LENGTH(unicode) = length; 905 _PyUnicode_WSTR_LENGTH(unicode) = length;
867 _PyUnicode_HASH(unicode) = -1; 906 _PyUnicode_HASH(unicode) = -1;
868 _PyUnicode_STATE(unicode).interned = 0; 907 _PyUnicode_STATE(unicode).interned = 0;
869 _PyUnicode_STATE(unicode).kind = 0; 908 _PyUnicode_STATE(unicode).kind = 0;
870 _PyUnicode_STATE(unicode).compact = 0; 909 _PyUnicode_STATE(unicode).compact = 0;
871 _PyUnicode_STATE(unicode).ready = 0; 910 _PyUnicode_STATE(unicode).ready = 0;
872 _PyUnicode_STATE(unicode).ascii = 0; 911 _PyUnicode_STATE(unicode).ascii = 0;
873 _PyUnicode_DATA_ANY(unicode) = NULL; 912 _PyUnicode_DATA_ANY(unicode) = NULL;
874 _PyUnicode_LENGTH(unicode) = 0; 913 _PyUnicode_LENGTH(unicode) = 0;
875 _PyUnicode_UTF8(unicode) = NULL; 914 _PyUnicode_UTF8(unicode) = NULL;
876 _PyUnicode_UTF8_LENGTH(unicode) = 0; 915 _PyUnicode_UTF8_LENGTH(unicode) = 0;
916
917 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
918 if (!_PyUnicode_WSTR(unicode)) {
919 Py_DECREF(unicode);
920 PyErr_NoMemory();
921 return NULL;
922 }
923
924 /* Initialize the first element to guard against cases where
925 * the caller fails before initializing str -- unicode_resize()
926 * reads str[0], and the Keep-Alive optimization can keep memory
927 * allocated for str alive across a call to unicode_dealloc(unicode).
928 * We don't want unicode_resize to read uninitialized memory in
929 * that case.
930 */
931 _PyUnicode_WSTR(unicode)[0] = 0;
932 _PyUnicode_WSTR(unicode)[length] = 0;
933
877 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0)); 934 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
878 return unicode; 935 return unicode;
879 } 936 }
880 937
881 static const char* 938 static const char*
882 unicode_kind_name(PyObject *unicode) 939 unicode_kind_name(PyObject *unicode)
883 { 940 {
884 /* don't check consistency: unicode_kind_name() is called from 941 /* don't check consistency: unicode_kind_name() is called from
885 _PyUnicode_Dump() */ 942 _PyUnicode_Dump() */
886 if (!PyUnicode_IS_COMPACT(unicode)) 943 if (!PyUnicode_IS_COMPACT(unicode))
(...skipping 184 matching lines...) Expand 10 before | Expand all | Expand 10 after
1071 if (is_sharing) { 1128 if (is_sharing) {
1072 _PyUnicode_WSTR_LENGTH(unicode) = size; 1129 _PyUnicode_WSTR_LENGTH(unicode) = size;
1073 _PyUnicode_WSTR(unicode) = (wchar_t *)data; 1130 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1074 } 1131 }
1075 else { 1132 else {
1076 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1133 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1077 _PyUnicode_WSTR(unicode) = NULL; 1134 _PyUnicode_WSTR(unicode) = NULL;
1078 } 1135 }
1079 } 1136 }
1080 #ifdef Py_DEBUG 1137 #ifdef Py_DEBUG
1081 /* Fill the data with invalid characters to detect bugs earlier. 1138 unicode_fill_invalid((PyObject*)unicode, 0);
1082 _PyUnicode_CheckConsistency(str, 1) detects invalid characters,
1083 at least for ASCII and UCS-4 strings. U+00FF is invalid in ASCII
1084 and U+FFFFFFFF is an invalid character in Unicode 6.0. */
1085 memset(data, 0xff, size * kind);
1086 #endif 1139 #endif
1087 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0)); 1140 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1088 return obj; 1141 return obj;
1089 } 1142 }
1090 1143
1091 #if SIZEOF_WCHAR_T == 2 1144 #if SIZEOF_WCHAR_T == 2
1092 /* Helper function to convert a 16-bits wchar_t representation to UCS4, this 1145 /* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1093 will decode surrogate pairs, the other conversions are implemented as macros 1146 will decode surrogate pairs, the other conversions are implemented as macros
1094 for efficiency. 1147 for efficiency.
1095 1148
(...skipping 409 matching lines...) Expand 10 before | Expand all | Expand 10 after
1505 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1558 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1506 #endif 1559 #endif
1507 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0'; 1560 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1508 } 1561 }
1509 _PyUnicode_STATE(unicode).ready = 1; 1562 _PyUnicode_STATE(unicode).ready = 1;
1510 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1563 assert(_PyUnicode_CheckConsistency(unicode, 1));
1511 return 0; 1564 return 0;
1512 } 1565 }
1513 1566
1514 static void 1567 static void
1515 unicode_dealloc(register PyObject *unicode) 1568 unicode_dealloc(PyObject *unicode)
1516 { 1569 {
1517 switch (PyUnicode_CHECK_INTERNED(unicode)) { 1570 switch (PyUnicode_CHECK_INTERNED(unicode)) {
1518 case SSTATE_NOT_INTERNED: 1571 case SSTATE_NOT_INTERNED:
1519 break; 1572 break;
1520 1573
1521 case SSTATE_INTERNED_MORTAL: 1574 case SSTATE_INTERNED_MORTAL:
1522 /* revive dead object temporarily for DelItem */ 1575 /* revive dead object temporarily for DelItem */
1523 Py_REFCNT(unicode) = 3; 1576 Py_REFCNT(unicode) = 3;
1524 if (PyDict_DelItem(interned, unicode) != 0) 1577 if (PyDict_DelItem(interned, unicode) != 0)
1525 Py_FatalError( 1578 Py_FatalError(
(...skipping 111 matching lines...) Expand 10 before | Expand all | Expand 10 after
1637 } 1690 }
1638 unicode = *p_unicode; 1691 unicode = *p_unicode;
1639 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0) 1692 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1640 { 1693 {
1641 PyErr_BadInternalCall(); 1694 PyErr_BadInternalCall();
1642 return -1; 1695 return -1;
1643 } 1696 }
1644 return unicode_resize(p_unicode, length); 1697 return unicode_resize(p_unicode, length);
1645 } 1698 }
1646 1699
1647 static int
1648 unicode_widen(PyObject **p_unicode, Py_ssize_t length,
1649 unsigned int maxchar)
1650 {
1651 PyObject *result;
1652 assert(PyUnicode_IS_READY(*p_unicode));
1653 assert(length <= PyUnicode_GET_LENGTH(*p_unicode));
1654 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1655 return 0;
1656 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1657 maxchar);
1658 if (result == NULL)
1659 return -1;
1660 _PyUnicode_FastCopyCharacters(result, 0, *p_unicode, 0, length);
1661 Py_DECREF(*p_unicode);
1662 *p_unicode = result;
1663 return 0;
1664 }
1665
1666 static int
1667 unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1668 Py_UCS4 ch)
1669 {
1670 assert(ch <= MAX_UNICODE);
1671 if (unicode_widen(p_unicode, *pos, ch) < 0)
1672 return -1;
1673 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1674 PyUnicode_DATA(*p_unicode),
1675 (*pos)++, ch);
1676 return 0;
1677 }
1678
1679 /* Copy a ASCII or latin1 char* string into a Python Unicode string. 1700 /* Copy a ASCII or latin1 char* string into a Python Unicode string.
1680 1701
1681 WARNING: The function doesn't copy the terminating null character and 1702 WARNING: The function doesn't copy the terminating null character and
1682 doesn't check the maximum character (may write a latin1 character in an 1703 doesn't check the maximum character (may write a latin1 character in an
1683 ASCII string). */ 1704 ASCII string). */
1684 static void 1705 static void
1685 unicode_write_cstr(PyObject *unicode, Py_ssize_t index, 1706 unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1686 const char *str, Py_ssize_t len) 1707 const char *str, Py_ssize_t len)
1687 { 1708 {
1688 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); 1709 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1689 void *data = PyUnicode_DATA(unicode); 1710 void *data = PyUnicode_DATA(unicode);
1690 const char *end = str + len; 1711 const char *end = str + len;
1691 1712
1692 switch (kind) { 1713 switch (kind) {
1693 case PyUnicode_1BYTE_KIND: { 1714 case PyUnicode_1BYTE_KIND: {
1694 assert(index + len <= PyUnicode_GET_LENGTH(unicode)); 1715 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1716 #ifdef Py_DEBUG
1717 if (PyUnicode_IS_ASCII(unicode)) {
1718 Py_UCS4 maxchar = ucs1lib_find_max_char(
1719 (const Py_UCS1*)str,
1720 (const Py_UCS1*)str + len);
1721 assert(maxchar < 128);
1722 }
1723 #endif
1695 memcpy((char *) data + index, str, len); 1724 memcpy((char *) data + index, str, len);
1696 break; 1725 break;
1697 } 1726 }
1698 case PyUnicode_2BYTE_KIND: { 1727 case PyUnicode_2BYTE_KIND: {
1699 Py_UCS2 *start = (Py_UCS2 *)data + index; 1728 Py_UCS2 *start = (Py_UCS2 *)data + index;
1700 Py_UCS2 *ucs2 = start; 1729 Py_UCS2 *ucs2 = start;
1701 assert(index <= PyUnicode_GET_LENGTH(unicode)); 1730 assert(index <= PyUnicode_GET_LENGTH(unicode));
1702 1731
1703 for (; str < end; ++ucs2, ++str) 1732 for (; str < end; ++ucs2, ++str)
1704 *ucs2 = (Py_UCS2)*str; 1733 *ucs2 = (Py_UCS2)*str;
1705 1734
1706 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode)); 1735 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1707 break; 1736 break;
1708 } 1737 }
1709 default: { 1738 default: {
1710 Py_UCS4 *start = (Py_UCS4 *)data + index; 1739 Py_UCS4 *start = (Py_UCS4 *)data + index;
1711 Py_UCS4 *ucs4 = start; 1740 Py_UCS4 *ucs4 = start;
1712 assert(kind == PyUnicode_4BYTE_KIND); 1741 assert(kind == PyUnicode_4BYTE_KIND);
1713 assert(index <= PyUnicode_GET_LENGTH(unicode)); 1742 assert(index <= PyUnicode_GET_LENGTH(unicode));
1714 1743
1715 for (; str < end; ++ucs4, ++str) 1744 for (; str < end; ++ucs4, ++str)
1716 *ucs4 = (Py_UCS4)*str; 1745 *ucs4 = (Py_UCS4)*str;
1717 1746
1718 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode)); 1747 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1719 } 1748 }
1720 } 1749 }
1721 } 1750 }
1722
1723 1751
1724 static PyObject* 1752 static PyObject*
1725 get_latin1_char(unsigned char ch) 1753 get_latin1_char(unsigned char ch)
1726 { 1754 {
1727 PyObject *unicode = unicode_latin1[ch]; 1755 PyObject *unicode = unicode_latin1[ch];
1728 if (!unicode) { 1756 if (!unicode) {
1729 unicode = PyUnicode_New(1, ch); 1757 unicode = PyUnicode_New(1, ch);
1730 if (!unicode) 1758 if (!unicode)
1731 return NULL; 1759 return NULL;
1732 PyUnicode_1BYTE_DATA(unicode)[0] = ch; 1760 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1733 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1761 assert(_PyUnicode_CheckConsistency(unicode, 1));
1734 unicode_latin1[ch] = unicode; 1762 unicode_latin1[ch] = unicode;
1735 } 1763 }
1736 Py_INCREF(unicode); 1764 Py_INCREF(unicode);
1765 return unicode;
1766 }
1767
1768 static PyObject*
1769 unicode_char(Py_UCS4 ch)
1770 {
1771 PyObject *unicode;
1772
1773 assert(ch <= MAX_UNICODE);
1774
1775 if (ch < 256)
1776 return get_latin1_char(ch);
1777
1778 unicode = PyUnicode_New(1, ch);
1779 if (unicode == NULL)
1780 return NULL;
1781 switch (PyUnicode_KIND(unicode)) {
1782 case PyUnicode_1BYTE_KIND:
1783 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1784 break;
1785 case PyUnicode_2BYTE_KIND:
1786 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1787 break;
1788 default:
1789 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1790 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1791 }
1792 assert(_PyUnicode_CheckConsistency(unicode, 1));
1737 return unicode; 1793 return unicode;
1738 } 1794 }
1739 1795
1740 PyObject * 1796 PyObject *
1741 PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) 1797 PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
1742 { 1798 {
1743 PyObject *unicode; 1799 PyObject *unicode;
1744 Py_UCS4 maxchar = 0; 1800 Py_UCS4 maxchar = 0;
1745 Py_ssize_t num_surrogates; 1801 Py_ssize_t num_surrogates;
1746 1802
(...skipping 181 matching lines...) Expand 10 before | Expand all | Expand 10 after
1928 1984
1929 static PyObject* 1985 static PyObject*
1930 _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) 1986 _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
1931 { 1987 {
1932 PyObject *res; 1988 PyObject *res;
1933 Py_UCS2 max_char; 1989 Py_UCS2 max_char;
1934 1990
1935 if (size == 0) 1991 if (size == 0)
1936 _Py_RETURN_UNICODE_EMPTY(); 1992 _Py_RETURN_UNICODE_EMPTY();
1937 assert(size > 0); 1993 assert(size > 0);
1938 if (size == 1) { 1994 if (size == 1)
1939 Py_UCS4 ch = u[0]; 1995 return unicode_char(u[0]);
1940 if (ch < 256)
1941 return get_latin1_char((unsigned char)ch);
1942
1943 res = PyUnicode_New(1, ch);
1944 if (res == NULL)
1945 return NULL;
1946 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1947 assert(_PyUnicode_CheckConsistency(res, 1));
1948 return res;
1949 }
1950 1996
1951 max_char = ucs2lib_find_max_char(u, u + size); 1997 max_char = ucs2lib_find_max_char(u, u + size);
1952 res = PyUnicode_New(size, max_char); 1998 res = PyUnicode_New(size, max_char);
1953 if (!res) 1999 if (!res)
1954 return NULL; 2000 return NULL;
1955 if (max_char >= 256) 2001 if (max_char >= 256)
1956 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size); 2002 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1957 else { 2003 else {
1958 _PyUnicode_CONVERT_BYTES( 2004 _PyUnicode_CONVERT_BYTES(
1959 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res)); 2005 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1960 } 2006 }
1961 assert(_PyUnicode_CheckConsistency(res, 1)); 2007 assert(_PyUnicode_CheckConsistency(res, 1));
1962 return res; 2008 return res;
1963 } 2009 }
1964 2010
1965 static PyObject* 2011 static PyObject*
1966 _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) 2012 _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
1967 { 2013 {
1968 PyObject *res; 2014 PyObject *res;
1969 Py_UCS4 max_char; 2015 Py_UCS4 max_char;
1970 2016
1971 if (size == 0) 2017 if (size == 0)
1972 _Py_RETURN_UNICODE_EMPTY(); 2018 _Py_RETURN_UNICODE_EMPTY();
1973 assert(size > 0); 2019 assert(size > 0);
1974 if (size == 1) { 2020 if (size == 1)
1975 Py_UCS4 ch = u[0]; 2021 return unicode_char(u[0]);
1976 if (ch < 256)
1977 return get_latin1_char((unsigned char)ch);
1978
1979 res = PyUnicode_New(1, ch);
1980 if (res == NULL)
1981 return NULL;
1982 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1983 assert(_PyUnicode_CheckConsistency(res, 1));
1984 return res;
1985 }
1986 2022
1987 max_char = ucs4lib_find_max_char(u, u + size); 2023 max_char = ucs4lib_find_max_char(u, u + size);
1988 res = PyUnicode_New(size, max_char); 2024 res = PyUnicode_New(size, max_char);
1989 if (!res) 2025 if (!res)
1990 return NULL; 2026 return NULL;
1991 if (max_char < 256) 2027 if (max_char < 256)
1992 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size, 2028 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1993 PyUnicode_1BYTE_DATA(res)); 2029 PyUnicode_1BYTE_DATA(res));
1994 else if (max_char < 0x10000) 2030 else if (max_char < 0x10000)
1995 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size, 2031 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
(...skipping 255 matching lines...) Expand 10 before | Expand all | Expand 10 after
2251 2287
2252 Py_UCS4* 2288 Py_UCS4*
2253 PyUnicode_AsUCS4Copy(PyObject *string) 2289 PyUnicode_AsUCS4Copy(PyObject *string)
2254 { 2290 {
2255 return as_ucs4(string, NULL, 0, 1); 2291 return as_ucs4(string, NULL, 0, 1);
2256 } 2292 }
2257 2293
2258 #ifdef HAVE_WCHAR_H 2294 #ifdef HAVE_WCHAR_H
2259 2295
2260 PyObject * 2296 PyObject *
2261 PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size) 2297 PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
2262 { 2298 {
2263 if (w == NULL) { 2299 if (w == NULL) {
2264 if (size == 0) 2300 if (size == 0)
2265 _Py_RETURN_UNICODE_EMPTY(); 2301 _Py_RETURN_UNICODE_EMPTY();
2266 PyErr_BadInternalCall(); 2302 PyErr_BadInternalCall();
2267 return NULL; 2303 return NULL;
2268 } 2304 }
2269 2305
2270 if (size == -1) { 2306 if (size == -1) {
2271 size = wcslen(w); 2307 size = wcslen(w);
2272 } 2308 }
2273 2309
2274 return PyUnicode_FromUnicode(w, size); 2310 return PyUnicode_FromUnicode(w, size);
2275 } 2311 }
2276 2312
2277 #endif /* HAVE_WCHAR_H */ 2313 #endif /* HAVE_WCHAR_H */
2278 2314
2279 static void 2315 static void
2280 makefmt(char *fmt, int longflag, int longlongflag, int size_tflag, 2316 makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2281 int zeropad, int width, int precision, char c) 2317 char c)
2282 { 2318 {
2283 *fmt++ = '%'; 2319 *fmt++ = '%';
2284 if (width) {
2285 if (zeropad)
2286 *fmt++ = '0';
2287 fmt += sprintf(fmt, "%d", width);
2288 }
2289 if (precision)
2290 fmt += sprintf(fmt, ".%d", precision);
2291 if (longflag) 2320 if (longflag)
2292 *fmt++ = 'l'; 2321 *fmt++ = 'l';
2293 else if (longlongflag) { 2322 else if (longlongflag) {
2294 /* longlongflag should only ever be nonzero on machines with 2323 /* longlongflag should only ever be nonzero on machines with
2295 HAVE_LONG_LONG defined */ 2324 HAVE_LONG_LONG defined */
2296 #ifdef HAVE_LONG_LONG 2325 #ifdef HAVE_LONG_LONG
2297 char *f = PY_FORMAT_LONG_LONG; 2326 char *f = PY_FORMAT_LONG_LONG;
2298 while (*f) 2327 while (*f)
2299 *fmt++ = *f++; 2328 *fmt++ = *f++;
2300 #else 2329 #else
2301 /* we shouldn't ever get here */ 2330 /* we shouldn't ever get here */
2302 assert(0); 2331 assert(0);
2303 *fmt++ = 'l'; 2332 *fmt++ = 'l';
2304 #endif 2333 #endif
2305 } 2334 }
2306 else if (size_tflag) { 2335 else if (size_tflag) {
2307 char *f = PY_FORMAT_SIZE_T; 2336 char *f = PY_FORMAT_SIZE_T;
2308 while (*f) 2337 while (*f)
2309 *fmt++ = *f++; 2338 *fmt++ = *f++;
2310 } 2339 }
2311 *fmt++ = c; 2340 *fmt++ = c;
2312 *fmt = '\0'; 2341 *fmt = '\0';
2313 } 2342 }
2314 2343
2315 /* helper for PyUnicode_FromFormatV() */ 2344 /* maximum number of characters required for output of %lld or %p.
2345 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2346 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2347 #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2348
2349 static int
2350 unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2351 Py_ssize_t width, Py_ssize_t precision)
2352 {
2353 Py_ssize_t length, fill, arglen;
2354 Py_UCS4 maxchar;
2355
2356 if (PyUnicode_READY(str) == -1)
2357 return -1;
2358
2359 length = PyUnicode_GET_LENGTH(str);
2360 if ((precision == -1 || precision >= length)
2361 && width <= length)
2362 return _PyUnicodeWriter_WriteStr(writer, str);
2363
2364 if (precision != -1)
2365 length = Py_MIN(precision, length);
2366
2367 arglen = Py_MAX(length, width);
2368 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2369 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2370 else
2371 maxchar = writer->maxchar;
2372
2373 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2374 return -1;
2375
2376 if (width > length) {
2377 fill = width - length;
2378 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2379 return -1;
2380 writer->pos += fill;
2381 }
2382
2383 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2384 str, 0, length);
2385 writer->pos += length;
2386 return 0;
2387 }
2388
2389 static int
2390 unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2391 Py_ssize_t width, Py_ssize_t precision)
2392 {
2393 /* UTF-8 */
2394 Py_ssize_t length;
2395 PyObject *unicode;
2396 int res;
2397
2398 length = strlen(str);
2399 if (precision != -1)
2400 length = Py_MIN(length, precision);
2401 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2402 if (unicode == NULL)
2403 return -1;
2404
2405 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2406 Py_DECREF(unicode);
2407 return res;
2408 }
2316 2409
2317 static const char* 2410 static const char*
2318 parse_format_flags(const char *f, 2411 unicode_fromformat_arg(_PyUnicodeWriter *writer,
2319 int *p_width, int *p_precision, 2412 const char *f, va_list *vargs)
2320 int *p_longflag, int *p_longlongflag, int *p_size_tflag) 2413 {
2321 { 2414 const char *p;
2322 int width, precision, longflag, longlongflag, size_tflag; 2415 Py_ssize_t len;
2416 int zeropad;
2417 Py_ssize_t width;
2418 Py_ssize_t precision;
2419 int longflag;
2420 int longlongflag;
2421 int size_tflag;
2422 Py_ssize_t fill;
2423
2424 p = f;
2425 f++;
2426 zeropad = 0;
2427 if (*f == '0') {
2428 zeropad = 1;
2429 f++;
2430 }
2323 2431
2324 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ 2432 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2325 f++; 2433 width = -1;
2326 width = 0; 2434 if (Py_ISDIGIT((unsigned)*f)) {
2327 while (Py_ISDIGIT((unsigned)*f)) 2435 width = *f - '0';
2328 width = (width*10) + *f++ - '0'; 2436 f++;
2329 precision = 0; 2437 while (Py_ISDIGIT((unsigned)*f)) {
2438 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2439 PyErr_SetString(PyExc_ValueError,
2440 "width too big");
2441 return NULL;
2442 }
2443 width = (width * 10) + (*f - '0');
2444 f++;
2445 }
2446 }
2447 precision = -1;
2330 if (*f == '.') { 2448 if (*f == '.') {
2331 f++; 2449 f++;
2332 while (Py_ISDIGIT((unsigned)*f)) 2450 if (Py_ISDIGIT((unsigned)*f)) {
2333 precision = (precision*10) + *f++ - '0'; 2451 precision = (*f - '0');
2452 f++;
2453 while (Py_ISDIGIT((unsigned)*f)) {
2454 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2455 PyErr_SetString(PyExc_ValueError,
2456 "precision too big");
2457 return NULL;
2458 }
2459 precision = (precision * 10) + (*f - '0');
2460 f++;
2461 }
2462 }
2334 if (*f == '%') { 2463 if (*f == '%') {
2335 /* "%.3%s" => f points to "3" */ 2464 /* "%.3%s" => f points to "3" */
2336 f--; 2465 f--;
2337 } 2466 }
2338 } 2467 }
2339 if (*f == '\0') { 2468 if (*f == '\0') {
2340 /* bogus format "%.1" => go backward, f points to "1" */ 2469 /* bogus format "%.123" => go backward, f points to "3" */
2341 f--; 2470 f--;
2342 } 2471 }
2343 if (p_width != NULL)
2344 *p_width = width;
2345 if (p_precision != NULL)
2346 *p_precision = precision;
2347 2472
2348 /* Handle %ld, %lu, %lld and %llu. */ 2473 /* Handle %ld, %lu, %lld and %llu. */
2349 longflag = 0; 2474 longflag = 0;
2350 longlongflag = 0; 2475 longlongflag = 0;
2351 size_tflag = 0; 2476 size_tflag = 0;
2352
2353 if (*f == 'l') { 2477 if (*f == 'l') {
2354 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') { 2478 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2355 longflag = 1; 2479 longflag = 1;
2356 ++f; 2480 ++f;
2357 } 2481 }
2358 #ifdef HAVE_LONG_LONG 2482 #ifdef HAVE_LONG_LONG
2359 else if (f[1] == 'l' && 2483 else if (f[1] == 'l' &&
2360 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) { 2484 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2361 longlongflag = 1; 2485 longlongflag = 1;
2362 f += 2; 2486 f += 2;
2363 } 2487 }
2364 #endif 2488 #endif
2365 } 2489 }
2366 /* handle the size_t flag. */ 2490 /* handle the size_t flag. */
2367 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) { 2491 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2368 size_tflag = 1; 2492 size_tflag = 1;
2369 ++f; 2493 ++f;
2370 } 2494 }
2371 if (p_longflag != NULL) 2495
2372 *p_longflag = longflag; 2496 if (f[1] == '\0')
2373 if (p_longlongflag != NULL) 2497 writer->overallocate = 0;
2374 *p_longlongflag = longlongflag; 2498
2375 if (p_size_tflag != NULL) 2499 switch (*f) {
2376 *p_size_tflag = size_tflag; 2500 case 'c':
2501 {
2502 int ordinal = va_arg(*vargs, int);
2503 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2504 PyErr_SetString(PyExc_OverflowError,
2505 "character argument not in range(0x110000)");
2506 return NULL;
2507 }
2508 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2509 return NULL;
2510 break;
2511 }
2512
2513 case 'i':
2514 case 'd':
2515 case 'u':
2516 case 'x':
2517 {
2518 /* used by sprintf */
2519 char fmt[10]; /* should be enough for "%0lld\0" */
2520 char buffer[MAX_LONG_LONG_CHARS];
2521 Py_ssize_t arglen;
2522
2523 if (*f == 'u') {
2524 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2525
2526 if (longflag)
2527 len = sprintf(buffer, fmt,
2528 va_arg(*vargs, unsigned long));
2529 #ifdef HAVE_LONG_LONG
2530 else if (longlongflag)
2531 len = sprintf(buffer, fmt,
2532 va_arg(*vargs, unsigned PY_LONG_LONG));
2533 #endif
2534 else if (size_tflag)
2535 len = sprintf(buffer, fmt,
2536 va_arg(*vargs, size_t));
2537 else
2538 len = sprintf(buffer, fmt,
2539 va_arg(*vargs, unsigned int));
2540 }
2541 else if (*f == 'x') {
2542 makefmt(fmt, 0, 0, 0, 'x');
2543 len = sprintf(buffer, fmt, va_arg(*vargs, int));
2544 }
2545 else {
2546 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2547
2548 if (longflag)
2549 len = sprintf(buffer, fmt,
2550 va_arg(*vargs, long));
2551 #ifdef HAVE_LONG_LONG
2552 else if (longlongflag)
2553 len = sprintf(buffer, fmt,
2554 va_arg(*vargs, PY_LONG_LONG));
2555 #endif
2556 else if (size_tflag)
2557 len = sprintf(buffer, fmt,
2558 va_arg(*vargs, Py_ssize_t));
2559 else
2560 len = sprintf(buffer, fmt,
2561 va_arg(*vargs, int));
2562 }
2563 assert(len >= 0);
2564
2565 if (precision < len)
2566 precision = len;
2567
2568 arglen = Py_MAX(precision, width);
2569 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2570 return NULL;
2571
2572 if (width > precision) {
2573 Py_UCS4 fillchar;
2574 fill = width - precision;
2575 fillchar = zeropad?'0':' ';
2576 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == - 1)
2577 return NULL;
2578 writer->pos += fill;
2579 }
2580 if (precision > len) {
2581 fill = precision - len;
2582 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2583 return NULL;
2584 writer->pos += fill;
2585 }
2586
2587 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2588 return NULL;
2589 break;
2590 }
2591
2592 case 'p':
2593 {
2594 char number[MAX_LONG_LONG_CHARS];
2595
2596 len = sprintf(number, "%p", va_arg(*vargs, void*));
2597 assert(len >= 0);
2598
2599 /* %p is ill-defined: ensure leading 0x. */
2600 if (number[1] == 'X')
2601 number[1] = 'x';
2602 else if (number[1] != 'x') {
2603 memmove(number + 2, number,
2604 strlen(number) + 1);
2605 number[0] = '0';
2606 number[1] = 'x';
2607 len += 2;
2608 }
2609
2610 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2611 return NULL;
2612 break;
2613 }
2614
2615 case 's':
2616 {
2617 /* UTF-8 */
2618 const char *s = va_arg(*vargs, const char*);
2619 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
2620 return NULL;
2621 break;
2622 }
2623
2624 case 'U':
2625 {
2626 PyObject *obj = va_arg(*vargs, PyObject *);
2627 assert(obj && _PyUnicode_CHECK(obj));
2628
2629 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2630 return NULL;
2631 break;
2632 }
2633
2634 case 'V':
2635 {
2636 PyObject *obj = va_arg(*vargs, PyObject *);
2637 const char *str = va_arg(*vargs, const char *);
2638 if (obj) {
2639 assert(_PyUnicode_CHECK(obj));
2640 if (unicode_fromformat_write_str(writer, obj, width, precision) == - 1)
2641 return NULL;
2642 }
2643 else {
2644 assert(str != NULL);
2645 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0 )
2646 return NULL;
2647 }
2648 break;
2649 }
2650
2651 case 'S':
2652 {
2653 PyObject *obj = va_arg(*vargs, PyObject *);
2654 PyObject *str;
2655 assert(obj);
2656 str = PyObject_Str(obj);
2657 if (!str)
2658 return NULL;
2659 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
2660 Py_DECREF(str);
2661 return NULL;
2662 }
2663 Py_DECREF(str);
2664 break;
2665 }
2666
2667 case 'R':
2668 {
2669 PyObject *obj = va_arg(*vargs, PyObject *);
2670 PyObject *repr;
2671 assert(obj);
2672 repr = PyObject_Repr(obj);
2673 if (!repr)
2674 return NULL;
2675 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
2676 Py_DECREF(repr);
2677 return NULL;
2678 }
2679 Py_DECREF(repr);
2680 break;
2681 }
2682
2683 case 'A':
2684 {
2685 PyObject *obj = va_arg(*vargs, PyObject *);
2686 PyObject *ascii;
2687 assert(obj);
2688 ascii = PyObject_ASCII(obj);
2689 if (!ascii)
2690 return NULL;
2691 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
2692 Py_DECREF(ascii);
2693 return NULL;
2694 }
2695 Py_DECREF(ascii);
2696 break;
2697 }
2698
2699 case '%':
2700 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2701 return NULL;
2702 break;
2703
2704 default:
2705 /* if we stumble upon an unknown formatting code, copy the rest
2706 of the format string to the output string. (we cannot just
2707 skip the code, since there's no way to know what's in the
2708 argument list) */
2709 len = strlen(p);
2710 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
2711 return NULL;
2712 f = p+len;
2713 return f;
2714 }
2715
2716 f++;
2377 return f; 2717 return f;
2378 } 2718 }
2379
2380 /* maximum number of characters required for output of %ld. 21 characters
2381 allows for 64-bit integers (in decimal) and an optional sign. */
2382 #define MAX_LONG_CHARS 21
2383 /* maximum number of characters required for output of %lld.
2384 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2385 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2386 #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2387 2719
2388 PyObject * 2720 PyObject *
2389 PyUnicode_FromFormatV(const char *format, va_list vargs) 2721 PyUnicode_FromFormatV(const char *format, va_list vargs)
2390 { 2722 {
2391 va_list count; 2723 va_list vargs2;
2392 Py_ssize_t callcount = 0; 2724 const char *f;
2393 PyObject **callresults = NULL; 2725 _PyUnicodeWriter writer;
2394 PyObject **callresult = NULL; 2726
2395 Py_ssize_t n = 0; 2727 _PyUnicodeWriter_Init(&writer);
2396 int width = 0; 2728 writer.min_length = strlen(format) + 100;
2397 int precision = 0; 2729 writer.overallocate = 1;
2398 int zeropad; 2730
2399 const char* f; 2731 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2400 PyObject *string; 2732 Copy it to be able to pass a reference to a subfunction. */
2401 /* used by sprintf */ 2733 Py_VA_COPY(vargs2, vargs);
2402 char fmt[61]; /* should be enough for %0width.precisionlld */ 2734
2403 Py_UCS4 maxchar = 127; /* result is ASCII by default */ 2735 for (f = format; *f; ) {
2404 Py_UCS4 argmaxchar;
2405 Py_ssize_t numbersize = 0;
2406 char *numberresults = NULL;
2407 char *numberresult = NULL;
2408 Py_ssize_t i;
2409 int kind;
2410 void *data;
2411
2412 Py_VA_COPY(count, vargs);
2413 /* step 1: count the number of %S/%R/%A/%s format specifications
2414 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2415 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
2416 * result in an array)
2417 * also estimate a upper bound for all the number formats in the string,
2418 * numbers will be formatted in step 3 and be kept in a '\0'-separated
2419 * buffer before putting everything together. */
2420 for (f = format; *f; f++) {
2421 if (*f == '%') { 2736 if (*f == '%') {
2422 int longlongflag; 2737 f = unicode_fromformat_arg(&writer, f, &vargs2);
2423 /* skip width or width.precision (eg. "1.2" of "%1.2f") */ 2738 if (f == NULL)
2424 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL); 2739 goto fail;
2425 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V') 2740 }
2426 ++callcount; 2741 else {
2427 2742 const char *p;
2428 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') { 2743 Py_ssize_t len;
2429 #ifdef HAVE_LONG_LONG 2744
2430 if (longlongflag) { 2745 p = f;
2431 if (width < MAX_LONG_LONG_CHARS) 2746 do
2432 width = MAX_LONG_LONG_CHARS; 2747 {
2748 if ((unsigned char)*p > 127) {
2749 PyErr_Format(PyExc_ValueError,
2750 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2751 "string, got a non-ASCII byte: 0x%02x",
2752 (unsigned char)*p);
2753 return NULL;
2433 } 2754 }
2434 else 2755 p++;
2435 #endif
2436 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2437 including sign. Decimal takes the most space. This
2438 isn't enough for octal. If a width is specified we
2439 need more (which we allocate later). */
2440 if (width < MAX_LONG_CHARS)
2441 width = MAX_LONG_CHARS;
2442
2443 /* account for the size + '\0' to separate numbers
2444 inside of the numberresults buffer */
2445 numbersize += (width + 1);
2446 } 2756 }
2447 } 2757 while (*p != '\0' && *p != '%');
2448 else if ((unsigned char)*f > 127) { 2758 len = p - f;
2449 PyErr_Format(PyExc_ValueError, 2759
2450 "PyUnicode_FromFormatV() expects an ASCII-encoded format " 2760 if (*p == '\0')
2451 "string, got a non-ASCII byte: 0x%02x", 2761 writer.overallocate = 0;
2452 (unsigned char)*f); 2762
2453 return NULL; 2763 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
2454 } 2764 goto fail;
2455 } 2765
2456 /* step 2: allocate memory for the results of 2766 f = p;
2457 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */ 2767 }
2458 if (callcount) { 2768 }
2459 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount); 2769 return _PyUnicodeWriter_Finish(&writer);
2460 if (!callresults) { 2770
2461 PyErr_NoMemory();
2462 return NULL;
2463 }
2464 callresult = callresults;
2465 }
2466 /* step 2.5: allocate memory for the results of formating numbers */
2467 if (numbersize) {
2468 numberresults = PyObject_Malloc(numbersize);
2469 if (!numberresults) {
2470 PyErr_NoMemory();
2471 goto fail;
2472 }
2473 numberresult = numberresults;
2474 }
2475
2476 /* step 3: format numbers and figure out how large a buffer we need */
2477 for (f = format; *f; f++) {
2478 if (*f == '%') {
2479 const char* p;
2480 int longflag;
2481 int longlongflag;
2482 int size_tflag;
2483 int numprinted;
2484
2485 p = f;
2486 zeropad = (f[1] == '0');
2487 f = parse_format_flags(f, &width, &precision,
2488 &longflag, &longlongflag, &size_tflag);
2489 switch (*f) {
2490 case 'c':
2491 {
2492 int ordinal = va_arg(count, int);
2493 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2494 PyErr_SetString(PyExc_OverflowError,
2495 "%c arg not in range(0x110000)");
2496 goto fail;
2497 }
2498 maxchar = Py_MAX(maxchar, (Py_UCS4)ordinal);
2499 n++;
2500 break;
2501 }
2502 case '%':
2503 n++;
2504 break;
2505 case 'i':
2506 case 'd':
2507 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2508 width, precision, *f);
2509 if (longflag)
2510 numprinted = sprintf(numberresult, fmt,
2511 va_arg(count, long));
2512 #ifdef HAVE_LONG_LONG
2513 else if (longlongflag)
2514 numprinted = sprintf(numberresult, fmt,
2515 va_arg(count, PY_LONG_LONG));
2516 #endif
2517 else if (size_tflag)
2518 numprinted = sprintf(numberresult, fmt,
2519 va_arg(count, Py_ssize_t));
2520 else
2521 numprinted = sprintf(numberresult, fmt,
2522 va_arg(count, int));
2523 n += numprinted;
2524 /* advance by +1 to skip over the '\0' */
2525 numberresult += (numprinted + 1);
2526 assert(*(numberresult - 1) == '\0');
2527 assert(*(numberresult - 2) != '\0');
2528 assert(numprinted >= 0);
2529 assert(numberresult <= numberresults + numbersize);
2530 break;
2531 case 'u':
2532 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2533 width, precision, 'u');
2534 if (longflag)
2535 numprinted = sprintf(numberresult, fmt,
2536 va_arg(count, unsigned long));
2537 #ifdef HAVE_LONG_LONG
2538 else if (longlongflag)
2539 numprinted = sprintf(numberresult, fmt,
2540 va_arg(count, unsigned PY_LONG_LONG));
2541 #endif
2542 else if (size_tflag)
2543 numprinted = sprintf(numberresult, fmt,
2544 va_arg(count, size_t));
2545 else
2546 numprinted = sprintf(numberresult, fmt,
2547 va_arg(count, unsigned int));
2548 n += numprinted;
2549 numberresult += (numprinted + 1);
2550 assert(*(numberresult - 1) == '\0');
2551 assert(*(numberresult - 2) != '\0');
2552 assert(numprinted >= 0);
2553 assert(numberresult <= numberresults + numbersize);
2554 break;
2555 case 'x':
2556 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2557 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2558 n += numprinted;
2559 numberresult += (numprinted + 1);
2560 assert(*(numberresult - 1) == '\0');
2561 assert(*(numberresult - 2) != '\0');
2562 assert(numprinted >= 0);
2563 assert(numberresult <= numberresults + numbersize);
2564 break;
2565 case 'p':
2566 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2567 /* %p is ill-defined: ensure leading 0x. */
2568 if (numberresult[1] == 'X')
2569 numberresult[1] = 'x';
2570 else if (numberresult[1] != 'x') {
2571 memmove(numberresult + 2, numberresult,
2572 strlen(numberresult) + 1);
2573 numberresult[0] = '0';
2574 numberresult[1] = 'x';
2575 numprinted += 2;
2576 }
2577 n += numprinted;
2578 numberresult += (numprinted + 1);
2579 assert(*(numberresult - 1) == '\0');
2580 assert(*(numberresult - 2) != '\0');
2581 assert(numprinted >= 0);
2582 assert(numberresult <= numberresults + numbersize);
2583 break;
2584 case 's':
2585 {
2586 /* UTF-8 */
2587 const char *s = va_arg(count, const char*);
2588 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "repl ace", NULL);
2589 if (!str)
2590 goto fail;
2591 /* since PyUnicode_DecodeUTF8 returns already flexible
2592 unicode objects, there is no need to call ready on them */
2593 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
2594 maxchar = Py_MAX(maxchar, argmaxchar);
2595 n += PyUnicode_GET_LENGTH(str);
2596 /* Remember the str and switch to the next slot */
2597 *callresult++ = str;
2598 break;
2599 }
2600 case 'U':
2601 {
2602 PyObject *obj = va_arg(count, PyObject *);
2603 assert(obj && _PyUnicode_CHECK(obj));
2604 if (PyUnicode_READY(obj) == -1)
2605 goto fail;
2606 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
2607 maxchar = Py_MAX(maxchar, argmaxchar);
2608 n += PyUnicode_GET_LENGTH(obj);
2609 break;
2610 }
2611 case 'V':
2612 {
2613 PyObject *obj = va_arg(count, PyObject *);
2614 const char *str = va_arg(count, const char *);
2615 PyObject *str_obj;
2616 assert(obj || str);
2617 assert(!obj || _PyUnicode_CHECK(obj));
2618 if (obj) {
2619 if (PyUnicode_READY(obj) == -1)
2620 goto fail;
2621 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
2622 maxchar = Py_MAX(maxchar, argmaxchar);
2623 n += PyUnicode_GET_LENGTH(obj);
2624 *callresult++ = NULL;
2625 }
2626 else {
2627 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "re place", NULL);
2628 if (!str_obj)
2629 goto fail;
2630 if (PyUnicode_READY(str_obj) == -1) {
2631 Py_DECREF(str_obj);
2632 goto fail;
2633 }
2634 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
2635 maxchar = Py_MAX(maxchar, argmaxchar);
2636 n += PyUnicode_GET_LENGTH(str_obj);
2637 *callresult++ = str_obj;
2638 }
2639 break;
2640 }
2641 case 'S':
2642 {
2643 PyObject *obj = va_arg(count, PyObject *);
2644 PyObject *str;
2645 assert(obj);
2646 str = PyObject_Str(obj);
2647 if (!str)
2648 goto fail;
2649 if (PyUnicode_READY(str) == -1) {
2650 Py_DECREF(str);
2651 goto fail;
2652 }
2653 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
2654 maxchar = Py_MAX(maxchar, argmaxchar);
2655 n += PyUnicode_GET_LENGTH(str);
2656 /* Remember the str and switch to the next slot */
2657 *callresult++ = str;
2658 break;
2659 }
2660 case 'R':
2661 {
2662 PyObject *obj = va_arg(count, PyObject *);
2663 PyObject *repr;
2664 assert(obj);
2665 repr = PyObject_Repr(obj);
2666 if (!repr)
2667 goto fail;
2668 if (PyUnicode_READY(repr) == -1) {
2669 Py_DECREF(repr);
2670 goto fail;
2671 }
2672 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
2673 maxchar = Py_MAX(maxchar, argmaxchar);
2674 n += PyUnicode_GET_LENGTH(repr);
2675 /* Remember the repr and switch to the next slot */
2676 *callresult++ = repr;
2677 break;
2678 }
2679 case 'A':
2680 {
2681 PyObject *obj = va_arg(count, PyObject *);
2682 PyObject *ascii;
2683 assert(obj);
2684 ascii = PyObject_ASCII(obj);
2685 if (!ascii)
2686 goto fail;
2687 if (PyUnicode_READY(ascii) == -1) {
2688 Py_DECREF(ascii);
2689 goto fail;
2690 }
2691 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
2692 maxchar = Py_MAX(maxchar, argmaxchar);
2693 n += PyUnicode_GET_LENGTH(ascii);
2694 /* Remember the repr and switch to the next slot */
2695 *callresult++ = ascii;
2696 break;
2697 }
2698 default:
2699 /* if we stumble upon an unknown
2700 formatting code, copy the rest of
2701 the format string to the output
2702 string. (we cannot just skip the
2703 code, since there's no way to know
2704 what's in the argument list) */
2705 n += strlen(p);
2706 goto expand;
2707 }
2708 } else
2709 n++;
2710 }
2711 expand:
2712 /* step 4: fill the buffer */
2713 /* Since we've analyzed how much space we need,
2714 we don't have to resize the string.
2715 There can be no errors beyond this point. */
2716 string = PyUnicode_New(n, maxchar);
2717 if (!string)
2718 goto fail;
2719 kind = PyUnicode_KIND(string);
2720 data = PyUnicode_DATA(string);
2721 callresult = callresults;
2722 numberresult = numberresults;
2723
2724 for (i = 0, f = format; *f; f++) {
2725 if (*f == '%') {
2726 const char* p;
2727
2728 p = f;
2729 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2730 /* checking for == because the last argument could be a empty
2731 string, which causes i to point to end, the assert at the end of
2732 the loop */
2733 assert(i <= PyUnicode_GET_LENGTH(string));
2734
2735 switch (*f) {
2736 case 'c':
2737 {
2738 const int ordinal = va_arg(vargs, int);
2739 PyUnicode_WRITE(kind, data, i++, ordinal);
2740 break;
2741 }
2742 case 'i':
2743 case 'd':
2744 case 'u':
2745 case 'x':
2746 case 'p':
2747 {
2748 Py_ssize_t len;
2749 /* unused, since we already have the result */
2750 if (*f == 'p')
2751 (void) va_arg(vargs, void *);
2752 else
2753 (void) va_arg(vargs, int);
2754 /* extract the result from numberresults and append. */
2755 len = strlen(numberresult);
2756 unicode_write_cstr(string, i, numberresult, len);
2757 /* skip over the separating '\0' */
2758 i += len;
2759 numberresult += len;
2760 assert(*numberresult == '\0');
2761 numberresult++;
2762 assert(numberresult <= numberresults + numbersize);
2763 break;
2764 }
2765 case 's':
2766 {
2767 /* unused, since we already have the result */
2768 Py_ssize_t size;
2769 (void) va_arg(vargs, char *);
2770 size = PyUnicode_GET_LENGTH(*callresult);
2771 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
2772 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
2773 i += size;
2774 /* We're done with the unicode()/repr() => forget it */
2775 Py_DECREF(*callresult);
2776 /* switch to next unicode()/repr() result */
2777 ++callresult;
2778 break;
2779 }
2780 case 'U':
2781 {
2782 PyObject *obj = va_arg(vargs, PyObject *);
2783 Py_ssize_t size;
2784 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2785 size = PyUnicode_GET_LENGTH(obj);
2786 _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
2787 i += size;
2788 break;
2789 }
2790 case 'V':
2791 {
2792 Py_ssize_t size;
2793 PyObject *obj = va_arg(vargs, PyObject *);
2794 va_arg(vargs, const char *);
2795 if (obj) {
2796 size = PyUnicode_GET_LENGTH(obj);
2797 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2798 _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
2799 i += size;
2800 } else {
2801 size = PyUnicode_GET_LENGTH(*callresult);
2802 assert(PyUnicode_KIND(*callresult) <=
2803 PyUnicode_KIND(string));
2804 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, siz e);
2805 i += size;
2806 Py_DECREF(*callresult);
2807 }
2808 ++callresult;
2809 break;
2810 }
2811 case 'S':
2812 case 'R':
2813 case 'A':
2814 {
2815 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
2816 /* unused, since we already have the result */
2817 (void) va_arg(vargs, PyObject *);
2818 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
2819 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
2820 i += size;
2821 /* We're done with the unicode()/repr() => forget it */
2822 Py_DECREF(*callresult);
2823 /* switch to next unicode()/repr() result */
2824 ++callresult;
2825 break;
2826 }
2827 case '%':
2828 PyUnicode_WRITE(kind, data, i++, '%');
2829 break;
2830 default:
2831 {
2832 Py_ssize_t len = strlen(p);
2833 unicode_write_cstr(string, i, p, len);
2834 i += len;
2835 assert(i == PyUnicode_GET_LENGTH(string));
2836 goto end;
2837 }
2838 }
2839 }
2840 else {
2841 assert(i < PyUnicode_GET_LENGTH(string));
2842 PyUnicode_WRITE(kind, data, i++, *f);
2843 }
2844 }
2845 assert(i == PyUnicode_GET_LENGTH(string));
2846
2847 end:
2848 if (callresults)
2849 PyObject_Free(callresults);
2850 if (numberresults)
2851 PyObject_Free(numberresults);
2852 return unicode_result(string);
2853 fail: 2771 fail:
2854 if (callresults) { 2772 _PyUnicodeWriter_Dealloc(&writer);
2855 PyObject **callresult2 = callresults;
2856 while (callresult2 < callresult) {
2857 Py_XDECREF(*callresult2);
2858 ++callresult2;
2859 }
2860 PyObject_Free(callresults);
2861 }
2862 if (numberresults)
2863 PyObject_Free(numberresults);
2864 return NULL; 2773 return NULL;
2865 } 2774 }
2866 2775
2867 PyObject * 2776 PyObject *
2868 PyUnicode_FromFormat(const char *format, ...) 2777 PyUnicode_FromFormat(const char *format, ...)
2869 { 2778 {
2870 PyObject* ret; 2779 PyObject* ret;
2871 va_list vargs; 2780 va_list vargs;
2872 2781
2873 #ifdef HAVE_STDARG_PROTOTYPES 2782 #ifdef HAVE_STDARG_PROTOTYPES
(...skipping 86 matching lines...) Expand 10 before | Expand all | Expand 10 after
2960 if (size != NULL) 2869 if (size != NULL)
2961 *size = buflen; 2870 *size = buflen;
2962 return buffer; 2871 return buffer;
2963 } 2872 }
2964 2873
2965 #endif /* HAVE_WCHAR_H */ 2874 #endif /* HAVE_WCHAR_H */
2966 2875
2967 PyObject * 2876 PyObject *
2968 PyUnicode_FromOrdinal(int ordinal) 2877 PyUnicode_FromOrdinal(int ordinal)
2969 { 2878 {
2970 PyObject *v;
2971 if (ordinal < 0 || ordinal > MAX_UNICODE) { 2879 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2972 PyErr_SetString(PyExc_ValueError, 2880 PyErr_SetString(PyExc_ValueError,
2973 "chr() arg not in range(0x110000)"); 2881 "chr() arg not in range(0x110000)");
2974 return NULL; 2882 return NULL;
2975 } 2883 }
2976 2884
2977 if ((Py_UCS4)ordinal < 256) 2885 return unicode_char((Py_UCS4)ordinal);
2978 return get_latin1_char((unsigned char)ordinal);
2979
2980 v = PyUnicode_New(1, ordinal);
2981 if (v == NULL)
2982 return NULL;
2983 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2984 assert(_PyUnicode_CheckConsistency(v, 1));
2985 return v;
2986 } 2886 }
2987 2887
2988 PyObject * 2888 PyObject *
2989 PyUnicode_FromObject(register PyObject *obj) 2889 PyUnicode_FromObject(PyObject *obj)
2990 { 2890 {
2991 /* XXX Perhaps we should make this API an alias of 2891 /* XXX Perhaps we should make this API an alias of
2992 PyObject_Str() instead ?! */ 2892 PyObject_Str() instead ?! */
2993 if (PyUnicode_CheckExact(obj)) { 2893 if (PyUnicode_CheckExact(obj)) {
2994 if (PyUnicode_READY(obj) == -1) 2894 if (PyUnicode_READY(obj) == -1)
2995 return NULL; 2895 return NULL;
2996 Py_INCREF(obj); 2896 Py_INCREF(obj);
2997 return obj; 2897 return obj;
2998 } 2898 }
2999 if (PyUnicode_Check(obj)) { 2899 if (PyUnicode_Check(obj)) {
3000 /* For a Unicode subtype that's not a Unicode object, 2900 /* For a Unicode subtype that's not a Unicode object,
3001 return a true Unicode object with the same data. */ 2901 return a true Unicode object with the same data. */
3002 return _PyUnicode_Copy(obj); 2902 return _PyUnicode_Copy(obj);
3003 } 2903 }
3004 PyErr_Format(PyExc_TypeError, 2904 PyErr_Format(PyExc_TypeError,
3005 "Can't convert '%.100s' object to str implicitly", 2905 "Can't convert '%.100s' object to str implicitly",
3006 Py_TYPE(obj)->tp_name); 2906 Py_TYPE(obj)->tp_name);
3007 return NULL; 2907 return NULL;
3008 } 2908 }
3009 2909
3010 PyObject * 2910 PyObject *
3011 PyUnicode_FromEncodedObject(register PyObject *obj, 2911 PyUnicode_FromEncodedObject(PyObject *obj,
3012 const char *encoding, 2912 const char *encoding,
3013 const char *errors) 2913 const char *errors)
3014 { 2914 {
3015 Py_buffer buffer; 2915 Py_buffer buffer;
3016 PyObject *v; 2916 PyObject *v;
3017 2917
3018 if (obj == NULL) { 2918 if (obj == NULL) {
3019 PyErr_BadInternalCall(); 2919 PyErr_BadInternalCall();
3020 return NULL; 2920 return NULL;
3021 } 2921 }
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after
3053 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); 2953 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3054 PyBuffer_Release(&buffer); 2954 PyBuffer_Release(&buffer);
3055 return v; 2955 return v;
3056 } 2956 }
3057 2957
3058 /* Convert encoding to lower case and replace '_' with '-' in order to 2958 /* Convert encoding to lower case and replace '_' with '-' in order to
3059 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1), 2959 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
3060 1 on success. */ 2960 1 on success. */
3061 int 2961 int
3062 _Py_normalize_encoding(const char *encoding, 2962 _Py_normalize_encoding(const char *encoding,
3063 char *lower, 2963 char *lower,
3064 size_t lower_len) 2964 size_t lower_len)
3065 { 2965 {
3066 const char *e; 2966 const char *e;
3067 char *l; 2967 char *l;
3068 char *l_end; 2968 char *l_end;
3069 2969
3070 if (encoding == NULL) { 2970 if (encoding == NULL) {
2971 /* 6 == strlen("utf-8") + 1 */
2972 if (lower_len < 6)
2973 return 0;
3071 strcpy(lower, "utf-8"); 2974 strcpy(lower, "utf-8");
3072 return 1; 2975 return 1;
3073 } 2976 }
3074 e = encoding; 2977 e = encoding;
3075 l = lower; 2978 l = lower;
3076 l_end = &lower[lower_len - 1]; 2979 l_end = &lower[lower_len - 1];
3077 while (*e) { 2980 while (*e) {
3078 if (l == l_end) 2981 if (l == l_end)
3079 return 0; 2982 return 0;
3080 if (Py_ISUPPER(*e)) { 2983 if (Py_ISUPPER(*e)) {
(...skipping 21 matching lines...) Expand all
3102 Py_buffer info; 3005 Py_buffer info;
3103 char lower[11]; /* Enough for any encoding shortcut */ 3006 char lower[11]; /* Enough for any encoding shortcut */
3104 3007
3105 /* Shortcuts for common default encodings */ 3008 /* Shortcuts for common default encodings */
3106 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) { 3009 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
3107 if ((strcmp(lower, "utf-8") == 0) || 3010 if ((strcmp(lower, "utf-8") == 0) ||
3108 (strcmp(lower, "utf8") == 0)) 3011 (strcmp(lower, "utf8") == 0))
3109 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 3012 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3110 else if ((strcmp(lower, "latin-1") == 0) || 3013 else if ((strcmp(lower, "latin-1") == 0) ||
3111 (strcmp(lower, "latin1") == 0) || 3014 (strcmp(lower, "latin1") == 0) ||
3112 (strcmp(lower, "iso-8859-1") == 0)) 3015 (strcmp(lower, "iso-8859-1") == 0) ||
3016 (strcmp(lower, "iso8859-1") == 0))
3113 return PyUnicode_DecodeLatin1(s, size, errors); 3017 return PyUnicode_DecodeLatin1(s, size, errors);
3114 #ifdef HAVE_MBCS 3018 #ifdef HAVE_MBCS
3115 else if (strcmp(lower, "mbcs") == 0) 3019 else if (strcmp(lower, "mbcs") == 0)
3116 return PyUnicode_DecodeMBCS(s, size, errors); 3020 return PyUnicode_DecodeMBCS(s, size, errors);
3117 #endif 3021 #endif
3118 else if (strcmp(lower, "ascii") == 0) 3022 else if (strcmp(lower, "ascii") == 0)
3119 return PyUnicode_DecodeASCII(s, size, errors); 3023 return PyUnicode_DecodeASCII(s, size, errors);
3120 else if (strcmp(lower, "utf-16") == 0) 3024 else if (strcmp(lower, "utf-16") == 0)
3121 return PyUnicode_DecodeUTF16(s, size, errors, 0); 3025 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3122 else if (strcmp(lower, "utf-32") == 0) 3026 else if (strcmp(lower, "utf-32") == 0)
3123 return PyUnicode_DecodeUTF32(s, size, errors, 0); 3027 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3124 } 3028 }
3125 3029
3126 /* Decode via the codec registry */ 3030 /* Decode via the codec registry */
3127 buffer = NULL; 3031 buffer = NULL;
3128 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) 3032 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3129 goto onError; 3033 goto onError;
3130 buffer = PyMemoryView_FromBuffer(&info); 3034 buffer = PyMemoryView_FromBuffer(&info);
3131 if (buffer == NULL) 3035 if (buffer == NULL)
3132 goto onError; 3036 goto onError;
3133 unicode = PyCodec_Decode(buffer, encoding, errors); 3037 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3134 if (unicode == NULL) 3038 if (unicode == NULL)
3135 goto onError; 3039 goto onError;
3136 if (!PyUnicode_Check(unicode)) { 3040 if (!PyUnicode_Check(unicode)) {
3137 PyErr_Format(PyExc_TypeError, 3041 PyErr_Format(PyExc_TypeError,
3138 "decoder did not return a str object (type=%.400s)", 3042 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3139 Py_TYPE(unicode)->tp_name); 3043 "use codecs.decode() to decode to arbitrary types",
3044 encoding,
3045 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
3140 Py_DECREF(unicode); 3046 Py_DECREF(unicode);
3141 goto onError; 3047 goto onError;
3142 } 3048 }
3143 Py_DECREF(buffer); 3049 Py_DECREF(buffer);
3144 return unicode_result(unicode); 3050 return unicode_result(unicode);
3145 3051
3146 onError: 3052 onError:
3147 Py_XDECREF(buffer); 3053 Py_XDECREF(buffer);
3148 return NULL; 3054 return NULL;
3149 } 3055 }
(...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after
3187 3093
3188 if (encoding == NULL) 3094 if (encoding == NULL)
3189 encoding = PyUnicode_GetDefaultEncoding(); 3095 encoding = PyUnicode_GetDefaultEncoding();
3190 3096
3191 /* Decode via the codec registry */ 3097 /* Decode via the codec registry */
3192 v = PyCodec_Decode(unicode, encoding, errors); 3098 v = PyCodec_Decode(unicode, encoding, errors);
3193 if (v == NULL) 3099 if (v == NULL)
3194 goto onError; 3100 goto onError;
3195 if (!PyUnicode_Check(v)) { 3101 if (!PyUnicode_Check(v)) {
3196 PyErr_Format(PyExc_TypeError, 3102 PyErr_Format(PyExc_TypeError,
3197 "decoder did not return a str object (type=%.400s)", 3103 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3198 Py_TYPE(v)->tp_name); 3104 "use codecs.decode() to decode to arbitrary types",
3105 encoding,
3106 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
3199 Py_DECREF(v); 3107 Py_DECREF(v);
3200 goto onError; 3108 goto onError;
3201 } 3109 }
3202 return unicode_result(v); 3110 return unicode_result(v);
3203 3111
3204 onError: 3112 onError:
3205 return NULL; 3113 return NULL;
3206 } 3114 }
3207 3115
3208 PyObject * 3116 PyObject *
(...skipping 127 matching lines...) Expand 10 before | Expand all | Expand 10 after
3336 return NULL; 3244 return NULL;
3337 3245
3338 wlen2 = wcslen(wstr); 3246 wlen2 = wcslen(wstr);
3339 if (wlen2 != wlen) { 3247 if (wlen2 != wlen) {
3340 PyMem_Free(wstr); 3248 PyMem_Free(wstr);
3341 PyErr_SetString(PyExc_TypeError, "embedded null character"); 3249 PyErr_SetString(PyExc_TypeError, "embedded null character");
3342 return NULL; 3250 return NULL;
3343 } 3251 }
3344 3252
3345 if (surrogateescape) { 3253 if (surrogateescape) {
3346 /* locale encoding with surrogateescape */ 3254 /* "surrogateescape" error handler */
3347 char *str; 3255 char *str;
3348 3256
3349 str = _Py_wchar2char(wstr, &error_pos); 3257 str = _Py_wchar2char(wstr, &error_pos);
3350 if (str == NULL) { 3258 if (str == NULL) {
3351 if (error_pos == (size_t)-1) { 3259 if (error_pos == (size_t)-1) {
3352 PyErr_NoMemory(); 3260 PyErr_NoMemory();
3353 PyMem_Free(wstr); 3261 PyMem_Free(wstr);
3354 return NULL; 3262 return NULL;
3355 } 3263 }
3356 else { 3264 else {
3357 goto encode_error; 3265 goto encode_error;
3358 } 3266 }
3359 } 3267 }
3360 PyMem_Free(wstr); 3268 PyMem_Free(wstr);
3361 3269
3362 bytes = PyBytes_FromString(str); 3270 bytes = PyBytes_FromString(str);
3363 PyMem_Free(str); 3271 PyMem_Free(str);
3364 } 3272 }
3365 else { 3273 else {
3274 /* strict mode */
3366 size_t len, len2; 3275 size_t len, len2;
3367 3276
3368 len = wcstombs(NULL, wstr, 0); 3277 len = wcstombs(NULL, wstr, 0);
3369 if (len == (size_t)-1) { 3278 if (len == (size_t)-1) {
3370 error_pos = (size_t)-1; 3279 error_pos = (size_t)-1;
3371 goto encode_error; 3280 goto encode_error;
3372 } 3281 }
3373 3282
3374 bytes = PyBytes_FromStringAndSize(NULL, len); 3283 bytes = PyBytes_FromStringAndSize(NULL, len);
3375 if (bytes == NULL) { 3284 if (bytes == NULL) {
(...skipping 18 matching lines...) Expand all
3394 error_pos = wcstombs_errorpos(wstr); 3303 error_pos = wcstombs_errorpos(wstr);
3395 3304
3396 PyMem_Free(wstr); 3305 PyMem_Free(wstr);
3397 Py_XDECREF(bytes); 3306 Py_XDECREF(bytes);
3398 3307
3399 if (errmsg != NULL) { 3308 if (errmsg != NULL) {
3400 size_t errlen; 3309 size_t errlen;
3401 wstr = _Py_char2wchar(errmsg, &errlen); 3310 wstr = _Py_char2wchar(errmsg, &errlen);
3402 if (wstr != NULL) { 3311 if (wstr != NULL) {
3403 reason = PyUnicode_FromWideChar(wstr, errlen); 3312 reason = PyUnicode_FromWideChar(wstr, errlen);
3404 PyMem_Free(wstr); 3313 PyMem_RawFree(wstr);
3405 } else 3314 } else
3406 errmsg = NULL; 3315 errmsg = NULL;
3407 } 3316 }
3408 if (errmsg == NULL) 3317 if (errmsg == NULL)
3409 reason = PyUnicode_FromString( 3318 reason = PyUnicode_FromString(
3410 "wcstombs() encountered an unencodable " 3319 "wcstombs() encountered an unencodable "
3411 "wide character"); 3320 "wide character");
3412 if (reason == NULL) 3321 if (reason == NULL)
3413 return NULL; 3322 return NULL;
3414 3323
(...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after
3472 if ((strcmp(lower, "utf-8") == 0) || 3381 if ((strcmp(lower, "utf-8") == 0) ||
3473 (strcmp(lower, "utf8") == 0)) 3382 (strcmp(lower, "utf8") == 0))
3474 { 3383 {
3475 if (errors == NULL || strcmp(errors, "strict") == 0) 3384 if (errors == NULL || strcmp(errors, "strict") == 0)
3476 return _PyUnicode_AsUTF8String(unicode, NULL); 3385 return _PyUnicode_AsUTF8String(unicode, NULL);
3477 else 3386 else
3478 return _PyUnicode_AsUTF8String(unicode, errors); 3387 return _PyUnicode_AsUTF8String(unicode, errors);
3479 } 3388 }
3480 else if ((strcmp(lower, "latin-1") == 0) || 3389 else if ((strcmp(lower, "latin-1") == 0) ||
3481 (strcmp(lower, "latin1") == 0) || 3390 (strcmp(lower, "latin1") == 0) ||
3482 (strcmp(lower, "iso-8859-1") == 0)) 3391 (strcmp(lower, "iso-8859-1") == 0) ||
3392 (strcmp(lower, "iso8859-1") == 0))
3483 return _PyUnicode_AsLatin1String(unicode, errors); 3393 return _PyUnicode_AsLatin1String(unicode, errors);
3484 #ifdef HAVE_MBCS 3394 #ifdef HAVE_MBCS
3485 else if (strcmp(lower, "mbcs") == 0) 3395 else if (strcmp(lower, "mbcs") == 0)
3486 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors); 3396 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3487 #endif 3397 #endif
3488 else if (strcmp(lower, "ascii") == 0) 3398 else if (strcmp(lower, "ascii") == 0)
3489 return _PyUnicode_AsASCIIString(unicode, errors); 3399 return _PyUnicode_AsASCIIString(unicode, errors);
3490 } 3400 }
3491 3401
3492 /* Encode via the codec registry */ 3402 /* Encode via the codec registry */
3493 v = PyCodec_Encode(unicode, encoding, errors); 3403 v = _PyCodec_EncodeText(unicode, encoding, errors);
3494 if (v == NULL) 3404 if (v == NULL)
3495 return NULL; 3405 return NULL;
3496 3406
3497 /* The normal path */ 3407 /* The normal path */
3498 if (PyBytes_Check(v)) 3408 if (PyBytes_Check(v))
3499 return v; 3409 return v;
3500 3410
3501 /* If the codec returns a buffer, raise a warning and convert to bytes */ 3411 /* If the codec returns a buffer, raise a warning and convert to bytes */
3502 if (PyByteArray_Check(v)) { 3412 if (PyByteArray_Check(v)) {
3503 int error; 3413 int error;
3504 PyObject *b; 3414 PyObject *b;
3505 3415
3506 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, 3416 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3507 "encoder %s returned bytearray instead of bytes", 3417 "encoder %s returned bytearray instead of bytes; "
3418 "use codecs.encode() to encode to arbitrary types",
3508 encoding); 3419 encoding);
3509 if (error) { 3420 if (error) {
3510 Py_DECREF(v); 3421 Py_DECREF(v);
3511 return NULL; 3422 return NULL;
3512 } 3423 }
3513 3424
3514 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 3425 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3515 Py_DECREF(v); 3426 Py_DECREF(v);
3516 return b; 3427 return b;
3517 } 3428 }
3518 3429
3519 PyErr_Format(PyExc_TypeError, 3430 PyErr_Format(PyExc_TypeError,
3520 "encoder did not return a bytes object (type=%.400s)", 3431 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3521 Py_TYPE(v)->tp_name); 3432 "use codecs.encode() to encode to arbitrary types",
3433 encoding,
3434 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
3522 Py_DECREF(v); 3435 Py_DECREF(v);
3523 return NULL; 3436 return NULL;
3524 } 3437 }
3525 3438
3526 PyObject * 3439 PyObject *
3527 PyUnicode_AsEncodedUnicode(PyObject *unicode, 3440 PyUnicode_AsEncodedUnicode(PyObject *unicode,
3528 const char *encoding, 3441 const char *encoding,
3529 const char *errors) 3442 const char *errors)
3530 { 3443 {
3531 PyObject *v; 3444 PyObject *v;
3532 3445
3533 if (!PyUnicode_Check(unicode)) { 3446 if (!PyUnicode_Check(unicode)) {
3534 PyErr_BadArgument(); 3447 PyErr_BadArgument();
3535 goto onError; 3448 goto onError;
3536 } 3449 }
3537 3450
3538 if (encoding == NULL) 3451 if (encoding == NULL)
3539 encoding = PyUnicode_GetDefaultEncoding(); 3452 encoding = PyUnicode_GetDefaultEncoding();
3540 3453
3541 /* Encode via the codec registry */ 3454 /* Encode via the codec registry */
3542 v = PyCodec_Encode(unicode, encoding, errors); 3455 v = PyCodec_Encode(unicode, encoding, errors);
3543 if (v == NULL) 3456 if (v == NULL)
3544 goto onError; 3457 goto onError;
3545 if (!PyUnicode_Check(v)) { 3458 if (!PyUnicode_Check(v)) {
3546 PyErr_Format(PyExc_TypeError, 3459 PyErr_Format(PyExc_TypeError,
3547 "encoder did not return an str object (type=%.400s)", 3460 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3548 Py_TYPE(v)->tp_name); 3461 "use codecs.encode() to encode to arbitrary types",
3462 encoding,
3463 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
3549 Py_DECREF(v); 3464 Py_DECREF(v);
3550 goto onError; 3465 goto onError;
3551 } 3466 }
3552 return v; 3467 return v;
3553 3468
3554 onError: 3469 onError:
3555 return NULL; 3470 return NULL;
3556 } 3471 }
3557 3472
3558 static size_t 3473 static size_t
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after
3601 PyObject *reason, *exc; 3516 PyObject *reason, *exc;
3602 3517
3603 if (locale_error_handler(errors, &surrogateescape) < 0) 3518 if (locale_error_handler(errors, &surrogateescape) < 0)
3604 return NULL; 3519 return NULL;
3605 3520
3606 if (str[len] != '\0' || len != strlen(str)) { 3521 if (str[len] != '\0' || len != strlen(str)) {
3607 PyErr_SetString(PyExc_TypeError, "embedded null character"); 3522 PyErr_SetString(PyExc_TypeError, "embedded null character");
3608 return NULL; 3523 return NULL;
3609 } 3524 }
3610 3525
3611 if (surrogateescape) 3526 if (surrogateescape) {
3612 { 3527 /* "surrogateescape" error handler */
3613 wstr = _Py_char2wchar(str, &wlen); 3528 wstr = _Py_char2wchar(str, &wlen);
3614 if (wstr == NULL) { 3529 if (wstr == NULL) {
3615 if (wlen == (size_t)-1) 3530 if (wlen == (size_t)-1)
3616 PyErr_NoMemory(); 3531 PyErr_NoMemory();
3617 else 3532 else
3618 PyErr_SetFromErrno(PyExc_OSError); 3533 PyErr_SetFromErrno(PyExc_OSError);
3619 return NULL; 3534 return NULL;
3620 } 3535 }
3621 3536
3622 unicode = PyUnicode_FromWideChar(wstr, wlen); 3537 unicode = PyUnicode_FromWideChar(wstr, wlen);
3623 PyMem_Free(wstr); 3538 PyMem_RawFree(wstr);
3624 } 3539 }
3625 else { 3540 else {
3541 /* strict mode */
3626 #ifndef HAVE_BROKEN_MBSTOWCS 3542 #ifndef HAVE_BROKEN_MBSTOWCS
3627 wlen = mbstowcs(NULL, str, 0); 3543 wlen = mbstowcs(NULL, str, 0);
3628 #else 3544 #else
3629 wlen = len; 3545 wlen = len;
3630 #endif 3546 #endif
3631 if (wlen == (size_t)-1) 3547 if (wlen == (size_t)-1)
3632 goto decode_error; 3548 goto decode_error;
3633 if (wlen+1 <= smallbuf_len) { 3549 if (wlen+1 <= smallbuf_len) {
3634 wstr = smallbuf; 3550 wstr = smallbuf;
3635 } 3551 }
3636 else { 3552 else {
3637 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) 3553 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3638 return PyErr_NoMemory(); 3554 return PyErr_NoMemory();
3639 3555
3640 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t)); 3556 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3641 if (!wstr) 3557 if (!wstr)
3642 return PyErr_NoMemory(); 3558 return PyErr_NoMemory();
3643 } 3559 }
3644 3560
3645 /* This shouldn't fail now */
3646 wlen2 = mbstowcs(wstr, str, wlen+1); 3561 wlen2 = mbstowcs(wstr, str, wlen+1);
3647 if (wlen2 == (size_t)-1) { 3562 if (wlen2 == (size_t)-1) {
3648 if (wstr != smallbuf) 3563 if (wstr != smallbuf)
3649 PyMem_Free(wstr); 3564 PyMem_Free(wstr);
3650 goto decode_error; 3565 goto decode_error;
3651 } 3566 }
3652 #ifdef HAVE_BROKEN_MBSTOWCS 3567 #ifdef HAVE_BROKEN_MBSTOWCS
3653 assert(wlen2 == wlen); 3568 assert(wlen2 == wlen);
3654 #endif 3569 #endif
3655 unicode = PyUnicode_FromWideChar(wstr, wlen2); 3570 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3656 if (wstr != smallbuf) 3571 if (wstr != smallbuf)
3657 PyMem_Free(wstr); 3572 PyMem_Free(wstr);
3658 } 3573 }
3659 return unicode; 3574 return unicode;
3660 3575
3661 decode_error: 3576 decode_error:
3662 errmsg = strerror(errno); 3577 errmsg = strerror(errno);
3663 assert(errmsg != NULL); 3578 assert(errmsg != NULL);
3664 3579
3665 error_pos = mbstowcs_errorpos(str, len); 3580 error_pos = mbstowcs_errorpos(str, len);
3666 if (errmsg != NULL) { 3581 if (errmsg != NULL) {
3667 size_t errlen; 3582 size_t errlen;
3668 wstr = _Py_char2wchar(errmsg, &errlen); 3583 wstr = _Py_char2wchar(errmsg, &errlen);
3669 if (wstr != NULL) { 3584 if (wstr != NULL) {
3670 reason = PyUnicode_FromWideChar(wstr, errlen); 3585 reason = PyUnicode_FromWideChar(wstr, errlen);
3671 PyMem_Free(wstr); 3586 PyMem_RawFree(wstr);
3672 } else 3587 } else
3673 errmsg = NULL; 3588 errmsg = NULL;
3674 } 3589 }
3675 if (errmsg == NULL) 3590 if (errmsg == NULL)
3676 reason = PyUnicode_FromString( 3591 reason = PyUnicode_FromString(
3677 "mbstowcs() encountered an invalid multibyte sequence"); 3592 "mbstowcs() encountered an invalid multibyte sequence");
3678 if (reason == NULL) 3593 if (reason == NULL)
3679 return NULL; 3594 return NULL;
3680 3595
3681 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO", 3596 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after
3729 "surrogateescape"); 3644 "surrogateescape");
3730 } 3645 }
3731 else { 3646 else {
3732 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape"); 3647 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
3733 } 3648 }
3734 #endif 3649 #endif
3735 } 3650 }
3736 3651
3737 3652
3738 int 3653 int
3739 _PyUnicode_HasNULChars(PyObject* s) 3654 _PyUnicode_HasNULChars(PyObject* str)
3740 { 3655 {
3741 static PyObject *nul = NULL; 3656 Py_ssize_t pos;
3742 3657
3743 if (nul == NULL) 3658 if (PyUnicode_READY(str) == -1)
3744 nul = PyUnicode_FromStringAndSize("\0", 1);
3745 if (nul == NULL)
3746 return -1; 3659 return -1;
3747 return PyUnicode_Contains(s, nul); 3660 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3748 } 3661 PyUnicode_GET_LENGTH(str), '\0', 1);
3749 3662 if (pos == -1)
3663 return 0;
3664 else
3665 return 1;
3666 }
3750 3667
3751 int 3668 int
3752 PyUnicode_FSConverter(PyObject* arg, void* addr) 3669 PyUnicode_FSConverter(PyObject* arg, void* addr)
3753 { 3670 {
3754 PyObject *output = NULL; 3671 PyObject *output = NULL;
3755 Py_ssize_t size; 3672 Py_ssize_t size;
3756 void *data; 3673 void *data;
3757 if (arg == NULL) { 3674 if (arg == NULL) {
3758 Py_DECREF(*(PyObject**)addr); 3675 Py_DECREF(*(PyObject**)addr);
3759 return 1; 3676 return 1;
(...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after
3844 if (PyUnicode_READY(unicode) == -1) 3761 if (PyUnicode_READY(unicode) == -1)
3845 return NULL; 3762 return NULL;
3846 3763
3847 if (PyUnicode_UTF8(unicode) == NULL) { 3764 if (PyUnicode_UTF8(unicode) == NULL) {
3848 assert(!PyUnicode_IS_COMPACT_ASCII(unicode)); 3765 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3849 bytes = _PyUnicode_AsUTF8String(unicode, "strict"); 3766 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3850 if (bytes == NULL) 3767 if (bytes == NULL)
3851 return NULL; 3768 return NULL;
3852 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1); 3769 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3853 if (_PyUnicode_UTF8(unicode) == NULL) { 3770 if (_PyUnicode_UTF8(unicode) == NULL) {
3771 PyErr_NoMemory();
3854 Py_DECREF(bytes); 3772 Py_DECREF(bytes);
3855 return NULL; 3773 return NULL;
3856 } 3774 }
3857 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes); 3775 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3858 Py_MEMCPY(_PyUnicode_UTF8(unicode), 3776 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3859 PyBytes_AS_STRING(bytes), 3777 PyBytes_AS_STRING(bytes),
3860 _PyUnicode_UTF8_LENGTH(unicode) + 1); 3778 _PyUnicode_UTF8_LENGTH(unicode) + 1);
3861 Py_DECREF(bytes); 3779 Py_DECREF(bytes);
3862 } 3780 }
3863 3781
(...skipping 148 matching lines...) Expand 10 before | Expand all | Expand 10 after
4012 return -1; 3930 return -1;
4013 } 3931 }
4014 if (PyUnicode_READY(unicode) == -1) 3932 if (PyUnicode_READY(unicode) == -1)
4015 return -1; 3933 return -1;
4016 return PyUnicode_GET_LENGTH(unicode); 3934 return PyUnicode_GET_LENGTH(unicode);
4017 } 3935 }
4018 3936
4019 Py_UCS4 3937 Py_UCS4
4020 PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index) 3938 PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4021 { 3939 {
3940 void *data;
3941 int kind;
3942
4022 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 3943 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4023 PyErr_BadArgument(); 3944 PyErr_BadArgument();
4024 return (Py_UCS4)-1; 3945 return (Py_UCS4)-1;
4025 } 3946 }
4026 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { 3947 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4027 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3948 PyErr_SetString(PyExc_IndexError, "string index out of range");
4028 return (Py_UCS4)-1; 3949 return (Py_UCS4)-1;
4029 } 3950 }
4030 return PyUnicode_READ_CHAR(unicode, index); 3951 data = PyUnicode_DATA(unicode);
3952 kind = PyUnicode_KIND(unicode);
3953 return PyUnicode_READ(kind, data, index);
4031 } 3954 }
4032 3955
4033 int 3956 int
4034 PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch) 3957 PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4035 { 3958 {
4036 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) { 3959 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4037 PyErr_BadArgument(); 3960 PyErr_BadArgument();
4038 return -1; 3961 return -1;
4039 } 3962 }
4040 assert(PyUnicode_IS_READY(unicode)); 3963 assert(PyUnicode_IS_READY(unicode));
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after
4079 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 4002 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4080 goto onError; 4003 goto onError;
4081 } 4004 }
4082 return; 4005 return;
4083 4006
4084 onError: 4007 onError:
4085 Py_DECREF(*exceptionObject); 4008 Py_DECREF(*exceptionObject);
4086 *exceptionObject = NULL; 4009 *exceptionObject = NULL;
4087 } 4010 }
4088 4011
4012 #ifdef HAVE_MBCS
4089 /* error handling callback helper: 4013 /* error handling callback helper:
4090 build arguments, call the callback and check the arguments, 4014 build arguments, call the callback and check the arguments,
4091 if no exception occurred, copy the replacement to the output 4015 if no exception occurred, copy the replacement to the output
4092 and adjust various state variables. 4016 and adjust various state variables.
4093 return 0 on success, -1 on error 4017 return 0 on success, -1 on error
4094 */ 4018 */
4095 4019
4096 static int 4020 static int
4097 unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 4021 unicode_decode_call_errorhandler_wchar(
4098 const char *encoding, const char *reason, 4022 const char *errors, PyObject **errorHandler,
4099 const char **input, const char **inend, Py_ssiz e_t *startinpos, 4023 const char *encoding, const char *reason,
4100 Py_ssize_t *endinpos, PyObject **exceptionObjec t, const char **inptr, 4024 const char **input, const char **inend, Py_ssize_t *startinpos,
4101 PyObject **output, Py_ssize_t *outpos) 4025 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4026 PyObject **output, Py_ssize_t *outpos)
4102 { 4027 {
4103 static char *argparse = "O!n;decoding error handler must return (str, int) t uple"; 4028 static char *argparse = "O!n;decoding error handler must return (str, int) t uple";
4104 4029
4105 PyObject *restuple = NULL; 4030 PyObject *restuple = NULL;
4106 PyObject *repunicode = NULL; 4031 PyObject *repunicode = NULL;
4107 Py_ssize_t outsize; 4032 Py_ssize_t outsize;
4108 Py_ssize_t insize; 4033 Py_ssize_t insize;
4109 Py_ssize_t requiredsize; 4034 Py_ssize_t requiredsize;
4110 Py_ssize_t newpos; 4035 Py_ssize_t newpos;
4111 PyObject *inputobj = NULL; 4036 PyObject *inputobj = NULL;
4112 int res = -1; 4037 wchar_t *repwstr;
4113 4038 Py_ssize_t repwlen;
4114 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) 4039
4115 outsize = PyUnicode_GET_LENGTH(*output); 4040 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4116 else 4041 outsize = _PyUnicode_WSTR_LENGTH(*output);
4117 outsize = _PyUnicode_WSTR_LENGTH(*output);
4118 4042
4119 if (*errorHandler == NULL) { 4043 if (*errorHandler == NULL) {
4120 *errorHandler = PyCodec_LookupError(errors); 4044 *errorHandler = PyCodec_LookupError(errors);
4121 if (*errorHandler == NULL) 4045 if (*errorHandler == NULL)
4122 goto onError; 4046 goto onError;
4123 } 4047 }
4124 4048
4125 make_decode_exception(exceptionObject, 4049 make_decode_exception(exceptionObject,
4126 encoding, 4050 encoding,
4127 *input, *inend - *input, 4051 *input, *inend - *input,
4128 *startinpos, *endinpos, 4052 *startinpos, *endinpos,
4129 reason); 4053 reason);
4130 if (*exceptionObject == NULL) 4054 if (*exceptionObject == NULL)
4131 goto onError; 4055 goto onError;
4132 4056
4133 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NUL L); 4057 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NUL L);
4134 if (restuple == NULL) 4058 if (restuple == NULL)
4135 goto onError; 4059 goto onError;
4136 if (!PyTuple_Check(restuple)) { 4060 if (!PyTuple_Check(restuple)) {
4137 PyErr_SetString(PyExc_TypeError, &argparse[4]); 4061 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4138 goto onError; 4062 goto onError;
4139 } 4063 }
4140 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &new pos)) 4064 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &new pos))
4141 goto onError; 4065 goto onError;
4142 if (PyUnicode_READY(repunicode) == -1)
4143 goto onError;
4144 4066
4145 /* Copy back the bytes variables, which might have been modified by the 4067 /* Copy back the bytes variables, which might have been modified by the
4146 callback */ 4068 callback */
4147 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 4069 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4148 if (!inputobj) 4070 if (!inputobj)
4149 goto onError; 4071 goto onError;
4150 if (!PyBytes_Check(inputobj)) { 4072 if (!PyBytes_Check(inputobj)) {
4151 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes" ); 4073 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes" );
4152 } 4074 }
4153 *input = PyBytes_AS_STRING(inputobj); 4075 *input = PyBytes_AS_STRING(inputobj);
4154 insize = PyBytes_GET_SIZE(inputobj); 4076 insize = PyBytes_GET_SIZE(inputobj);
4155 *inend = *input + insize; 4077 *inend = *input + insize;
4156 /* we can DECREF safely, as the exception has another reference, 4078 /* we can DECREF safely, as the exception has another reference,
4157 so the object won't go away. */ 4079 so the object won't go away. */
4158 Py_DECREF(inputobj); 4080 Py_DECREF(inputobj);
4159 4081
4160 if (newpos<0) 4082 if (newpos<0)
4161 newpos = insize+newpos; 4083 newpos = insize+newpos;
4162 if (newpos<0 || newpos>insize) { 4084 if (newpos<0 || newpos>insize) {
4163 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of b ounds", newpos); 4085 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of b ounds", newpos);
4164 goto onError; 4086 goto onError;
4165 } 4087 }
4166 4088
4167 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) { 4089 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4168 /* need more space? (at least enough for what we 4090 if (repwstr == NULL)
4169 have+the replacement+the rest of the string (starting 4091 goto onError;
4170 at the new input position), so we won't have to check space 4092 /* need more space? (at least enough for what we
4171 when there are no errors in the rest of the string) */ 4093 have+the replacement+the rest of the string (starting
4172 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode); 4094 at the new input position), so we won't have to check space
4173 requiredsize = *outpos + replen + insize-newpos; 4095 when there are no errors in the rest of the string) */
4174 if (requiredsize > outsize) { 4096 requiredsize = *outpos + repwlen + insize-newpos;
4175 if (requiredsize<2*outsize) 4097 if (requiredsize > outsize) {
4176 requiredsize = 2*outsize; 4098 if (requiredsize < 2*outsize)
4177 if (unicode_resize(output, requiredsize) < 0) 4099 requiredsize = 2*outsize;
4178 goto onError; 4100 if (unicode_resize(output, requiredsize) < 0)
4179 }
4180 if (unicode_widen(output, *outpos,
4181 PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
4182 goto onError; 4101 goto onError;
4183 _PyUnicode_FastCopyCharacters(*output, *outpos, repunicode, 0, replen); 4102 }
4184 *outpos += replen; 4103 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4185 } 4104 *outpos += repwlen;
4186 else { 4105
4187 wchar_t *repwstr;
4188 Py_ssize_t repwlen;
4189 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4190 if (repwstr == NULL)
4191 goto onError;
4192 /* need more space? (at least enough for what we
4193 have+the replacement+the rest of the string (starting
4194 at the new input position), so we won't have to check space
4195 when there are no errors in the rest of the string) */
4196 requiredsize = *outpos + repwlen + insize-newpos;
4197 if (requiredsize > outsize) {
4198 if (requiredsize < 2*outsize)
4199 requiredsize = 2*outsize;
4200 if (unicode_resize(output, requiredsize) < 0)
4201 goto onError;
4202 }
4203 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4204 *outpos += repwlen;
4205 }
4206 *endinpos = newpos; 4106 *endinpos = newpos;
4207 *inptr = *input + newpos; 4107 *inptr = *input + newpos;
4208 4108
4209 /* we made it! */ 4109 /* we made it! */
4210 res = 0; 4110 Py_XDECREF(restuple);
4111 return 0;
4211 4112
4212 onError: 4113 onError:
4213 Py_XDECREF(restuple); 4114 Py_XDECREF(restuple);
4214 return res; 4115 return -1;
4116 }
4117 #endif /* HAVE_MBCS */
4118
4119 static int
4120 unicode_decode_call_errorhandler_writer(
4121 const char *errors, PyObject **errorHandler,
4122 const char *encoding, const char *reason,
4123 const char **input, const char **inend, Py_ssize_t *startinpos,
4124 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4125 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4126 {
4127 static char *argparse = "O!n;decoding error handler must return (str, int) t uple";
4128
4129 PyObject *restuple = NULL;
4130 PyObject *repunicode = NULL;
4131 Py_ssize_t insize;
4132 Py_ssize_t newpos;
4133 Py_ssize_t replen;
4134 PyObject *inputobj = NULL;
4135
4136 if (*errorHandler == NULL) {
4137 *errorHandler = PyCodec_LookupError(errors);
4138 if (*errorHandler == NULL)
4139 goto onError;
4140 }
4141
4142 make_decode_exception(exceptionObject,
4143 encoding,
4144 *input, *inend - *input,
4145 *startinpos, *endinpos,
4146 reason);
4147 if (*exceptionObject == NULL)
4148 goto onError;
4149
4150 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NUL L);
4151 if (restuple == NULL)
4152 goto onError;
4153 if (!PyTuple_Check(restuple)) {
4154 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4155 goto onError;
4156 }
4157 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &new pos))
4158 goto onError;
4159
4160 /* Copy back the bytes variables, which might have been modified by the
4161 callback */
4162 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4163 if (!inputobj)
4164 goto onError;
4165 if (!PyBytes_Check(inputobj)) {
4166 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes" );
4167 }
4168 *input = PyBytes_AS_STRING(inputobj);
4169 insize = PyBytes_GET_SIZE(inputobj);
4170 *inend = *input + insize;
4171 /* we can DECREF safely, as the exception has another reference,
4172 so the object won't go away. */
4173 Py_DECREF(inputobj);
4174
4175 if (newpos<0)
4176 newpos = insize+newpos;
4177 if (newpos<0 || newpos>insize) {
4178 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of b ounds", newpos);
4179 goto onError;
4180 }
4181
4182 if (PyUnicode_READY(repunicode) < 0)
4183 goto onError;
4184 replen = PyUnicode_GET_LENGTH(repunicode);
4185 writer->min_length += replen;
4186 if (replen > 1)
4187 writer->overallocate = 1;
4188 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4189 goto onError;
4190
4191 *endinpos = newpos;
4192 *inptr = *input + newpos;
4193
4194 /* we made it! */
4195 Py_XDECREF(restuple);
4196 return 0;
4197
4198 onError:
4199 Py_XDECREF(restuple);
4200 return -1;
4215 } 4201 }
4216 4202
4217 /* --- UTF-7 Codec -------------------------------------------------------- */ 4203 /* --- UTF-7 Codec -------------------------------------------------------- */
4218 4204
4219 /* See RFC2152 for details. We encode conservatively and decode liberally. */ 4205 /* See RFC2152 for details. We encode conservatively and decode liberally. */
4220 4206
4221 /* Three simple macros defining base-64. */ 4207 /* Three simple macros defining base-64. */
4222 4208
4223 /* Is c a base-64 character? */ 4209 /* Is c a base-64 character? */
4224 4210
(...skipping 87 matching lines...) Expand 10 before | Expand all | Expand 10 after
4312 4298
4313 PyObject * 4299 PyObject *
4314 PyUnicode_DecodeUTF7Stateful(const char *s, 4300 PyUnicode_DecodeUTF7Stateful(const char *s,
4315 Py_ssize_t size, 4301 Py_ssize_t size,
4316 const char *errors, 4302 const char *errors,
4317 Py_ssize_t *consumed) 4303 Py_ssize_t *consumed)
4318 { 4304 {
4319 const char *starts = s; 4305 const char *starts = s;
4320 Py_ssize_t startinpos; 4306 Py_ssize_t startinpos;
4321 Py_ssize_t endinpos; 4307 Py_ssize_t endinpos;
4322 Py_ssize_t outpos;
4323 const char *e; 4308 const char *e;
4324 PyObject *unicode; 4309 _PyUnicodeWriter writer;
4325 const char *errmsg = ""; 4310 const char *errmsg = "";
4326 int inShift = 0; 4311 int inShift = 0;
4327 Py_ssize_t shiftOutStart; 4312 Py_ssize_t shiftOutStart;
4328 unsigned int base64bits = 0; 4313 unsigned int base64bits = 0;
4329 unsigned long base64buffer = 0; 4314 unsigned long base64buffer = 0;
4330 Py_UCS4 surrogate = 0; 4315 Py_UCS4 surrogate = 0;
4331 PyObject *errorHandler = NULL; 4316 PyObject *errorHandler = NULL;
4332 PyObject *exc = NULL; 4317 PyObject *exc = NULL;
4333 4318
4334 /* Start off assuming it's all ASCII. Widen later as necessary. */
4335 unicode = PyUnicode_New(size, 127);
4336 if (!unicode)
4337 return NULL;
4338 if (size == 0) { 4319 if (size == 0) {
4339 if (consumed) 4320 if (consumed)
4340 *consumed = 0; 4321 *consumed = 0;
4341 return unicode; 4322 _Py_RETURN_UNICODE_EMPTY();
4342 } 4323 }
4343 4324
4344 shiftOutStart = outpos = 0; 4325 /* Start off assuming it's all ASCII. Widen later as necessary. */
4326 _PyUnicodeWriter_Init(&writer);
4327 writer.min_length = size;
4328
4329 shiftOutStart = 0;
4345 e = s + size; 4330 e = s + size;
4346 4331
4347 while (s < e) { 4332 while (s < e) {
4348 Py_UCS4 ch; 4333 Py_UCS4 ch;
4349 restart: 4334 restart:
4350 ch = (unsigned char) *s; 4335 ch = (unsigned char) *s;
4351 4336
4352 if (inShift) { /* in a base-64 section */ 4337 if (inShift) { /* in a base-64 section */
4353 if (IS_BASE64(ch)) { /* consume a base-64 character */ 4338 if (IS_BASE64(ch)) { /* consume a base-64 character */
4354 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 4339 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4355 base64bits += 6; 4340 base64bits += 6;
4356 s++; 4341 s++;
4357 if (base64bits >= 16) { 4342 if (base64bits >= 16) {
4358 /* we have enough bits for a UTF-16 value */ 4343 /* we have enough bits for a UTF-16 value */
4359 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16)); 4344 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4360 base64bits -= 16; 4345 base64bits -= 16;
4361 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 4346 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4362 assert(outCh <= 0xffff); 4347 assert(outCh <= 0xffff);
4363 if (surrogate) { 4348 if (surrogate) {
4364 /* expecting a second surrogate */ 4349 /* expecting a second surrogate */
4365 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) { 4350 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4366 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh); 4351 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4367 if (unicode_putchar(&unicode, &outpos, ch2) < 0) 4352 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4368 goto onError; 4353 goto onError;
4369 surrogate = 0; 4354 surrogate = 0;
4370 continue; 4355 continue;
4371 } 4356 }
4372 else { 4357 else {
4373 if (unicode_putchar(&unicode, &outpos, surrogate) < 0) 4358 if (_PyUnicodeWriter_WriteCharInline(&writer, surrog ate) < 0)
4374 goto onError; 4359 goto onError;
4375 surrogate = 0; 4360 surrogate = 0;
4376 } 4361 }
4377 } 4362 }
4378 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) { 4363 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4379 /* first surrogate */ 4364 /* first surrogate */
4380 surrogate = outCh; 4365 surrogate = outCh;
4381 } 4366 }
4382 else { 4367 else {
4383 if (unicode_putchar(&unicode, &outpos, outCh) < 0) 4368 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0 )
4384 goto onError; 4369 goto onError;
4385 } 4370 }
4386 } 4371 }
4387 } 4372 }
4388 else { /* now leaving a base-64 section */ 4373 else { /* now leaving a base-64 section */
4389 inShift = 0; 4374 inShift = 0;
4390 s++; 4375 s++;
4391 if (surrogate) { 4376 if (surrogate) {
4392 if (unicode_putchar(&unicode, &outpos, surrogate) < 0) 4377 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0 )
4393 goto onError; 4378 goto onError;
4394 surrogate = 0; 4379 surrogate = 0;
4395 } 4380 }
4396 if (base64bits > 0) { /* left-over bits */ 4381 if (base64bits > 0) { /* left-over bits */
4397 if (base64bits >= 6) { 4382 if (base64bits >= 6) {
4398 /* We've seen at least one base-64 character */ 4383 /* We've seen at least one base-64 character */
4399 errmsg = "partial character in shift sequence"; 4384 errmsg = "partial character in shift sequence";
4400 goto utf7Error; 4385 goto utf7Error;
4401 } 4386 }
4402 else { 4387 else {
4403 /* Some bits remain; they should be zero */ 4388 /* Some bits remain; they should be zero */
4404 if (base64buffer != 0) { 4389 if (base64buffer != 0) {
4405 errmsg = "non-zero padding bits in shift sequence"; 4390 errmsg = "non-zero padding bits in shift sequence";
4406 goto utf7Error; 4391 goto utf7Error;
4407 } 4392 }
4408 } 4393 }
4409 } 4394 }
4410 if (ch != '-') { 4395 if (ch != '-') {
4411 /* '-' is absorbed; other terminating 4396 /* '-' is absorbed; other terminating
4412 characters are preserved */ 4397 characters are preserved */
4413 if (unicode_putchar(&unicode, &outpos, ch) < 0) 4398 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4414 goto onError; 4399 goto onError;
4415 } 4400 }
4416 } 4401 }
4417 } 4402 }
4418 else if ( ch == '+' ) { 4403 else if ( ch == '+' ) {
4419 startinpos = s-starts; 4404 startinpos = s-starts;
4420 s++; /* consume '+' */ 4405 s++; /* consume '+' */
4421 if (s < e && *s == '-') { /* '+-' encodes '+' */ 4406 if (s < e && *s == '-') { /* '+-' encodes '+' */
4422 s++; 4407 s++;
4423 if (unicode_putchar(&unicode, &outpos, '+') < 0) 4408 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4424 goto onError; 4409 goto onError;
4425 } 4410 }
4426 else { /* begin base64-encoded section */ 4411 else { /* begin base64-encoded section */
4427 inShift = 1; 4412 inShift = 1;
4428 shiftOutStart = outpos; 4413 shiftOutStart = writer.pos;
4429 base64bits = 0; 4414 base64bits = 0;
4430 base64buffer = 0; 4415 base64buffer = 0;
4431 } 4416 }
4432 } 4417 }
4433 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 4418 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4434 if (unicode_putchar(&unicode, &outpos, ch) < 0) 4419 s++;
4420 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4435 goto onError; 4421 goto onError;
4436 s++;
4437 } 4422 }
4438 else { 4423 else {
4439 startinpos = s-starts; 4424 startinpos = s-starts;
4440 s++; 4425 s++;
4441 errmsg = "unexpected special character"; 4426 errmsg = "unexpected special character";
4442 goto utf7Error; 4427 goto utf7Error;
4443 } 4428 }
4444 continue; 4429 continue;
4445 utf7Error: 4430 utf7Error:
4446 endinpos = s-starts; 4431 endinpos = s-starts;
4447 if (unicode_decode_call_errorhandler( 4432 if (unicode_decode_call_errorhandler_writer(
4448 errors, &errorHandler, 4433 errors, &errorHandler,
4449 "utf7", errmsg, 4434 "utf7", errmsg,
4450 &starts, &e, &startinpos, &endinpos, &exc, &s, 4435 &starts, &e, &startinpos, &endinpos, &exc, &s,
4451 &unicode, &outpos)) 4436 &writer))
4452 goto onError; 4437 goto onError;
4453 } 4438 }
4454 4439
4455 /* end of string */ 4440 /* end of string */
4456 4441
4457 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 4442 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4458 /* if we're in an inconsistent state, that's an error */ 4443 /* if we're in an inconsistent state, that's an error */
4459 if (surrogate || 4444 if (surrogate ||
4460 (base64bits >= 6) || 4445 (base64bits >= 6) ||
4461 (base64bits > 0 && base64buffer != 0)) { 4446 (base64bits > 0 && base64buffer != 0)) {
4462 endinpos = size; 4447 endinpos = size;
4463 if (unicode_decode_call_errorhandler( 4448 if (unicode_decode_call_errorhandler_writer(
4464 errors, &errorHandler, 4449 errors, &errorHandler,
4465 "utf7", "unterminated shift sequence", 4450 "utf7", "unterminated shift sequence",
4466 &starts, &e, &startinpos, &endinpos, &exc, &s, 4451 &starts, &e, &startinpos, &endinpos, &exc, &s,
4467 &unicode, &outpos)) 4452 &writer))
4468 goto onError; 4453 goto onError;
4469 if (s < e) 4454 if (s < e)
4470 goto restart; 4455 goto restart;
4471 } 4456 }
4472 } 4457 }
4473 4458
4474 /* return state */ 4459 /* return state */
4475 if (consumed) { 4460 if (consumed) {
4476 if (inShift) { 4461 if (inShift) {
4477 *consumed = startinpos; 4462 *consumed = startinpos;
4478 if (outpos != shiftOutStart && 4463 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4479 PyUnicode_MAX_CHAR_VALUE(unicode) > 127) {
4480 PyObject *result = PyUnicode_FromKindAndData( 4464 PyObject *result = PyUnicode_FromKindAndData(
4481 PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 4465 writer.kind, writer.data, shiftOutStart);
4482 shiftOutStart); 4466 Py_XDECREF(errorHandler);
4483 Py_DECREF(unicode); 4467 Py_XDECREF(exc);
4484 unicode = result; 4468 _PyUnicodeWriter_Dealloc(&writer);
4469 return result;
4485 } 4470 }
4486 outpos = shiftOutStart; /* back off output */ 4471 writer.pos = shiftOutStart; /* back off output */
4487 } 4472 }
4488 else { 4473 else {
4489 *consumed = s-starts; 4474 *consumed = s-starts;
4490 } 4475 }
4491 } 4476 }
4492 4477
4493 if (unicode_resize(&unicode, outpos) < 0)
4494 goto onError;
4495
4496 Py_XDECREF(errorHandler); 4478 Py_XDECREF(errorHandler);
4497 Py_XDECREF(exc); 4479 Py_XDECREF(exc);
4498 return unicode_result(unicode); 4480 return _PyUnicodeWriter_Finish(&writer);
4499 4481
4500 onError: 4482 onError:
4501 Py_XDECREF(errorHandler); 4483 Py_XDECREF(errorHandler);
4502 Py_XDECREF(exc); 4484 Py_XDECREF(exc);
4503 Py_DECREF(unicode); 4485 _PyUnicodeWriter_Dealloc(&writer);
4504 return NULL; 4486 return NULL;
4505 } 4487 }
4506 4488
4507 4489
4508 PyObject * 4490 PyObject *
4509 _PyUnicode_EncodeUTF7(PyObject *str, 4491 _PyUnicode_EncodeUTF7(PyObject *str,
4510 int base64SetO, 4492 int base64SetO,
4511 int base64WhiteSpace, 4493 int base64WhiteSpace,
4512 const char *errors) 4494 const char *errors)
4513 { 4495 {
(...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after
4576 goto encode_char; 4558 goto encode_char;
4577 } 4559 }
4578 } 4560 }
4579 continue; 4561 continue;
4580 encode_char: 4562 encode_char:
4581 if (ch >= 0x10000) { 4563 if (ch >= 0x10000) {
4582 assert(ch <= MAX_UNICODE); 4564 assert(ch <= MAX_UNICODE);
4583 4565
4584 /* code first surrogate */ 4566 /* code first surrogate */
4585 base64bits += 16; 4567 base64bits += 16;
4586 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10); 4568 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4587 while (base64bits >= 6) { 4569 while (base64bits >= 6) {
4588 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4570 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4589 base64bits -= 6; 4571 base64bits -= 6;
4590 } 4572 }
4591 /* prepare second surrogate */ 4573 /* prepare second surrogate */
4592 ch = Py_UNICODE_LOW_SURROGATE(ch); 4574 ch = Py_UNICODE_LOW_SURROGATE(ch);
4593 } 4575 }
4594 base64bits += 16; 4576 base64bits += 16;
4595 base64buffer = (base64buffer << 16) | ch; 4577 base64buffer = (base64buffer << 16) | ch;
4596 while (base64bits >= 6) { 4578 while (base64bits >= 6) {
(...skipping 80 matching lines...) Expand 10 before | Expand all | Expand 10 after
4677 * long are only aligned at 2-byte boundaries. Therefore the assert() 4659 * long are only aligned at 2-byte boundaries. Therefore the assert()
4678 * won't work; also, tests have shown that skipping the "optimised 4660 * won't work; also, tests have shown that skipping the "optimised
4679 * version" will even speed up m68k. 4661 * version" will even speed up m68k.
4680 */ 4662 */
4681 #if !defined(__m68k__) 4663 #if !defined(__m68k__)
4682 #if SIZEOF_LONG <= SIZEOF_VOID_P 4664 #if SIZEOF_LONG <= SIZEOF_VOID_P
4683 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG)); 4665 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4684 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { 4666 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4685 /* Fast path, see in STRINGLIB(utf8_decode) for 4667 /* Fast path, see in STRINGLIB(utf8_decode) for
4686 an explanation. */ 4668 an explanation. */
4687 /* Help register allocation */ 4669 /* Help allocation */
4688 register const char *_p = p; 4670 const char *_p = p;
4689 register Py_UCS1 * q = dest; 4671 Py_UCS1 * q = dest;
4690 while (_p < aligned_end) { 4672 while (_p < aligned_end) {
4691 unsigned long value = *(const unsigned long *) _p; 4673 unsigned long value = *(const unsigned long *) _p;
4692 if (value & ASCII_CHAR_MASK) 4674 if (value & ASCII_CHAR_MASK)
4693 break; 4675 break;
4694 *((unsigned long *)q) = value; 4676 *((unsigned long *)q) = value;
4695 _p += SIZEOF_LONG; 4677 _p += SIZEOF_LONG;
4696 q += SIZEOF_LONG; 4678 q += SIZEOF_LONG;
4697 } 4679 }
4698 p = _p; 4680 p = _p;
4699 while (p < end) { 4681 while (p < end) {
4700 if ((unsigned char)*p & 0x80) 4682 if ((unsigned char)*p & 0x80)
4701 break; 4683 break;
4702 *q++ = *p++; 4684 *q++ = *p++;
4703 } 4685 }
4704 return p - start; 4686 return p - start;
4705 } 4687 }
4706 #endif 4688 #endif
4707 #endif 4689 #endif
4708 while (p < end) { 4690 while (p < end) {
4709 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h 4691 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4710 for an explanation. */ 4692 for an explanation. */
4711 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { 4693 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4712 /* Help register allocation */ 4694 /* Help allocation */
4713 register const char *_p = p; 4695 const char *_p = p;
4714 while (_p < aligned_end) { 4696 while (_p < aligned_end) {
4715 unsigned long value = *(unsigned long *) _p; 4697 unsigned long value = *(unsigned long *) _p;
4716 if (value & ASCII_CHAR_MASK) 4698 if (value & ASCII_CHAR_MASK)
4717 break; 4699 break;
4718 _p += SIZEOF_LONG; 4700 _p += SIZEOF_LONG;
4719 } 4701 }
4720 p = _p; 4702 p = _p;
4721 if (_p == end) 4703 if (_p == end)
4722 break; 4704 break;
4723 } 4705 }
4724 if ((unsigned char)*p & 0x80) 4706 if ((unsigned char)*p & 0x80)
4725 break; 4707 break;
4726 ++p; 4708 ++p;
4727 } 4709 }
4728 memcpy(dest, start, p - start); 4710 memcpy(dest, start, p - start);
4729 return p - start; 4711 return p - start;
4730 } 4712 }
4731 4713
4732 PyObject * 4714 PyObject *
4733 PyUnicode_DecodeUTF8Stateful(const char *s, 4715 PyUnicode_DecodeUTF8Stateful(const char *s,
4734 Py_ssize_t size, 4716 Py_ssize_t size,
4735 const char *errors, 4717 const char *errors,
4736 Py_ssize_t *consumed) 4718 Py_ssize_t *consumed)
4737 { 4719 {
4738 PyObject *unicode; 4720 _PyUnicodeWriter writer;
4739 const char *starts = s; 4721 const char *starts = s;
4740 const char *end = s + size; 4722 const char *end = s + size;
4741 Py_ssize_t outpos;
4742 4723
4743 Py_ssize_t startinpos; 4724 Py_ssize_t startinpos;
4744 Py_ssize_t endinpos; 4725 Py_ssize_t endinpos;
4745 const char *errmsg = ""; 4726 const char *errmsg = "";
4746 PyObject *errorHandler = NULL; 4727 PyObject *errorHandler = NULL;
4747 PyObject *exc = NULL; 4728 PyObject *exc = NULL;
4748 4729
4749 if (size == 0) { 4730 if (size == 0) {
4750 if (consumed) 4731 if (consumed)
4751 *consumed = 0; 4732 *consumed = 0;
4752 _Py_RETURN_UNICODE_EMPTY(); 4733 _Py_RETURN_UNICODE_EMPTY();
4753 } 4734 }
4754 4735
4755 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 4736 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4756 if (size == 1 && (unsigned char)s[0] < 128) { 4737 if (size == 1 && (unsigned char)s[0] < 128) {
4757 if (consumed) 4738 if (consumed)
4758 *consumed = 1; 4739 *consumed = 1;
4759 return get_latin1_char((unsigned char)s[0]); 4740 return get_latin1_char((unsigned char)s[0]);
4760 } 4741 }
4761 4742
4762 unicode = PyUnicode_New(size, 127); 4743 _PyUnicodeWriter_Init(&writer);
4763 if (!unicode) 4744 writer.min_length = size;
4764 return NULL; 4745 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
4765 4746 goto onError;
4766 outpos = ascii_decode(s, end, PyUnicode_1BYTE_DATA(unicode)); 4747
4767 s += outpos; 4748 writer.pos = ascii_decode(s, end, writer.data);
4749 s += writer.pos;
4768 while (s < end) { 4750 while (s < end) {
4769 Py_UCS4 ch; 4751 Py_UCS4 ch;
4770 int kind = PyUnicode_KIND(unicode); 4752 int kind = writer.kind;
4771 if (kind == PyUnicode_1BYTE_KIND) { 4753 if (kind == PyUnicode_1BYTE_KIND) {
4772 if (PyUnicode_IS_ASCII(unicode)) 4754 if (PyUnicode_IS_ASCII(writer.buffer))
4773 ch = asciilib_utf8_decode(&s, end, 4755 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
4774 PyUnicode_1BYTE_DATA(unicode), &outpos);
4775 else 4756 else
4776 ch = ucs1lib_utf8_decode(&s, end, 4757 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
4777 PyUnicode_1BYTE_DATA(unicode), &outpos);
4778 } else if (kind == PyUnicode_2BYTE_KIND) { 4758 } else if (kind == PyUnicode_2BYTE_KIND) {
4779 ch = ucs2lib_utf8_decode(&s, end, 4759 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
4780 PyUnicode_2BYTE_DATA(unicode), &outpos);
4781 } else { 4760 } else {
4782 assert(kind == PyUnicode_4BYTE_KIND); 4761 assert(kind == PyUnicode_4BYTE_KIND);
4783 ch = ucs4lib_utf8_decode(&s, end, 4762 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
4784 PyUnicode_4BYTE_DATA(unicode), &outpos);
4785 } 4763 }
4786 4764
4787 switch (ch) { 4765 switch (ch) {
4788 case 0: 4766 case 0:
4789 if (s == end || consumed) 4767 if (s == end || consumed)
4790 goto End; 4768 goto End;
4791 errmsg = "unexpected end of data"; 4769 errmsg = "unexpected end of data";
4792 startinpos = s - starts; 4770 startinpos = s - starts;
4793 endinpos = end - starts; 4771 endinpos = end - starts;
4794 break; 4772 break;
4795 case 1: 4773 case 1:
4796 errmsg = "invalid start byte"; 4774 errmsg = "invalid start byte";
4797 startinpos = s - starts; 4775 startinpos = s - starts;
4798 endinpos = startinpos + 1; 4776 endinpos = startinpos + 1;
4799 break; 4777 break;
4800 case 2: 4778 case 2:
4801 case 3: 4779 case 3:
4802 case 4: 4780 case 4:
4803 errmsg = "invalid continuation byte"; 4781 errmsg = "invalid continuation byte";
4804 startinpos = s - starts; 4782 startinpos = s - starts;
4805 endinpos = startinpos + ch - 1; 4783 endinpos = startinpos + ch - 1;
4806 break; 4784 break;
4807 default: 4785 default:
4808 if (unicode_putchar(&unicode, &outpos, ch) < 0) 4786 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4809 goto onError; 4787 goto onError;
4810 continue; 4788 continue;
4811 } 4789 }
4812 4790
4813 if (unicode_decode_call_errorhandler( 4791 if (unicode_decode_call_errorhandler_writer(
4814 errors, &errorHandler, 4792 errors, &errorHandler,
4815 "utf-8", errmsg, 4793 "utf-8", errmsg,
4816 &starts, &end, &startinpos, &endinpos, &exc, &s, 4794 &starts, &end, &startinpos, &endinpos, &exc, &s,
4817 &unicode, &outpos)) 4795 &writer))
4818 goto onError; 4796 goto onError;
4819 } 4797 }
4820 4798
4821 End: 4799 End:
4822 if (unicode_resize(&unicode, outpos) < 0)
4823 goto onError;
4824
4825 if (consumed) 4800 if (consumed)
4826 *consumed = s - starts; 4801 *consumed = s - starts;
4827 4802
4828 Py_XDECREF(errorHandler); 4803 Py_XDECREF(errorHandler);
4829 Py_XDECREF(exc); 4804 Py_XDECREF(exc);
4830 assert(_PyUnicode_CheckConsistency(unicode, 1)); 4805 return _PyUnicodeWriter_Finish(&writer);
4831 return unicode;
4832 4806
4833 onError: 4807 onError:
4834 Py_XDECREF(errorHandler); 4808 Py_XDECREF(errorHandler);
4835 Py_XDECREF(exc); 4809 Py_XDECREF(exc);
4836 Py_XDECREF(unicode); 4810 _PyUnicodeWriter_Dealloc(&writer);
4837 return NULL; 4811 return NULL;
4838 } 4812 }
4839 4813
4840 #ifdef __APPLE__ 4814 #ifdef __APPLE__
4841 4815
4842 /* Simplified UTF-8 decoder using surrogateescape error handler, 4816 /* Simplified UTF-8 decoder using surrogateescape error handler,
4843 used to decode the command line arguments on Mac OS X. 4817 used to decode the command line arguments on Mac OS X.
4844 4818
4845 Return a pointer to a newly allocated wide character string (use 4819 Return a pointer to a newly allocated wide character string (use
4846 PyMem_Free() to free the memory), or NULL on memory allocation error. */ 4820 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
4847 4821
4848 wchar_t* 4822 wchar_t*
4849 _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) 4823 _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4850 { 4824 {
4851 const char *e; 4825 const char *e;
4852 wchar_t *unicode; 4826 wchar_t *unicode;
4853 Py_ssize_t outpos; 4827 Py_ssize_t outpos;
4854 4828
4855 /* Note: size will always be longer than the resulting Unicode 4829 /* Note: size will always be longer than the resulting Unicode
4856 character count */ 4830 character count */
4857 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) 4831 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
4858 return NULL; 4832 return NULL;
4859 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t)); 4833 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
4860 if (!unicode) 4834 if (!unicode)
4861 return NULL; 4835 return NULL;
4862 4836
4863 /* Unpack UTF-8 encoded data */ 4837 /* Unpack UTF-8 encoded data */
4864 e = s + size; 4838 e = s + size;
4865 outpos = 0; 4839 outpos = 0;
4866 while (s < e) { 4840 while (s < e) {
4867 Py_UCS4 ch; 4841 Py_UCS4 ch;
4868 #if SIZEOF_WCHAR_T == 4 4842 #if SIZEOF_WCHAR_T == 4
4869 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos); 4843 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
(...skipping 102 matching lines...) Expand 10 before | Expand all | Expand 10 after
4972 PyObject * 4946 PyObject *
4973 PyUnicode_DecodeUTF32Stateful(const char *s, 4947 PyUnicode_DecodeUTF32Stateful(const char *s,
4974 Py_ssize_t size, 4948 Py_ssize_t size,
4975 const char *errors, 4949 const char *errors,
4976 int *byteorder, 4950 int *byteorder,
4977 Py_ssize_t *consumed) 4951 Py_ssize_t *consumed)
4978 { 4952 {
4979 const char *starts = s; 4953 const char *starts = s;
4980 Py_ssize_t startinpos; 4954 Py_ssize_t startinpos;
4981 Py_ssize_t endinpos; 4955 Py_ssize_t endinpos;
4982 Py_ssize_t outpos; 4956 _PyUnicodeWriter writer;
4983 PyObject *unicode;
4984 const unsigned char *q, *e; 4957 const unsigned char *q, *e;
4985 int bo = 0; /* assume native ordering by default */ 4958 int le, bo = 0; /* assume native ordering by default */
4959 const char *encoding;
4986 const char *errmsg = ""; 4960 const char *errmsg = "";
4987 /* Offsets from q for retrieving bytes in the right order. */
4988 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
4989 int iorder[] = {0, 1, 2, 3};
4990 #else
4991 int iorder[] = {3, 2, 1, 0};
4992 #endif
4993 PyObject *errorHandler = NULL; 4961 PyObject *errorHandler = NULL;
4994 PyObject *exc = NULL; 4962 PyObject *exc = NULL;
4995 4963
4996 q = (unsigned char *)s; 4964 q = (unsigned char *)s;
4997 e = q + size; 4965 e = q + size;
4998 4966
4999 if (byteorder) 4967 if (byteorder)
5000 bo = *byteorder; 4968 bo = *byteorder;
5001 4969
5002 /* Check for BOM marks (U+FEFF) in the input and adjust current 4970 /* Check for BOM marks (U+FEFF) in the input and adjust current
5003 byte order setting accordingly. In native mode, the leading BOM 4971 byte order setting accordingly. In native mode, the leading BOM
5004 mark is skipped, in all other modes, it is copied to the output 4972 mark is skipped, in all other modes, it is copied to the output
5005 stream as-is (giving a ZWNBSP character). */ 4973 stream as-is (giving a ZWNBSP character). */
5006 if (bo == 0) { 4974 if (bo == 0 && size >= 4) {
5007 if (size >= 4) { 4975 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5008 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 4976 if (bom == 0x0000FEFF) {
5009 (q[iorder[1]] << 8) | q[iorder[0]]; 4977 bo = -1;
5010 #ifdef BYTEORDER_IS_LITTLE_ENDIAN 4978 q += 4;
5011 if (bom == 0x0000FEFF) { 4979 }
4980 else if (bom == 0xFFFE0000) {
4981 bo = 1;
4982 q += 4;
4983 }
4984 if (byteorder)
4985 *byteorder = bo;
4986 }
4987
4988 if (q == e) {
4989 if (consumed)
4990 *consumed = size;
4991 _Py_RETURN_UNICODE_EMPTY();
4992 }
4993
4994 #ifdef WORDS_BIGENDIAN
4995 le = bo < 0;
4996 #else
4997 le = bo <= 0;
4998 #endif
4999 encoding = le ? "utf-32-le" : "utf-32-be";
5000
5001 _PyUnicodeWriter_Init(&writer);
5002 writer.min_length = (e - q + 3) / 4;
5003 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5004 goto onError;
5005
5006 while (1) {
5007 Py_UCS4 ch = 0;
5008 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5009
5010 if (e - q >= 4) {
5011 enum PyUnicode_Kind kind = writer.kind;
5012 void *data = writer.data;
5013 const unsigned char *last = e - 4;
5014 Py_ssize_t pos = writer.pos;
5015 if (le) {
5016 do {
5017 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5018 if (ch > maxch)
5019 break;
5020 if (kind != PyUnicode_1BYTE_KIND &&
5021 Py_UNICODE_IS_SURROGATE(ch))
5022 break;
5023 PyUnicode_WRITE(kind, data, pos++, ch);
5024 q += 4;
5025 } while (q <= last);
5026 }
5027 else {
5028 do {
5029 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5030 if (ch > maxch)
5031 break;
5032 if (kind != PyUnicode_1BYTE_KIND &&
5033 Py_UNICODE_IS_SURROGATE(ch))
5034 break;
5035 PyUnicode_WRITE(kind, data, pos++, ch);
5036 q += 4;
5037 } while (q <= last);
5038 }
5039 writer.pos = pos;
5040 }
5041
5042 if (Py_UNICODE_IS_SURROGATE(ch)) {
5043 errmsg = "codepoint in surrogate code point range(0xd800, 0xe000)";
5044 startinpos = ((const char *)q) - starts;
5045 endinpos = startinpos + 4;
5046 }
5047 else if (ch <= maxch) {
5048 if (q == e || consumed)
5049 break;
5050 /* remaining bytes at the end? (size should be divisible by 4) */
5051 errmsg = "truncated data";
5052 startinpos = ((const char *)q) - starts;
5053 endinpos = ((const char *)e) - starts;
5054 }
5055 else {
5056 if (ch < 0x110000) {
5057 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5058 goto onError;
5012 q += 4; 5059 q += 4;
5013 bo = -1; 5060 continue;
5014 } 5061 }
5015 else if (bom == 0xFFFE0000) {
5016 q += 4;
5017 bo = 1;
5018 }
5019 #else
5020 if (bom == 0x0000FEFF) {
5021 q += 4;
5022 bo = 1;
5023 }
5024 else if (bom == 0xFFFE0000) {
5025 q += 4;
5026 bo = -1;
5027 }
5028 #endif
5029 }
5030 }
5031
5032 if (bo == -1) {
5033 /* force LE */
5034 iorder[0] = 0;
5035 iorder[1] = 1;
5036 iorder[2] = 2;
5037 iorder[3] = 3;
5038 }
5039 else if (bo == 1) {
5040 /* force BE */
5041 iorder[0] = 3;
5042 iorder[1] = 2;
5043 iorder[2] = 1;
5044 iorder[3] = 0;
5045 }
5046
5047 /* This might be one to much, because of a BOM */
5048 unicode = PyUnicode_New((size+3)/4, 127);
5049 if (!unicode)
5050 return NULL;
5051 if (size == 0)
5052 return unicode;
5053 outpos = 0;
5054
5055 while (q < e) {
5056 Py_UCS4 ch;
5057 /* remaining bytes at the end? (size should be divisible by 4) */
5058 if (e-q<4) {
5059 if (consumed)
5060 break;
5061 errmsg = "truncated data";
5062 startinpos = ((const char *)q)-starts;
5063 endinpos = ((const char *)e)-starts;
5064 goto utf32Error;
5065 /* The remaining input chars are ignored if the callback
5066 chooses to skip the input */
5067 }
5068 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5069 (q[iorder[1]] << 8) | q[iorder[0]];
5070
5071 if (ch >= 0x110000)
5072 {
5073 errmsg = "codepoint not in range(0x110000)"; 5062 errmsg = "codepoint not in range(0x110000)";
5074 startinpos = ((const char *)q)-starts; 5063 startinpos = ((const char *)q) - starts;
5075 endinpos = startinpos+4; 5064 endinpos = startinpos + 4;
5076 goto utf32Error; 5065 }
5077 } 5066
5078 if (unicode_putchar(&unicode, &outpos, ch) < 0) 5067 /* The remaining input chars are ignored if the callback
5068 chooses to skip the input */
5069 if (unicode_decode_call_errorhandler_writer(
5070 errors, &errorHandler,
5071 encoding, errmsg,
5072 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5073 &writer))
5079 goto onError; 5074 goto onError;
5080 q += 4; 5075 }
5081 continue;
5082 utf32Error:
5083 if (unicode_decode_call_errorhandler(
5084 errors, &errorHandler,
5085 "utf32", errmsg,
5086 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5087 &unicode, &outpos))
5088 goto onError;
5089 }
5090
5091 if (byteorder)
5092 *byteorder = bo;
5093 5076
5094 if (consumed) 5077 if (consumed)
5095 *consumed = (const char *)q-starts; 5078 *consumed = (const char *)q-starts;
5096 5079
5097 /* Adjust length */
5098 if (unicode_resize(&unicode, outpos) < 0)
5099 goto onError;
5100
5101 Py_XDECREF(errorHandler); 5080 Py_XDECREF(errorHandler);
5102 Py_XDECREF(exc); 5081 Py_XDECREF(exc);
5103 return unicode_result(unicode); 5082 return _PyUnicodeWriter_Finish(&writer);
5104 5083
5105 onError: 5084 onError:
5106 Py_DECREF(unicode); 5085 _PyUnicodeWriter_Dealloc(&writer);
5107 Py_XDECREF(errorHandler); 5086 Py_XDECREF(errorHandler);
5108 Py_XDECREF(exc); 5087 Py_XDECREF(exc);
5109 return NULL; 5088 return NULL;
5110 } 5089 }
5111 5090
5112 PyObject * 5091 PyObject *
5113 _PyUnicode_EncodeUTF32(PyObject *str, 5092 _PyUnicode_EncodeUTF32(PyObject *str,
5114 const char *errors, 5093 const char *errors,
5115 int byteorder) 5094 int byteorder)
5116 { 5095 {
5117 int kind; 5096 int kind;
5118 void *data; 5097 void *data;
5119 Py_ssize_t len; 5098 Py_ssize_t len;
5120 PyObject *v; 5099 PyObject *v;
5121 unsigned char *p; 5100 unsigned char *p;
5122 Py_ssize_t nsize, i; 5101 Py_ssize_t nsize, i;
5123 /* Offsets from p for storing byte pairs in the right order. */ 5102 /* Offsets from p for storing byte pairs in the right order. */
5124 #ifdef BYTEORDER_IS_LITTLE_ENDIAN 5103 #if PY_LITTLE_ENDIAN
5125 int iorder[] = {0, 1, 2, 3}; 5104 int iorder[] = {0, 1, 2, 3};
5126 #else 5105 #else
5127 int iorder[] = {3, 2, 1, 0}; 5106 int iorder[] = {3, 2, 1, 0};
5128 #endif 5107 #endif
5108 const char *encoding;
5109 PyObject *errorHandler = NULL;
5110 PyObject *exc = NULL;
5111 PyObject *rep = NULL;
5129 5112
5130 #define STORECHAR(CH) \ 5113 #define STORECHAR(CH) \
5131 do { \ 5114 do { \
5132 p[iorder[3]] = ((CH) >> 24) & 0xff; \ 5115 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5133 p[iorder[2]] = ((CH) >> 16) & 0xff; \ 5116 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5134 p[iorder[1]] = ((CH) >> 8) & 0xff; \ 5117 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5135 p[iorder[0]] = (CH) & 0xff; \ 5118 p[iorder[0]] = (CH) & 0xff; \
5136 p += 4; \ 5119 p += 4; \
5137 } while(0) 5120 } while(0)
5138 5121
(...skipping 11 matching lines...) Expand all
5150 if (nsize > PY_SSIZE_T_MAX / 4) 5133 if (nsize > PY_SSIZE_T_MAX / 4)
5151 return PyErr_NoMemory(); 5134 return PyErr_NoMemory();
5152 v = PyBytes_FromStringAndSize(NULL, nsize * 4); 5135 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5153 if (v == NULL) 5136 if (v == NULL)
5154 return NULL; 5137 return NULL;
5155 5138
5156 p = (unsigned char *)PyBytes_AS_STRING(v); 5139 p = (unsigned char *)PyBytes_AS_STRING(v);
5157 if (byteorder == 0) 5140 if (byteorder == 0)
5158 STORECHAR(0xFEFF); 5141 STORECHAR(0xFEFF);
5159 if (len == 0) 5142 if (len == 0)
5160 goto done; 5143 return v;
5161 5144
5162 if (byteorder == -1) { 5145 if (byteorder == -1) {
5163 /* force LE */ 5146 /* force LE */
5164 iorder[0] = 0; 5147 iorder[0] = 0;
5165 iorder[1] = 1; 5148 iorder[1] = 1;
5166 iorder[2] = 2; 5149 iorder[2] = 2;
5167 iorder[3] = 3; 5150 iorder[3] = 3;
5151 encoding = "utf-32-le";
5168 } 5152 }
5169 else if (byteorder == 1) { 5153 else if (byteorder == 1) {
5170 /* force BE */ 5154 /* force BE */
5171 iorder[0] = 3; 5155 iorder[0] = 3;
5172 iorder[1] = 2; 5156 iorder[1] = 2;
5173 iorder[2] = 1; 5157 iorder[2] = 1;
5174 iorder[3] = 0; 5158 iorder[3] = 0;
5175 } 5159 encoding = "utf-32-be";
5176 5160 }
5177 for (i = 0; i < len; i++) 5161 else
5178 STORECHAR(PyUnicode_READ(kind, data, i)); 5162 encoding = "utf-32";
5179 5163
5180 done: 5164 if (kind == PyUnicode_1BYTE_KIND) {
5165 for (i = 0; i < len; i++)
5166 STORECHAR(PyUnicode_READ(kind, data, i));
5167 return v;
5168 }
5169
5170 for (i = 0; i < len;) {
5171 Py_ssize_t repsize, moreunits;
5172 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5173 i++;
5174 assert(ch <= MAX_UNICODE);
5175 if (!Py_UNICODE_IS_SURROGATE(ch)) {
5176 STORECHAR(ch);
5177 continue;
5178 }
5179
5180 rep = unicode_encode_call_errorhandler(
5181 errors, &errorHandler,
5182 encoding, "surrogates not allowed",
5183 str, &exc, i-1, i, &i);
5184
5185 if (!rep)
5186 goto error;
5187
5188 if (PyBytes_Check(rep)) {
5189 repsize = PyBytes_GET_SIZE(rep);
5190 if (repsize & 3) {
5191 raise_encode_exception(&exc, encoding,
5192 str, i - 1, i,
5193 "surrogates not allowed");
5194 goto error;
5195 }
5196 moreunits = repsize / 4;
5197 }
5198 else {
5199 assert(PyUnicode_Check(rep));
5200 if (PyUnicode_READY(rep) < 0)
5201 goto error;
5202 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5203 if (!PyUnicode_IS_ASCII(rep)) {
5204 raise_encode_exception(&exc, encoding,
5205 str, i - 1, i,
5206 "surrogates not allowed");
5207 goto error;
5208 }
5209 }
5210
5211 /* four bytes are reserved for each surrogate */
5212 if (moreunits > 1) {
5213 Py_ssize_t outpos = p - (unsigned char*) PyBytes_AS_STRING(v);
5214 Py_ssize_t morebytes = 4 * (moreunits - 1);
5215 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5216 /* integer overflow */
5217 PyErr_NoMemory();
5218 goto error;
5219 }
5220 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5221 goto error;
5222 p = (unsigned char*) PyBytes_AS_STRING(v) + outpos;
5223 }
5224
5225 if (PyBytes_Check(rep)) {
5226 Py_MEMCPY(p, PyBytes_AS_STRING(rep), repsize);
5227 p += repsize;
5228 } else /* rep is unicode */ {
5229 const Py_UCS1 *repdata;
5230 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5231 repdata = PyUnicode_1BYTE_DATA(rep);
5232 while (repsize--) {
5233 Py_UCS4 ch = *repdata++;
5234 STORECHAR(ch);
5235 }
5236 }
5237
5238 Py_CLEAR(rep);
5239 }
5240
5241 /* Cut back to size actually needed. This is necessary for, for example,
5242 encoding of a string containing isolated surrogates and the 'ignore'
5243 handler is used. */
5244 nsize = p - (unsigned char*) PyBytes_AS_STRING(v);
5245 if (nsize != PyBytes_GET_SIZE(v))
5246 _PyBytes_Resize(&v, nsize);
5247 Py_XDECREF(errorHandler);
5248 Py_XDECREF(exc);
5181 return v; 5249 return v;
5250 error:
5251 Py_XDECREF(rep);
5252 Py_XDECREF(errorHandler);
5253 Py_XDECREF(exc);
5254 Py_XDECREF(v);
5255 return NULL;
5182 #undef STORECHAR 5256 #undef STORECHAR
5183 } 5257 }
5184 5258
5185 PyObject * 5259 PyObject *
5186 PyUnicode_EncodeUTF32(const Py_UNICODE *s, 5260 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5187 Py_ssize_t size, 5261 Py_ssize_t size,
5188 const char *errors, 5262 const char *errors,
5189 int byteorder) 5263 int byteorder)
5190 { 5264 {
5191 PyObject *result; 5265 PyObject *result;
(...skipping 25 matching lines...) Expand all
5217 PyObject * 5291 PyObject *
5218 PyUnicode_DecodeUTF16Stateful(const char *s, 5292 PyUnicode_DecodeUTF16Stateful(const char *s,
5219 Py_ssize_t size, 5293 Py_ssize_t size,
5220 const char *errors, 5294 const char *errors,
5221 int *byteorder, 5295 int *byteorder,
5222 Py_ssize_t *consumed) 5296 Py_ssize_t *consumed)
5223 { 5297 {
5224 const char *starts = s; 5298 const char *starts = s;
5225 Py_ssize_t startinpos; 5299 Py_ssize_t startinpos;
5226 Py_ssize_t endinpos; 5300 Py_ssize_t endinpos;
5227 Py_ssize_t outpos; 5301 _PyUnicodeWriter writer;
5228 PyObject *unicode;
5229 const unsigned char *q, *e; 5302 const unsigned char *q, *e;
5230 int bo = 0; /* assume native ordering by default */ 5303 int bo = 0; /* assume native ordering by default */
5231 int native_ordering; 5304 int native_ordering;
5232 const char *errmsg = ""; 5305 const char *errmsg = "";
5233 PyObject *errorHandler = NULL; 5306 PyObject *errorHandler = NULL;
5234 PyObject *exc = NULL; 5307 PyObject *exc = NULL;
5308 const char *encoding;
5235 5309
5236 q = (unsigned char *)s; 5310 q = (unsigned char *)s;
5237 e = q + size; 5311 e = q + size;
5238 5312
5239 if (byteorder) 5313 if (byteorder)
5240 bo = *byteorder; 5314 bo = *byteorder;
5241 5315
5242 /* Check for BOM marks (U+FEFF) in the input and adjust current 5316 /* Check for BOM marks (U+FEFF) in the input and adjust current
5243 byte order setting accordingly. In native mode, the leading BOM 5317 byte order setting accordingly. In native mode, the leading BOM
5244 mark is skipped, in all other modes, it is copied to the output 5318 mark is skipped, in all other modes, it is copied to the output
(...skipping 11 matching lines...) Expand all
5256 if (byteorder) 5330 if (byteorder)
5257 *byteorder = bo; 5331 *byteorder = bo;
5258 } 5332 }
5259 5333
5260 if (q == e) { 5334 if (q == e) {
5261 if (consumed) 5335 if (consumed)
5262 *consumed = size; 5336 *consumed = size;
5263 _Py_RETURN_UNICODE_EMPTY(); 5337 _Py_RETURN_UNICODE_EMPTY();
5264 } 5338 }
5265 5339
5266 #ifdef BYTEORDER_IS_LITTLE_ENDIAN 5340 #if PY_LITTLE_ENDIAN
5267 native_ordering = bo <= 0; 5341 native_ordering = bo <= 0;
5342 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
5268 #else 5343 #else
5269 native_ordering = bo >= 0; 5344 native_ordering = bo >= 0;
5345 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
5270 #endif 5346 #endif
5271 5347
5272 /* Note: size will always be longer than the resulting Unicode 5348 /* Note: size will always be longer than the resulting Unicode
5273 character count */ 5349 character count */
5274 unicode = PyUnicode_New((e - q + 1) / 2, 127); 5350 _PyUnicodeWriter_Init(&writer);
5275 if (!unicode) 5351 writer.min_length = (e - q + 1) / 2;
5276 return NULL; 5352 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5277 5353 goto onError;
5278 outpos = 0; 5354
5279 while (1) { 5355 while (1) {
5280 Py_UCS4 ch = 0; 5356 Py_UCS4 ch = 0;
5281 if (e - q >= 2) { 5357 if (e - q >= 2) {
5282 int kind = PyUnicode_KIND(unicode); 5358 int kind = writer.kind;
5283 if (kind == PyUnicode_1BYTE_KIND) { 5359 if (kind == PyUnicode_1BYTE_KIND) {
5284 if (PyUnicode_IS_ASCII(unicode)) 5360 if (PyUnicode_IS_ASCII(writer.buffer))
5285 ch = asciilib_utf16_decode(&q, e, 5361 ch = asciilib_utf16_decode(&q, e,
5286 PyUnicode_1BYTE_DATA(unicode), &outpos, 5362 (Py_UCS1*)writer.data, &writer.pos,
5287 native_ordering); 5363 native_ordering);
5288 else 5364 else
5289 ch = ucs1lib_utf16_decode(&q, e, 5365 ch = ucs1lib_utf16_decode(&q, e,
5290 PyUnicode_1BYTE_DATA(unicode), &outpos, 5366 (Py_UCS1*)writer.data, &writer.pos,
5291 native_ordering); 5367 native_ordering);
5292 } else if (kind == PyUnicode_2BYTE_KIND) { 5368 } else if (kind == PyUnicode_2BYTE_KIND) {
5293 ch = ucs2lib_utf16_decode(&q, e, 5369 ch = ucs2lib_utf16_decode(&q, e,
5294 PyUnicode_2BYTE_DATA(unicode), &outpos, 5370 (Py_UCS2*)writer.data, &writer.pos,
5295 native_ordering); 5371 native_ordering);
5296 } else { 5372 } else {
5297 assert(kind == PyUnicode_4BYTE_KIND); 5373 assert(kind == PyUnicode_4BYTE_KIND);
5298 ch = ucs4lib_utf16_decode(&q, e, 5374 ch = ucs4lib_utf16_decode(&q, e,
5299 PyUnicode_4BYTE_DATA(unicode), &outpos, 5375 (Py_UCS4*)writer.data, &writer.pos,
5300 native_ordering); 5376 native_ordering);
5301 } 5377 }
5302 } 5378 }
5303 5379
5304 switch (ch) 5380 switch (ch)
5305 { 5381 {
5306 case 0: 5382 case 0:
5307 /* remaining byte at the end? (size should be even) */ 5383 /* remaining byte at the end? (size should be even) */
5308 if (q == e || consumed) 5384 if (q == e || consumed)
5309 goto End; 5385 goto End;
(...skipping 15 matching lines...) Expand all
5325 errmsg = "illegal encoding"; 5401 errmsg = "illegal encoding";
5326 startinpos = ((const char *)q) - 2 - starts; 5402 startinpos = ((const char *)q) - 2 - starts;
5327 endinpos = startinpos + 2; 5403 endinpos = startinpos + 2;
5328 break; 5404 break;
5329 case 3: 5405 case 3:
5330 errmsg = "illegal UTF-16 surrogate"; 5406 errmsg = "illegal UTF-16 surrogate";
5331 startinpos = ((const char *)q) - 4 - starts; 5407 startinpos = ((const char *)q) - 4 - starts;
5332 endinpos = startinpos + 2; 5408 endinpos = startinpos + 2;
5333 break; 5409 break;
5334 default: 5410 default:
5335 if (unicode_putchar(&unicode, &outpos, ch) < 0) 5411 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5336 goto onError; 5412 goto onError;
5337 continue; 5413 continue;
5338 } 5414 }
5339 5415
5340 if (unicode_decode_call_errorhandler( 5416 if (unicode_decode_call_errorhandler_writer(
5341 errors, 5417 errors,
5342 &errorHandler, 5418 &errorHandler,
5343 "utf16", errmsg, 5419 encoding, errmsg,
5344 &starts, 5420 &starts,
5345 (const char **)&e, 5421 (const char **)&e,
5346 &startinpos, 5422 &startinpos,
5347 &endinpos, 5423 &endinpos,
5348 &exc, 5424 &exc,
5349 (const char **)&q, 5425 (const char **)&q,
5350 &unicode, 5426 &writer))
5351 &outpos))
5352 goto onError; 5427 goto onError;
5353 } 5428 }
5354 5429
5355 End: 5430 End:
5356 if (consumed) 5431 if (consumed)
5357 *consumed = (const char *)q-starts; 5432 *consumed = (const char *)q-starts;
5358 5433
5359 /* Adjust length */
5360 if (unicode_resize(&unicode, outpos) < 0)
5361 goto onError;
5362
5363 Py_XDECREF(errorHandler); 5434 Py_XDECREF(errorHandler);
5364 Py_XDECREF(exc); 5435 Py_XDECREF(exc);
5365 return unicode_result(unicode); 5436 return _PyUnicodeWriter_Finish(&writer);
5366 5437
5367 onError: 5438 onError:
5368 Py_DECREF(unicode); 5439 _PyUnicodeWriter_Dealloc(&writer);
5369 Py_XDECREF(errorHandler); 5440 Py_XDECREF(errorHandler);
5370 Py_XDECREF(exc); 5441 Py_XDECREF(exc);
5371 return NULL; 5442 return NULL;
5372 } 5443 }
5373 5444
5374 PyObject * 5445 PyObject *
5375 _PyUnicode_EncodeUTF16(PyObject *str, 5446 _PyUnicode_EncodeUTF16(PyObject *str,
5376 const char *errors, 5447 const char *errors,
5377 int byteorder) 5448 int byteorder)
5378 { 5449 {
5379 enum PyUnicode_Kind kind; 5450 enum PyUnicode_Kind kind;
5380 const void *data; 5451 const void *data;
5381 Py_ssize_t len; 5452 Py_ssize_t len;
5382 PyObject *v; 5453 PyObject *v;
5383 unsigned short *out; 5454 unsigned short *out;
5384 Py_ssize_t bytesize;
5385 Py_ssize_t pairs; 5455 Py_ssize_t pairs;
5386 #ifdef WORDS_BIGENDIAN 5456 #if PY_BIG_ENDIAN
5387 int native_ordering = byteorder >= 0; 5457 int native_ordering = byteorder >= 0;
5388 #else 5458 #else
5389 int native_ordering = byteorder <= 0; 5459 int native_ordering = byteorder <= 0;
5390 #endif 5460 #endif
5461 const char *encoding;
5462 Py_ssize_t nsize, pos;
5463 PyObject *errorHandler = NULL;
5464 PyObject *exc = NULL;
5465 PyObject *rep = NULL;
5391 5466
5392 if (!PyUnicode_Check(str)) { 5467 if (!PyUnicode_Check(str)) {
5393 PyErr_BadArgument(); 5468 PyErr_BadArgument();
5394 return NULL; 5469 return NULL;
5395 } 5470 }
5396 if (PyUnicode_READY(str) == -1) 5471 if (PyUnicode_READY(str) == -1)
5397 return NULL; 5472 return NULL;
5398 kind = PyUnicode_KIND(str); 5473 kind = PyUnicode_KIND(str);
5399 data = PyUnicode_DATA(str); 5474 data = PyUnicode_DATA(str);
5400 len = PyUnicode_GET_LENGTH(str); 5475 len = PyUnicode_GET_LENGTH(str);
5401 5476
5402 pairs = 0; 5477 pairs = 0;
5403 if (kind == PyUnicode_4BYTE_KIND) { 5478 if (kind == PyUnicode_4BYTE_KIND) {
5404 const Py_UCS4 *in = (const Py_UCS4 *)data; 5479 const Py_UCS4 *in = (const Py_UCS4 *)data;
5405 const Py_UCS4 *end = in + len; 5480 const Py_UCS4 *end = in + len;
5406 while (in < end) 5481 while (in < end)
5407 if (*in++ >= 0x10000) 5482 if (*in++ >= 0x10000)
5408 pairs++; 5483 pairs++;
5409 } 5484 }
5410 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) 5485 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
5411 return PyErr_NoMemory(); 5486 return PyErr_NoMemory();
5412 bytesize = (len + pairs + (byteorder == 0)) * 2; 5487 nsize = len + pairs + (byteorder == 0);
5413 v = PyBytes_FromStringAndSize(NULL, bytesize); 5488 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
5414 if (v == NULL) 5489 if (v == NULL)
5415 return NULL; 5490 return NULL;
5416 5491
5417 /* output buffer is 2-bytes aligned */ 5492 /* output buffer is 2-bytes aligned */
5418 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2)); 5493 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
5419 out = (unsigned short *)PyBytes_AS_STRING(v); 5494 out = (unsigned short *)PyBytes_AS_STRING(v);
5420 if (byteorder == 0) 5495 if (byteorder == 0)
5421 *out++ = 0xFEFF; 5496 *out++ = 0xFEFF;
5422 if (len == 0) 5497 if (len == 0)
5423 goto done; 5498 goto done;
5424 5499
5425 switch (kind) { 5500 if (kind == PyUnicode_1BYTE_KIND) {
5426 case PyUnicode_1BYTE_KIND: { 5501 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5427 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering); 5502 goto done;
5428 break; 5503 }
5429 } 5504
5430 case PyUnicode_2BYTE_KIND: { 5505 if (byteorder < 0)
5431 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering); 5506 encoding = "utf-16-le";
5432 break; 5507 else if (byteorder > 0)
5433 } 5508 encoding = "utf-16-be";
5434 case PyUnicode_4BYTE_KIND: { 5509 else
5435 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering); 5510 encoding = "utf-16";
5436 break; 5511
5437 } 5512 pos = 0;
5438 default: 5513 while (pos < len) {
5439 assert(0); 5514 Py_ssize_t repsize, moreunits;
5440 } 5515
5441 5516 if (kind == PyUnicode_2BYTE_KIND) {
5517 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5518 &out, native_ordering);
5519 }
5520 else {
5521 assert(kind == PyUnicode_4BYTE_KIND);
5522 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5523 &out, native_ordering);
5524 }
5525 if (pos == len)
5526 break;
5527
5528 rep = unicode_encode_call_errorhandler(
5529 errors, &errorHandler,
5530 encoding, "surrogates not allowed",
5531 str, &exc, pos, pos + 1, &pos);
5532 if (!rep)
5533 goto error;
5534
5535 if (PyBytes_Check(rep)) {
5536 repsize = PyBytes_GET_SIZE(rep);
5537 if (repsize & 1) {
5538 raise_encode_exception(&exc, encoding,
5539 str, pos - 1, pos,
5540 "surrogates not allowed");
5541 goto error;
5542 }
5543 moreunits = repsize / 2;
5544 }
5545 else {
5546 assert(PyUnicode_Check(rep));
5547 if (PyUnicode_READY(rep) < 0)
5548 goto error;
5549 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5550 if (!PyUnicode_IS_ASCII(rep)) {
5551 raise_encode_exception(&exc, encoding,
5552 str, pos - 1, pos,
5553 "surrogates not allowed");
5554 goto error;
5555 }
5556 }
5557
5558 /* two bytes are reserved for each surrogate */
5559 if (moreunits > 1) {
5560 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5561 Py_ssize_t morebytes = 2 * (moreunits - 1);
5562 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5563 /* integer overflow */
5564 PyErr_NoMemory();
5565 goto error;
5566 }
5567 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5568 goto error;
5569 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5570 }
5571
5572 if (PyBytes_Check(rep)) {
5573 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5574 out += moreunits;
5575 } else /* rep is unicode */ {
5576 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5577 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5578 &out, native_ordering);
5579 }
5580
5581 Py_CLEAR(rep);
5582 }
5583
5584 /* Cut back to size actually needed. This is necessary for, for example,
5585 encoding of a string containing isolated surrogates and the 'ignore' handler
5586 is used. */
5587 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5588 if (nsize != PyBytes_GET_SIZE(v))
5589 _PyBytes_Resize(&v, nsize);
5590 Py_XDECREF(errorHandler);
5591 Py_XDECREF(exc);
5442 done: 5592 done:
5443 return v; 5593 return v;
5594 error:
5595 Py_XDECREF(rep);
5596 Py_XDECREF(errorHandler);
5597 Py_XDECREF(exc);
5598 Py_XDECREF(v);
5599 return NULL;
5600 #undef STORECHAR
5444 } 5601 }
5445 5602
5446 PyObject * 5603 PyObject *
5447 PyUnicode_EncodeUTF16(const Py_UNICODE *s, 5604 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5448 Py_ssize_t size, 5605 Py_ssize_t size,
5449 const char *errors, 5606 const char *errors,
5450 int byteorder) 5607 int byteorder)
5451 { 5608 {
5452 PyObject *result; 5609 PyObject *result;
5453 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5610 PyObject *tmp = PyUnicode_FromUnicode(s, size);
(...skipping 70 matching lines...) Expand 10 before | Expand all | Expand 10 after
5524 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 5681 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
5525 5682
5526 PyObject * 5683 PyObject *
5527 PyUnicode_DecodeUnicodeEscape(const char *s, 5684 PyUnicode_DecodeUnicodeEscape(const char *s,
5528 Py_ssize_t size, 5685 Py_ssize_t size,
5529 const char *errors) 5686 const char *errors)
5530 { 5687 {
5531 const char *starts = s; 5688 const char *starts = s;
5532 Py_ssize_t startinpos; 5689 Py_ssize_t startinpos;
5533 Py_ssize_t endinpos; 5690 Py_ssize_t endinpos;
5534 PyObject *v; 5691 _PyUnicodeWriter writer;
5535 const char *end; 5692 const char *end;
5536 char* message; 5693 char* message;
5537 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 5694 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
5538 PyObject *errorHandler = NULL; 5695 PyObject *errorHandler = NULL;
5539 PyObject *exc = NULL; 5696 PyObject *exc = NULL;
5540 Py_ssize_t len; 5697 Py_ssize_t len;
5541 Py_ssize_t i;
5542 5698
5543 len = length_of_escaped_ascii_string(s, size); 5699 len = length_of_escaped_ascii_string(s, size);
5700 if (len == 0)
5701 _Py_RETURN_UNICODE_EMPTY();
5544 5702
5545 /* After length_of_escaped_ascii_string() there are two alternatives, 5703 /* After length_of_escaped_ascii_string() there are two alternatives,
5546 either the string is pure ASCII with named escapes like \n, etc. 5704 either the string is pure ASCII with named escapes like \n, etc.
5547 and we determined it's exact size (common case) 5705 and we determined it's exact size (common case)
5548 or it contains \x, \u, ... escape sequences. then we create a 5706 or it contains \x, \u, ... escape sequences. then we create a
5549 legacy wchar string and resize it at the end of this function. */ 5707 legacy wchar string and resize it at the end of this function. */
5550 if (len >= 0) { 5708 _PyUnicodeWriter_Init(&writer);
5551 v = PyUnicode_New(len, 127); 5709 if (len > 0) {
5552 if (!v) 5710 writer.min_length = len;
5553 goto onError;
5554 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5555 } 5711 }
5556 else { 5712 else {
5557 /* Escaped strings will always be longer than the resulting 5713 /* Escaped strings will always be longer than the resulting
5558 Unicode string, so we start with size here and then reduce the 5714 Unicode string, so we start with size here and then reduce the
5559 length after conversion to the true value. 5715 length after conversion to the true value.
5560 (but if the error callback returns a long replacement string 5716 (but if the error callback returns a long replacement string
5561 we'll have to allocate more space) */ 5717 we'll have to allocate more space) */
5562 v = PyUnicode_New(size, 127); 5718 writer.min_length = size;
5563 if (!v)
5564 goto onError;
5565 len = size;
5566 } 5719 }
5567 5720
5568 if (size == 0) 5721 if (size == 0)
5569 return v; 5722 return _PyUnicodeWriter_Finish(&writer);
5570 i = 0;
5571 end = s + size; 5723 end = s + size;
5572 5724
5573 while (s < end) { 5725 while (s < end) {
5574 unsigned char c; 5726 unsigned char c;
5575 Py_UCS4 x; 5727 Py_UCS4 x;
5576 int digits; 5728 int digits;
5577 5729
5578 /* The only case in which i == ascii_length is a backslash
5579 followed by a newline. */
5580 assert(i <= len);
5581
5582 /* Non-escape characters are interpreted as Unicode ordinals */ 5730 /* Non-escape characters are interpreted as Unicode ordinals */
5583 if (*s != '\\') { 5731 if (*s != '\\') {
5584 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0) 5732 x = (unsigned char)*s;
5733 s++;
5734 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
5585 goto onError; 5735 goto onError;
5586 continue; 5736 continue;
5587 } 5737 }
5588 5738
5589 startinpos = s-starts; 5739 startinpos = s-starts;
5590 /* \ - Escapes */ 5740 /* \ - Escapes */
5591 s++; 5741 s++;
5592 c = *s++; 5742 c = *s++;
5593 if (s > end) 5743 if (s > end)
5594 c = '\0'; /* Invalid after \ */ 5744 c = '\0'; /* Invalid after \ */
5595 5745
5596 /* The only case in which i == ascii_length is a backslash
5597 followed by a newline. */
5598 assert(i < len || (i == len && c == '\n'));
5599
5600 switch (c) { 5746 switch (c) {
5601 5747
5602 /* \x escapes */ 5748 /* \x escapes */
5603 #define WRITECHAR(ch) \ 5749 #define WRITECHAR(ch) \
5604 do { \ 5750 do { \
5605 if (unicode_putchar(&v, &i, ch) < 0) \ 5751 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
5606 goto onError; \ 5752 goto onError; \
5607 }while(0) 5753 } while(0)
5608 5754
5609 case '\n': break; 5755 case '\n': break;
5610 case '\\': WRITECHAR('\\'); break; 5756 case '\\': WRITECHAR('\\'); break;
5611 case '\'': WRITECHAR('\''); break; 5757 case '\'': WRITECHAR('\''); break;
5612 case '\"': WRITECHAR('\"'); break; 5758 case '\"': WRITECHAR('\"'); break;
5613 case 'b': WRITECHAR('\b'); break; 5759 case 'b': WRITECHAR('\b'); break;
5614 /* FF */ 5760 /* FF */
5615 case 'f': WRITECHAR('\014'); break; 5761 case 'f': WRITECHAR('\014'); break;
5616 case 't': WRITECHAR('\t'); break; 5762 case 't': WRITECHAR('\t'); break;
5617 case 'n': WRITECHAR('\n'); break; 5763 case 'n': WRITECHAR('\n'); break;
(...skipping 103 matching lines...) Expand 10 before | Expand all | Expand 10 after
5721 else { 5867 else {
5722 WRITECHAR('\\'); 5868 WRITECHAR('\\');
5723 WRITECHAR((unsigned char)s[-1]); 5869 WRITECHAR((unsigned char)s[-1]);
5724 } 5870 }
5725 break; 5871 break;
5726 } 5872 }
5727 continue; 5873 continue;
5728 5874
5729 error: 5875 error:
5730 endinpos = s-starts; 5876 endinpos = s-starts;
5731 if (unicode_decode_call_errorhandler( 5877 if (unicode_decode_call_errorhandler_writer(
5732 errors, &errorHandler, 5878 errors, &errorHandler,