Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code | Sign in
(3023)

Delta Between Two Patch Sets: Objects/unicodeobject.c

Issue 11828: startswith and endswith don't accept None as slice index
Left Patch Set: Created 2 years, 1 month ago
Right Patch Set: Created 2 years, 1 month ago
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
« no previous file with change/comment | « Objects/stringlib/find.h ('k') | no next file » | no next file with change/comment »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
LEFTRIGHT
1 /* 1 /*
2 2
3 Unicode implementation based on original code by Fredrik Lundh, 3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the 4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt). 5 Unicode Integration Proposal (see file Misc/unicode.txt).
6 6
7 Major speed upgrades to the method implementations at the Reykjavik 7 Major speed upgrades to the method implementations at the Reykjavik
8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9 9
10 Copyright (c) Corporation for National Research Initiatives. 10 Copyright (c) Corporation for National Research Initiatives.
(...skipping 23 matching lines...) Expand all
34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38 -------------------------------------------------------------------- 38 --------------------------------------------------------------------
39 39
40 */ 40 */
41 41
42 #define PY_SSIZE_T_CLEAN 42 #define PY_SSIZE_T_CLEAN
43 #include "Python.h" 43 #include "Python.h"
44 #include "bytes_methods.h"
44 45
45 #include "unicodeobject.h" 46 #include "unicodeobject.h"
46 #include "ucnhash.h" 47 #include "ucnhash.h"
47 48
48 #ifdef MS_WINDOWS 49 #ifdef MS_WINDOWS
49 #include <windows.h> 50 #include <windows.h>
50 #endif 51 #endif
51 52
52 /* Limit for the Unicode object free list */ 53 /* Limit for the Unicode object free list */
53 54
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after
85 The globals are initialized by the _PyUnicode_Init() API and should 86 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API. 87 not be used before calling that API.
87 88
88 */ 89 */
89 90
90 91
91 #ifdef __cplusplus 92 #ifdef __cplusplus
92 extern "C" { 93 extern "C" {
93 #endif 94 #endif
94 95
96 /* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
103 */
104 static PyObject *interned;
105
95 /* Free list for Unicode objects */ 106 /* Free list for Unicode objects */
96 static PyUnicodeObject *free_list; 107 static PyUnicodeObject *free_list;
97 static int numfree; 108 static int numfree;
98 109
99 /* The empty Unicode object is shared to improve performance. */ 110 /* The empty Unicode object is shared to improve performance. */
100 static PyUnicodeObject *unicode_empty; 111 static PyUnicodeObject *unicode_empty;
101 112
102 /* Single character Unicode strings in the Latin-1 range are being 113 /* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */ 114 shared as well. */
104 static PyUnicodeObject *unicode_latin1[256]; 115 static PyUnicodeObject *unicode_latin1[256];
105 116
106 /* Default encoding to use and assume when NULL is passed as encoding 117 /* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init(). 118 parameter; it is fixed to "utf-8". Always use the
108 119 PyUnicode_GetDefaultEncoding() API to access this global.
109 Always use the PyUnicode_SetDefaultEncoding() and 120
110 PyUnicode_GetDefaultEncoding() APIs to access this global. 121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
111 122 hard coded default!
112 */ 123 */
113 static char unicode_default_encoding[100]; 124 static const char unicode_default_encoding[] = "utf-8";
114 125
115 /* Fast detection of the most frequent whitespace characters */ 126 /* Fast detection of the most frequent whitespace characters */
116 const unsigned char _Py_ascii_whitespace[] = { 127 const unsigned char _Py_ascii_whitespace[] = {
117 0, 0, 0, 0, 0, 0, 0, 0, 128 0, 0, 0, 0, 0, 0, 0, 0,
118 /* case 0x0009: * CHARACTER TABULATION */ 129 /* case 0x0009: * HORIZONTAL TABULATION */
119 /* case 0x000A: * LINE FEED */ 130 /* case 0x000A: * LINE FEED */
120 /* case 0x000B: * LINE TABULATION */ 131 /* case 0x000B: * VERTICAL TABULATION */
121 /* case 0x000C: * FORM FEED */ 132 /* case 0x000C: * FORM FEED */
122 /* case 0x000D: * CARRIAGE RETURN */ 133 /* case 0x000D: * CARRIAGE RETURN */
123 0, 1, 1, 1, 1, 1, 0, 0, 134 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0, 135 0, 0, 0, 0, 0, 0, 0, 0,
125 /* case 0x001C: * FILE SEPARATOR */ 136 /* case 0x001C: * FILE SEPARATOR */
126 /* case 0x001D: * GROUP SEPARATOR */ 137 /* case 0x001D: * GROUP SEPARATOR */
127 /* case 0x001E: * RECORD SEPARATOR */ 138 /* case 0x001E: * RECORD SEPARATOR */
128 /* case 0x001F: * UNIT SEPARATOR */ 139 /* case 0x001F: * UNIT SEPARATOR */
129 0, 0, 0, 0, 1, 1, 1, 1, 140 0, 0, 0, 0, 1, 1, 1, 1,
130 /* case 0x0020: * SPACE */ 141 /* case 0x0020: * SPACE */
131 1, 0, 0, 0, 0, 0, 0, 0, 142 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0, 143 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0, 144 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0, 145 0, 0, 0, 0, 0, 0, 0, 0,
135 146
136 0, 0, 0, 0, 0, 0, 0, 0, 147 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0, 148 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0, 149 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0, 150 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0, 151 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0, 152 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0, 153 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0 154 0, 0, 0, 0, 0, 0, 0, 0
144 }; 155 };
145 156
157 static PyObject *unicode_encode_call_errorhandler(const char *errors,
158 PyObject **errorHandler,const char *encoding, const char *reason,
159 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
160 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
161
162 static void raise_encode_exception(PyObject **exceptionObject,
163 const char *encoding,
164 const Py_UNICODE *unicode, Py_ssize_t size,
165 Py_ssize_t startpos, Py_ssize_t endpos,
166 const char *reason);
167
146 /* Same for linebreaks */ 168 /* Same for linebreaks */
147 static unsigned char ascii_linebreak[] = { 169 static unsigned char ascii_linebreak[] = {
148 0, 0, 0, 0, 0, 0, 0, 0, 170 0, 0, 0, 0, 0, 0, 0, 0,
149 /* 0x000A, * LINE FEED */ 171 /* 0x000A, * LINE FEED */
150 /* 0x000B, * LINE TABULATION */
151 /* 0x000C, * FORM FEED */
152 /* 0x000D, * CARRIAGE RETURN */ 172 /* 0x000D, * CARRIAGE RETURN */
153 0, 0, 1, 1, 1, 1, 0, 0, 173 0, 0, 1, 0, 0, 1, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0, 174 0, 0, 0, 0, 0, 0, 0, 0,
155 /* 0x001C, * FILE SEPARATOR */ 175 /* 0x001C, * FILE SEPARATOR */
156 /* 0x001D, * GROUP SEPARATOR */ 176 /* 0x001D, * GROUP SEPARATOR */
157 /* 0x001E, * RECORD SEPARATOR */ 177 /* 0x001E, * RECORD SEPARATOR */
158 0, 0, 0, 0, 1, 1, 1, 0, 178 0, 0, 0, 0, 1, 1, 1, 0,
159 0, 0, 0, 0, 0, 0, 0, 0, 179 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0, 180 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0, 181 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0, 182 0, 0, 0, 0, 0, 0, 0, 0,
163 183
(...skipping 21 matching lines...) Expand all
185 } 205 }
186 206
187 /* --- Bloom Filters ----------------------------------------------------- */ 207 /* --- Bloom Filters ----------------------------------------------------- */
188 208
189 /* stuff to implement simple "bloom filters" for Unicode characters. 209 /* stuff to implement simple "bloom filters" for Unicode characters.
190 to keep things simple, we use a single bitmask, using the least 5 210 to keep things simple, we use a single bitmask, using the least 5
191 bits from each unicode characters as the bit index. */ 211 bits from each unicode characters as the bit index. */
192 212
193 /* the linebreak mask is set up by Unicode_Init below */ 213 /* the linebreak mask is set up by Unicode_Init below */
194 214
195 #if LONG_BIT >= 128
196 #define BLOOM_WIDTH 128
197 #elif LONG_BIT >= 64
198 #define BLOOM_WIDTH 64
199 #elif LONG_BIT >= 32
200 #define BLOOM_WIDTH 32
201 #else
202 #error "LONG_BIT is smaller than 32"
203 #endif
204
205 #define BLOOM_MASK unsigned long 215 #define BLOOM_MASK unsigned long
206 216
207 static BLOOM_MASK bloom_linebreak; 217 static BLOOM_MASK bloom_linebreak;
208 218
209 #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 219 #define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
210 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
211 220
212 #define BLOOM_LINEBREAK(ch) \ 221 #define BLOOM_LINEBREAK(ch) \
213 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 222 ((ch) < 128U ? ascii_linebreak[(ch)] : \
214 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 223 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
215 224
216 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len) 225 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
217 { 226 {
218 /* calculate simple bloom-style bitmask for a given unicode string */ 227 /* calculate simple bloom-style bitmask for a given unicode string */
219 228
220 BLOOM_MASK mask; 229 long mask;
221 Py_ssize_t i; 230 Py_ssize_t i;
222 231
223 mask = 0; 232 mask = 0;
224 for (i = 0; i < len; i++) 233 for (i = 0; i < len; i++)
225 BLOOM_ADD(mask, ptr[i]); 234 mask |= (1 << (ptr[i] & 0x1F));
226 235
227 return mask; 236 return mask;
228 } 237 }
229 238
230 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen) 239 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
231 { 240 {
232 Py_ssize_t i; 241 Py_ssize_t i;
233 242
234 for (i = 0; i < setlen; i++) 243 for (i = 0; i < setlen; i++)
235 if (set[i] == chr) 244 if (set[i] == chr)
(...skipping 19 matching lines...) Expand all
255 264
256 /* Resizing shared object (unicode_empty or single character 265 /* Resizing shared object (unicode_empty or single character
257 objects) in-place is not allowed. Use PyUnicode_Resize() 266 objects) in-place is not allowed. Use PyUnicode_Resize()
258 instead ! */ 267 instead ! */
259 268
260 if (unicode == unicode_empty || 269 if (unicode == unicode_empty ||
261 (unicode->length == 1 && 270 (unicode->length == 1 &&
262 unicode->str[0] < 256U && 271 unicode->str[0] < 256U &&
263 unicode_latin1[unicode->str[0]] == unicode)) { 272 unicode_latin1[unicode->str[0]] == unicode)) {
264 PyErr_SetString(PyExc_SystemError, 273 PyErr_SetString(PyExc_SystemError,
265 "can't resize shared unicode objects"); 274 "can't resize shared str objects");
266 return -1; 275 return -1;
267 } 276 }
268 277
269 /* We allocate one more byte to make sure the string is Ux0000 terminated. 278 /* We allocate one more byte to make sure the string is Ux0000 terminated.
270 The overallocation is also used by fastsearch, which assumes that it's 279 The overallocation is also used by fastsearch, which assumes that it's
271 safe to look at str[length] (without making any assumptions about what 280 safe to look at str[length] (without making any assumptions about what
272 it contains). */ 281 it contains). */
273 282
274 oldstr = unicode->str; 283 oldstr = unicode->str;
275 unicode->str = PyObject_REALLOC(unicode->str, 284 unicode->str = PyObject_REALLOC(unicode->str,
(...skipping 10 matching lines...) Expand all
286 /* Reset the object caches */ 295 /* Reset the object caches */
287 if (unicode->defenc) { 296 if (unicode->defenc) {
288 Py_CLEAR(unicode->defenc); 297 Py_CLEAR(unicode->defenc);
289 } 298 }
290 unicode->hash = -1; 299 unicode->hash = -1;
291 300
292 return 0; 301 return 0;
293 } 302 }
294 303
295 /* We allocate one more byte to make sure the string is 304 /* We allocate one more byte to make sure the string is
296 Ux0000 terminated; some code relies on that. 305 Ux0000 terminated; some code (e.g. new_identifier)
306 relies on that.
297 307
298 XXX This allocator could further be enhanced by assuring that the 308 XXX This allocator could further be enhanced by assuring that the
299 free list never reduces its size below 1. 309 free list never reduces its size below 1.
300 310
301 */ 311 */
302 312
303 static 313 static
304 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length) 314 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
305 { 315 {
306 register PyUnicodeObject *unicode; 316 register PyUnicodeObject *unicode;
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after
353 * the caller fails before initializing str -- unicode_resize() 363 * the caller fails before initializing str -- unicode_resize()
354 * reads str[0], and the Keep-Alive optimization can keep memory 364 * reads str[0], and the Keep-Alive optimization can keep memory
355 * allocated for str alive across a call to unicode_dealloc(unicode). 365 * allocated for str alive across a call to unicode_dealloc(unicode).
356 * We don't want unicode_resize to read uninitialized memory in 366 * We don't want unicode_resize to read uninitialized memory in
357 * that case. 367 * that case.
358 */ 368 */
359 unicode->str[0] = 0; 369 unicode->str[0] = 0;
360 unicode->str[length] = 0; 370 unicode->str[length] = 0;
361 unicode->length = length; 371 unicode->length = length;
362 unicode->hash = -1; 372 unicode->hash = -1;
373 unicode->state = 0;
363 unicode->defenc = NULL; 374 unicode->defenc = NULL;
364 return unicode; 375 return unicode;
365 376
366 onError: 377 onError:
367 /* XXX UNREF/NEWREF interface should be more symmetrical */ 378 /* XXX UNREF/NEWREF interface should be more symmetrical */
368 _Py_DEC_REFTOTAL; 379 _Py_DEC_REFTOTAL;
369 _Py_ForgetReference((PyObject *)unicode); 380 _Py_ForgetReference((PyObject *)unicode);
370 PyObject_Del(unicode); 381 PyObject_Del(unicode);
371 return NULL; 382 return NULL;
372 } 383 }
373 384
374 static 385 static
375 void unicode_dealloc(register PyUnicodeObject *unicode) 386 void unicode_dealloc(register PyUnicodeObject *unicode)
376 { 387 {
388 switch (PyUnicode_CHECK_INTERNED(unicode)) {
389 case SSTATE_NOT_INTERNED:
390 break;
391
392 case SSTATE_INTERNED_MORTAL:
393 /* revive dead object temporarily for DelItem */
394 Py_REFCNT(unicode) = 3;
395 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
396 Py_FatalError(
397 "deletion of interned string failed");
398 break;
399
400 case SSTATE_INTERNED_IMMORTAL:
401 Py_FatalError("Immortal interned string died.");
402
403 default:
404 Py_FatalError("Inconsistent interned string state.");
405 }
406
377 if (PyUnicode_CheckExact(unicode) && 407 if (PyUnicode_CheckExact(unicode) &&
378 numfree < PyUnicode_MAXFREELIST) { 408 numfree < PyUnicode_MAXFREELIST) {
379 /* Keep-Alive optimization */ 409 /* Keep-Alive optimization */
380 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { 410 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
381 PyObject_DEL(unicode->str); 411 PyObject_DEL(unicode->str);
382 unicode->str = NULL; 412 unicode->str = NULL;
383 unicode->length = 0; 413 unicode->length = 0;
384 } 414 }
385 if (unicode->defenc) { 415 if (unicode->defenc) {
386 Py_CLEAR(unicode->defenc); 416 Py_CLEAR(unicode->defenc);
(...skipping 164 matching lines...) Expand 10 before | Expand all | Expand 10 after
551 581
552 PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 582 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
553 Py_ssize_t size) 583 Py_ssize_t size)
554 { 584 {
555 PyUnicodeObject *unicode; 585 PyUnicodeObject *unicode;
556 register Py_ssize_t i; 586 register Py_ssize_t i;
557 Py_ssize_t alloc; 587 Py_ssize_t alloc;
558 const wchar_t *orig_w; 588 const wchar_t *orig_w;
559 589
560 if (w == NULL) { 590 if (w == NULL) {
591 if (size == 0)
592 return PyUnicode_FromStringAndSize(NULL, 0);
561 PyErr_BadInternalCall(); 593 PyErr_BadInternalCall();
562 return NULL; 594 return NULL;
595 }
596
597 if (size == -1) {
598 size = wcslen(w);
563 } 599 }
564 600
565 alloc = size; 601 alloc = size;
566 orig_w = w; 602 orig_w = w;
567 for (i = size; i > 0; i--) { 603 for (i = size; i > 0; i--) {
568 if (*w > 0xFFFF) 604 if (*w > 0xFFFF)
569 alloc++; 605 alloc++;
570 w++; 606 w++;
571 } 607 }
572 w = orig_w; 608 w = orig_w;
(...skipping 20 matching lines...) Expand all
593 } 629 }
594 630
595 #else 631 #else
596 632
597 PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 633 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
598 Py_ssize_t size) 634 Py_ssize_t size)
599 { 635 {
600 PyUnicodeObject *unicode; 636 PyUnicodeObject *unicode;
601 637
602 if (w == NULL) { 638 if (w == NULL) {
639 if (size == 0)
640 return PyUnicode_FromStringAndSize(NULL, 0);
603 PyErr_BadInternalCall(); 641 PyErr_BadInternalCall();
604 return NULL; 642 return NULL;
643 }
644
645 if (size == -1) {
646 size = wcslen(w);
605 } 647 }
606 648
607 unicode = _PyUnicode_New(size); 649 unicode = _PyUnicode_New(size);
608 if (!unicode) 650 if (!unicode)
609 return NULL; 651 return NULL;
610 652
611 /* Copy the wchar_t data into the new object */ 653 /* Copy the wchar_t data into the new object */
612 #ifdef HAVE_USABLE_WCHAR_T 654 #ifdef HAVE_USABLE_WCHAR_T
613 memcpy(unicode->str, w, size * sizeof(wchar_t)); 655 memcpy(unicode->str, w, size * sizeof(wchar_t));
614 #else 656 #else
(...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after
678 720
679 #ifdef VA_LIST_IS_ARRAY 721 #ifdef VA_LIST_IS_ARRAY
680 Py_MEMCPY(count, vargs, sizeof(va_list)); 722 Py_MEMCPY(count, vargs, sizeof(va_list));
681 #else 723 #else
682 #ifdef __va_copy 724 #ifdef __va_copy
683 __va_copy(count, vargs); 725 __va_copy(count, vargs);
684 #else 726 #else
685 count = vargs; 727 count = vargs;
686 #endif 728 #endif
687 #endif 729 #endif
688 /* step 1: count the number of %S/%R/%s format specifications 730 /* step 1: count the number of %S/%R/%A/%s format specifications
689 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these 731 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
690 * objects once during step 3 and put the result in an array) */ 732 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
733 * result in an array) */
691 for (f = format; *f; f++) { 734 for (f = format; *f; f++) {
692 if (*f == '%') { 735 if (*f == '%') {
693 if (*(f+1)=='%') 736 if (*(f+1)=='%')
694 continue; 737 continue;
695 if (*(f+1)=='S' || *(f+1)=='R') 738 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
696 ++callcount; 739 ++callcount;
697 while (isdigit((unsigned)*f)) 740 while (ISDIGIT((unsigned)*f))
698 width = (width*10) + *f++ - '0'; 741 width = (width*10) + *f++ - '0';
699 while (*++f && *f != '%' && !isalpha((unsigned)*f)) 742 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
700 ; 743 ;
701 if (*f == 's') 744 if (*f == 's')
702 ++callcount; 745 ++callcount;
703 } 746 }
704 } 747 }
705 /* step 2: allocate memory for the results of 748 /* step 2: allocate memory for the results of
706 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */ 749 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
707 if (callcount) { 750 if (callcount) {
708 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount); 751 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
709 if (!callresults) { 752 if (!callresults) {
710 PyErr_NoMemory(); 753 PyErr_NoMemory();
711 return NULL; 754 return NULL;
712 } 755 }
713 callresult = callresults; 756 callresult = callresults;
714 } 757 }
715 /* step 3: figure out how large a buffer we need */ 758 /* step 3: figure out how large a buffer we need */
716 for (f = format; *f; f++) { 759 for (f = format; *f; f++) {
717 if (*f == '%') { 760 if (*f == '%') {
718 const char* p = f; 761 const char* p = f;
719 width = 0; 762 width = 0;
720 while (isdigit((unsigned)*f)) 763 while (ISDIGIT((unsigned)*f))
721 width = (width*10) + *f++ - '0'; 764 width = (width*10) + *f++ - '0';
722 while (*++f && *f != '%' && !isalpha((unsigned)*f)) 765 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
723 ; 766 ;
724 767
725 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since 768 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
726 * they don't affect the amount of space we reserve. 769 * they don't affect the amount of space we reserve.
727 */ 770 */
728 if ((*f == 'l' || *f == 'z') && 771 if ((*f == 'l' || *f == 'z') &&
729 (f[1] == 'd' || f[1] == 'u')) 772 (f[1] == 'd' || f[1] == 'u'))
730 ++f; 773 ++f;
731 774
732 switch (*f) { 775 switch (*f) {
(...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after
799 PyObject *repr; 842 PyObject *repr;
800 assert(obj); 843 assert(obj);
801 repr = PyObject_Repr(obj); 844 repr = PyObject_Repr(obj);
802 if (!repr) 845 if (!repr)
803 goto fail; 846 goto fail;
804 n += PyUnicode_GET_SIZE(repr); 847 n += PyUnicode_GET_SIZE(repr);
805 /* Remember the repr and switch to the next slot */ 848 /* Remember the repr and switch to the next slot */
806 *callresult++ = repr; 849 *callresult++ = repr;
807 break; 850 break;
808 } 851 }
852 case 'A':
853 {
854 PyObject *obj = va_arg(count, PyObject *);
855 PyObject *ascii;
856 assert(obj);
857 ascii = PyObject_ASCII(obj);
858 if (!ascii)
859 goto fail;
860 n += PyUnicode_GET_SIZE(ascii);
861 /* Remember the repr and switch to the next slot */
862 *callresult++ = ascii;
863 break;
864 }
809 case 'p': 865 case 'p':
810 (void) va_arg(count, int); 866 (void) va_arg(count, int);
811 /* maximum 64-bit pointer representation: 867 /* maximum 64-bit pointer representation:
812 * 0xffffffffffffffff 868 * 0xffffffffffffffff
813 * so 19 characters is enough. 869 * so 19 characters is enough.
814 * XXX I count 18 -- what's the extra for? 870 * XXX I count 18 -- what's the extra for?
815 */ 871 */
816 n += 19; 872 n += 19;
817 break; 873 break;
818 default: 874 default:
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after
851 callresult = callresults; 907 callresult = callresults;
852 908
853 for (f = format; *f; f++) { 909 for (f = format; *f; f++) {
854 if (*f == '%') { 910 if (*f == '%') {
855 const char* p = f++; 911 const char* p = f++;
856 int longflag = 0; 912 int longflag = 0;
857 int size_tflag = 0; 913 int size_tflag = 0;
858 zeropad = (*f == '0'); 914 zeropad = (*f == '0');
859 /* parse the width.precision part */ 915 /* parse the width.precision part */
860 width = 0; 916 width = 0;
861 while (isdigit((unsigned)*f)) 917 while (ISDIGIT((unsigned)*f))
862 width = (width*10) + *f++ - '0'; 918 width = (width*10) + *f++ - '0';
863 precision = 0; 919 precision = 0;
864 if (*f == '.') { 920 if (*f == '.') {
865 f++; 921 f++;
866 while (isdigit((unsigned)*f)) 922 while (ISDIGIT((unsigned)*f))
867 precision = (precision*10) + *f++ - '0'; 923 precision = (precision*10) + *f++ - '0';
868 } 924 }
869 /* handle the long flag, but only for %ld and %lu. 925 /* handle the long flag, but only for %ld and %lu.
870 others can be added when necessary. */ 926 others can be added when necessary. */
871 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) { 927 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
872 longflag = 1; 928 longflag = 1;
873 ++f; 929 ++f;
874 } 930 }
875 /* handle the size_t flag. */ 931 /* handle the size_t flag. */
876 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { 932 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
(...skipping 177 matching lines...) Expand 10 before | Expand all | Expand 10 after
1054 if (size > PyUnicode_GET_SIZE(unicode)) 1110 if (size > PyUnicode_GET_SIZE(unicode))
1055 return PyUnicode_GET_SIZE(unicode); 1111 return PyUnicode_GET_SIZE(unicode);
1056 else 1112 else
1057 return size; 1113 return size;
1058 } 1114 }
1059 1115
1060 #endif 1116 #endif
1061 1117
1062 PyObject *PyUnicode_FromOrdinal(int ordinal) 1118 PyObject *PyUnicode_FromOrdinal(int ordinal)
1063 { 1119 {
1064 Py_UNICODE s[1]; 1120 Py_UNICODE s[2];
1065 1121
1066 #ifdef Py_UNICODE_WIDE
1067 if (ordinal < 0 || ordinal > 0x10ffff) { 1122 if (ordinal < 0 || ordinal > 0x10ffff) {
1068 PyErr_SetString(PyExc_ValueError, 1123 PyErr_SetString(PyExc_ValueError,
1069 "unichr() arg not in range(0x110000) " 1124 "chr() arg not in range(0x110000)");
1070 "(wide Python build)"); 1125 return NULL;
1071 return NULL; 1126 }
1072 } 1127
1073 #else 1128 #ifndef Py_UNICODE_WIDE
1074 if (ordinal < 0 || ordinal > 0xffff) { 1129 if (ordinal > 0xffff) {
1075 PyErr_SetString(PyExc_ValueError, 1130 ordinal -= 0x10000;
1076 "unichr() arg not in range(0x10000) " 1131 s[0] = 0xD800 | (ordinal >> 10);
1077 "(narrow Python build)"); 1132 s[1] = 0xDC00 | (ordinal & 0x3FF);
1078 return NULL; 1133 return PyUnicode_FromUnicode(s, 2);
1079 } 1134 }
1080 #endif 1135 #endif
1081 1136
1082 s[0] = (Py_UNICODE)ordinal; 1137 s[0] = (Py_UNICODE)ordinal;
1083 return PyUnicode_FromUnicode(s, 1); 1138 return PyUnicode_FromUnicode(s, 1);
1084 } 1139 }
1085 1140
1086 PyObject *PyUnicode_FromObject(register PyObject *obj) 1141 PyObject *PyUnicode_FromObject(register PyObject *obj)
1087 { 1142 {
1088 /* XXX Perhaps we should make this API an alias of 1143 /* XXX Perhaps we should make this API an alias of
1089 PyObject_Unicode() instead ?! */ 1144 PyObject_Str() instead ?! */
1090 if (PyUnicode_CheckExact(obj)) { 1145 if (PyUnicode_CheckExact(obj)) {
1091 Py_INCREF(obj); 1146 Py_INCREF(obj);
1092 return obj; 1147 return obj;
1093 } 1148 }
1094 if (PyUnicode_Check(obj)) { 1149 if (PyUnicode_Check(obj)) {
1095 /* For a Unicode subtype that's not a Unicode object, 1150 /* For a Unicode subtype that's not a Unicode object,
1096 return a true Unicode object with the same data. */ 1151 return a true Unicode object with the same data. */
1097 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj), 1152 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1098 PyUnicode_GET_SIZE(obj)); 1153 PyUnicode_GET_SIZE(obj));
1099 } 1154 }
1100 return PyUnicode_FromEncodedObject(obj, NULL, "strict"); 1155 PyErr_Format(PyExc_TypeError,
1156 "Can't convert '%.100s' object to str implicitly",
1157 Py_TYPE(obj)->tp_name);
1158 return NULL;
1101 } 1159 }
1102 1160
1103 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, 1161 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1104 const char *encoding, 1162 const char *encoding,
1105 const char *errors) 1163 const char *errors)
1106 { 1164 {
1107 const char *s = NULL; 1165 Py_buffer buffer;
1108 Py_ssize_t len;
1109 PyObject *v; 1166 PyObject *v;
1110 1167
1111 if (obj == NULL) { 1168 if (obj == NULL) {
1112 PyErr_BadInternalCall(); 1169 PyErr_BadInternalCall();
1113 return NULL; 1170 return NULL;
1114 } 1171 }
1115 1172
1116 #if 0 1173 /* Decoding bytes objects is the most common case and should be fast */
1117 /* For b/w compatibility we also accept Unicode objects provided 1174 if (PyBytes_Check(obj)) {
1118 that no encodings is given and then redirect to 1175 if (PyBytes_GET_SIZE(obj) == 0) {
1119 PyObject_Unicode() which then applies the additional logic for 1176 Py_INCREF(unicode_empty);
1120 Unicode subclasses. 1177 v = (PyObject *) unicode_empty;
1121 1178 }
1122 NOTE: This API should really only be used for object which 1179 else {
1123 represent *encoded* Unicode ! 1180 v = PyUnicode_Decode(
1124 1181 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1125 */ 1182 encoding, errors);
1126 if (PyUnicode_Check(obj)) { 1183 }
1127 if (encoding) { 1184 return v;
1128 PyErr_SetString(PyExc_TypeError, 1185 }
1129 "decoding Unicode is not supported"); 1186
1130 return NULL;
1131 }
1132 return PyObject_Unicode(obj);
1133 }
1134 #else
1135 if (PyUnicode_Check(obj)) { 1187 if (PyUnicode_Check(obj)) {
1136 PyErr_SetString(PyExc_TypeError, 1188 PyErr_SetString(PyExc_TypeError,
1137 "decoding Unicode is not supported"); 1189 "decoding str is not supported");
1138 return NULL; 1190 return NULL;
1139 } 1191 }
1140 #endif 1192
1141 1193 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1142 /* Coerce object */ 1194 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1143 if (PyString_Check(obj)) {
1144 s = PyString_AS_STRING(obj);
1145 len = PyString_GET_SIZE(obj);
1146 }
1147 else if (PyByteArray_Check(obj)) {
1148 /* Python 2.x specific */
1149 PyErr_Format(PyExc_TypeError, 1195 PyErr_Format(PyExc_TypeError,
1150 "decoding bytearray is not supported"); 1196 "coercing to str: need bytes, bytearray "
1151 return NULL; 1197 "or buffer-like object, %.80s found",
1152 } 1198 Py_TYPE(obj)->tp_name);
1153 else if (PyObject_AsCharBuffer(obj, &s, &len)) { 1199 return NULL;
1154 /* Overwrite the error message with something more useful in 1200 }
1155 case of a TypeError. */ 1201
1156 if (PyErr_ExceptionMatches(PyExc_TypeError)) 1202 if (buffer.len == 0) {
1157 PyErr_Format(PyExc_TypeError,
1158 "coercing to Unicode: need string or buffer, "
1159 "%.80s found",
1160 Py_TYPE(obj)->tp_name);
1161 goto onError;
1162 }
1163
1164 /* Convert to Unicode */
1165 if (len == 0) {
1166 Py_INCREF(unicode_empty); 1203 Py_INCREF(unicode_empty);
1167 v = (PyObject *)unicode_empty; 1204 v = (PyObject *) unicode_empty;
1168 } 1205 }
1169 else 1206 else
1170 v = PyUnicode_Decode(s, len, encoding, errors); 1207 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
1171 1208
1209 PyBuffer_Release(&buffer);
1172 return v; 1210 return v;
1173
1174 onError:
1175 return NULL;
1176 } 1211 }
1177 1212
1178 PyObject *PyUnicode_Decode(const char *s, 1213 PyObject *PyUnicode_Decode(const char *s,
1179 Py_ssize_t size, 1214 Py_ssize_t size,
1180 const char *encoding, 1215 const char *encoding,
1181 const char *errors) 1216 const char *errors)
1182 { 1217 {
1183 PyObject *buffer = NULL, *unicode; 1218 PyObject *buffer = NULL, *unicode;
1219 Py_buffer info;
1220 char lower[20]; /* Enough for any encoding name we recognize */
1221 char *l;
1222 const char *e;
1184 1223
1185 if (encoding == NULL) 1224 if (encoding == NULL)
1186 encoding = PyUnicode_GetDefaultEncoding(); 1225 encoding = PyUnicode_GetDefaultEncoding();
1187 1226
1227 /* Convert encoding to lower case and replace '_' with '-' in order to
1228 catch e.g. UTF_8 */
1229 e = encoding;
1230 l = lower;
1231 while (*e && l < &lower[(sizeof lower) - 2]) {
1232 if (ISUPPER(*e)) {
1233 *l++ = TOLOWER(*e++);
1234 }
1235 else if (*e == '_') {
1236 *l++ = '-';
1237 e++;
1238 }
1239 else {
1240 *l++ = *e++;
1241 }
1242 }
1243 *l = '\0';
1244
1188 /* Shortcuts for common default encodings */ 1245 /* Shortcuts for common default encodings */
1189 if (strcmp(encoding, "utf-8") == 0) 1246 if (strcmp(lower, "utf-8") == 0)
1190 return PyUnicode_DecodeUTF8(s, size, errors); 1247 return PyUnicode_DecodeUTF8(s, size, errors);
1191 else if (strcmp(encoding, "latin-1") == 0) 1248 else if ((strcmp(lower, "latin-1") == 0) ||
1249 (strcmp(lower, "iso-8859-1") == 0))
1192 return PyUnicode_DecodeLatin1(s, size, errors); 1250 return PyUnicode_DecodeLatin1(s, size, errors);
1193 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1251 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1194 else if (strcmp(encoding, "mbcs") == 0) 1252 else if (strcmp(lower, "mbcs") == 0)
1195 return PyUnicode_DecodeMBCS(s, size, errors); 1253 return PyUnicode_DecodeMBCS(s, size, errors);
1196 #endif 1254 #endif
1197 else if (strcmp(encoding, "ascii") == 0) 1255 else if (strcmp(lower, "ascii") == 0)
1198 return PyUnicode_DecodeASCII(s, size, errors); 1256 return PyUnicode_DecodeASCII(s, size, errors);
1257 else if (strcmp(lower, "utf-16") == 0)
1258 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1259 else if (strcmp(lower, "utf-32") == 0)
1260 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1199 1261
1200 /* Decode via the codec registry */ 1262 /* Decode via the codec registry */
1201 buffer = PyBuffer_FromMemory((void *)s, size); 1263 buffer = NULL;
1264 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
1265 goto onError;
1266 buffer = PyMemoryView_FromBuffer(&info);
1202 if (buffer == NULL) 1267 if (buffer == NULL)
1203 goto onError; 1268 goto onError;
1204 unicode = PyCodec_Decode(buffer, encoding, errors); 1269 unicode = PyCodec_Decode(buffer, encoding, errors);
1205 if (unicode == NULL) 1270 if (unicode == NULL)
1206 goto onError; 1271 goto onError;
1207 if (!PyUnicode_Check(unicode)) { 1272 if (!PyUnicode_Check(unicode)) {
1208 PyErr_Format(PyExc_TypeError, 1273 PyErr_Format(PyExc_TypeError,
1209 "decoder did not return an unicode object (type=%.400s)", 1274 "decoder did not return a str object (type=%.400s)",
1210 Py_TYPE(unicode)->tp_name); 1275 Py_TYPE(unicode)->tp_name);
1211 Py_DECREF(unicode); 1276 Py_DECREF(unicode);
1212 goto onError; 1277 goto onError;
1213 } 1278 }
1214 Py_DECREF(buffer); 1279 Py_DECREF(buffer);
1215 return unicode; 1280 return unicode;
1216 1281
1217 onError: 1282 onError:
1218 Py_XDECREF(buffer); 1283 Py_XDECREF(buffer);
1219 return NULL; 1284 return NULL;
(...skipping 16 matching lines...) Expand all
1236 /* Decode via the codec registry */ 1301 /* Decode via the codec registry */
1237 v = PyCodec_Decode(unicode, encoding, errors); 1302 v = PyCodec_Decode(unicode, encoding, errors);
1238 if (v == NULL) 1303 if (v == NULL)
1239 goto onError; 1304 goto onError;
1240 return v; 1305 return v;
1241 1306
1242 onError: 1307 onError:
1243 return NULL; 1308 return NULL;
1244 } 1309 }
1245 1310
1311 PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1312 const char *encoding,
1313 const char *errors)
1314 {
1315 PyObject *v;
1316
1317 if (!PyUnicode_Check(unicode)) {
1318 PyErr_BadArgument();
1319 goto onError;
1320 }
1321
1322 if (encoding == NULL)
1323 encoding = PyUnicode_GetDefaultEncoding();
1324
1325 /* Decode via the codec registry */
1326 v = PyCodec_Decode(unicode, encoding, errors);
1327 if (v == NULL)
1328 goto onError;
1329 if (!PyUnicode_Check(v)) {
1330 PyErr_Format(PyExc_TypeError,
1331 "decoder did not return a str object (type=%.400s)",
1332 Py_TYPE(v)->tp_name);
1333 Py_DECREF(v);
1334 goto onError;
1335 }
1336 return v;
1337
1338 onError:
1339 return NULL;
1340 }
1341
1246 PyObject *PyUnicode_Encode(const Py_UNICODE *s, 1342 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1247 Py_ssize_t size, 1343 Py_ssize_t size,
1248 const char *encoding, 1344 const char *encoding,
1249 const char *errors) 1345 const char *errors)
1250 { 1346 {
1251 PyObject *v, *unicode; 1347 PyObject *v, *unicode;
1252 1348
1253 unicode = PyUnicode_FromUnicode(s, size); 1349 unicode = PyUnicode_FromUnicode(s, size);
1254 if (unicode == NULL) 1350 if (unicode == NULL)
1255 return NULL; 1351 return NULL;
(...skipping 27 matching lines...) Expand all
1283 } 1379 }
1284 1380
1285 PyObject *PyUnicode_AsEncodedString(PyObject *unicode, 1381 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1286 const char *encoding, 1382 const char *encoding,
1287 const char *errors) 1383 const char *errors)
1288 { 1384 {
1289 PyObject *v; 1385 PyObject *v;
1290 1386
1291 if (!PyUnicode_Check(unicode)) { 1387 if (!PyUnicode_Check(unicode)) {
1292 PyErr_BadArgument(); 1388 PyErr_BadArgument();
1293 goto onError; 1389 return NULL;
1294 } 1390 }
1295 1391
1296 if (encoding == NULL) 1392 if (encoding == NULL)
1297 encoding = PyUnicode_GetDefaultEncoding(); 1393 encoding = PyUnicode_GetDefaultEncoding();
1298 1394
1299 /* Shortcuts for common default encodings */ 1395 /* Shortcuts for common default encodings */
1300 if (errors == NULL) { 1396 if (errors == NULL) {
1301 if (strcmp(encoding, "utf-8") == 0) 1397 if (strcmp(encoding, "utf-8") == 0)
1302 return PyUnicode_AsUTF8String(unicode); 1398 return PyUnicode_AsUTF8String(unicode);
1303 else if (strcmp(encoding, "latin-1") == 0) 1399 else if (strcmp(encoding, "latin-1") == 0)
1304 return PyUnicode_AsLatin1String(unicode); 1400 return PyUnicode_AsLatin1String(unicode);
1305 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1401 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1306 else if (strcmp(encoding, "mbcs") == 0) 1402 else if (strcmp(encoding, "mbcs") == 0)
1307 return PyUnicode_AsMBCSString(unicode); 1403 return PyUnicode_AsMBCSString(unicode);
1308 #endif 1404 #endif
1309 else if (strcmp(encoding, "ascii") == 0) 1405 else if (strcmp(encoding, "ascii") == 0)
1310 return PyUnicode_AsASCIIString(unicode); 1406 return PyUnicode_AsASCIIString(unicode);
1311 } 1407 /* During bootstrap, we may need to find the encodings
1408 package, to load the file system encoding, and require the
1409 file system encoding in order to load the encodings
1410 package.
1411
1412 Break out of this dependency by assuming that the path to
1413 the encodings module is ASCII-only. XXX could try wcstombs
1414 instead, if the file system encoding is the locale's
1415 encoding. */
1416 else if (Py_FileSystemDefaultEncoding &&
1417 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1418 !PyThreadState_GET()->interp->codecs_initialized)
1419 return PyUnicode_AsASCIIString(unicode);
1420 }
1421
1422 /* Encode via the codec registry */
1423 v = PyCodec_Encode(unicode, encoding, errors);
1424 if (v == NULL)
1425 return NULL;
1426
1427 /* The normal path */
1428 if (PyBytes_Check(v))
1429 return v;
1430
1431 /* If the codec returns a buffer, raise a warning and convert to bytes */
1432 if (PyByteArray_Check(v)) {
1433 char msg[100];
1434 PyObject *b;
1435 PyOS_snprintf(msg, sizeof(msg),
1436 "encoder %s returned buffer instead of bytes",
1437 encoding);
1438 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
1439 Py_DECREF(v);
1440 return NULL;
1441 }
1442
1443 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1444 Py_DECREF(v);
1445 return b;
1446 }
1447
1448 PyErr_Format(PyExc_TypeError,
1449 "encoder did not return a bytes object (type=%.400s)",
1450 Py_TYPE(v)->tp_name);
1451 Py_DECREF(v);
1452 return NULL;
1453 }
1454
1455 PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1456 const char *encoding,
1457 const char *errors)
1458 {
1459 PyObject *v;
1460
1461 if (!PyUnicode_Check(unicode)) {
1462 PyErr_BadArgument();
1463 goto onError;
1464 }
1465
1466 if (encoding == NULL)
1467 encoding = PyUnicode_GetDefaultEncoding();
1312 1468
1313 /* Encode via the codec registry */ 1469 /* Encode via the codec registry */
1314 v = PyCodec_Encode(unicode, encoding, errors); 1470 v = PyCodec_Encode(unicode, encoding, errors);
1315 if (v == NULL) 1471 if (v == NULL)
1316 goto onError; 1472 goto onError;
1317 if (!PyString_Check(v)) { 1473 if (!PyUnicode_Check(v)) {
1318 PyErr_Format(PyExc_TypeError, 1474 PyErr_Format(PyExc_TypeError,
1319 "encoder did not return a string object (type=%.400s)", 1475 "encoder did not return an str object (type=%.400s)",
1320 Py_TYPE(v)->tp_name); 1476 Py_TYPE(v)->tp_name);
1321 Py_DECREF(v); 1477 Py_DECREF(v);
1322 goto onError; 1478 goto onError;
1323 } 1479 }
1324 return v; 1480 return v;
1325 1481
1326 onError: 1482 onError:
1327 return NULL; 1483 return NULL;
1328 } 1484 }
1329 1485
1330 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode, 1486 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1331 const char *errors) 1487 const char *errors)
1332 { 1488 {
1333 PyObject *v = ((PyUnicodeObject *)unicode)->defenc; 1489 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1334
1335 if (v) 1490 if (v)
1336 return v; 1491 return v;
1337 v = PyUnicode_AsEncodedString(unicode, NULL, errors); 1492 if (errors != NULL)
1338 if (v && errors == NULL) 1493 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
1339 ((PyUnicodeObject *)unicode)->defenc = v; 1494 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1495 PyUnicode_GET_SIZE(unicode),
1496 NULL);
1497 if (!v)
1498 return NULL;
1499 ((PyUnicodeObject *)unicode)->defenc = v;
1340 return v; 1500 return v;
1501 }
1502
1503 PyObject*
1504 PyUnicode_DecodeFSDefault(const char *s) {
1505 Py_ssize_t size = (Py_ssize_t)strlen(s);
1506 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1507 }
1508
1509 PyObject*
1510 PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1511 {
1512 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1513 can be undefined. If it is case, decode using UTF-8. The following assume s
1514 that Py_FileSystemDefaultEncoding is set to a built-in encoding during th e
1515 bootstrapping process where the codecs aren't ready yet.
1516 */
1517 if (Py_FileSystemDefaultEncoding) {
1518 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1519 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
1520 return PyUnicode_DecodeMBCS(s, size, "replace");
1521 }
1522 #elif defined(__APPLE__)
1523 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
1524 return PyUnicode_DecodeUTF8(s, size, "replace");
1525 }
1526 #endif
1527 return PyUnicode_Decode(s, size,
1528 Py_FileSystemDefaultEncoding,
1529 "replace");
1530 }
1531 else {
1532 return PyUnicode_DecodeUTF8(s, size, "replace");
1533 }
1534 }
1535
1536 /* Convert the argument to a bytes object, according to the file
1537 system encoding */
1538
1539 int
1540 PyUnicode_FSConverter(PyObject* arg, void* addr)
1541 {
1542 PyObject *output = NULL;
1543 Py_ssize_t size;
1544 void *data;
1545 if (arg == NULL) {
1546 Py_DECREF(*(PyObject**)addr);
1547 return 1;
1548 }
1549 if (PyBytes_Check(arg) || PyByteArray_Check(arg)) {
1550 output = arg;
1551 Py_INCREF(output);
1552 }
1553 else {
1554 arg = PyUnicode_FromObject(arg);
1555 if (!arg)
1556 return 0;
1557 output = PyUnicode_AsEncodedObject(arg,
1558 Py_FileSystemDefaultEncoding,
1559 "surrogateescape");
1560 Py_DECREF(arg);
1561 if (!output)
1562 return 0;
1563 if (!PyBytes_Check(output)) {
1564 Py_DECREF(output);
1565 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1566 return 0;
1567 }
1568 }
1569 if (PyBytes_Check(output)) {
1570 size = PyBytes_GET_SIZE(output);
1571 data = PyBytes_AS_STRING(output);
1572 }
1573 else {
1574 size = PyByteArray_GET_SIZE(output);
1575 data = PyByteArray_AS_STRING(output);
1576 }
1577 if (size != strlen(data)) {
1578 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1579 Py_DECREF(output);
1580 return 0;
1581 }
1582 *(PyObject**)addr = output;
1583 return Py_CLEANUP_SUPPORTED;
1584 }
1585
1586
1587 char*
1588 _PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
1589 {
1590 PyObject *bytes;
1591 if (!PyUnicode_Check(unicode)) {
1592 PyErr_BadArgument();
1593 return NULL;
1594 }
1595 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1596 if (bytes == NULL)
1597 return NULL;
1598 if (psize != NULL)
1599 *psize = PyBytes_GET_SIZE(bytes);
1600 return PyBytes_AS_STRING(bytes);
1601 }
1602
1603 char*
1604 _PyUnicode_AsString(PyObject *unicode)
1605 {
1606 return _PyUnicode_AsStringAndSize(unicode, NULL);
1341 } 1607 }
1342 1608
1343 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode) 1609 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1344 { 1610 {
1345 if (!PyUnicode_Check(unicode)) { 1611 if (!PyUnicode_Check(unicode)) {
1346 PyErr_BadArgument(); 1612 PyErr_BadArgument();
1347 goto onError; 1613 goto onError;
1348 } 1614 }
1349 return PyUnicode_AS_UNICODE(unicode); 1615 return PyUnicode_AS_UNICODE(unicode);
1350 1616
(...skipping 13 matching lines...) Expand all
1364 return -1; 1630 return -1;
1365 } 1631 }
1366 1632
1367 const char *PyUnicode_GetDefaultEncoding(void) 1633 const char *PyUnicode_GetDefaultEncoding(void)
1368 { 1634 {
1369 return unicode_default_encoding; 1635 return unicode_default_encoding;
1370 } 1636 }
1371 1637
1372 int PyUnicode_SetDefaultEncoding(const char *encoding) 1638 int PyUnicode_SetDefaultEncoding(const char *encoding)
1373 { 1639 {
1374 PyObject *v; 1640 if (strcmp(encoding, unicode_default_encoding) != 0) {
1375 1641 PyErr_Format(PyExc_ValueError,
1376 /* Make sure the encoding is valid. As side effect, this also 1642 "Can only set default encoding to %s",
1377 loads the encoding into the codec registry cache. */ 1643 unicode_default_encoding);
1378 v = _PyCodec_Lookup(encoding); 1644 return -1;
1379 if (v == NULL) 1645 }
1380 goto onError;
1381 Py_DECREF(v);
1382 strncpy(unicode_default_encoding,
1383 encoding,
1384 sizeof(unicode_default_encoding));
1385 return 0; 1646 return 0;
1386
1387 onError:
1388 return -1;
1389 } 1647 }
1390 1648
1391 /* error handling callback helper: 1649 /* error handling callback helper:
1392 build arguments, call the callback and check the arguments, 1650 build arguments, call the callback and check the arguments,
1393 if no exception occurred, copy the replacement to the output 1651 if no exception occurred, copy the replacement to the output
1394 and adjust various state variables. 1652 and adjust various state variables.
1395 return 0 on success, -1 on error 1653 return 0 on success, -1 on error
1396 */ 1654 */
1397 1655
1398 static 1656 static
1399 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler , 1657 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler ,
1400 const char *encoding, const char *reason, 1658 const char *encoding, const char *reason,
1401 const char *input, Py_ssize_t insize, Py_ss ize_t *startinpos, 1659 const char **input, const char **inend, Py_ ssize_t *startinpos,
1402 Py_ssize_t *endinpos, PyObject **exceptionO bject, const char **inptr, 1660 Py_ssize_t *endinpos, PyObject **exceptionO bject, const char **inptr,
1403 PyUnicodeObject **output, Py_ssize_t *outpo s, Py_UNICODE **outptr) 1661 PyUnicodeObject **output, Py_ssize_t *outpo s, Py_UNICODE **outptr)
1404 { 1662 {
1405 static char *argparse = "O!n;decoding error handler must return (unicode, in t) tuple"; 1663 static char *argparse = "O!n;decoding error handler must return (str, int) t uple";
1406 1664
1407 PyObject *restuple = NULL; 1665 PyObject *restuple = NULL;
1408 PyObject *repunicode = NULL; 1666 PyObject *repunicode = NULL;
1409 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output); 1667 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1668 Py_ssize_t insize;
1410 Py_ssize_t requiredsize; 1669 Py_ssize_t requiredsize;
1411 Py_ssize_t newpos; 1670 Py_ssize_t newpos;
1412 Py_UNICODE *repptr; 1671 Py_UNICODE *repptr;
1672 PyObject *inputobj = NULL;
1413 Py_ssize_t repsize; 1673 Py_ssize_t repsize;
1414 int res = -1; 1674 int res = -1;
1415 1675
1416 if (*errorHandler == NULL) { 1676 if (*errorHandler == NULL) {
1417 *errorHandler = PyCodec_LookupError(errors); 1677 *errorHandler = PyCodec_LookupError(errors);
1418 if (*errorHandler == NULL) 1678 if (*errorHandler == NULL)
1419 goto onError; 1679 goto onError;
1420 } 1680 }
1421 1681
1422 if (*exceptionObject == NULL) { 1682 if (*exceptionObject == NULL) {
1423 *exceptionObject = PyUnicodeDecodeError_Create( 1683 *exceptionObject = PyUnicodeDecodeError_Create(
1424 encoding, input, insize, *startinpos, *endinpos, reason); 1684 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1425 if (*exceptionObject == NULL) 1685 if (*exceptionObject == NULL)
1426 goto onError; 1686 goto onError;
1427 } 1687 }
1428 else { 1688 else {
1429 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos)) 1689 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1430 goto onError; 1690 goto onError;
1431 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos)) 1691 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1432 goto onError; 1692 goto onError;
1433 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 1693 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1434 goto onError; 1694 goto onError;
1435 } 1695 }
1436 1696
1437 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NUL L); 1697 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NUL L);
1438 if (restuple == NULL) 1698 if (restuple == NULL)
1439 goto onError; 1699 goto onError;
1440 if (!PyTuple_Check(restuple)) { 1700 if (!PyTuple_Check(restuple)) {
1441 PyErr_SetString(PyExc_TypeError, &argparse[4]); 1701 PyErr_SetString(PyExc_TypeError, &argparse[4]);
1442 goto onError; 1702 goto onError;
1443 } 1703 }
1444 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &new pos)) 1704 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &new pos))
1445 goto onError; 1705 goto onError;
1706
1707 /* Copy back the bytes variables, which might have been modified by the
1708 callback */
1709 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1710 if (!inputobj)
1711 goto onError;
1712 if (!PyBytes_Check(inputobj)) {
1713 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes" );
1714 }
1715 *input = PyBytes_AS_STRING(inputobj);
1716 insize = PyBytes_GET_SIZE(inputobj);
1717 *inend = *input + insize;
1718 /* we can DECREF safely, as the exception has another reference,
1719 so the object won't go away. */
1720 Py_DECREF(inputobj);
1721
1446 if (newpos<0) 1722 if (newpos<0)
1447 newpos = insize+newpos; 1723 newpos = insize+newpos;
1448 if (newpos<0 || newpos>insize) { 1724 if (newpos<0 || newpos>insize) {
1449 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of b ounds", newpos); 1725 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of b ounds", newpos);
1450 goto onError; 1726 goto onError;
1451 } 1727 }
1452 1728
1453 /* need more space? (at least enough for what we 1729 /* need more space? (at least enough for what we
1454 have+the replacement+the rest of the string (starting 1730 have+the replacement+the rest of the string (starting
1455 at the new input position), so we won't have to check space 1731 at the new input position), so we won't have to check space
1456 when there are no errors in the rest of the string) */ 1732 when there are no errors in the rest of the string) */
1457 repptr = PyUnicode_AS_UNICODE(repunicode); 1733 repptr = PyUnicode_AS_UNICODE(repunicode);
1458 repsize = PyUnicode_GET_SIZE(repunicode); 1734 repsize = PyUnicode_GET_SIZE(repunicode);
1459 requiredsize = *outpos + repsize + insize-newpos; 1735 requiredsize = *outpos + repsize + insize-newpos;
1460 if (requiredsize > outsize) { 1736 if (requiredsize > outsize) {
1461 if (requiredsize<2*outsize) 1737 if (requiredsize<2*outsize)
1462 requiredsize = 2*outsize; 1738 requiredsize = 2*outsize;
1463 if (_PyUnicode_Resize(output, requiredsize) < 0) 1739 if (_PyUnicode_Resize(output, requiredsize) < 0)
1464 goto onError; 1740 goto onError;
1465 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; 1741 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1466 } 1742 }
1467 *endinpos = newpos; 1743 *endinpos = newpos;
1468 *inptr = input + newpos; 1744 *inptr = *input + newpos;
1469 Py_UNICODE_COPY(*outptr, repptr, repsize); 1745 Py_UNICODE_COPY(*outptr, repptr, repsize);
1470 *outptr += repsize; 1746 *outptr += repsize;
1471 *outpos += repsize; 1747 *outpos += repsize;
1748
1472 /* we made it! */ 1749 /* we made it! */
1473 res = 0; 1750 res = 0;
1474 1751
1475 onError: 1752 onError:
1476 Py_XDECREF(restuple); 1753 Py_XDECREF(restuple);
1477 return res; 1754 return res;
1478 } 1755 }
1479 1756
1480 /* --- UTF-7 Codec -------------------------------------------------------- */ 1757 /* --- UTF-7 Codec -------------------------------------------------------- */
1481 1758
1482 /* See RFC2152 for details. We encode conservatively and decode liberally. */ 1759 /* See RFC2152 for details. We encode conservatively and decode liberally. */
1483 1760
1484 /* Three simple macros defining base-64. */ 1761 /* Three simple macros defining base-64. */
1485 1762
1486 /* Is c a base-64 character? */ 1763 /* Is c a base-64 character? */
1487 1764
1488 #define IS_BASE64(c) \ 1765 #define IS_BASE64(c) \
1489 (isalnum(c) || (c) == '+' || (c) == '/') 1766 (((c) >= 'A' && (c) <= 'Z') || \
1767 ((c) >= 'a' && (c) <= 'z') || \
1768 ((c) >= '0' && (c) <= '9') || \
1769 (c) == '+' || (c) == '/')
1490 1770
1491 /* given that c is a base-64 character, what is its base-64 value? */ 1771 /* given that c is a base-64 character, what is its base-64 value? */
1492 1772
1493 #define FROM_BASE64(c) \ 1773 #define FROM_BASE64(c) \
1494 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ 1774 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1495 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ 1775 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1496 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ 1776 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1497 (c) == '+' ? 62 : 63) 1777 (c) == '+' ? 62 : 63)
1498 1778
1499 /* What is the base-64 character of the bottom 6 bits of n? */ 1779 /* What is the base-64 character of the bottom 6 bits of n? */
(...skipping 97 matching lines...) Expand 10 before | Expand all | Expand 10 after
1597 if (consumed) 1877 if (consumed)
1598 *consumed = 0; 1878 *consumed = 0;
1599 return (PyObject *)unicode; 1879 return (PyObject *)unicode;
1600 } 1880 }
1601 1881
1602 p = unicode->str; 1882 p = unicode->str;
1603 shiftOutStart = p; 1883 shiftOutStart = p;
1604 e = s + size; 1884 e = s + size;
1605 1885
1606 while (s < e) { 1886 while (s < e) {
1607 Py_UNICODE ch = (unsigned char) *s; 1887 Py_UNICODE ch;
1888 restart:
1889 ch = (unsigned char) *s;
1608 1890
1609 if (inShift) { /* in a base-64 section */ 1891 if (inShift) { /* in a base-64 section */
1610 if (IS_BASE64(ch)) { /* consume a base-64 character */ 1892 if (IS_BASE64(ch)) { /* consume a base-64 character */
1611 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 1893 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1612 base64bits += 6; 1894 base64bits += 6;
1613 s++; 1895 s++;
1614 if (base64bits >= 16) { 1896 if (base64bits >= 16) {
1615 /* we have enough bits for a UTF-16 value */ 1897 /* we have enough bits for a UTF-16 value */
1616 Py_UNICODE outCh = (Py_UNICODE) 1898 Py_UNICODE outCh = (Py_UNICODE)
1617 (base64buffer >> (base64bits-16)); 1899 (base64buffer >> (base64bits-16));
(...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after
1699 errmsg = "unexpected special character"; 1981 errmsg = "unexpected special character";
1700 goto utf7Error; 1982 goto utf7Error;
1701 } 1983 }
1702 continue; 1984 continue;
1703 utf7Error: 1985 utf7Error:
1704 outpos = p-PyUnicode_AS_UNICODE(unicode); 1986 outpos = p-PyUnicode_AS_UNICODE(unicode);
1705 endinpos = s-starts; 1987 endinpos = s-starts;
1706 if (unicode_decode_call_errorhandler( 1988 if (unicode_decode_call_errorhandler(
1707 errors, &errorHandler, 1989 errors, &errorHandler,
1708 "utf7", errmsg, 1990 "utf7", errmsg,
1709 starts, size, &startinpos, &endinpos, &exc, &s, 1991 &starts, &e, &startinpos, &endinpos, &exc, &s,
1710 &unicode, &outpos, &p)) 1992 &unicode, &outpos, &p))
1711 goto onError; 1993 goto onError;
1712 } 1994 }
1713 1995
1714 /* end of string */ 1996 /* end of string */
1715 1997
1716 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 1998 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1717 /* if we're in an inconsistent state, that's an error */ 1999 /* if we're in an inconsistent state, that's an error */
1718 if (surrogate || 2000 if (surrogate ||
1719 (base64bits >= 6) || 2001 (base64bits >= 6) ||
1720 (base64bits > 0 && base64buffer != 0)) { 2002 (base64bits > 0 && base64buffer != 0)) {
1721 outpos = p-PyUnicode_AS_UNICODE(unicode); 2003 outpos = p-PyUnicode_AS_UNICODE(unicode);
1722 endinpos = size; 2004 endinpos = size;
1723 if (unicode_decode_call_errorhandler( 2005 if (unicode_decode_call_errorhandler(
1724 errors, &errorHandler, 2006 errors, &errorHandler,
1725 "utf7", "unterminated shift sequence", 2007 "utf7", "unterminated shift sequence",
1726 starts, size, &startinpos, &endinpos, &exc, &s, 2008 &starts, &e, &startinpos, &endinpos, &exc, &s,
1727 &unicode, &outpos, &p)) 2009 &unicode, &outpos, &p))
1728 goto onError; 2010 goto onError;
2011 if (s < e)
2012 goto restart;
1729 } 2013 }
1730 } 2014 }
1731 2015
1732 /* return state */ 2016 /* return state */
1733 if (consumed) { 2017 if (consumed) {
1734 if (inShift) { 2018 if (inShift) {
1735 p = shiftOutStart; /* back off output */ 2019 p = shiftOutStart; /* back off output */
1736 *consumed = startinpos; 2020 *consumed = startinpos;
1737 } 2021 }
1738 else { 2022 else {
(...skipping 25 matching lines...) Expand all
1764 PyObject *v; 2048 PyObject *v;
1765 /* It might be possible to tighten this worst case */ 2049 /* It might be possible to tighten this worst case */
1766 Py_ssize_t allocated = 8 * size; 2050 Py_ssize_t allocated = 8 * size;
1767 int inShift = 0; 2051 int inShift = 0;
1768 Py_ssize_t i = 0; 2052 Py_ssize_t i = 0;
1769 unsigned int base64bits = 0; 2053 unsigned int base64bits = 0;
1770 unsigned long base64buffer = 0; 2054 unsigned long base64buffer = 0;
1771 char * out; 2055 char * out;
1772 char * start; 2056 char * start;
1773 2057
2058 if (size == 0)
2059 return PyBytes_FromStringAndSize(NULL, 0);
2060
1774 if (allocated / 8 != size) 2061 if (allocated / 8 != size)
1775 return PyErr_NoMemory(); 2062 return PyErr_NoMemory();
1776 2063
1777 if (size == 0) 2064 v = PyBytes_FromStringAndSize(NULL, allocated);
1778 return PyString_FromStringAndSize(NULL, 0);
1779
1780 v = PyString_FromStringAndSize(NULL, allocated);
1781 if (v == NULL) 2065 if (v == NULL)
1782 return NULL; 2066 return NULL;
1783 2067
1784 start = out = PyString_AS_STRING(v); 2068 start = out = PyBytes_AS_STRING(v);
1785 for (;i < size; ++i) { 2069 for (;i < size; ++i) {
1786 Py_UNICODE ch = s[i]; 2070 Py_UNICODE ch = s[i];
1787 2071
1788 if (inShift) { 2072 if (inShift) {
1789 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 2073 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1790 /* shifting out */ 2074 /* shifting out */
1791 if (base64bits) { /* output remaining bits */ 2075 if (base64bits) { /* output remaining bits */
1792 *out++ = TO_BASE64(base64buffer << (6-base64bits)); 2076 *out++ = TO_BASE64(base64buffer << (6-base64bits));
1793 base64buffer = 0; 2077 base64buffer = 0;
1794 base64bits = 0; 2078 base64bits = 0;
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after
1838 base64buffer = (base64buffer << 16) | ch; 2122 base64buffer = (base64buffer << 16) | ch;
1839 while (base64bits >= 6) { 2123 while (base64bits >= 6) {
1840 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 2124 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1841 base64bits -= 6; 2125 base64bits -= 6;
1842 } 2126 }
1843 } 2127 }
1844 if (base64bits) 2128 if (base64bits)
1845 *out++= TO_BASE64(base64buffer << (6-base64bits) ); 2129 *out++= TO_BASE64(base64buffer << (6-base64bits) );
1846 if (inShift) 2130 if (inShift)
1847 *out++ = '-'; 2131 *out++ = '-';
1848 2132 if (_PyBytes_Resize(&v, out - start) < 0)
1849 if (_PyString_Resize(&v, out - start))
1850 return NULL; 2133 return NULL;
1851 return v; 2134 return v;
1852 } 2135 }
1853 2136
1854 #undef IS_BASE64 2137 #undef IS_BASE64
1855 #undef FROM_BASE64 2138 #undef FROM_BASE64
1856 #undef TO_BASE64 2139 #undef TO_BASE64
1857 #undef DECODE_DIRECT 2140 #undef DECODE_DIRECT
1858 #undef ENCODE_DIRECT 2141 #undef ENCODE_DIRECT
1859 2142
(...skipping 21 matching lines...) Expand all
1881 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */ 2164 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
1882 }; 2165 };
1883 2166
1884 PyObject *PyUnicode_DecodeUTF8(const char *s, 2167 PyObject *PyUnicode_DecodeUTF8(const char *s,
1885 Py_ssize_t size, 2168 Py_ssize_t size,
1886 const char *errors) 2169 const char *errors)
1887 { 2170 {
1888 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 2171 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1889 } 2172 }
1890 2173
2174 /* Mask to check or force alignment of a pointer to C 'long' boundaries */
2175 #define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2176
2177 /* Mask to quickly check whether a C 'long' contains a
2178 non-ASCII, UTF8-encoded char. */
2179 #if (SIZEOF_LONG == 8)
2180 # define ASCII_CHAR_MASK 0x8080808080808080L
2181 #elif (SIZEOF_LONG == 4)
2182 # define ASCII_CHAR_MASK 0x80808080L
2183 #else
2184 # error C 'long' size should be either 4 or 8!
2185 #endif
2186
1891 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s, 2187 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1892 Py_ssize_t size, 2188 Py_ssize_t size,
1893 const char *errors, 2189 const char *errors,
1894 Py_ssize_t *consumed) 2190 Py_ssize_t *consumed)
1895 { 2191 {
1896 const char *starts = s; 2192 const char *starts = s;
1897 int n; 2193 int n;
1898 int k; 2194 int k;
1899 Py_ssize_t startinpos; 2195 Py_ssize_t startinpos;
1900 Py_ssize_t endinpos; 2196 Py_ssize_t endinpos;
1901 Py_ssize_t outpos; 2197 Py_ssize_t outpos;
1902 const char *e; 2198 const char *e, *aligned_end;
1903 PyUnicodeObject *unicode; 2199 PyUnicodeObject *unicode;
1904 Py_UNICODE *p; 2200 Py_UNICODE *p;
1905 const char *errmsg = ""; 2201 const char *errmsg = "";
1906 PyObject *errorHandler = NULL; 2202 PyObject *errorHandler = NULL;
1907 PyObject *exc = NULL; 2203 PyObject *exc = NULL;
1908 2204
1909 /* Note: size will always be longer than the resulting Unicode 2205 /* Note: size will always be longer than the resulting Unicode
1910 character count */ 2206 character count */
1911 unicode = _PyUnicode_New(size); 2207 unicode = _PyUnicode_New(size);
1912 if (!unicode) 2208 if (!unicode)
1913 return NULL; 2209 return NULL;
1914 if (size == 0) { 2210 if (size == 0) {
1915 if (consumed) 2211 if (consumed)
1916 *consumed = 0; 2212 *consumed = 0;
1917 return (PyObject *)unicode; 2213 return (PyObject *)unicode;
1918 } 2214 }
1919 2215
1920 /* Unpack UTF-8 encoded data */ 2216 /* Unpack UTF-8 encoded data */
1921 p = unicode->str; 2217 p = unicode->str;
1922 e = s + size; 2218 e = s + size;
2219 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
1923 2220
1924 while (s < e) { 2221 while (s < e) {
1925 Py_UCS4 ch = (unsigned char)*s; 2222 Py_UCS4 ch = (unsigned char)*s;
2223
2224 if (ch < 0x80) {
2225 /* Fast path for runs of ASCII characters. Given that common UTF-8
2226 input will consist of an overwhelming majority of ASCII
2227 characters, we try to optimize for this case by checking
2228 as many characters as a C 'long' can contain.
2229 First, check if we can do an aligned read, as most CPUs have
2230 a penalty for unaligned reads.
2231 */
2232 if (!((size_t) s & LONG_PTR_MASK)) {
2233 /* Help register allocation */
2234 register const char *_s = s;
2235 register Py_UNICODE *_p = p;
2236 while (_s < aligned_end) {
2237 /* Read a whole long at a time (either 4 or 8 bytes),
2238 and do a fast unrolled copy if it only contains ASCII
2239 characters. */
2240 unsigned long data = *(unsigned long *) _s;
2241 if (data & ASCII_CHAR_MASK)
2242 break;
2243 _p[0] = (unsigned char) _s[0];
2244 _p[1] = (unsigned char) _s[1];
2245 _p[2] = (unsigned char) _s[2];
2246 _p[3] = (unsigned char) _s[3];
2247 #if (SIZEOF_LONG == 8)
2248 _p[4] = (unsigned char) _s[4];
2249 _p[5] = (unsigned char) _s[5];
2250 _p[6] = (unsigned char) _s[6];
2251 _p[7] = (unsigned char) _s[7];
2252 #endif
2253 _s += SIZEOF_LONG;
2254 _p += SIZEOF_LONG;
2255 }
2256 s = _s;
2257 p = _p;
2258 if (s == e)
2259 break;
2260 ch = (unsigned char)*s;
2261 }
2262 }
1926 2263
1927 if (ch < 0x80) { 2264 if (ch < 0x80) {
1928 *p++ = (Py_UNICODE)ch; 2265 *p++ = (Py_UNICODE)ch;
1929 s++; 2266 s++;
1930 continue; 2267 continue;
1931 } 2268 }
1932 2269
1933 n = utf8_code_length[ch]; 2270 n = utf8_code_length[ch];
1934 2271
1935 if (s + n > e) { 2272 if (s + n > e) {
(...skipping 29 matching lines...) Expand all
1965 startinpos = s-starts; 2302 startinpos = s-starts;
1966 endinpos = startinpos + 1; 2303 endinpos = startinpos + 1;
1967 goto utf8Error; 2304 goto utf8Error;
1968 } 2305 }
1969 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 2306 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1970 assert ((ch > 0x007F) && (ch <= 0x07FF)); 2307 assert ((ch > 0x007F) && (ch <= 0x07FF));
1971 *p++ = (Py_UNICODE)ch; 2308 *p++ = (Py_UNICODE)ch;
1972 break; 2309 break;
1973 2310
1974 case 3: 2311 case 3:
1975 /* XXX: surrogates shouldn't be valid UTF-8! 2312 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
1976 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 2313 will result in surrogates in range d800-dfff. Surrogates are
1977 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt 2314 not valid UTF-8 so they are rejected.
1978 Uncomment the 2 lines below to make them invalid, 2315 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
1979 codepoints: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */ 2316 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
1980 if ((s[1] & 0xc0) != 0x80 || 2317 if ((s[1] & 0xc0) != 0x80 ||
1981 (s[2] & 0xc0) != 0x80 || 2318 (s[2] & 0xc0) != 0x80 ||
1982 ((unsigned char)s[0] == 0xE0 && 2319 ((unsigned char)s[0] == 0xE0 &&
1983 (unsigned char)s[1] < 0xA0)/* || 2320 (unsigned char)s[1] < 0xA0) ||
1984 ((unsigned char)s[0] == 0xED && 2321 ((unsigned char)s[0] == 0xED &&
1985 (unsigned char)s[1] > 0x9F)*/) { 2322 (unsigned char)s[1] > 0x9F)) {
1986 errmsg = "invalid continuation byte"; 2323 errmsg = "invalid continuation byte";
1987 startinpos = s-starts; 2324 startinpos = s-starts;
1988 endinpos = startinpos + 1; 2325 endinpos = startinpos + 1;
1989 2326
1990 /* if s[1] first two bits are 1 and 0, then the invalid 2327 /* if s[1] first two bits are 1 and 0, then the invalid
1991 continuation byte is s[2], so increment endinpos by 1, 2328 continuation byte is s[2], so increment endinpos by 1,
1992 if not, s[1] is invalid and endinpos doesn't need to 2329 if not, s[1] is invalid and endinpos doesn't need to
1993 be incremented. */ 2330 be incremented. */
1994 if ((s[1] & 0xC0) == 0x80) 2331 if ((s[1] & 0xC0) == 0x80)
1995 endinpos++; 2332 endinpos++;
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after
2039 break; 2376 break;
2040 } 2377 }
2041 s += n; 2378 s += n;
2042 continue; 2379 continue;
2043 2380
2044 utf8Error: 2381 utf8Error:
2045 outpos = p-PyUnicode_AS_UNICODE(unicode); 2382 outpos = p-PyUnicode_AS_UNICODE(unicode);
2046 if (unicode_decode_call_errorhandler( 2383 if (unicode_decode_call_errorhandler(
2047 errors, &errorHandler, 2384 errors, &errorHandler,
2048 "utf8", errmsg, 2385 "utf8", errmsg,
2049 starts, size, &startinpos, &endinpos, &exc, &s, 2386 &starts, &e, &startinpos, &endinpos, &exc, &s,
2050 &unicode, &outpos, &p)) 2387 &unicode, &outpos, &p))
2051 goto onError; 2388 goto onError;
2389 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
2052 } 2390 }
2053 if (consumed) 2391 if (consumed)
2054 *consumed = s-starts; 2392 *consumed = s-starts;
2055 2393
2056 /* Adjust length */ 2394 /* Adjust length */
2057 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 2395 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2058 goto onError; 2396 goto onError;
2059 2397
2060 Py_XDECREF(errorHandler); 2398 Py_XDECREF(errorHandler);
2061 Py_XDECREF(exc); 2399 Py_XDECREF(exc);
2062 return (PyObject *)unicode; 2400 return (PyObject *)unicode;
2063 2401
2064 onError: 2402 onError:
2065 Py_XDECREF(errorHandler); 2403 Py_XDECREF(errorHandler);
2066 Py_XDECREF(exc); 2404 Py_XDECREF(exc);
2067 Py_DECREF(unicode); 2405 Py_DECREF(unicode);
2068 return NULL; 2406 return NULL;
2069 } 2407 }
2408
2409 #undef ASCII_CHAR_MASK
2410
2070 2411
2071 /* Allocation strategy: if the string is short, convert into a stack buffer 2412 /* Allocation strategy: if the string is short, convert into a stack buffer
2072 and allocate exactly as much space needed at the end. Else allocate the 2413 and allocate exactly as much space needed at the end. Else allocate the
2073 maximum possible needed (4 result bytes per Unicode character), and return 2414 maximum possible needed (4 result bytes per Unicode character), and return
2074 the excess memory at the end. 2415 the excess memory at the end.
2075 */ 2416 */
2076 PyObject * 2417 PyObject *
2077 PyUnicode_EncodeUTF8(const Py_UNICODE *s, 2418 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2078 Py_ssize_t size, 2419 Py_ssize_t size,
2079 const char *errors) 2420 const char *errors)
2080 { 2421 {
2081 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ 2422 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
2082 2423
2083 Py_ssize_t i; /* index into s of next input byte */ 2424 Py_ssize_t i; /* index into s of next input byte */
2084 PyObject *v; /* result string object */ 2425 PyObject *result; /* result string object */
2085 char *p; /* next free byte in output buffer */ 2426 char *p; /* next free byte in output buffer */
2086 Py_ssize_t nallocated; /* number of result bytes allocated */ 2427 Py_ssize_t nallocated; /* number of result bytes allocated */
2087 Py_ssize_t nneeded; /* number of result bytes needed */ 2428 Py_ssize_t nneeded; /* number of result bytes needed */
2088 char stackbuf[MAX_SHORT_UNICHARS * 4]; 2429 char stackbuf[MAX_SHORT_UNICHARS * 4];
2430 PyObject *errorHandler = NULL;
2431 PyObject *exc = NULL;
2089 2432
2090 assert(s != NULL); 2433 assert(s != NULL);
2091 assert(size >= 0); 2434 assert(size >= 0);
2092 2435
2093 if (size <= MAX_SHORT_UNICHARS) { 2436 if (size <= MAX_SHORT_UNICHARS) {
2094 /* Write into the stack buffer; nallocated can't overflow. 2437 /* Write into the stack buffer; nallocated can't overflow.
2095 * At the end, we'll allocate exactly as much heap space as it 2438 * At the end, we'll allocate exactly as much heap space as it
2096 * turns out we need. 2439 * turns out we need.
2097 */ 2440 */
2098 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); 2441 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2099 v = NULL; /* will allocate after we're done */ 2442 result = NULL; /* will allocate after we're done */
2100 p = stackbuf; 2443 p = stackbuf;
2101 } 2444 }
2102 else { 2445 else {
2103 /* Overallocate on the heap, and give the excess back at the end. */ 2446 /* Overallocate on the heap, and give the excess back at the end. */
2104 nallocated = size * 4; 2447 nallocated = size * 4;
2105 if (nallocated / 4 != size) /* overflow! */ 2448 if (nallocated / 4 != size) /* overflow! */
2106 return PyErr_NoMemory(); 2449 return PyErr_NoMemory();
2107 v = PyString_FromStringAndSize(NULL, nallocated); 2450 result = PyBytes_FromStringAndSize(NULL, nallocated);
2108 if (v == NULL) 2451 if (result == NULL)
2109 return NULL; 2452 return NULL;
2110 p = PyString_AS_STRING(v); 2453 p = PyBytes_AS_STRING(result);
2111 } 2454 }
2112 2455
2113 for (i = 0; i < size;) { 2456 for (i = 0; i < size;) {
2114 Py_UCS4 ch = s[i++]; 2457 Py_UCS4 ch = s[i++];
2115 2458
2116 if (ch < 0x80) 2459 if (ch < 0x80)
2117 /* Encode ASCII */ 2460 /* Encode ASCII */
2118 *p++ = (char) ch; 2461 *p++ = (char) ch;
2119 2462
2120 else if (ch < 0x0800) { 2463 else if (ch < 0x0800) {
2121 /* Encode Latin-1 */ 2464 /* Encode Latin-1 */
2122 *p++ = (char)(0xc0 | (ch >> 6)); 2465 *p++ = (char)(0xc0 | (ch >> 6));
2123 *p++ = (char)(0x80 | (ch & 0x3f)); 2466 *p++ = (char)(0x80 | (ch & 0x3f));
2124 } 2467 } else if (0xD800 <= ch && ch <= 0xDFFF) {
2125 else { 2468 #ifndef Py_UNICODE_WIDE
2126 /* Encode UCS2 Unicode ordinals */ 2469 /* Special case: check for high and low surrogate */
2127 if (ch < 0x10000) { 2470 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2128 /* Special case: check for high surrogate */ 2471 Py_UCS4 ch2 = s[i];
2129 if (0xD800 <= ch && ch <= 0xDBFF && i != size) { 2472 /* Combine the two surrogates to form a UCS4 value */
2130 Py_UCS4 ch2 = s[i]; 2473 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2131 /* Check for low surrogate and combine the two to 2474 i++;
2132 form a UCS4 value */ 2475
2133 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 2476 /* Encode UCS4 Unicode ordinals */
2134 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000; 2477 *p++ = (char)(0xf0 | (ch >> 18));
2135 i++; 2478 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2136 goto encodeUCS4;
2137 }
2138 /* Fall through: handles isolated high surrogates */
2139 }
2140 *p++ = (char)(0xe0 | (ch >> 12));
2141 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 2479 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2142 *p++ = (char)(0x80 | (ch & 0x3f)); 2480 *p++ = (char)(0x80 | (ch & 0x3f));
2143 continue; 2481 } else {
2144 } 2482 #endif
2145 encodeUCS4: 2483 Py_ssize_t newpos;
2484 PyObject *rep;
2485 Py_ssize_t repsize, k;
2486 rep = unicode_encode_call_errorhandler
2487 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2488 s, size, &exc, i-1, i, &newpos);
2489 if (!rep)
2490 goto error;
2491
2492 if (PyBytes_Check(rep))
2493 repsize = PyBytes_GET_SIZE(rep);
2494 else
2495 repsize = PyUnicode_GET_SIZE(rep);
2496
2497 if (repsize > 4) {
2498 Py_ssize_t offset;
2499
2500 if (result == NULL)
2501 offset = p - stackbuf;
2502 else
2503 offset = p - PyBytes_AS_STRING(result);
2504
2505 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2506 /* integer overflow */
2507 PyErr_NoMemory();
2508 goto error;
2509 }
2510 nallocated += repsize - 4;
2511 if (result != NULL) {
2512 if (_PyBytes_Resize(&result, nallocated) < 0)
2513 goto error;
2514 } else {
2515 result = PyBytes_FromStringAndSize(NULL, nallocated);
2516 if (result == NULL)
2517 goto error;
2518 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2519 }
2520 p = PyBytes_AS_STRING(result) + offset;
2521 }
2522
2523 if (PyBytes_Check(rep)) {
2524 char *prep = PyBytes_AS_STRING(rep);
2525 for(k = repsize; k > 0; k--)
2526 *p++ = *prep++;
2527 } else /* rep is unicode */ {
2528 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2529 Py_UNICODE c;
2530
2531 for(k=0; k<repsize; k++) {
2532 c = prep[k];
2533 if (0x80 <= c) {
2534 raise_encode_exception(&exc, "utf-8", s, size,
2535 i-1, i, "surrogates not allow ed");
2536 goto error;
2537 }
2538 *p++ = (char)prep[k];
2539 }
2540 }
2541 Py_DECREF(rep);
2542 #ifndef Py_UNICODE_WIDE
2543 }
2544 #endif
2545 } else if (ch < 0x10000) {
2546 *p++ = (char)(0xe0 | (ch >> 12));
2547 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2548 *p++ = (char)(0x80 | (ch & 0x3f));
2549 } else /* ch >= 0x10000 */ {
2146 /* Encode UCS4 Unicode ordinals */ 2550 /* Encode UCS4 Unicode ordinals */
2147 *p++ = (char)(0xf0 | (ch >> 18)); 2551 *p++ = (char)(0xf0 | (ch >> 18));
2148 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 2552 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2149 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 2553 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2150 *p++ = (char)(0x80 | (ch & 0x3f)); 2554 *p++ = (char)(0x80 | (ch & 0x3f));
2151 } 2555 }
2152 } 2556 }
2153 2557
2154 if (v == NULL) { 2558 if (result == NULL) {
2155 /* This was stack allocated. */ 2559 /* This was stack allocated. */
2156 nneeded = p - stackbuf; 2560 nneeded = p - stackbuf;
2157 assert(nneeded <= nallocated); 2561 assert(nneeded <= nallocated);
2158 v = PyString_FromStringAndSize(stackbuf, nneeded); 2562 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
2159 } 2563 }
2160 else { 2564 else {
2161 /* Cut back to size actually needed. */ 2565 /* Cut back to size actually needed. */
2162 nneeded = p - PyString_AS_STRING(v); 2566 nneeded = p - PyBytes_AS_STRING(result);
2163 assert(nneeded <= nallocated); 2567 assert(nneeded <= nallocated);
2164 if (_PyString_Resize(&v, nneeded)) 2568 _PyBytes_Resize(&result, nneeded);
2165 return NULL; 2569 }
2166 } 2570 Py_XDECREF(errorHandler);
2167 return v; 2571 Py_XDECREF(exc);
2572 return result;
2573 error:
2574 Py_XDECREF(errorHandler);
2575 Py_XDECREF(exc);
2576 Py_XDECREF(result);
2577 return NULL;
2168 2578
2169 #undef MAX_SHORT_UNICHARS 2579 #undef MAX_SHORT_UNICHARS
2170 } 2580 }
2171 2581
2172 PyObject *PyUnicode_AsUTF8String(PyObject *unicode) 2582 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2173 { 2583 {
2174 if (!PyUnicode_Check(unicode)) { 2584 if (!PyUnicode_Check(unicode)) {
2175 PyErr_BadArgument(); 2585 PyErr_BadArgument();
2176 return NULL; 2586 return NULL;
2177 } 2587 }
(...skipping 145 matching lines...) Expand 10 before | Expand all | Expand 10 after
2323 else 2733 else
2324 #endif 2734 #endif
2325 *p++ = ch; 2735 *p++ = ch;
2326 q += 4; 2736 q += 4;
2327 continue; 2737 continue;
2328 utf32Error: 2738 utf32Error:
2329 outpos = p-PyUnicode_AS_UNICODE(unicode); 2739 outpos = p-PyUnicode_AS_UNICODE(unicode);
2330 if (unicode_decode_call_errorhandler( 2740 if (unicode_decode_call_errorhandler(
2331 errors, &errorHandler, 2741 errors, &errorHandler,
2332 "utf32", errmsg, 2742 "utf32", errmsg,
2333 starts, size, &startinpos, &endinpos, &exc, (const char **)&q, 2743 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2334 &unicode, &outpos, &p)) 2744 &unicode, &outpos, &p))
2335 goto onError; 2745 goto onError;
2336 } 2746 }
2337 2747
2338 if (byteorder) 2748 if (byteorder)
2339 *byteorder = bo; 2749 *byteorder = bo;
2340 2750
2341 if (consumed) 2751 if (consumed)
2342 *consumed = (const char *)q-starts; 2752 *consumed = (const char *)q-starts;
2343 2753
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after
2391 #ifndef Py_UNICODE_WIDE 2801 #ifndef Py_UNICODE_WIDE
2392 for (i = pairs = 0; i < size-1; i++) 2802 for (i = pairs = 0; i < size-1; i++)
2393 if (0xD800 <= s[i] && s[i] <= 0xDBFF && 2803 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2394 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF) 2804 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2395 pairs++; 2805 pairs++;
2396 #endif 2806 #endif
2397 nsize = (size - pairs + (byteorder == 0)); 2807 nsize = (size - pairs + (byteorder == 0));
2398 bytesize = nsize * 4; 2808 bytesize = nsize * 4;
2399 if (bytesize / 4 != nsize) 2809 if (bytesize / 4 != nsize)
2400 return PyErr_NoMemory(); 2810 return PyErr_NoMemory();
2401 v = PyString_FromStringAndSize(NULL, bytesize); 2811 v = PyBytes_FromStringAndSize(NULL, bytesize);
2402 if (v == NULL) 2812 if (v == NULL)
2403 return NULL; 2813 return NULL;
2404 2814
2405 p = (unsigned char *)PyString_AS_STRING(v); 2815 p = (unsigned char *)PyBytes_AS_STRING(v);
2406 if (byteorder == 0) 2816 if (byteorder == 0)
2407 STORECHAR(0xFEFF); 2817 STORECHAR(0xFEFF);
2408 if (size == 0) 2818 if (size == 0)
2409 return v; 2819 goto done;
2410 2820
2411 if (byteorder == -1) { 2821 if (byteorder == -1) {
2412 /* force LE */ 2822 /* force LE */
2413 iorder[0] = 0; 2823 iorder[0] = 0;
2414 iorder[1] = 1; 2824 iorder[1] = 1;
2415 iorder[2] = 2; 2825 iorder[2] = 2;
2416 iorder[3] = 3; 2826 iorder[3] = 3;
2417 } 2827 }
2418 else if (byteorder == 1) { 2828 else if (byteorder == 1) {
2419 /* force BE */ 2829 /* force BE */
(...skipping 10 matching lines...) Expand all
2430 Py_UCS4 ch2 = *s; 2840 Py_UCS4 ch2 = *s;
2431 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 2841 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2432 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 2842 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2433 s++; 2843 s++;
2434 size--; 2844 size--;
2435 } 2845 }
2436 } 2846 }
2437 #endif 2847 #endif
2438 STORECHAR(ch); 2848 STORECHAR(ch);
2439 } 2849 }
2850
2851 done:
2440 return v; 2852 return v;
2441 #undef STORECHAR 2853 #undef STORECHAR
2442 } 2854 }
2443 2855
2444 PyObject *PyUnicode_AsUTF32String(PyObject *unicode) 2856 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2445 { 2857 {
2446 if (!PyUnicode_Check(unicode)) { 2858 if (!PyUnicode_Check(unicode)) {
2447 PyErr_BadArgument(); 2859 PyErr_BadArgument();
2448 return NULL; 2860 return NULL;
2449 } 2861 }
2450 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode), 2862 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2451 PyUnicode_GET_SIZE(unicode), 2863 PyUnicode_GET_SIZE(unicode),
2452 NULL, 2864 NULL,
2453 0); 2865 0);
2454 } 2866 }
2455 2867
2456 /* --- UTF-16 Codec ------------------------------------------------------- */ 2868 /* --- UTF-16 Codec ------------------------------------------------------- */
2457 2869
2458 PyObject * 2870 PyObject *
2459 PyUnicode_DecodeUTF16(const char *s, 2871 PyUnicode_DecodeUTF16(const char *s,
2460 Py_ssize_t size, 2872 Py_ssize_t size,
2461 const char *errors, 2873 const char *errors,
2462 int *byteorder) 2874 int *byteorder)
2463 { 2875 {
2464 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 2876 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2465 } 2877 }
2878
2879 /* Two masks for fast checking of whether a C 'long' may contain
2880 UTF16-encoded surrogate characters. This is an efficient heuristic,
2881 assuming that non-surrogate characters with a code point >= 0x8000 are
2882 rare in most input.
2883 FAST_CHAR_MASK is used when the input is in native byte ordering,
2884 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
2885 */
2886 #if (SIZEOF_LONG == 8)
2887 # define FAST_CHAR_MASK 0x8000800080008000L
2888 # define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
2889 #elif (SIZEOF_LONG == 4)
2890 # define FAST_CHAR_MASK 0x80008000L
2891 # define SWAPPED_FAST_CHAR_MASK 0x00800080L
2892 #else
2893 # error C 'long' size should be either 4 or 8!
2894 #endif
2466 2895
2467 PyObject * 2896 PyObject *
2468 PyUnicode_DecodeUTF16Stateful(const char *s, 2897 PyUnicode_DecodeUTF16Stateful(const char *s,
2469 Py_ssize_t size, 2898 Py_ssize_t size,
2470 const char *errors, 2899 const char *errors,
2471 int *byteorder, 2900 int *byteorder,
2472 Py_ssize_t *consumed) 2901 Py_ssize_t *consumed)
2473 { 2902 {
2474 const char *starts = s; 2903 const char *starts = s;
2475 Py_ssize_t startinpos; 2904 Py_ssize_t startinpos;
2476 Py_ssize_t endinpos; 2905 Py_ssize_t endinpos;
2477 Py_ssize_t outpos; 2906 Py_ssize_t outpos;
2478 PyUnicodeObject *unicode; 2907 PyUnicodeObject *unicode;
2479 Py_UNICODE *p; 2908 Py_UNICODE *p;
2480 const unsigned char *q, *e; 2909 const unsigned char *q, *e, *aligned_end;
2481 int bo = 0; /* assume native ordering by default */ 2910 int bo = 0; /* assume native ordering by default */
2911 int native_ordering = 0;
2482 const char *errmsg = ""; 2912 const char *errmsg = "";
2483 /* Offsets from q for retrieving byte pairs in the right order. */ 2913 /* Offsets from q for retrieving byte pairs in the right order. */
2484 #ifdef BYTEORDER_IS_LITTLE_ENDIAN 2914 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2485 int ihi = 1, ilo = 0; 2915 int ihi = 1, ilo = 0;
2486 #else 2916 #else
2487 int ihi = 0, ilo = 1; 2917 int ihi = 0, ilo = 1;
2488 #endif 2918 #endif
2489 PyObject *errorHandler = NULL; 2919 PyObject *errorHandler = NULL;
2490 PyObject *exc = NULL; 2920 PyObject *exc = NULL;
2491 2921
2492 /* Note: size will always be longer than the resulting Unicode 2922 /* Note: size will always be longer than the resulting Unicode
2493 character count */ 2923 character count */
2494 unicode = _PyUnicode_New(size); 2924 unicode = _PyUnicode_New(size);
2495 if (!unicode) 2925 if (!unicode)
2496 return NULL; 2926 return NULL;
2497 if (size == 0) 2927 if (size == 0)
2498 return (PyObject *)unicode; 2928 return (PyObject *)unicode;
2499 2929
2500 /* Unpack UTF-16 encoded data */ 2930 /* Unpack UTF-16 encoded data */
2501 p = unicode->str; 2931 p = unicode->str;
2502 q = (unsigned char *)s; 2932 q = (unsigned char *)s;
2503 e = q + size; 2933 e = q + size - 1;
2504 2934
2505 if (byteorder) 2935 if (byteorder)
2506 bo = *byteorder; 2936 bo = *byteorder;
2507 2937
2508 /* Check for BOM marks (U+FEFF) in the input and adjust current 2938 /* Check for BOM marks (U+FEFF) in the input and adjust current
2509 byte order setting accordingly. In native mode, the leading BOM 2939 byte order setting accordingly. In native mode, the leading BOM
2510 mark is skipped, in all other modes, it is copied to the output 2940 mark is skipped, in all other modes, it is copied to the output
2511 stream as-is (giving a ZWNBSP character). */ 2941 stream as-is (giving a ZWNBSP character). */
2512 if (bo == 0) { 2942 if (bo == 0) {
2513 if (size >= 2) { 2943 if (size >= 2) {
(...skipping 23 matching lines...) Expand all
2537 if (bo == -1) { 2967 if (bo == -1) {
2538 /* force LE */ 2968 /* force LE */
2539 ihi = 1; 2969 ihi = 1;
2540 ilo = 0; 2970 ilo = 0;
2541 } 2971 }
2542 else if (bo == 1) { 2972 else if (bo == 1) {
2543 /* force BE */ 2973 /* force BE */
2544 ihi = 0; 2974 ihi = 0;
2545 ilo = 1; 2975 ilo = 1;
2546 } 2976 }
2547 2977 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2978 native_ordering = ilo < ihi;
2979 #else
2980 native_ordering = ilo > ihi;
2981 #endif
2982
2983 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
2548 while (q < e) { 2984 while (q < e) {
2549 Py_UNICODE ch; 2985 Py_UNICODE ch;
2550 /* remaining bytes at the end? (size should be even) */ 2986 /* First check for possible aligned read of a C 'long'. Unaligned
2551 if (e-q<2) { 2987 reads are more expensive, better to defer to another iteration. */
2552 if (consumed) 2988 if (!((size_t) q & LONG_PTR_MASK)) {
2989 /* Fast path for runs of non-surrogate chars. */
2990 register const unsigned char *_q = q;
2991 Py_UNICODE *_p = p;
2992 if (native_ordering) {
2993 /* Native ordering is simple: as long as the input cannot
2994 possibly contain a surrogate char, do an unrolled copy
2995 of several 16-bit code points to the target object.
2996 The non-surrogate check is done on several input bytes
2997 at a time (as many as a C 'long' can contain). */
2998 while (_q < aligned_end) {
2999 unsigned long data = * (unsigned long *) _q;
3000 if (data & FAST_CHAR_MASK)
3001 break;
3002 _p[0] = ((unsigned short *) _q)[0];
3003 _p[1] = ((unsigned short *) _q)[1];
3004 #if (SIZEOF_LONG == 8)
3005 _p[2] = ((unsigned short *) _q)[2];
3006 _p[3] = ((unsigned short *) _q)[3];
3007 #endif
3008 _q += SIZEOF_LONG;
3009 _p += SIZEOF_LONG / 2;
3010 }
3011 }
3012 else {
3013 /* Byteswapped ordering is similar, but we must decompose
3014 the copy bytewise, and take care of zero'ing out the
3015 upper bytes if the target object is in 32-bit units
3016 (that is, in UCS-4 builds). */
3017 while (_q < aligned_end) {
3018 unsigned long data = * (unsigned long *) _q;
3019 if (data & SWAPPED_FAST_CHAR_MASK)
3020 break;
3021 /* Zero upper bytes in UCS-4 builds */
3022 #if (Py_UNICODE_SIZE > 2)
3023 _p[0] = 0;
3024 _p[1] = 0;
3025 #if (SIZEOF_LONG == 8)
3026 _p[2] = 0;
3027 _p[3] = 0;
3028 #endif
3029 #endif
3030 /* Issue #4916; UCS-4 builds on big endian machines must
3031 fill the two last bytes of each 4-byte unit. */
3032 #if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3033 # define OFF 2
3034 #else
3035 # define OFF 0
3036 #endif
3037 ((unsigned char *) _p)[OFF + 1] = _q[0];
3038 ((unsigned char *) _p)[OFF + 0] = _q[1];
3039 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3040 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3041 #if (SIZEOF_LONG == 8)
3042 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4 ];
3043 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5 ];
3044 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6 ];
3045 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7 ];
3046 #endif
3047 #undef OFF
3048 _q += SIZEOF_LONG;
3049 _p += SIZEOF_LONG / 2;
3050 }
3051 }
3052 p = _p;
3053 q = _q;
3054 if (q >= e)
2553 break; 3055 break;
2554 errmsg = "truncated data";
2555 startinpos = ((const char *)q)-starts;
2556 endinpos = ((const char *)e)-starts;
2557 goto utf16Error;
2558 /* The remaining input chars are ignored if the callback
2559 chooses to skip the input */
2560 } 3056 }
2561 ch = (q[ihi] << 8) | q[ilo]; 3057 ch = (q[ihi] << 8) | q[ilo];
2562 3058
2563 q += 2; 3059 q += 2;
2564 3060
2565 if (ch < 0xD800 || ch > 0xDFFF) { 3061 if (ch < 0xD800 || ch > 0xDFFF) {
2566 *p++ = ch; 3062 *p++ = ch;
2567 continue; 3063 continue;
2568 } 3064 }
2569 3065
2570 /* UTF-16 code pair: */ 3066 /* UTF-16 code pair: */
2571 if (q >= e) { 3067 if (q > e) {
2572 errmsg = "unexpected end of data"; 3068 errmsg = "unexpected end of data";
2573 startinpos = (((const char *)q)-2)-starts; 3069 startinpos = (((const char *)q) - 2) - starts;
2574 endinpos = ((const char *)e)-starts; 3070 endinpos = ((const char *)e) + 1 - starts;
2575 goto utf16Error; 3071 goto utf16Error;
2576 } 3072 }
2577 if (0xD800 <= ch && ch <= 0xDBFF) { 3073 if (0xD800 <= ch && ch <= 0xDBFF) {
2578 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; 3074 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2579 q += 2; 3075 q += 2;
2580 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 3076 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2581 #ifndef Py_UNICODE_WIDE 3077 #ifndef Py_UNICODE_WIDE
2582 *p++ = ch; 3078 *p++ = ch;
2583 *p++ = ch2; 3079 *p++ = ch2;
2584 #else 3080 #else
2585 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 3081 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2586 #endif 3082 #endif
2587 continue; 3083 continue;
2588 } 3084 }
2589 else { 3085 else {
2590 errmsg = "illegal UTF-16 surrogate"; 3086 errmsg = "illegal UTF-16 surrogate";
2591 startinpos = (((const char *)q)-4)-starts; 3087 startinpos = (((const char *)q)-4)-starts;
2592 endinpos = startinpos+2; 3088 endinpos = startinpos+2;
2593 goto utf16Error; 3089 goto utf16Error;
2594 } 3090 }
2595 3091
2596 } 3092 }
2597 errmsg = "illegal encoding"; 3093 errmsg = "illegal encoding";
2598 startinpos = (((const char *)q)-2)-starts; 3094 startinpos = (((const char *)q)-2)-starts;
2599 endinpos = startinpos+2; 3095 endinpos = startinpos+2;
2600 /* Fall through to report the error */ 3096 /* Fall through to report the error */
2601 3097
2602 utf16Error: 3098 utf16Error:
2603 outpos = p-PyUnicode_AS_UNICODE(unicode); 3099 outpos = p - PyUnicode_AS_UNICODE(unicode);
2604 if (unicode_decode_call_errorhandler( 3100 if (unicode_decode_call_errorhandler(
2605 errors, &errorHandler, 3101 errors,
3102 &errorHandler,
2606 "utf16", errmsg, 3103 "utf16", errmsg,
2607 starts, size, &startinpos, &endinpos, &exc, (const char **)&q, 3104 &starts,
2608 &unicode, &outpos, &p)) 3105 (const char **)&e,
3106 &startinpos,
3107 &endinpos,
3108 &exc,
3109 (const char **)&q,
3110 &unicode,
3111 &outpos,
3112 &p))
2609 goto onError; 3113 goto onError;
3114 }
3115 /* remaining byte at the end? (size should be even) */
3116 if (e == q) {
3117 if (!consumed) {
3118 errmsg = "truncated data";
3119 startinpos = ((const char *)q) - starts;
3120 endinpos = ((const char *)e) + 1 - starts;
3121 outpos = p - PyUnicode_AS_UNICODE(unicode);
3122 if (unicode_decode_call_errorhandler(
3123 errors,
3124 &errorHandler,
3125 "utf16", errmsg,
3126 &starts,
3127 (const char **)&e,
3128 &startinpos,
3129 &endinpos,
3130 &exc,
3131 (const char **)&q,
3132 &unicode,
3133 &outpos,
3134 &p))
3135 goto onError;
3136 /* The remaining input chars are ignored if the callback
3137 chooses to skip the input */
3138 }
2610 } 3139 }
2611 3140
2612 if (byteorder) 3141 if (byteorder)
2613 *byteorder = bo; 3142 *byteorder = bo;
2614 3143
2615 if (consumed) 3144 if (consumed)
2616 *consumed = (const char *)q-starts; 3145 *consumed = (const char *)q-starts;
2617 3146
2618 /* Adjust length */ 3147 /* Adjust length */
2619 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 3148 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2620 goto onError; 3149 goto onError;
2621 3150
2622 Py_XDECREF(errorHandler); 3151 Py_XDECREF(errorHandler);
2623 Py_XDECREF(exc); 3152 Py_XDECREF(exc);
2624 return (PyObject *)unicode; 3153 return (PyObject *)unicode;
2625 3154
2626 onError: 3155 onError:
2627 Py_DECREF(unicode); 3156 Py_DECREF(unicode);
2628 Py_XDECREF(errorHandler); 3157 Py_XDECREF(errorHandler);
2629 Py_XDECREF(exc); 3158 Py_XDECREF(exc);
2630 return NULL; 3159 return NULL;
2631 } 3160 }
3161
3162 #undef FAST_CHAR_MASK
3163 #undef SWAPPED_FAST_CHAR_MASK
2632 3164
2633 PyObject * 3165 PyObject *
2634 PyUnicode_EncodeUTF16(const Py_UNICODE *s, 3166 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2635 Py_ssize_t size, 3167 Py_ssize_t size,
2636 const char *errors, 3168 const char *errors,
2637 int byteorder) 3169 int byteorder)
2638 { 3170 {
2639 PyObject *v; 3171 PyObject *v;
2640 unsigned char *p; 3172 unsigned char *p;
2641 Py_ssize_t nsize, bytesize; 3173 Py_ssize_t nsize, bytesize;
(...skipping 22 matching lines...) Expand all
2664 pairs++; 3196 pairs++;
2665 #endif 3197 #endif
2666 /* 2 * (size + pairs + (byteorder == 0)) */ 3198 /* 2 * (size + pairs + (byteorder == 0)) */
2667 if (size > PY_SSIZE_T_MAX || 3199 if (size > PY_SSIZE_T_MAX ||
2668 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0)) 3200 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2669 return PyErr_NoMemory(); 3201 return PyErr_NoMemory();
2670 nsize = size + pairs + (byteorder == 0); 3202 nsize = size + pairs + (byteorder == 0);
2671 bytesize = nsize * 2; 3203 bytesize = nsize * 2;
2672 if (bytesize / 2 != nsize) 3204 if (bytesize / 2 != nsize)
2673 return PyErr_NoMemory(); 3205 return PyErr_NoMemory();
2674 v = PyString_FromStringAndSize(NULL, bytesize); 3206 v = PyBytes_FromStringAndSize(NULL, bytesize);
2675 if (v == NULL) 3207 if (v == NULL)
2676 return NULL; 3208 return NULL;
2677 3209
2678 p = (unsigned char *)PyString_AS_STRING(v); 3210 p = (unsigned char *)PyBytes_AS_STRING(v);
2679 if (byteorder == 0) 3211 if (byteorder == 0)
2680 STORECHAR(0xFEFF); 3212 STORECHAR(0xFEFF);
2681 if (size == 0) 3213 if (size == 0)
2682 return v; 3214 goto done;
2683 3215
2684 if (byteorder == -1) { 3216 if (byteorder == -1) {
2685 /* force LE */ 3217 /* force LE */
2686 ihi = 1; 3218 ihi = 1;
2687 ilo = 0; 3219 ilo = 0;
2688 } 3220 }
2689 else if (byteorder == 1) { 3221 else if (byteorder == 1) {
2690 /* force BE */ 3222 /* force BE */
2691 ihi = 0; 3223 ihi = 0;
2692 ilo = 1; 3224 ilo = 1;
2693 } 3225 }
2694 3226
2695 while (size-- > 0) { 3227 while (size-- > 0) {
2696 Py_UNICODE ch = *s++; 3228 Py_UNICODE ch = *s++;
2697 Py_UNICODE ch2 = 0; 3229 Py_UNICODE ch2 = 0;
2698 #ifdef Py_UNICODE_WIDE 3230 #ifdef Py_UNICODE_WIDE
2699 if (ch >= 0x10000) { 3231 if (ch >= 0x10000) {
2700 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); 3232 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2701 ch = 0xD800 | ((ch-0x10000) >> 10); 3233 ch = 0xD800 | ((ch-0x10000) >> 10);
2702 } 3234 }
2703 #endif 3235 #endif
2704 STORECHAR(ch); 3236 STORECHAR(ch);
2705 if (ch2) 3237 if (ch2)
2706 STORECHAR(ch2); 3238 STORECHAR(ch2);
2707 } 3239 }
3240
3241 done:
2708 return v; 3242 return v;
2709 #undef STORECHAR 3243 #undef STORECHAR
2710 } 3244 }
2711 3245
2712 PyObject *PyUnicode_AsUTF16String(PyObject *unicode) 3246 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2713 { 3247 {
2714 if (!PyUnicode_Check(unicode)) { 3248 if (!PyUnicode_Check(unicode)) {
2715 PyErr_BadArgument(); 3249 PyErr_BadArgument();
2716 return NULL; 3250 return NULL;
2717 } 3251 }
(...skipping 100 matching lines...) Expand 10 before | Expand all | Expand 10 after
2818 digits = 8; 3352 digits = 8;
2819 message = "truncated \\UXXXXXXXX escape"; 3353 message = "truncated \\UXXXXXXXX escape";
2820 hexescape: 3354 hexescape:
2821 chr = 0; 3355 chr = 0;
2822 outpos = p-PyUnicode_AS_UNICODE(v); 3356 outpos = p-PyUnicode_AS_UNICODE(v);
2823 if (s+digits>end) { 3357 if (s+digits>end) {
2824 endinpos = size; 3358 endinpos = size;
2825 if (unicode_decode_call_errorhandler( 3359 if (unicode_decode_call_errorhandler(
2826 errors, &errorHandler, 3360 errors, &errorHandler,
2827 "unicodeescape", "end of string in escape sequence", 3361 "unicodeescape", "end of string in escape sequence",
2828 starts, size, &startinpos, &endinpos, &exc, &s, 3362 &starts, &end, &startinpos, &endinpos, &exc, &s,
2829 &v, &outpos, &p)) 3363 &v, &outpos, &p))
2830 goto onError; 3364 goto onError;
2831 goto nextByte; 3365 goto nextByte;
2832 } 3366 }
2833 for (i = 0; i < digits; ++i) { 3367 for (i = 0; i < digits; ++i) {
2834 c = (unsigned char) s[i]; 3368 c = (unsigned char) s[i];
2835 if (!isxdigit(c)) { 3369 if (!ISXDIGIT(c)) {
2836 endinpos = (s+i+1)-starts; 3370 endinpos = (s+i+1)-starts;
2837 if (unicode_decode_call_errorhandler( 3371 if (unicode_decode_call_errorhandler(
2838 errors, &errorHandler, 3372 errors, &errorHandler,
2839 "unicodeescape", message, 3373 "unicodeescape", message,
2840 starts, size, &startinpos, &endinpos, &exc, &s, 3374 &starts, &end, &startinpos, &endinpos, &exc, &s,
2841 &v, &outpos, &p)) 3375 &v, &outpos, &p))
2842 goto onError; 3376 goto onError;
2843 goto nextByte; 3377 goto nextByte;
2844 } 3378 }
2845 chr = (chr<<4) & ~0xF; 3379 chr = (chr<<4) & ~0xF;
2846 if (c >= '0' && c <= '9') 3380 if (c >= '0' && c <= '9')
2847 chr += c - '0'; 3381 chr += c - '0';
2848 else if (c >= 'a' && c <= 'f') 3382 else if (c >= 'a' && c <= 'f')
2849 chr += 10 + c - 'a'; 3383 chr += 10 + c - 'a';
2850 else 3384 else
(...skipping 18 matching lines...) Expand all
2869 chr -= 0x10000L; 3403 chr -= 0x10000L;
2870 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); 3404 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2871 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); 3405 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2872 #endif 3406 #endif
2873 } else { 3407 } else {
2874 endinpos = s-starts; 3408 endinpos = s-starts;
2875 outpos = p-PyUnicode_AS_UNICODE(v); 3409 outpos = p-PyUnicode_AS_UNICODE(v);
2876 if (unicode_decode_call_errorhandler( 3410 if (unicode_decode_call_errorhandler(
2877 errors, &errorHandler, 3411 errors, &errorHandler,
2878 "unicodeescape", "illegal Unicode character", 3412 "unicodeescape", "illegal Unicode character",
2879 starts, size, &startinpos, &endinpos, &exc, &s, 3413 &starts, &end, &startinpos, &endinpos, &exc, &s,
2880 &v, &outpos, &p)) 3414 &v, &outpos, &p))
2881 goto onError; 3415 goto onError;
2882 } 3416 }
2883 break; 3417 break;
2884 3418
2885 /* \N{name} */ 3419 /* \N{name} */
2886 case 'N': 3420 case 'N':
2887 message = "malformed \\N character escape"; 3421 message = "malformed \\N character escape";
2888 if (ucnhash_CAPI == NULL) { 3422 if (ucnhash_CAPI == NULL) {
2889 /* load the unicode data module */ 3423 /* load the unicode data module */
(...skipping 12 matching lines...) Expand all
2902 s++; 3436 s++;
2903 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &ch r)) 3437 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &ch r))
2904 goto store; 3438 goto store;
2905 } 3439 }
2906 } 3440 }
2907 endinpos = s-starts; 3441 endinpos = s-starts;
2908 outpos = p-PyUnicode_AS_UNICODE(v); 3442 outpos = p-PyUnicode_AS_UNICODE(v);
2909 if (unicode_decode_call_errorhandler( 3443 if (unicode_decode_call_errorhandler(
2910 errors, &errorHandler, 3444 errors, &errorHandler,
2911 "unicodeescape", message, 3445 "unicodeescape", message,
2912 starts, size, &startinpos, &endinpos, &exc, &s, 3446 &starts, &end, &startinpos, &endinpos, &exc, &s,
2913 &v, &outpos, &p)) 3447 &v, &outpos, &p))
2914 goto onError; 3448 goto onError;
2915 break; 3449 break;
2916 3450
2917 default: 3451 default:
2918 if (s > end) { 3452 if (s > end) {
2919 message = "\\ at end of string"; 3453 message = "\\ at end of string";
2920 s--; 3454 s--;
2921 endinpos = s-starts; 3455 endinpos = s-starts;
2922 outpos = p-PyUnicode_AS_UNICODE(v); 3456 outpos = p-PyUnicode_AS_UNICODE(v);
2923 if (unicode_decode_call_errorhandler( 3457 if (unicode_decode_call_errorhandler(
2924 errors, &errorHandler, 3458 errors, &errorHandler,
2925 "unicodeescape", message, 3459 "unicodeescape", message,
2926 starts, size, &startinpos, &endinpos, &exc, &s, 3460 &starts, &end, &startinpos, &endinpos, &exc, &s,
2927 &v, &outpos, &p)) 3461 &v, &outpos, &p))
2928 goto onError; 3462 goto onError;
2929 } 3463 }
2930 else { 3464 else {
2931 *p++ = '\\'; 3465 *p++ = '\\';
2932 *p++ = (unsigned char)s[-1]; 3466 *p++ = (unsigned char)s[-1];
2933 } 3467 }
2934 break; 3468 break;
2935 } 3469 }
2936 nextByte: 3470 nextByte:
(...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after
2974 3508
2975 while (size-- > 0) { 3509 while (size-- > 0) {
2976 if (*s == ch) 3510 if (*s == ch)
2977 return s; 3511 return s;
2978 s++; 3512 s++;
2979 } 3513 }
2980 3514
2981 return NULL; 3515 return NULL;
2982 } 3516 }
2983 3517
2984 static 3518 static const char *hexdigits = "0123456789abcdef";
2985 PyObject *unicodeescape_string(const Py_UNICODE *s, 3519
2986 Py_ssize_t size, 3520 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2987 int quotes) 3521 Py_ssize_t size)
2988 { 3522 {
2989 PyObject *repr; 3523 PyObject *repr;
2990 char *p; 3524 char *p;
2991 3525
2992 static const char *hexdigit = "0123456789abcdef";
2993 #ifdef Py_UNICODE_WIDE 3526 #ifdef Py_UNICODE_WIDE
2994 const Py_ssize_t expandsize = 10; 3527 const Py_ssize_t expandsize = 10;
2995 #else 3528 #else
2996 const Py_ssize_t expandsize = 6; 3529 const Py_ssize_t expandsize = 6;
2997 #endif 3530 #endif
2998 3531
2999 /* XXX(nnorwitz): rather than over-allocating, it would be 3532 /* XXX(nnorwitz): rather than over-allocating, it would be
3000 better to choose a different scheme. Perhaps scan the 3533 better to choose a different scheme. Perhaps scan the
3001 first N-chars of the string and allocate based on that size. 3534 first N-chars of the string and allocate based on that size.
3002 */ 3535 */
3003 /* Initial allocation is based on the longest-possible unichr 3536 /* Initial allocation is based on the longest-possible unichr
3004 escape. 3537 escape.
3005 3538
3006 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 3539 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3007 unichr, so in this case it's the longest unichr escape. In 3540 unichr, so in this case it's the longest unichr escape. In
3008 narrow (UTF-16) builds this is five chars per source unichr 3541 narrow (UTF-16) builds this is five chars per source unichr
3009 since there are two unichrs in the surrogate pair, so in narrow 3542 since there are two unichrs in the surrogate pair, so in narrow
3010 (UTF-16) builds it's not the longest unichr escape. 3543 (UTF-16) builds it's not the longest unichr escape.
3011 3544
3012 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 3545 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3013 so in the narrow (UTF-16) build case it's the longest unichr 3546 so in the narrow (UTF-16) build case it's the longest unichr
3014 escape. 3547 escape.
3015 */ 3548 */
3016 3549
3550 if (size == 0)
3551 return PyBytes_FromStringAndSize(NULL, 0);
3552
3017 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) 3553 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3018 return PyErr_NoMemory(); 3554 return PyErr_NoMemory();
3019 3555
3020 repr = PyString_FromStringAndSize(NULL, 3556 repr = PyBytes_FromStringAndSize(NULL,
3021 2 3557 2
3022 + expandsize*size 3558 + expandsize*size
3023 + 1); 3559 + 1);
3024 if (repr == NULL) 3560 if (repr == NULL)
3025 return NULL; 3561 return NULL;
3026 3562
3027 p = PyString_AS_STRING(repr); 3563 p = PyBytes_AS_STRING(repr);
3028 3564
3029 if (quotes) {
3030 *p++ = 'u';
3031 *p++ = (findchar(s, size, '\'') &&
3032 !findchar(s, size, '"')) ? '"' : '\'';
3033 }
3034 while (size-- > 0) { 3565 while (size-- > 0) {
3035 Py_UNICODE ch = *s++; 3566 Py_UNICODE ch = *s++;
3036 3567
3037 /* Escape quotes and backslashes */ 3568 /* Escape backslashes */
3038 if ((quotes && 3569 if (ch == '\\') {
3039 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
3040 *p++ = '\\'; 3570 *p++ = '\\';
3041 *p++ = (char) ch; 3571 *p++ = (char) ch;
3042 continue; 3572 continue;
3043 } 3573 }
3044 3574
3045 #ifdef Py_UNICODE_WIDE 3575 #ifdef Py_UNICODE_WIDE
3046 /* Map 21-bit characters to '\U00xxxxxx' */ 3576 /* Map 21-bit characters to '\U00xxxxxx' */
3047 else if (ch >= 0x10000) { 3577 else if (ch >= 0x10000) {
3048 *p++ = '\\'; 3578 *p++ = '\\';
3049 *p++ = 'U'; 3579 *p++ = 'U';
3050 *p++ = hexdigit[(ch >> 28) & 0x0000000F]; 3580 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3051 *p++ = hexdigit[(ch >> 24) & 0x0000000F]; 3581 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3052 *p++ = hexdigit[(ch >> 20) & 0x0000000F]; 3582 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3053 *p++ = hexdigit[(ch >> 16) & 0x0000000F]; 3583 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3054 *p++ = hexdigit[(ch >> 12) & 0x0000000F]; 3584 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3055 *p++ = hexdigit[(ch >> 8) & 0x0000000F]; 3585 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3056 *p++ = hexdigit[(ch >> 4) & 0x0000000F]; 3586 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3057 *p++ = hexdigit[ch & 0x0000000F]; 3587 *p++ = hexdigits[ch & 0x0000000F];
3058 continue; 3588 continue;
3059 } 3589 }
3060 #else 3590 #else
3061 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 3591 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3062 else if (ch >= 0xD800 && ch < 0xDC00) { 3592 else if (ch >= 0xD800 && ch < 0xDC00) {
3063 Py_UNICODE ch2; 3593 Py_UNICODE ch2;
3064 Py_UCS4 ucs; 3594 Py_UCS4 ucs;
3065 3595
3066 ch2 = *s++; 3596 ch2 = *s++;
3067 size--; 3597 size--;
3068 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 3598 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3069 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 3599 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3070 *p++ = '\\'; 3600 *p++ = '\\';
3071 *p++ = 'U'; 3601 *p++ = 'U';
3072 *p++ = hexdigit[(ucs >> 28) & 0x0000000F]; 3602 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3073 *p++ = hexdigit[(ucs >> 24) & 0x0000000F]; 3603 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3074 *p++ = hexdigit[(ucs >> 20) & 0x0000000F]; 3604 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3075 *p++ = hexdigit[(ucs >> 16) & 0x0000000F]; 3605 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3076 *p++ = hexdigit[(ucs >> 12) & 0x0000000F]; 3606 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3077 *p++ = hexdigit[(ucs >> 8) & 0x0000000F]; 3607 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3078 *p++ = hexdigit[(ucs >> 4) & 0x0000000F]; 3608 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3079 *p++ = hexdigit[ucs & 0x0000000F]; 3609 *p++ = hexdigits[ucs & 0x0000000F];
3080 continue; 3610 continue;
3081 } 3611 }
3082 /* Fall through: isolated surrogates are copied as-is */ 3612 /* Fall through: isolated surrogates are copied as-is */
3083 s--; 3613 s--;
3084 size++; 3614 size++;
3085 } 3615 }
3086 #endif 3616 #endif
3087 3617
3088 /* Map 16-bit characters to '\uxxxx' */ 3618 /* Map 16-bit characters to '\uxxxx' */
3089 if (ch >= 256) { 3619 if (ch >= 256) {
3090 *p++ = '\\'; 3620 *p++ = '\\';
3091 *p++ = 'u'; 3621 *p++ = 'u';
3092 *p++ = hexdigit[(ch >> 12) & 0x000F]; 3622 *p++ = hexdigits[(ch >> 12) & 0x000F];
3093 *p++ = hexdigit[(ch >> 8) & 0x000F]; 3623 *p++ = hexdigits[(ch >> 8) & 0x000F];
3094 *p++ = hexdigit[(ch >> 4) & 0x000F]; 3624 *p++ = hexdigits[(ch >> 4) & 0x000F];
3095 *p++ = hexdigit[ch & 0x000F]; 3625 *p++ = hexdigits[ch & 0x000F];
3096 } 3626 }
3097 3627
3098 /* Map special whitespace to '\t', \n', '\r' */ 3628 /* Map special whitespace to '\t', \n', '\r' */
3099 else if (ch == '\t') { 3629 else if (ch == '\t') {
3100 *p++ = '\\'; 3630 *p++ = '\\';
3101 *p++ = 't'; 3631 *p++ = 't';
3102 } 3632 }
3103 else if (ch == '\n') { 3633 else if (ch == '\n') {
3104 *p++ = '\\'; 3634 *p++ = '\\';
3105 *p++ = 'n'; 3635 *p++ = 'n';
3106 } 3636 }
3107 else if (ch == '\r') { 3637 else if (ch == '\r') {
3108 *p++ = '\\'; 3638 *p++ = '\\';
3109 *p++ = 'r'; 3639 *p++ = 'r';
3110 } 3640 }
3111 3641
3112 /* Map non-printable US ASCII to '\xhh' */ 3642 /* Map non-printable US ASCII to '\xhh' */
3113 else if (ch < ' ' || ch >= 0x7F) { 3643 else if (ch < ' ' || ch >= 0x7F) {
3114 *p++ = '\\'; 3644 *p++ = '\\';
3115 *p++ = 'x'; 3645 *p++ = 'x';
3116 *p++ = hexdigit[(ch >> 4) & 0x000F]; 3646 *p++ = hexdigits[(ch >> 4) & 0x000F];
3117 *p++ = hexdigit[ch & 0x000F]; 3647 *p++ = hexdigits[ch & 0x000F];
3118 } 3648 }
3119 3649
3120 /* Copy everything else as-is */ 3650 /* Copy everything else as-is */
3121 else 3651 else
3122 *p++ = (char) ch; 3652 *p++ = (char) ch;
3123 } 3653 }
3124 if (quotes) 3654
3125 *p++ = PyString_AS_STRING(repr)[1]; 3655 assert(p - PyBytes_AS_STRING(repr) > 0);
3126 3656 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3127 *p = '\0';
3128 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
3129 return NULL; 3657 return NULL;
3130 return repr; 3658 return repr;
3131 } 3659 }
3132 3660
3133 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3134 Py_ssize_t size)
3135 {
3136 return unicodeescape_string(s, size, 0);
3137 }
3138
3139 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 3661 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3140 { 3662 {
3663 PyObject *s;
3141 if (!PyUnicode_Check(unicode)) { 3664 if (!PyUnicode_Check(unicode)) {
3142 PyErr_BadArgument(); 3665 PyErr_BadArgument();
3143 return NULL; 3666 return NULL;
3144 } 3667 }
3145 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 3668 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3146 PyUnicode_GET_SIZE(unicode)); 3669 PyUnicode_GET_SIZE(unicode));
3670 return s;
3147 } 3671 }
3148 3672
3149 /* --- Raw Unicode Escape Codec ------------------------------------------- */ 3673 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3150 3674
3151 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, 3675 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3152 Py_ssize_t size, 3676 Py_ssize_t size,
3153 const char *errors) 3677 const char *errors)
3154 { 3678 {
3155 const char *starts = s; 3679 const char *starts = s;
3156 Py_ssize_t startinpos; 3680 Py_ssize_t startinpos;
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after
3201 continue; 3725 continue;
3202 } 3726 }
3203 p--; 3727 p--;
3204 count = *s=='u' ? 4 : 8; 3728 count = *s=='u' ? 4 : 8;
3205 s++; 3729 s++;
3206 3730
3207 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 3731 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3208 outpos = p-PyUnicode_AS_UNICODE(v); 3732 outpos = p-PyUnicode_AS_UNICODE(v);
3209 for (x = 0, i = 0; i < count; ++i, ++s) { 3733 for (x = 0, i = 0; i < count; ++i, ++s) {
3210 c = (unsigned char)*s; 3734 c = (unsigned char)*s;
3211 if (!isxdigit(c)) { 3735 if (!ISXDIGIT(c)) {
3212 endinpos = s-starts; 3736 endinpos = s-starts;
3213 if (unicode_decode_call_errorhandler( 3737 if (unicode_decode_call_errorhandler(
3214 errors, &errorHandler, 3738 errors, &errorHandler,
3215 "rawunicodeescape", "truncated \\uXXXX", 3739 "rawunicodeescape", "truncated \\uXXXX",
3216 starts, size, &startinpos, &endinpos, &exc, &s, 3740 &starts, &end, &startinpos, &endinpos, &exc, &s,
3217 &v, &outpos, &p)) 3741 &v, &outpos, &p))
3218 goto onError; 3742 goto onError;
3219 goto nextByte; 3743 goto nextByte;
3220 } 3744 }
3221 x = (x<<4) & ~0xF; 3745 x = (x<<4) & ~0xF;
3222 if (c >= '0' && c <= '9') 3746 if (c >= '0' && c <= '9')
3223 x += c - '0'; 3747 x += c - '0';
3224 else if (c >= 'a' && c <= 'f') 3748 else if (c >= 'a' && c <= 'f')
3225 x += 10 + c - 'a'; 3749 x += 10 + c - 'a';
3226 else 3750 else
(...skipping 11 matching lines...) Expand all
3238 x -= 0x10000L; 3762 x -= 0x10000L;
3239 *p++ = 0xD800 + (Py_UNICODE) (x >> 10); 3763 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3240 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF); 3764 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3241 #endif 3765 #endif
3242 } else { 3766 } else {
3243 endinpos = s-starts; 3767 endinpos = s-starts;
3244 outpos = p-PyUnicode_AS_UNICODE(v); 3768 outpos = p-PyUnicode_AS_UNICODE(v);
3245 if (unicode_decode_call_errorhandler( 3769 if (unicode_decode_call_errorhandler(
3246 errors, &errorHandler, 3770 errors, &errorHandler,
3247 "rawunicodeescape", "\\Uxxxxxxxx out of range", 3771 "rawunicodeescape", "\\Uxxxxxxxx out of range",
3248 starts, size, &startinpos, &endinpos, &exc, &s, 3772 &starts, &end, &startinpos, &endinpos, &exc, &s,
3249 &v, &outpos, &p)) 3773 &v, &outpos, &p))
3250 goto onError; 3774 goto onError;
3251 } 3775 }
3252 nextByte: 3776 nextByte:
3253 ; 3777 ;
3254 } 3778 }
3255 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3779 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3256 goto onError; 3780 goto onError;
3257 Py_XDECREF(errorHandler); 3781 Py_XDECREF(errorHandler);
3258 Py_XDECREF(exc); 3782 Py_XDECREF(exc);
3259 return (PyObject *)v; 3783 return (PyObject *)v;
3260 3784
3261 onError: 3785 onError:
3262 Py_XDECREF(v); 3786 Py_XDECREF(v);
3263 Py_XDECREF(errorHandler); 3787 Py_XDECREF(errorHandler);
3264 Py_XDECREF(exc); 3788 Py_XDECREF(exc);
3265 return NULL; 3789 return NULL;
3266 } 3790 }
3267 3791
3268 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 3792 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3269 Py_ssize_t size) 3793 Py_ssize_t size)
3270 { 3794 {
3271 PyObject *repr; 3795 PyObject *repr;
3272 char *p; 3796 char *p;
3273 char *q; 3797 char *q;
3274 3798
3275 static const char *hexdigit = "0123456789abcdef";
3276 #ifdef Py_UNICODE_WIDE 3799 #ifdef Py_UNICODE_WIDE
3277 const Py_ssize_t expandsize = 10; 3800 const Py_ssize_t expandsize = 10;
3278 #else 3801 #else
3279 const Py_ssize_t expandsize = 6; 3802 const Py_ssize_t expandsize = 6;
3280 #endif 3803 #endif
3281 3804
3282 if (size > PY_SSIZE_T_MAX / expandsize) 3805 if (size > PY_SSIZE_T_MAX / expandsize)
3283 return PyErr_NoMemory(); 3806 return PyErr_NoMemory();
3284 3807
3285 repr = PyString_FromStringAndSize(NULL, expandsize * size); 3808 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
3286 if (repr == NULL) 3809 if (repr == NULL)
3287 return NULL; 3810 return NULL;
3288 if (size == 0) 3811 if (size == 0)
3289 return repr; 3812 return repr;
3290 3813
3291 p = q = PyString_AS_STRING(repr); 3814 p = q = PyBytes_AS_STRING(repr);
3292 while (size-- > 0) { 3815 while (size-- > 0) {
3293 Py_UNICODE ch = *s++; 3816 Py_UNICODE ch = *s++;
3294 #ifdef Py_UNICODE_WIDE 3817 #ifdef Py_UNICODE_WIDE
3295 /* Map 32-bit characters to '\Uxxxxxxxx' */ 3818 /* Map 32-bit characters to '\Uxxxxxxxx' */
3296 if (ch >= 0x10000) { 3819 if (ch >= 0x10000) {
3297 *p++ = '\\'; 3820 *p++ = '\\';
3298 *p++ = 'U'; 3821 *p++ = 'U';
3299 *p++ = hexdigit[(ch >> 28) & 0xf]; 3822 *p++ = hexdigits[(ch >> 28) & 0xf];
3300 *p++ = hexdigit[(ch >> 24) & 0xf]; 3823 *p++ = hexdigits[(ch >> 24) & 0xf];
3301 *p++ = hexdigit[(ch >> 20) & 0xf]; 3824 *p++ = hexdigits[(ch >> 20) & 0xf];
3302 *p++ = hexdigit[(ch >> 16) & 0xf]; 3825 *p++ = hexdigits[(ch >> 16) & 0xf];
3303 *p++ = hexdigit[(ch >> 12) & 0xf]; 3826 *p++ = hexdigits[(ch >> 12) & 0xf];
3304 *p++ = hexdigit[(ch >> 8) & 0xf]; 3827 *p++ = hexdigits[(ch >> 8) & 0xf];
3305 *p++ = hexdigit[(ch >> 4) & 0xf]; 3828 *p++ = hexdigits[(ch >> 4) & 0xf];
3306 *p++ = hexdigit[ch & 15]; 3829 *p++ = hexdigits[ch & 15];
3307 } 3830 }
3308 else 3831 else
3309 #else 3832 #else
3310 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 3833 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3311 if (ch >= 0xD800 && ch < 0xDC00) { 3834 if (ch >= 0xD800 && ch < 0xDC00) {
3312 Py_UNICODE ch2; 3835 Py_UNICODE ch2;
3313 Py_UCS4 ucs; 3836 Py_UCS4 ucs;
3314 3837
3315 ch2 = *s++; 3838 ch2 = *s++;
3316 size--; 3839 size--;
3317 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 3840 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3318 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 3841 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3319 *p++ = '\\'; 3842 *p++ = '\\';
3320 *p++ = 'U'; 3843 *p++ = 'U';
3321 *p++ = hexdigit[(ucs >> 28) & 0xf]; 3844 *p++ = hexdigits[(ucs >> 28) & 0xf];
3322 *p++ = hexdigit[(ucs >> 24) & 0xf]; 3845 *p++ = hexdigits[(ucs >> 24) & 0xf];
3323 *p++ = hexdigit[(ucs >> 20) & 0xf]; 3846 *p++ = hexdigits[(ucs >> 20) & 0xf];
3324 *p++ = hexdigit[(ucs >> 16) & 0xf]; 3847 *p++ = hexdigits[(ucs >> 16) & 0xf];
3325 *p++ = hexdigit[(ucs >> 12) & 0xf]; 3848 *p++ = hexdigits[(ucs >> 12) & 0xf];
3326 *p++ = hexdigit[(ucs >> 8) & 0xf]; 3849 *p++ = hexdigits[(ucs >> 8) & 0xf];
3327 *p++ = hexdigit[(ucs >> 4) & 0xf]; 3850 *p++ = hexdigits[(ucs >> 4) & 0xf];
3328 *p++ = hexdigit[ucs & 0xf]; 3851 *p++ = hexdigits[ucs & 0xf];
3329 continue; 3852 continue;
3330 } 3853 }
3331 /* Fall through: isolated surrogates are copied as-is */ 3854 /* Fall through: isolated surrogates are copied as-is */
3332 s--; 3855 s--;
3333 size++; 3856 size++;
3334 } 3857 }
3335 #endif 3858 #endif
3336 /* Map 16-bit characters to '\uxxxx' */ 3859 /* Map 16-bit characters to '\uxxxx' */
3337 if (ch >= 256) { 3860 if (ch >= 256) {
3338 *p++ = '\\'; 3861 *p++ = '\\';
3339 *p++ = 'u'; 3862 *p++ = 'u';
3340 *p++ = hexdigit[(ch >> 12) & 0xf]; 3863 *p++ = hexdigits[(ch >> 12) & 0xf];
3341 *p++ = hexdigit[(ch >> 8) & 0xf]; 3864 *p++ = hexdigits[(ch >> 8) & 0xf];
3342 *p++ = hexdigit[(ch >> 4) & 0xf]; 3865 *p++ = hexdigits[(ch >> 4) & 0xf];
3343 *p++ = hexdigit[ch & 15]; 3866 *p++ = hexdigits[ch & 15];
3344 } 3867 }
3345 /* Copy everything else as-is */ 3868 /* Copy everything else as-is */
3346 else 3869 else
3347 *p++ = (char) ch; 3870 *p++ = (char) ch;
3348 } 3871 }
3349 *p = '\0'; 3872 size = p - q;
3350 if (_PyString_Resize(&repr, p - q)) 3873
3874 assert(size > 0);
3875 if (_PyBytes_Resize(&repr, size) < 0)
3351 return NULL; 3876 return NULL;
3352 return repr; 3877 return repr;
3353 } 3878 }
3354 3879
3355 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 3880 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3356 { 3881 {
3882 PyObject *s;
3357 if (!PyUnicode_Check(unicode)) { 3883 if (!PyUnicode_Check(unicode)) {
3358 PyErr_BadArgument(); 3884 PyErr_BadArgument();
3359 return NULL; 3885 return NULL;
3360 } 3886 }
3361 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 3887 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3362 PyUnicode_GET_SIZE(unicode)); 3888 PyUnicode_GET_SIZE(unicode));
3889
3890 return s;
3363 } 3891 }
3364 3892
3365 /* --- Unicode Internal Codec ------------------------------------------- */ 3893 /* --- Unicode Internal Codec ------------------------------------------- */
3366 3894
3367 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s, 3895 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3368 Py_ssize_t size, 3896 Py_ssize_t size,
3369 const char *errors) 3897 const char *errors)
3370 { 3898 {
3371 const char *starts = s; 3899 const char *starts = s;
3372 Py_ssize_t startinpos; 3900 Py_ssize_t startinpos;
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after
3409 reason = "truncated input"; 3937 reason = "truncated input";
3410 } 3938 }
3411 else { 3939 else {
3412 endinpos = s - starts + Py_UNICODE_SIZE; 3940 endinpos = s - starts + Py_UNICODE_SIZE;
3413 reason = "illegal code point (> 0x10FFFF)"; 3941 reason = "illegal code point (> 0x10FFFF)";
3414 } 3942 }
3415 outpos = p - PyUnicode_AS_UNICODE(v); 3943 outpos = p - PyUnicode_AS_UNICODE(v);
3416 if (unicode_decode_call_errorhandler( 3944 if (unicode_decode_call_errorhandler(
3417 errors, &errorHandler, 3945 errors, &errorHandler,
3418 "unicode_internal", reason, 3946 "unicode_internal", reason,
3419 starts, size, &startinpos, &endinpos, &exc, &s, 3947 &starts, &end, &startinpos, &endinpos, &exc, &s,
3420 &v, &outpos, &p)) { 3948 &v, &outpos, &p)) {
3421 goto onError; 3949 goto onError;
3422 } 3950 }
3423 } 3951 }
3424 else { 3952 else {
3425 p++; 3953 p++;
3426 s += Py_UNICODE_SIZE; 3954 s += Py_UNICODE_SIZE;
3427 } 3955 }
3428 } 3956 }
3429 3957
(...skipping 11 matching lines...) Expand all
3441 } 3969 }
3442 3970
3443 /* --- Latin-1 Codec ------------------------------------------------------ */ 3971 /* --- Latin-1 Codec ------------------------------------------------------ */
3444 3972
3445 PyObject *PyUnicode_DecodeLatin1(const char *s, 3973 PyObject *PyUnicode_DecodeLatin1(const char *s,
3446 Py_ssize_t size, 3974 Py_ssize_t size,
3447 const char *errors) 3975 const char *errors)
3448 { 3976 {
3449 PyUnicodeObject *v; 3977 PyUnicodeObject *v;
3450 Py_UNICODE *p; 3978 Py_UNICODE *p;
3979 const char *e, *unrolled_end;
3451 3980
3452 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 3981 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3453 if (size == 1) { 3982 if (size == 1) {
3454 Py_UNICODE r = *(unsigned char*)s; 3983 Py_UNICODE r = *(unsigned char*)s;
3455 return PyUnicode_FromUnicode(&r, 1); 3984 return PyUnicode_FromUnicode(&r, 1);
3456 } 3985 }
3457 3986
3458 v = _PyUnicode_New(size); 3987 v = _PyUnicode_New(size);
3459 if (v == NULL) 3988 if (v == NULL)
3460 goto onError; 3989 goto onError;
3461 if (size == 0) 3990 if (size == 0)
3462 return (PyObject *)v; 3991 return (PyObject *)v;
3463 p = PyUnicode_AS_UNICODE(v); 3992 p = PyUnicode_AS_UNICODE(v);
3464 while (size-- > 0) 3993 e = s + size;
3465 *p++ = (unsigned char)*s++; 3994 /* Unrolling the copy makes it much faster by reducing the looping
3995 overhead. This is similar to what many memcpy() implementations do. */
3996 unrolled_end = e - 4;
3997 while (s < unrolled_end) {
3998 p[0] = (unsigned char) s[0];
3999 p[1] = (unsigned char) s[1];
4000 p[2] = (unsigned char) s[2];
4001 p[3] = (unsigned char) s[3];
4002 s += 4;
4003 p += 4;
4004 }
4005 while (s < e)
4006 *p++ = (unsigned char) *s++;
3466 return (PyObject *)v; 4007 return (PyObject *)v;
3467 4008
3468 onError: 4009 onError:
3469 Py_XDECREF(v); 4010 Py_XDECREF(v);
3470 return NULL; 4011 return NULL;
3471 } 4012 }
3472 4013
3473 /* create or adjust a UnicodeEncodeError */ 4014 /* create or adjust a UnicodeEncodeError */
3474 static void make_encode_exception(PyObject **exceptionObject, 4015 static void make_encode_exception(PyObject **exceptionObject,
3475 const char *encoding, 4016 const char *encoding,
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after
3512 build arguments, call the callback and check the arguments, 4053 build arguments, call the callback and check the arguments,
3513 put the result into newpos and return the replacement string, which 4054 put the result into newpos and return the replacement string, which
3514 has to be freed by the caller */ 4055 has to be freed by the caller */
3515 static PyObject *unicode_encode_call_errorhandler(const char *errors, 4056 static PyObject *unicode_encode_call_errorhandler(const char *errors,
3516 PyObject **errorHandler, 4057 PyObject **errorHandler,
3517 const char *encoding, const ch ar *reason, 4058 const char *encoding, const ch ar *reason,
3518 const Py_UNICODE *unicode, Py_ ssize_t size, PyObject **exceptionObject, 4059 const Py_UNICODE *unicode, Py_ ssize_t size, PyObject **exceptionObject,
3519 Py_ssize_t startpos, Py_ssize_ t endpos, 4060 Py_ssize_t startpos, Py_ssize_ t endpos,
3520 Py_ssize_t *newpos) 4061 Py_ssize_t *newpos)
3521 { 4062 {
3522 static char *argparse = "O!n;encoding error handler must return (unicode, in t) tuple"; 4063 static char *argparse = "On;encoding error handler must return (str/bytes, i nt) tuple";
3523 4064
3524 PyObject *restuple; 4065 PyObject *restuple;
3525 PyObject *resunicode; 4066 PyObject *resunicode;
3526 4067
3527 if (*errorHandler == NULL) { 4068 if (*errorHandler == NULL) {
3528 *errorHandler = PyCodec_LookupError(errors); 4069 *errorHandler = PyCodec_LookupError(errors);
3529 if (*errorHandler == NULL) 4070 if (*errorHandler == NULL)
3530 return NULL; 4071 return NULL;
3531 } 4072 }
3532 4073
3533 make_encode_exception(exceptionObject, 4074 make_encode_exception(exceptionObject,
3534 encoding, unicode, size, startpos, endpos, reason); 4075 encoding, unicode, size, startpos, endpos, reason);
3535 if (*exceptionObject == NULL) 4076 if (*exceptionObject == NULL)
3536 return NULL; 4077 return NULL;
3537 4078
3538 restuple = PyObject_CallFunctionObjArgs( 4079 restuple = PyObject_CallFunctionObjArgs(
3539 *errorHandler, *exceptionObject, NULL); 4080 *errorHandler, *exceptionObject, NULL);
3540 if (restuple == NULL) 4081 if (restuple == NULL)
3541 return NULL; 4082 return NULL;
3542 if (!PyTuple_Check(restuple)) { 4083 if (!PyTuple_Check(restuple)) {
3543 PyErr_SetString(PyExc_TypeError, &argparse[4]); 4084 PyErr_SetString(PyExc_TypeError, &argparse[3]);
3544 Py_DECREF(restuple); 4085 Py_DECREF(restuple);
3545 return NULL; 4086 return NULL;
3546 } 4087 }
3547 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 4088 if (!PyArg_ParseTuple(restuple, argparse,
3548 &resunicode, newpos)) { 4089 &resunicode, newpos)) {
4090 Py_DECREF(restuple);
4091 return NULL;
4092 }
4093 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4094 PyErr_SetString(PyExc_TypeError, &argparse[3]);
3549 Py_DECREF(restuple); 4095 Py_DECREF(restuple);
3550 return NULL; 4096 return NULL;
3551 } 4097 }
3552 if (*newpos<0) 4098 if (*newpos<0)
3553 *newpos = size+*newpos; 4099 *newpos = size+*newpos;
3554 if (*newpos<0 || *newpos>size) { 4100 if (*newpos<0 || *newpos>size) {
3555 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of b ounds", *newpos); 4101 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of b ounds", *newpos);
3556 Py_DECREF(restuple); 4102 Py_DECREF(restuple);
3557 return NULL; 4103 return NULL;
3558 } 4104 }
(...skipping 10 matching lines...) Expand all
3569 /* output object */ 4115 /* output object */
3570 PyObject *res; 4116 PyObject *res;
3571 /* pointers to the beginning and end+1 of input */ 4117 /* pointers to the beginning and end+1 of input */
3572 const Py_UNICODE *startp = p; 4118 const Py_UNICODE *startp = p;
3573 const Py_UNICODE *endp = p + size; 4119 const Py_UNICODE *endp = p + size;
3574 /* pointer to the beginning of the unencodable characters */ 4120 /* pointer to the beginning of the unencodable characters */
3575 /* const Py_UNICODE *badp = NULL; */ 4121 /* const Py_UNICODE *badp = NULL; */
3576 /* pointer into the output */ 4122 /* pointer into the output */
3577 char *str; 4123 char *str;
3578 /* current output position */ 4124 /* current output position */
3579 Py_ssize_t respos = 0;
3580 Py_ssize_t ressize; 4125 Py_ssize_t ressize;
3581 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 4126 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3582 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 4127 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3583 PyObject *errorHandler = NULL; 4128 PyObject *errorHandler = NULL;
3584 PyObject *exc = NULL; 4129 PyObject *exc = NULL;
3585 /* the following variable is used for caching string comparisons 4130 /* the following variable is used for caching string comparisons
3586 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharre freplace */ 4131 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharre freplace */
3587 int known_errorHandler = -1; 4132 int known_errorHandler = -1;
3588 4133
3589 /* allocate enough for a simple encoding without 4134 /* allocate enough for a simple encoding without
3590 replacements, if we need more, we'll resize */ 4135 replacements, if we need more, we'll resize */
3591 res = PyString_FromStringAndSize(NULL, size); 4136 if (size == 0)
4137 return PyBytes_FromStringAndSize(NULL, 0);
4138 res = PyBytes_FromStringAndSize(NULL, size);
3592 if (res == NULL) 4139 if (res == NULL)
3593 goto onError; 4140 return NULL;
3594 if (size == 0) 4141 str = PyBytes_AS_STRING(res);
3595 return res;
3596 str = PyString_AS_STRING(res);
3597 ressize = size; 4142 ressize = size;
3598 4143
3599 while (p<endp) { 4144 while (p<endp) {
3600 Py_UNICODE c = *p; 4145 Py_UNICODE c = *p;
3601 4146
3602 /* can we encode this? */ 4147 /* can we encode this? */
3603 if (c<limit) { 4148 if (c<limit) {
3604 /* no overflow check, because we know that the space is enough */ 4149 /* no overflow check, because we know that the space is enough */
3605 *str++ = (char)c; 4150 *str++ = (char)c;
3606 ++p; 4151 ++p;
(...skipping 29 matching lines...) Expand all
3636 case 1: /* strict */ 4181 case 1: /* strict */
3637 raise_encode_exception(&exc, encoding, startp, size, collstart-s tartp, collend-startp, reason); 4182 raise_encode_exception(&exc, encoding, startp, size, collstart-s tartp, collend-startp, reason);
3638 goto onError; 4183 goto onError;
3639 case 2: /* replace */ 4184 case 2: /* replace */
3640 while (collstart++<collend) 4185 while (collstart++<collend)
3641 *str++ = '?'; /* fall through */ 4186 *str++ = '?'; /* fall through */
3642 case 3: /* ignore */ 4187 case 3: /* ignore */
3643 p = collend; 4188 p = collend;
3644 break; 4189 break;
3645 case 4: /* xmlcharrefreplace */ 4190 case 4: /* xmlcharrefreplace */
3646 respos = str-PyString_AS_STRING(res); 4191 respos = str - PyBytes_AS_STRING(res);
3647 /* determine replacement size (temporarily (mis)uses p) */ 4192 /* determine replacement size (temporarily (mis)uses p) */
3648 for (p = collstart, repsize = 0; p < collend; ++p) { 4193 for (p = collstart, repsize = 0; p < collend; ++p) {
3649 if (*p<10) 4194 if (*p<10)
3650 repsize += 2+1+1; 4195 repsize += 2+1+1;
3651 else if (*p<100) 4196 else if (*p<100)
3652 repsize += 2+2+1; 4197 repsize += 2+2+1;
3653 else if (*p<1000) 4198 else if (*p<1000)
3654 repsize += 2+3+1; 4199 repsize += 2+3+1;
3655 else if (*p<10000) 4200 else if (*p<10000)
3656 repsize += 2+4+1; 4201 repsize += 2+4+1;
3657 #ifndef Py_UNICODE_WIDE 4202 #ifndef Py_UNICODE_WIDE
3658 else 4203 else
3659 repsize += 2+5+1; 4204 repsize += 2+5+1;
3660 #else 4205 #else
3661 else if (*p<100000) 4206 else if (*p<100000)
3662 repsize += 2+5+1; 4207 repsize += 2+5+1;
3663 else if (*p<1000000) 4208 else if (*p<1000000)
3664 repsize += 2+6+1; 4209 repsize += 2+6+1;
3665 else 4210 else
3666 repsize += 2+7+1; 4211 repsize += 2+7+1;
3667 #endif 4212 #endif
3668 } 4213 }
3669 requiredsize = respos+repsize+(endp-collend); 4214 requiredsize = respos+repsize+(endp-collend);
3670 if (requiredsize > ressize) { 4215 if (requiredsize > ressize) {
3671 if (requiredsize<2*ressize) 4216 if (requiredsize<2*ressize)
3672 requiredsize = 2*ressize; 4217 requiredsize = 2*ressize;
3673 if (_PyString_Resize(&res, requiredsize)) 4218 if (_PyBytes_Resize(&res, requiredsize))
3674 goto onError; 4219 goto onError;
3675 str = PyString_AS_STRING(res) + respos; 4220 str = PyBytes_AS_STRING(res) + respos;
3676 ressize = requiredsize; 4221 ressize = requiredsize;
3677 } 4222 }
3678 /* generate replacement (temporarily (mis)uses p) */ 4223 /* generate replacement (temporarily (mis)uses p) */
3679 for (p = collstart; p < collend; ++p) { 4224 for (p = collstart; p < collend; ++p) {
3680 str += sprintf(str, "&#%d;", (int)*p); 4225 str += sprintf(str, "&#%d;", (int)*p);
3681 } 4226 }
3682 p = collend; 4227 p = collend;
3683 break; 4228 break;
3684 default: 4229 default:
3685 repunicode = unicode_encode_call_errorhandler(errors, &errorHand ler, 4230 repunicode = unicode_encode_call_errorhandler(errors, &errorHand ler,
3686 encoding, reason, startp, size, &exc, 4231 encoding, reason, startp, size, &exc,
3687 collstart-startp, collend-startp, &newpos); 4232 collstart-startp, collend-startp, &newpos);
3688 if (repunicode == NULL) 4233 if (repunicode == NULL)
3689 goto onError; 4234 goto onError;
3690 /* need more space? (at least enough for what we have+the 4235 if (PyBytes_Check(repunicode)) {
3691 replacement+the rest of the string, so we won't have to 4236 /* Directly copy bytes result to output. */
3692 check space for encodable characters) */ 4237 repsize = PyBytes_Size(repunicode);
3693 respos = str-PyString_AS_STRING(res); 4238 if (repsize > 1) {
4239 /* Make room for all additional bytes. */
4240 respos = str - PyBytes_AS_STRING(res);
4241 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4242 Py_DECREF(repunicode);
4243 goto onError;
4244 }
4245 str = PyBytes_AS_STRING(res) + respos;
4246 ressize += repsize-1;
4247 }
4248 memcpy(str, PyBytes_AsString(repunicode), repsize);
4249 str += repsize;
4250 p = startp + newpos;
4251 Py_DECREF(repunicode);
4252 break;
4253 }
4254 /* need more space? (at least enough for what we
4255 have+the replacement+the rest of the string, so
4256 we won't have to check space for encodable characters) */
4257 respos = str - PyBytes_AS_STRING(res);
3694 repsize = PyUnicode_GET_SIZE(repunicode); 4258 repsize = PyUnicode_GET_SIZE(repunicode);
3695 requiredsize = respos+repsize+(endp-collend); 4259 requiredsize = respos+repsize+(endp-collend);
3696 if (requiredsize > ressize) { 4260 if (requiredsize > ressize) {
3697 if (requiredsize<2*ressize) 4261 if (requiredsize<2*ressize)
3698 requiredsize = 2*ressize; 4262 requiredsize = 2*ressize;
3699 if (_PyString_Resize(&res, requiredsize)) { 4263 if (_PyBytes_Resize(&res, requiredsize)) {
3700 Py_DECREF(repunicode); 4264 Py_DECREF(repunicode);
3701 goto onError; 4265 goto onError;
3702 } 4266 }
3703 str = PyString_AS_STRING(res) + respos; 4267 str = PyBytes_AS_STRING(res) + respos;
3704 ressize = requiredsize; 4268 ressize = requiredsize;
3705 } 4269 }
3706 /* check if there is anything unencodable in the replacement 4270 /* check if there is anything unencodable in the replacement
3707 and copy it to the output */ 4271 and copy it to the output */
3708 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2 , ++str) { 4272 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2 , ++str) {
3709 c = *uni2; 4273 c = *uni2;
3710 if (c >= limit) { 4274 if (c >= limit) {
3711 raise_encode_exception(&exc, encoding, startp, size, 4275 raise_encode_exception(&exc, encoding, startp, size,
3712 unicodepos, unicodepos+1, reason) ; 4276 unicodepos, unicodepos+1, reason) ;
3713 Py_DECREF(repunicode); 4277 Py_DECREF(repunicode);
3714 goto onError; 4278 goto onError;
3715 } 4279 }
3716 *str = (char)c; 4280 *str = (char)c;
3717 } 4281 }
3718 p = startp + newpos; 4282 p = startp + newpos;
3719 Py_DECREF(repunicode); 4283 Py_DECREF(repunicode);
3720 } 4284 }
3721 } 4285 }
3722 } 4286 }
3723 /* Resize if we allocated to much */ 4287 /* Resize if we allocated to much */
3724 respos = str-PyString_AS_STRING(res); 4288 size = str - PyBytes_AS_STRING(res);
3725 if (respos<ressize) 4289 if (size < ressize) { /* If this falls res will be NULL */
3726 /* If this falls res will be NULL */ 4290 assert(size >= 0);
3727 _PyString_Resize(&res, respos); 4291 if (_PyBytes_Resize(&res, size) < 0)
4292 goto onError;
4293 }
4294
3728 Py_XDECREF(errorHandler); 4295 Py_XDECREF(errorHandler);
3729 Py_XDECREF(exc); 4296 Py_XDECREF(exc);
3730 return res; 4297 return res;
3731 4298
3732 onError: 4299 onError:
3733 Py_XDECREF(res); 4300 Py_XDECREF(res);
3734 Py_XDECREF(errorHandler); 4301 Py_XDECREF(errorHandler);
3735 Py_XDECREF(exc); 4302 Py_XDECREF(exc);
3736 return NULL; 4303 return NULL;
3737 } 4304 }
(...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after
3789 *p++ = c; 4356 *p++ = c;
3790 ++s; 4357 ++s;
3791 } 4358 }
3792 else { 4359 else {
3793 startinpos = s-starts; 4360 startinpos = s-starts;
3794 endinpos = startinpos + 1; 4361 endinpos = startinpos + 1;
3795 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v); 4362 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3796 if (unicode_decode_call_errorhandler( 4363 if (unicode_decode_call_errorhandler(
3797 errors, &errorHandler, 4364 errors, &errorHandler,
3798 "ascii", "ordinal not in range(128)", 4365 "ascii", "ordinal not in range(128)",
3799 starts, size, &startinpos, &endinpos, &exc, &s, 4366 &starts, &e, &startinpos, &endinpos, &exc, &s,
3800 &v, &outpos, &p)) 4367 &v, &outpos, &p))
3801 goto onError; 4368 goto onError;
3802 } 4369 }
3803 } 4370 }
3804 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v)) 4371 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
3805 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 4372 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3806 goto onError; 4373 goto onError;
3807 Py_XDECREF(errorHandler); 4374 Py_XDECREF(errorHandler);
3808 Py_XDECREF(exc); 4375 Py_XDECREF(exc);
3809 return (PyObject *)v; 4376 return (PyObject *)v;
3810 4377
3811 onError: 4378 onError:
3812 Py_XDECREF(v); 4379 Py_XDECREF(v);
3813 Py_XDECREF(errorHandler); 4380 Py_XDECREF(errorHandler);
3814 Py_XDECREF(exc); 4381 Py_XDECREF(exc);
(...skipping 157 matching lines...) Expand 10 before | Expand all | Expand 10 after
3972 if (size > 0) { 4539 if (size > 0) {
3973 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL); 4540 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3974 if (mbcssize == 0) { 4541 if (mbcssize == 0) {
3975 PyErr_SetFromWindowsErrWithFilename(0, NULL); 4542 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3976 return -1; 4543 return -1;
3977 } 4544 }
3978 } 4545 }
3979 4546
3980 if (*repr == NULL) { 4547 if (*repr == NULL) {
3981 /* Create string object */ 4548 /* Create string object */
3982 *repr = PyString_FromStringAndSize(NULL, mbcssize); 4549 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
3983 if (*repr == NULL) 4550 if (*repr == NULL)
3984 return -1; 4551 return -1;
3985 } 4552 }
3986 else { 4553 else {
3987 /* Extend string object */ 4554 /* Extend string object */
3988 n = PyString_Size(*repr); 4555 n = PyBytes_Size(*repr);
3989 if (_PyString_Resize(repr, n + mbcssize) < 0) 4556 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
3990 return -1; 4557 return -1;
3991 } 4558 }
3992 4559
3993 /* Do the conversion */ 4560 /* Do the conversion */
3994 if (size > 0) { 4561 if (size > 0) {
3995 char *s = PyString_AS_STRING(*repr) + n; 4562 char *s = PyBytes_AS_STRING(*repr) + n;
3996 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL )) { 4563 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL )) {
3997 PyErr_SetFromWindowsErrWithFilename(0, NULL); 4564 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3998 return -1; 4565 return -1;
3999 } 4566 }
4000 } 4567 }
4001 4568
4002 return 0; 4569 return 0;
4003 } 4570 }
4004 4571
4005 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, 4572 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
(...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after
4090 x = mapstring[ch]; 4657 x = mapstring[ch];
4091 4658
4092 if (x == 0xfffe) { 4659 if (x == 0xfffe) {
4093 /* undefined mapping */ 4660 /* undefined mapping */
4094 outpos = p-PyUnicode_AS_UNICODE(v); 4661 outpos = p-PyUnicode_AS_UNICODE(v);
4095 startinpos = s-starts; 4662 startinpos = s-starts;
4096 endinpos = startinpos+1; 4663 endinpos = startinpos+1;
4097 if (unicode_decode_call_errorhandler( 4664 if (unicode_decode_call_errorhandler(
4098 errors, &errorHandler, 4665 errors, &errorHandler,
4099 "charmap", "character maps to <undefined>", 4666 "charmap", "character maps to <undefined>",
4100 starts, size, &startinpos, &endinpos, &exc, &s, 4667 &starts, &e, &startinpos, &endinpos, &exc, &s,
4101 &v, &outpos, &p)) { 4668 &v, &outpos, &p)) {
4102 goto onError; 4669 goto onError;
4103 } 4670 }
4104 continue; 4671 continue;
4105 } 4672 }
4106 *p++ = x; 4673 *p++ = x;
4107 ++s; 4674 ++s;
4108 } 4675 }
4109 } 4676 }
4110 else { 4677 else {
4111 while (s < e) { 4678 while (s < e) {
4112 unsigned char ch = *s; 4679 unsigned char ch = *s;
4113 PyObject *w, *x; 4680 PyObject *w, *x;
4114 4681
4115 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 4682 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4116 w = PyInt_FromLong((long)ch); 4683 w = PyLong_FromLong((long)ch);
4117 if (w == NULL) 4684 if (w == NULL)
4118 goto onError; 4685 goto onError;
4119 x = PyObject_GetItem(mapping, w); 4686 x = PyObject_GetItem(mapping, w);
4120 Py_DECREF(w); 4687 Py_DECREF(w);
4121 if (x == NULL) { 4688 if (x == NULL) {
4122 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 4689 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4123 /* No mapping found means: mapping is undefined. */ 4690 /* No mapping found means: mapping is undefined. */
4124 PyErr_Clear(); 4691 PyErr_Clear();
4125 x = Py_None; 4692 x = Py_None;
4126 Py_INCREF(x); 4693 Py_INCREF(x);
4127 } else 4694 } else
4128 goto onError; 4695 goto onError;
4129 } 4696 }
4130 4697
4131 /* Apply mapping */ 4698 /* Apply mapping */
4132 if (PyInt_Check(x)) { 4699 if (PyLong_Check(x)) {
4133 long value = PyInt_AS_LONG(x); 4700 long value = PyLong_AS_LONG(x);
4134 if (value < 0 || value > 65535) { 4701 if (value < 0 || value > 65535) {
4135 PyErr_SetString(PyExc_TypeError, 4702 PyErr_SetString(PyExc_TypeError,
4136 "character mapping must be in range(65536)") ; 4703 "character mapping must be in range(65536)") ;
4137 Py_DECREF(x); 4704 Py_DECREF(x);
4138 goto onError; 4705 goto onError;
4139 } 4706 }
4140 *p++ = (Py_UNICODE)value; 4707 *p++ = (Py_UNICODE)value;
4141 } 4708 }
4142 else if (x == Py_None) { 4709 else if (x == Py_None) {
4143 /* undefined mapping */ 4710 /* undefined mapping */
4144 outpos = p-PyUnicode_AS_UNICODE(v); 4711 outpos = p-PyUnicode_AS_UNICODE(v);
4145 startinpos = s-starts; 4712 startinpos = s-starts;
4146 endinpos = startinpos+1; 4713 endinpos = startinpos+1;
4147 if (unicode_decode_call_errorhandler( 4714 if (unicode_decode_call_errorhandler(
4148 errors, &errorHandler, 4715 errors, &errorHandler,
4149 "charmap", "character maps to <undefined>", 4716 "charmap", "character maps to <undefined>",
4150 starts, size, &startinpos, &endinpos, &exc, &s, 4717 &starts, &e, &startinpos, &endinpos, &exc, &s,
4151 &v, &outpos, &p)) { 4718 &v, &outpos, &p)) {
4152 Py_DECREF(x); 4719 Py_DECREF(x);
4153 goto onError; 4720 goto onError;
4154 } 4721 }
4155 Py_DECREF(x); 4722 Py_DECREF(x);
4156 continue; 4723 continue;
4157 } 4724 }
4158 else if (PyUnicode_Check(x)) { 4725 else if (PyUnicode_Check(x)) {
4159 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x); 4726 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4160 4727
(...skipping 21 matching lines...) Expand all
4182 PyUnicode_AS_UNICODE(x), 4749 PyUnicode_AS_UNICODE(x),
4183 targetsize); 4750 targetsize);
4184 p += targetsize; 4751 p += targetsize;
4185 extrachars -= targetsize; 4752 extrachars -= targetsize;
4186 } 4753 }
4187 /* 1-0 mapping: skip the character */ 4754 /* 1-0 mapping: skip the character */
4188 } 4755 }
4189 else { 4756 else {
4190 /* wrong return value */ 4757 /* wrong return value */
4191 PyErr_SetString(PyExc_TypeError, 4758 PyErr_SetString(PyExc_TypeError,
4192 "character mapping must return integer, None or unicode"); 4759 "character mapping must return integer, None or str");
4193 Py_DECREF(x); 4760 Py_DECREF(x);
4194 goto onError; 4761 goto onError;
4195 } 4762 }
4196 Py_DECREF(x); 4763 Py_DECREF(x);
4197 ++s; 4764 ++s;
4198 } 4765 }
4199 } 4766 }
4200 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 4767 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4201 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 4768 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4202 goto onError; 4769 goto onError;
(...skipping 14 matching lines...) Expand all
4217 PyObject_HEAD 4784 PyObject_HEAD
4218 unsigned char level1[32]; 4785 unsigned char level1[32];
4219 int count2, count3; 4786 int count2, count3;
4220 unsigned char level23[1]; 4787 unsigned char level23[1];
4221 }; 4788 };
4222 4789
4223 static PyObject* 4790 static PyObject*
4224 encoding_map_size(PyObject *obj, PyObject* args) 4791 encoding_map_size(PyObject *obj, PyObject* args)
4225 { 4792 {
4226 struct encoding_map *map = (struct encoding_map*)obj; 4793 struct encoding_map *map = (struct encoding_map*)obj;
4227 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 + 4794 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4228 128*map->count3); 4795 128*map->count3);
4229 } 4796 }
4230 4797
4231 static PyMethodDef encoding_map_methods[] = { 4798 static PyMethodDef encoding_map_methods[] = {
4232 {"size", encoding_map_size, METH_NOARGS, 4799 {"size", encoding_map_size, METH_NOARGS,
4233 PyDoc_STR("Return the size (in bytes) of this object") }, 4800 PyDoc_STR("Return the size (in bytes) of this object") },
4234 { 0 } 4801 { 0 }
4235 }; 4802 };
4236 4803
4237 static void 4804 static void
4238 encoding_map_dealloc(PyObject* o) 4805 encoding_map_dealloc(PyObject* o)
4239 { 4806 {
4240 PyObject_FREE(o); 4807 PyObject_FREE(o);
4241 } 4808 }
4242 4809
4243 static PyTypeObject EncodingMapType = { 4810 static PyTypeObject EncodingMapType = {
4244 PyVarObject_HEAD_INIT(NULL, 0) 4811 PyVarObject_HEAD_INIT(NULL, 0)
4245 "EncodingMap", /*tp_name*/ 4812 "EncodingMap", /*tp_name*/
4246 sizeof(struct encoding_map), /*tp_basicsize*/ 4813 sizeof(struct encoding_map), /*tp_basicsize*/
4247 0, /*tp_itemsize*/ 4814 0, /*tp_itemsize*/
4248 /* methods */ 4815 /* methods */
4249 encoding_map_dealloc, /*tp_dealloc*/ 4816 encoding_map_dealloc, /*tp_dealloc*/
4250 0, /*tp_print*/ 4817 0, /*tp_print*/
4251 0, /*tp_getattr*/ 4818 0, /*tp_getattr*/
4252 0, /*tp_setattr*/ 4819 0, /*tp_setattr*/
4253 0, /*tp_compare*/ 4820 0, /*tp_reserved*/
4254 0, /*tp_repr*/ 4821 0, /*tp_repr*/
4255 0, /*tp_as_number*/ 4822 0, /*tp_as_number*/
4256 0, /*tp_as_sequence*/ 4823 0, /*tp_as_sequence*/
4257 0, /*tp_as_mapping*/ 4824 0, /*tp_as_mapping*/
4258 0, /*tp_hash*/ 4825 0, /*tp_hash*/
4259 0, /*tp_call*/ 4826 0, /*tp_call*/
4260 0, /*tp_str*/ 4827 0, /*tp_str*/
4261 0, /*tp_getattro*/ 4828 0, /*tp_getattro*/
4262 0, /*tp_setattro*/ 4829 0, /*tp_setattro*/
4263 0, /*tp_as_buffer*/ 4830 0, /*tp_as_buffer*/
(...skipping 69 matching lines...) Expand 10 before | Expand all | Expand 10 after
4333 4900
4334 if (count2 >= 0xFF || count3 >= 0xFF) 4901 if (count2 >= 0xFF || count3 >= 0xFF)
4335 need_dict = 1; 4902 need_dict = 1;
4336 4903
4337 if (need_dict) { 4904 if (need_dict) {
4338 PyObject *result = PyDict_New(); 4905 PyObject *result = PyDict_New();
4339 PyObject *key, *value; 4906 PyObject *key, *value;
4340 if (!result) 4907 if (!result)
4341 return NULL; 4908 return NULL;
4342 for (i = 0; i < 256; i++) { 4909 for (i = 0; i < 256; i++) {
4343 value = NULL; 4910 key = value = NULL;
4344 key = PyInt_FromLong(decode[i]); 4911 key = PyLong_FromLong(decode[i]);
4345 value = PyInt_FromLong(i); 4912 value = PyLong_FromLong(i);
4346 if (!key || !value) 4913 if (!key || !value)
4347 goto failed1; 4914 goto failed1;
4348 if (PyDict_SetItem(result, key, value) == -1) 4915 if (PyDict_SetItem(result, key, value) == -1)
4349 goto failed1; 4916 goto failed1;
4350 Py_DECREF(key); 4917 Py_DECREF(key);
4351 Py_DECREF(value); 4918 Py_DECREF(value);
4352 } 4919 }
4353 return result; 4920 return result;
4354 failed1: 4921 failed1:
4355 Py_XDECREF(key); 4922 Py_XDECREF(key);
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after
4423 return -1; 4990 return -1;
4424 } 4991 }
4425 return i; 4992 return i;
4426 } 4993 }
4427 4994
4428 /* Lookup the character ch in the mapping. If the character 4995 /* Lookup the character ch in the mapping. If the character
4429 can't be found, Py_None is returned (or NULL, if another 4996 can't be found, Py_None is returned (or NULL, if another
4430 error occurred). */ 4997 error occurred). */
4431 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping) 4998 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4432 { 4999 {
4433 PyObject *w = PyInt_FromLong((long)c); 5000 PyObject *w = PyLong_FromLong((long)c);
4434 PyObject *x; 5001 PyObject *x;
4435 5002
4436 if (w == NULL) 5003 if (w == NULL)
4437 return NULL; 5004 return NULL;
4438 x = PyObject_GetItem(mapping, w); 5005 x = PyObject_GetItem(mapping, w);
4439 Py_DECREF(w); 5006 Py_DECREF(w);
4440 if (x == NULL) { 5007 if (x == NULL) {
4441 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 5008 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4442 /* No mapping found means: mapping is undefined. */ 5009 /* No mapping found means: mapping is undefined. */
4443 PyErr_Clear(); 5010 PyErr_Clear();
4444 x = Py_None; 5011 x = Py_None;
4445 Py_INCREF(x); 5012 Py_INCREF(x);
4446 return x; 5013 return x;
4447 } else 5014 } else
4448 return NULL; 5015 return NULL;
4449 } 5016 }
4450 else if (x == Py_None) 5017 else if (x == Py_None)
4451 return x; 5018 return x;
4452 else if (PyInt_Check(x)) { 5019 else if (PyLong_Check(x)) {
4453 long value = PyInt_AS_LONG(x); 5020 long value = PyLong_AS_LONG(x);
4454 if (value < 0 || value > 255) { 5021 if (value < 0 || value > 255) {
4455 PyErr_SetString(PyExc_TypeError, 5022 PyErr_SetString(PyExc_TypeError,
4456 "character mapping must be in range(256)"); 5023 "character mapping must be in range(256)");
4457 Py_DECREF(x); 5024 Py_DECREF(x);
4458 return NULL; 5025 return NULL;
4459 } 5026 }
4460 return x; 5027 return x;
4461 } 5028 }
4462 else if (PyString_Check(x)) 5029 else if (PyBytes_Check(x))
4463 return x; 5030 return x;
4464 else { 5031 else {
4465 /* wrong return value */ 5032 /* wrong return value */
4466 PyErr_SetString(PyExc_TypeError, 5033 PyErr_Format(PyExc_TypeError,
4467 "character mapping must return integer, None or str"); 5034 "character mapping must return integer, bytes or None, not %.400s",
5035 x->ob_type->tp_name);
4468 Py_DECREF(x); 5036 Py_DECREF(x);
4469 return NULL; 5037 return NULL;
4470 } 5038 }
4471 } 5039 }
4472 5040
4473 static int 5041 static int
4474 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requireds ize) 5042 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requireds ize)
4475 { 5043 {
4476 Py_ssize_t outsize = PyString_GET_SIZE(*outobj); 5044 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
4477 /* exponentially overallocate to minimize reallocations */ 5045 /* exponentially overallocate to minimize reallocations */
4478 if (requiredsize < 2*outsize) 5046 if (requiredsize < 2*outsize)
4479 requiredsize = 2*outsize; 5047 requiredsize = 2*outsize;
4480 if (_PyString_Resize(outobj, requiredsize)) { 5048 if (_PyBytes_Resize(outobj, requiredsize))
4481 return 0; 5049 return -1;
4482 } 5050 return 0;
4483 return 1;
4484 } 5051 }
4485 5052
4486 typedef enum charmapencode_result { 5053 typedef enum charmapencode_result {
4487 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 5054 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4488 }charmapencode_result; 5055 }charmapencode_result;
4489 /* lookup the character, put the result in the output string and adjust 5056 /* lookup the character, put the result in the output string and adjust
4490 various state variables. Reallocate the output string if not enough 5057 various state variables. Resize the output bytes object if not enough
4491 space is available. Return a new reference to the object that 5058 space is available. Return a new reference to the object that
4492 was put in the output buffer, or Py_None, if the mapping was undefined 5059 was put in the output buffer, or Py_None, if the mapping was undefined
4493 (in which case no character was written) or NULL, if a 5060 (in which case no character was written) or NULL, if a
4494 reallocation error occurred. The caller must decref the result */ 5061 reallocation error occurred. The caller must decref the result */
4495 static 5062 static
4496 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping, 5063 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4497 PyObject **outobj, Py_ssize_t *outpos) 5064 PyObject **outobj, Py_ssize_t *outpos)
4498 { 5065 {
4499 PyObject *rep; 5066 PyObject *rep;
4500 char *outstart; 5067 char *outstart;
4501 Py_ssize_t outsize = PyString_GET_SIZE(*outobj); 5068 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
4502 5069
4503 if (Py_TYPE(mapping) == &EncodingMapType) { 5070 if (Py_TYPE(mapping) == &EncodingMapType) {
4504 int res = encoding_map_lookup(c, mapping); 5071 int res = encoding_map_lookup(c, mapping);
4505 Py_ssize_t requiredsize = *outpos+1; 5072 Py_ssize_t requiredsize = *outpos+1;
4506 if (res == -1) 5073 if (res == -1)
4507 return enc_FAILED; 5074 return enc_FAILED;
4508 if (outsize<requiredsize) 5075 if (outsize<requiredsize)
4509 if (!charmapencode_resize(outobj, outpos, requiredsize)) 5076 if (charmapencode_resize(outobj, outpos, requiredsize))
4510 return enc_EXCEPTION; 5077 return enc_EXCEPTION;
4511 outstart = PyString_AS_STRING(*outobj); 5078 outstart = PyBytes_AS_STRING(*outobj);
4512 outstart[(*outpos)++] = (char)res; 5079 outstart[(*outpos)++] = (char)res;
4513 return enc_SUCCESS; 5080 return enc_SUCCESS;
4514 } 5081 }
4515 5082
4516 rep = charmapencode_lookup(c, mapping); 5083 rep = charmapencode_lookup(c, mapping);
4517 if (rep==NULL) 5084 if (rep==NULL)
4518 return enc_EXCEPTION; 5085 return enc_EXCEPTION;
4519 else if (rep==Py_None) { 5086 else if (rep==Py_None) {
4520 Py_DECREF(rep); 5087 Py_DECREF(rep);
4521 return enc_FAILED; 5088 return enc_FAILED;
4522 } else { 5089 } else {
4523 if (PyInt_Check(rep)) { 5090 if (PyLong_Check(rep)) {
4524 Py_ssize_t requiredsize = *outpos+1; 5091 Py_ssize_t requiredsize = *outpos+1;
4525 if (outsize<requiredsize) 5092 if (outsize<requiredsize)
4526 if (!charmapencode_resize(outobj, outpos, requiredsize)) { 5093 if (charmapencode_resize(outobj, outpos, requiredsize)) {
4527 Py_DECREF(rep); 5094 Py_DECREF(rep);
4528 return enc_EXCEPTION; 5095 return enc_EXCEPTION;
4529 } 5096 }
4530 outstart = PyString_AS_STRING(*outobj); 5097 outstart = PyBytes_AS_STRING(*outobj);
4531 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep); 5098 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
4532 } 5099 }
4533 else { 5100 else {
4534 const char *repchars = PyString_AS_STRING(rep); 5101 const char *repchars = PyBytes_AS_STRING(rep);
4535 Py_ssize_t repsize = PyString_GET_SIZE(rep); 5102 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
4536 Py_ssize_t requiredsize = *outpos+repsize; 5103 Py_ssize_t requiredsize = *outpos+repsize;
4537 if (outsize<requiredsize) 5104 if (outsize<requiredsize)
4538 if (!charmapencode_resize(outobj, outpos, requiredsize)) { 5105 if (charmapencode_resize(outobj, outpos, requiredsize)) {
4539 Py_DECREF(rep); 5106 Py_DECREF(rep);
4540 return enc_EXCEPTION; 5107 return enc_EXCEPTION;
4541 } 5108 }
4542 outstart = PyString_AS_STRING(*outobj); 5109 outstart = PyBytes_AS_STRING(*outobj);
4543 memcpy(outstart + *outpos, repchars, repsize); 5110 memcpy(outstart + *outpos, repchars, repsize);
4544 *outpos += repsize; 5111 *outpos += repsize;
4545 } 5112 }
4546 } 5113 }
4547 Py_DECREF(rep); 5114 Py_DECREF(rep);
4548 return enc_SUCCESS; 5115 return enc_SUCCESS;
4549 } 5116 }
4550 5117
4551 /* handle an error in PyUnicode_EncodeCharmap 5118 /* handle an error in PyUnicode_EncodeCharmap
4552 Return 0 on success, -1 on error */ 5119 Return 0 on success, -1 on error */
(...skipping 87 matching lines...) Expand 10 before | Expand all | Expand 10 after
4640 } 5207 }
4641 } 5208 }
4642 *inpos = collendpos; 5209 *inpos = collendpos;
4643 break; 5210 break;
4644 default: 5211 default:
4645 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 5212 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4646 encoding, reason, p, size, exceptionObject, 5213 encoding, reason, p, size, exceptionObject,
4647 collstartpos, collendpos, &newpos); 5214 collstartpos, collendpos, &newpos);
4648 if (repunicode == NULL) 5215 if (repunicode == NULL)
4649 return -1; 5216 return -1;
5217 if (PyBytes_Check(repunicode)) {
5218 /* Directly copy bytes result to output. */
5219 Py_ssize_t outsize = PyBytes_Size(*res);
5220 Py_ssize_t requiredsize;
5221 repsize = PyBytes_Size(repunicode);
5222 requiredsize = *respos + repsize;
5223 if (requiredsize > outsize)
5224 /* Make room for all additional bytes. */
5225 if (charmapencode_resize(res, respos, requiredsize)) {
5226 Py_DECREF(repunicode);
5227 return -1;
5228 }
5229 memcpy(PyBytes_AsString(*res) + *respos,
5230 PyBytes_AsString(repunicode), repsize);
5231 *respos += repsize;
5232 *inpos = newpos;
5233 Py_DECREF(repunicode);
5234 break;
5235 }
4650 /* generate replacement */ 5236 /* generate replacement */
4651 repsize = PyUnicode_GET_SIZE(repunicode); 5237 repsize = PyUnicode_GET_SIZE(repunicode);
4652 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 5238 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4653 x = charmapencode_output(*uni2, mapping, res, respos); 5239 x = charmapencode_output(*uni2, mapping, res, respos);
4654 if (x==enc_EXCEPTION) { 5240 if (x==enc_EXCEPTION) {
4655 return -1; 5241 return -1;
4656 } 5242 }
4657 else if (x==enc_FAILED) { 5243 else if (x==enc_FAILED) {
4658 Py_DECREF(repunicode); 5244 Py_DECREF(repunicode);
4659 raise_encode_exception(exceptionObject, encoding, p, size, colls tartpos, collendpos, reason); 5245 raise_encode_exception(exceptionObject, encoding, p, size, colls tartpos, collendpos, reason);
(...skipping 23 matching lines...) Expand all
4683 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 5269 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4684 * 3=ignore, 4=xmlcharrefreplace */ 5270 * 3=ignore, 4=xmlcharrefreplace */
4685 int known_errorHandler = -1; 5271 int known_errorHandler = -1;
4686 5272
4687 /* Default to Latin-1 */ 5273 /* Default to Latin-1 */
4688 if (mapping == NULL) 5274 if (mapping == NULL)
4689 return PyUnicode_EncodeLatin1(p, size, errors); 5275 return PyUnicode_EncodeLatin1(p, size, errors);
4690 5276
4691 /* allocate enough for a simple encoding without 5277 /* allocate enough for a simple encoding without
4692 replacements, if we need more, we'll resize */ 5278 replacements, if we need more, we'll resize */
4693 res = PyString_FromStringAndSize(NULL, size); 5279 res = PyBytes_FromStringAndSize(NULL, size);
4694 if (res == NULL) 5280 if (res == NULL)
4695 goto onError; 5281 goto onError;
4696 if (size == 0) 5282 if (size == 0)
4697 return res; 5283 return res;
4698 5284
4699 while (inpos<size) { 5285 while (inpos<size) {
4700 /* try to encode it */ 5286 /* try to encode it */
4701 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, & respos); 5287 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, & respos);
4702 if (x==enc_EXCEPTION) /* error */ 5288 if (x==enc_EXCEPTION) /* error */
4703 goto onError; 5289 goto onError;
4704 if (x==enc_FAILED) { /* unencodable character */ 5290 if (x==enc_FAILED) { /* unencodable character */
4705 if (charmap_encoding_error(p, size, &inpos, mapping, 5291 if (charmap_encoding_error(p, size, &inpos, mapping,
4706 &exc, 5292 &exc,
4707 &known_errorHandler, &errorHandler, error s, 5293 &known_errorHandler, &errorHandler, error s,
4708 &res, &respos)) { 5294 &res, &respos)) {
4709 goto onError; 5295 goto onError;
4710 } 5296 }
4711 } 5297 }
4712 else 5298 else
4713 /* done with this character => adjust input position */ 5299 /* done with this character => adjust input position */
4714 ++inpos; 5300 ++inpos;
4715 } 5301 }
4716 5302
4717 /* Resize if we allocated to much */ 5303 /* Resize if we allocated to much */
4718 if (respos<PyString_GET_SIZE(res)) { 5304 if (respos<PyBytes_GET_SIZE(res))
4719 if (_PyString_Resize(&res, respos)) 5305 if (_PyBytes_Resize(&res, respos) < 0)
4720 goto onError; 5306 goto onError;
4721 } 5307
4722 Py_XDECREF(exc); 5308 Py_XDECREF(exc);
4723 Py_XDECREF(errorHandler); 5309 Py_XDECREF(errorHandler);
4724 return res; 5310 return res;
4725 5311
4726 onError: 5312 onError:
4727 Py_XDECREF(res); 5313 Py_XDECREF(res);
4728 Py_XDECREF(exc); 5314 Py_XDECREF(exc);
4729 Py_XDECREF(errorHandler); 5315 Py_XDECREF(errorHandler);
4730 return NULL; 5316 return NULL;
4731 } 5317 }
(...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after
4783 build arguments, call the callback and check the arguments, 5369 build arguments, call the callback and check the arguments,
4784 put the result into newpos and return the replacement string, which 5370 put the result into newpos and return the replacement string, which
4785 has to be freed by the caller */ 5371 has to be freed by the caller */
4786 static PyObject *unicode_translate_call_errorhandler(const char *errors, 5372 static PyObject *unicode_translate_call_errorhandler(const char *errors,
4787 PyObject **errorHandler, 5373 PyObject **errorHandler,
4788 const char *reason, 5374 const char *reason,
4789 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 5375 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4790 Py_ssize_t startpos, Py_ssi ze_t endpos, 5376 Py_ssize_t startpos, Py_ssi ze_t endpos,
4791 Py_ssize_t *newpos) 5377 Py_ssize_t *newpos)
4792 { 5378 {
4793 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple"; 5379 static char *argparse = "O!n;translating error handler must return (str, int ) tuple";
4794 5380
4795 Py_ssize_t i_newpos; 5381 Py_ssize_t i_newpos;
4796 PyObject *restuple; 5382 PyObject *restuple;
4797 PyObject *resunicode; 5383 PyObject *resunicode;
4798 5384
4799 if (*errorHandler == NULL) { 5385 if (*errorHandler == NULL) {
4800 *errorHandler = PyCodec_LookupError(errors); 5386 *errorHandler = PyCodec_LookupError(errors);
4801 if (*errorHandler == NULL) 5387 if (*errorHandler == NULL)
4802 return NULL; 5388 return NULL;
4803 } 5389 }
(...skipping 30 matching lines...) Expand all
4834 Py_DECREF(restuple); 5420 Py_DECREF(restuple);
4835 return resunicode; 5421 return resunicode;
4836 } 5422 }
4837 5423
4838 /* Lookup the character ch in the mapping and put the result in result, 5424 /* Lookup the character ch in the mapping and put the result in result,
4839 which must be decrefed by the caller. 5425 which must be decrefed by the caller.
4840 Return 0 on success, -1 on error */ 5426 Return 0 on success, -1 on error */
4841 static 5427 static
4842 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result) 5428 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4843 { 5429 {
4844 PyObject *w = PyInt_FromLong((long)c); 5430 PyObject *w = PyLong_FromLong((long)c);
4845 PyObject *x; 5431 PyObject *x;
4846 5432
4847 if (w == NULL) 5433 if (w == NULL)
4848 return -1; 5434 return -1;
4849 x = PyObject_GetItem(mapping, w); 5435 x = PyObject_GetItem(mapping, w);
4850 Py_DECREF(w); 5436 Py_DECREF(w);
4851 if (x == NULL) { 5437 if (x == NULL) {
4852 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 5438 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4853 /* No mapping found means: use 1:1 mapping. */ 5439 /* No mapping found means: use 1:1 mapping. */
4854 PyErr_Clear(); 5440 PyErr_Clear();
4855 *result = NULL; 5441 *result = NULL;
4856 return 0; 5442 return 0;
4857 } else 5443 } else
4858 return -1; 5444 return -1;
4859 } 5445 }
4860 else if (x == Py_None) { 5446 else if (x == Py_None) {
4861 *result = x; 5447 *result = x;
4862 return 0; 5448 return 0;
4863 } 5449 }
4864 else if (PyInt_Check(x)) { 5450 else if (PyLong_Check(x)) {
4865 long value = PyInt_AS_LONG(x); 5451 long value = PyLong_AS_LONG(x);
4866 long max = PyUnicode_GetMax(); 5452 long max = PyUnicode_GetMax();
4867 if (value < 0 || value > max) { 5453 if (value < 0 || value > max) {
4868 PyErr_Format(PyExc_TypeError, 5454 PyErr_Format(PyExc_TypeError,
4869 "character mapping must be in range(0x%lx)", max+1); 5455 "character mapping must be in range(0x%x)", max+1);
4870 Py_DECREF(x); 5456 Py_DECREF(x);
4871 return -1; 5457 return -1;
4872 } 5458 }
4873 *result = x; 5459 *result = x;
4874 return 0; 5460 return 0;
4875 } 5461 }
4876 else if (PyUnicode_Check(x)) { 5462 else if (PyUnicode_Check(x)) {
4877 *result = x; 5463 *result = x;
4878 return 0; 5464 return 0;
4879 } 5465 }
4880 else { 5466 else {
4881 /* wrong return value */ 5467 /* wrong return value */
4882 PyErr_SetString(PyExc_TypeError, 5468 PyErr_SetString(PyExc_TypeError,
4883 "character mapping must return integer, None or unicode" ); 5469 "character mapping must return integer, None or str");
4884 Py_DECREF(x); 5470 Py_DECREF(x);
4885 return -1; 5471 return -1;
4886 } 5472 }
4887 } 5473 }
4888 /* ensure that *outobj is at least requiredsize characters long, 5474 /* ensure that *outobj is at least requiredsize characters long,
4889 if not reallocate and adjust various state variables. 5475 if not reallocate and adjust various state variables.
4890 Return 0 on success, -1 on error */ 5476 Return 0 on success, -1 on error */
4891 static 5477 static
4892 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, 5478 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4893 Py_ssize_t requiredsize) 5479 Py_ssize_t requiredsize)
(...skipping 23 matching lines...) Expand all
4917 PyObject **res) 5503 PyObject **res)
4918 { 5504 {
4919 if (charmaptranslate_lookup(*curinp, mapping, res)) 5505 if (charmaptranslate_lookup(*curinp, mapping, res))
4920 return -1; 5506 return -1;
4921 if (*res==NULL) { 5507 if (*res==NULL) {
4922 /* not found => default to 1:1 mapping */ 5508 /* not found => default to 1:1 mapping */
4923 *(*outp)++ = *curinp; 5509 *(*outp)++ = *curinp;
4924 } 5510 }
4925 else if (*res==Py_None) 5511 else if (*res==Py_None)
4926 ; 5512 ;
4927 else if (PyInt_Check(*res)) { 5513 else if (PyLong_Check(*res)) {
4928 /* no overflow check, because we know that the space is enough */ 5514 /* no overflow check, because we know that the space is enough */
4929 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res); 5515 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
4930 } 5516 }
4931 else if (PyUnicode_Check(*res)) { 5517 else if (PyUnicode_Check(*res)) {
4932 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res); 5518 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4933 if (repsize==1) { 5519 if (repsize==1) {
4934 /* no overflow check, because we know that the space is enough */ 5520 /* no overflow check, because we know that the space is enough */
4935 *(*outp)++ = *PyUnicode_AS_UNICODE(*res); 5521 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4936 } 5522 }
4937 else if (repsize!=0) { 5523 else if (repsize!=0) {
4938 /* more than one character */ 5524 /* more than one character */
4939 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) + 5525 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
(...skipping 261 matching lines...) Expand 10 before | Expand all | Expand 10 after
5201 for (p = collstart; p < collend; ++p) 5787 for (p = collstart; p < collend; ++p)
5202 output += sprintf(output, "&#%d;", (int)*p); 5788 output += sprintf(output, "&#%d;", (int)*p);
5203 p = collend; 5789 p = collend;
5204 break; 5790 break;
5205 default: 5791 default:
5206 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 5792 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5207 encoding, reason, s, l ength, &exc, 5793 encoding, reason, s, l ength, &exc,
5208 collstart-s, collend-s , &newpos); 5794 collstart-s, collend-s , &newpos);
5209 if (repunicode == NULL) 5795 if (repunicode == NULL)
5210 goto onError; 5796 goto onError;
5797 if (!PyUnicode_Check(repunicode)) {
5798 /* Byte results not supported, since they have no decimal proper ty. */
5799 PyErr_SetString(PyExc_TypeError, "error handler should return un icode");
5800 Py_DECREF(repunicode);
5801 goto onError;
5802 }
5211 /* generate replacement */ 5803 /* generate replacement */
5212 repsize = PyUnicode_GET_SIZE(repunicode); 5804 repsize = PyUnicode_GET_SIZE(repunicode);
5213 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 5805 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5214 Py_UNICODE ch = *uni2; 5806 Py_UNICODE ch = *uni2;
5215 if (Py_UNICODE_ISSPACE(ch)) 5807 if (Py_UNICODE_ISSPACE(ch))
5216 *output++ = ' '; 5808 *output++ = ' ';
5217 else { 5809 else {
5218 decimal = Py_UNICODE_TODECIMAL(ch); 5810 decimal = Py_UNICODE_TODECIMAL(ch);
5219 if (decimal >= 0) 5811 if (decimal >= 0)
5220 *output++ = '0' + decimal; 5812 *output++ = '0' + decimal;
(...skipping 20 matching lines...) Expand all
5241 onError: 5833 onError:
5242 Py_XDECREF(exc); 5834 Py_XDECREF(exc);
5243 Py_XDECREF(errorHandler); 5835 Py_XDECREF(errorHandler);
5244 return -1; 5836 return -1;
5245 } 5837 }
5246 5838
5247 /* --- Helpers ------------------------------------------------------------ */ 5839 /* --- Helpers ------------------------------------------------------------ */
5248 5840
5249 #include "stringlib/unicodedefs.h" 5841 #include "stringlib/unicodedefs.h"
5250 #include "stringlib/fastsearch.h" 5842 #include "stringlib/fastsearch.h"
5251
5252 #include "stringlib/count.h" 5843 #include "stringlib/count.h"
5844 /* Include _ParseTupleFinds from find.h */
5845 #define FROM_UNICODE
5253 #include "stringlib/find.h" 5846 #include "stringlib/find.h"
5254 #include "stringlib/partition.h" 5847 #include "stringlib/partition.h"
5255 #include "stringlib/split.h" 5848
5849 #define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
5850 #define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLoca le
5851 #include "stringlib/localeutil.h"
5256 5852
5257 /* helper macro to fixup start/end slice values */ 5853 /* helper macro to fixup start/end slice values */
5258 #define ADJUST_INDICES(start, end, len) \ 5854 #define FIX_START_END(obj) \
5259 if (end > len) \ 5855 if (start < 0) \
5260 end = len; \ 5856 start += (obj)->length; \
5261 else if (end < 0) { \ 5857 if (start < 0) \
5262 end += len; \ 5858 start = 0; \
5263 if (end < 0) \ 5859 if (end > (obj)->length) \
5264 end = 0; \ 5860 end = (obj)->length; \
5265 } \ 5861 if (end < 0) \
5266 if (start < 0) { \ 5862 end += (obj)->length; \
5267 start += len; \ 5863 if (end < 0) \
5268 if (start < 0) \ 5864 end = 0;
5269 start = 0; \
5270 }
5271 5865
5272 Py_ssize_t PyUnicode_Count(PyObject *str, 5866 Py_ssize_t PyUnicode_Count(PyObject *str,
5273 PyObject *substr, 5867 PyObject *substr,
5274 Py_ssize_t start, 5868 Py_ssize_t start,
5275 Py_ssize_t end) 5869 Py_ssize_t end)
5276 { 5870 {
5277 Py_ssize_t result; 5871 Py_ssize_t result;
5278 PyUnicodeObject* str_obj; 5872 PyUnicodeObject* str_obj;
5279 PyUnicodeObject* sub_obj; 5873 PyUnicodeObject* sub_obj;
5280 5874
5281 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str); 5875 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5282 if (!str_obj) 5876 if (!str_obj)
5283 return -1; 5877 return -1;
5284 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr); 5878 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5285 if (!sub_obj) { 5879 if (!sub_obj) {
5286 Py_DECREF(str_obj); 5880 Py_DECREF(str_obj);
5287 return -1; 5881 return -1;
5288 } 5882 }
5289 5883
5290 ADJUST_INDICES(start, end, str_obj->length); 5884 FIX_START_END(str_obj);
5885
5291 result = stringlib_count( 5886 result = stringlib_count(
5292 str_obj->str + start, end - start, sub_obj->str, sub_obj->length, 5887 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5293 PY_SSIZE_T_MAX
5294 ); 5888 );
5295 5889
5296 Py_DECREF(sub_obj); 5890 Py_DECREF(sub_obj);
5297 Py_DECREF(str_obj); 5891 Py_DECREF(str_obj);
5298 5892
5299 return result; 5893 return result;
5300 } 5894 }
5301 5895
5302 Py_ssize_t PyUnicode_Find(PyObject *str, 5896 Py_ssize_t PyUnicode_Find(PyObject *str,
5303 PyObject *sub, 5897 PyObject *sub,
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after
5338 static 5932 static
5339 int tailmatch(PyUnicodeObject *self, 5933 int tailmatch(PyUnicodeObject *self,
5340 PyUnicodeObject *substring, 5934 PyUnicodeObject *substring,
5341 Py_ssize_t start, 5935 Py_ssize_t start,
5342 Py_ssize_t end, 5936 Py_ssize_t end,
5343 int direction) 5937 int direction)
5344 { 5938 {
5345 if (substring->length == 0) 5939 if (substring->length == 0)
5346 return 1; 5940 return 1;
5347 5941
5348 ADJUST_INDICES(start, end, self->length); 5942 FIX_START_END(self);
5943
5349 end -= substring->length; 5944 end -= substring->length;
5350 if (end < start) 5945 if (end < start)
5351 return 0; 5946 return 0;
5352 5947
5353 if (direction > 0) { 5948 if (direction > 0) {
5354 if (Py_UNICODE_MATCH(self, end, substring)) 5949 if (Py_UNICODE_MATCH(self, end, substring))
5355 return 1; 5950 return 1;
5356 } else { 5951 } else {
5357 if (Py_UNICODE_MATCH(self, start, substring)) 5952 if (Py_UNICODE_MATCH(self, start, substring))
5358 return 1; 5953 return 1;
(...skipping 175 matching lines...) Expand 10 before | Expand all | Expand 10 after
5534 previous_is_cased = 1; 6129 previous_is_cased = 1;
5535 else 6130 else
5536 previous_is_cased = 0; 6131 previous_is_cased = 0;
5537 } 6132 }
5538 return 1; 6133 return 1;
5539 } 6134 }
5540 6135
5541 PyObject * 6136 PyObject *
5542 PyUnicode_Join(PyObject *separator, PyObject *seq) 6137 PyUnicode_Join(PyObject *separator, PyObject *seq)
5543 { 6138 {
5544 PyObject *internal_separator = NULL;
5545 const Py_UNICODE blank = ' '; 6139 const Py_UNICODE blank = ' ';
5546 const Py_UNICODE *sep = &blank; 6140 const Py_UNICODE *sep = &blank;
5547 Py_ssize_t seplen = 1; 6141 Py_ssize_t seplen = 1;
5548 PyUnicodeObject *res = NULL; /* the result */ 6142 PyUnicodeObject *res = NULL; /* the result */
5549 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5550 Py_ssize_t res_used; /* # used bytes */
5551 Py_UNICODE *res_p; /* pointer to free byte in res's string area */ 6143 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5552 PyObject *fseq; /* PySequence_Fast(seq) */ 6144 PyObject *fseq; /* PySequence_Fast(seq) */
5553 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 6145 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6146 PyObject **items;
5554 PyObject *item; 6147 PyObject *item;
5555 Py_ssize_t i; 6148 Py_ssize_t sz, i;
5556 6149
5557 fseq = PySequence_Fast(seq, ""); 6150 fseq = PySequence_Fast(seq, "");
5558 if (fseq == NULL) { 6151 if (fseq == NULL) {
5559 return NULL; 6152 return NULL;
5560 } 6153 }
5561 6154
5562 /* Grrrr. A codec may be invoked to convert str objects to 6155 /* NOTE: the following code can't call back into Python code,
5563 * Unicode, and so it's possible to call back into Python code 6156 * so we are sure that fseq won't be mutated.
5564 * during PyUnicode_FromObject(), and so it's possible for a sick
5565 * codec to change the size of fseq (if seq is a list). Therefore
5566 * we have to keep refetching the size -- can't assume seqlen
5567 * is invariant.
5568 */ 6157 */
6158
5569 seqlen = PySequence_Fast_GET_SIZE(fseq); 6159 seqlen = PySequence_Fast_GET_SIZE(fseq);
5570 /* If empty sequence, return u"". */ 6160 /* If empty sequence, return u"". */
5571 if (seqlen == 0) { 6161 if (seqlen == 0) {
5572 res = _PyUnicode_New(0); /* empty sequence; return u"" */ 6162 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5573 goto Done; 6163 goto Done;
5574 } 6164 }
6165 items = PySequence_Fast_ITEMS(fseq);
5575 /* If singleton sequence with an exact Unicode, return that. */ 6166 /* If singleton sequence with an exact Unicode, return that. */
5576 if (seqlen == 1) { 6167 if (seqlen == 1) {
5577 item = PySequence_Fast_GET_ITEM(fseq, 0); 6168 item = items[0];
5578 if (PyUnicode_CheckExact(item)) { 6169 if (PyUnicode_CheckExact(item)) {
5579 Py_INCREF(item); 6170 Py_INCREF(item);
5580 res = (PyUnicodeObject *)item; 6171 res = (PyUnicodeObject *)item;
5581 goto Done; 6172 goto Done;
5582 } 6173 }
5583 } 6174 }
5584 6175 else {
5585 /* At least two items to join, or one that isn't exact Unicode. */ 6176 /* Set up sep and seplen */
5586 if (seqlen > 1) {
5587 /* Set up sep and seplen -- they're needed. */
5588 if (separator == NULL) { 6177 if (separator == NULL) {
5589 sep = &blank; 6178 sep = &blank;
5590 seplen = 1; 6179 seplen = 1;
5591 } 6180 }
5592 else { 6181 else {
5593 internal_separator = PyUnicode_FromObject(separator); 6182 if (!PyUnicode_Check(separator)) {
5594 if (internal_separator == NULL) 6183 PyErr_Format(PyExc_TypeError,
6184 "separator: expected str instance,"
6185 " %.80s found",
6186 Py_TYPE(separator)->tp_name);
5595 goto onError; 6187 goto onError;
5596 sep = PyUnicode_AS_UNICODE(internal_separator); 6188 }
5597 seplen = PyUnicode_GET_SIZE(internal_separator); 6189 sep = PyUnicode_AS_UNICODE(separator);
5598 /* In case PyUnicode_FromObject() mutated seq. */ 6190 seplen = PyUnicode_GET_SIZE(separator);
5599 seqlen = PySequence_Fast_GET_SIZE(fseq); 6191 }
5600 } 6192 }
5601 } 6193
5602 6194 /* There are at least two things to join, or else we have a subclass
5603 /* Get space. */ 6195 * of str in the sequence.
5604 res = _PyUnicode_New(res_alloc); 6196 * Do a pre-pass to figure out the total amount of space we'll
5605 if (res == NULL) 6197 * need (sz), and see whether all argument are strings.
5606 goto onError; 6198 */
5607 res_p = PyUnicode_AS_UNICODE(res); 6199 sz = 0;
5608 res_used = 0; 6200 for (i = 0; i < seqlen; i++) {
5609 6201 const Py_ssize_t old_sz = sz;
5610 for (i = 0; i < seqlen; ++i) { 6202 item = items[i];
5611 Py_ssize_t itemlen; 6203 if (!PyUnicode_Check(item)) {
5612 Py_ssize_t new_res_used;
5613
5614 item = PySequence_Fast_GET_ITEM(fseq, i);
5615 /* Convert item to Unicode. */
5616 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5617 PyErr_Format(PyExc_TypeError,