| LEFT | RIGHT |
| 1 /* | 1 /* |
| 2 | 2 |
| 3 Unicode implementation based on original code by Fredrik Lundh, | 3 Unicode implementation based on original code by Fredrik Lundh, |
| 4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the | 4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the |
| 5 Unicode Integration Proposal (see file Misc/unicode.txt). | 5 Unicode Integration Proposal (see file Misc/unicode.txt). |
| 6 | 6 |
| 7 Major speed upgrades to the method implementations at the Reykjavik | 7 Major speed upgrades to the method implementations at the Reykjavik |
| 8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. | 8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. |
| 9 | 9 |
| 10 Copyright (c) Corporation for National Research Initiatives. | 10 Copyright (c) Corporation for National Research Initiatives. |
| (...skipping 23 matching lines...) Expand all Loading... |
| 34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | 34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
| 35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | 35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
| 36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT | 36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT |
| 37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | 37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
| 38 -------------------------------------------------------------------- | 38 -------------------------------------------------------------------- |
| 39 | 39 |
| 40 */ | 40 */ |
| 41 | 41 |
| 42 #define PY_SSIZE_T_CLEAN | 42 #define PY_SSIZE_T_CLEAN |
| 43 #include "Python.h" | 43 #include "Python.h" |
| 44 #include "bytes_methods.h" |
| 44 | 45 |
| 45 #include "unicodeobject.h" | 46 #include "unicodeobject.h" |
| 46 #include "ucnhash.h" | 47 #include "ucnhash.h" |
| 47 | 48 |
| 48 #ifdef MS_WINDOWS | 49 #ifdef MS_WINDOWS |
| 49 #include <windows.h> | 50 #include <windows.h> |
| 50 #endif | 51 #endif |
| 51 | 52 |
| 52 /* Limit for the Unicode object free list */ | 53 /* Limit for the Unicode object free list */ |
| 53 | 54 |
| (...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 85 The globals are initialized by the _PyUnicode_Init() API and should | 86 The globals are initialized by the _PyUnicode_Init() API and should |
| 86 not be used before calling that API. | 87 not be used before calling that API. |
| 87 | 88 |
| 88 */ | 89 */ |
| 89 | 90 |
| 90 | 91 |
| 91 #ifdef __cplusplus | 92 #ifdef __cplusplus |
| 92 extern "C" { | 93 extern "C" { |
| 93 #endif | 94 #endif |
| 94 | 95 |
| 96 /* This dictionary holds all interned unicode strings. Note that references |
| 97 to strings in this dictionary are *not* counted in the string's ob_refcnt. |
| 98 When the interned string reaches a refcnt of 0 the string deallocation |
| 99 function will delete the reference from this dictionary. |
| 100 |
| 101 Another way to look at this is that to say that the actual reference |
| 102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) |
| 103 */ |
| 104 static PyObject *interned; |
| 105 |
| 95 /* Free list for Unicode objects */ | 106 /* Free list for Unicode objects */ |
| 96 static PyUnicodeObject *free_list; | 107 static PyUnicodeObject *free_list; |
| 97 static int numfree; | 108 static int numfree; |
| 98 | 109 |
| 99 /* The empty Unicode object is shared to improve performance. */ | 110 /* The empty Unicode object is shared to improve performance. */ |
| 100 static PyUnicodeObject *unicode_empty; | 111 static PyUnicodeObject *unicode_empty; |
| 101 | 112 |
| 102 /* Single character Unicode strings in the Latin-1 range are being | 113 /* Single character Unicode strings in the Latin-1 range are being |
| 103 shared as well. */ | 114 shared as well. */ |
| 104 static PyUnicodeObject *unicode_latin1[256]; | 115 static PyUnicodeObject *unicode_latin1[256]; |
| 105 | 116 |
| 106 /* Default encoding to use and assume when NULL is passed as encoding | 117 /* Default encoding to use and assume when NULL is passed as encoding |
| 107 parameter; it is initialized by _PyUnicode_Init(). | 118 parameter; it is fixed to "utf-8". Always use the |
| 108 | 119 PyUnicode_GetDefaultEncoding() API to access this global. |
| 109 Always use the PyUnicode_SetDefaultEncoding() and | 120 |
| 110 PyUnicode_GetDefaultEncoding() APIs to access this global. | 121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the |
| 111 | 122 hard coded default! |
| 112 */ | 123 */ |
| 113 static char unicode_default_encoding[100]; | 124 static const char unicode_default_encoding[] = "utf-8"; |
| 114 | 125 |
| 115 /* Fast detection of the most frequent whitespace characters */ | 126 /* Fast detection of the most frequent whitespace characters */ |
| 116 const unsigned char _Py_ascii_whitespace[] = { | 127 const unsigned char _Py_ascii_whitespace[] = { |
| 117 0, 0, 0, 0, 0, 0, 0, 0, | 128 0, 0, 0, 0, 0, 0, 0, 0, |
| 118 /* case 0x0009: * CHARACTER TABULATION */ | 129 /* case 0x0009: * HORIZONTAL TABULATION */ |
| 119 /* case 0x000A: * LINE FEED */ | 130 /* case 0x000A: * LINE FEED */ |
| 120 /* case 0x000B: * LINE TABULATION */ | 131 /* case 0x000B: * VERTICAL TABULATION */ |
| 121 /* case 0x000C: * FORM FEED */ | 132 /* case 0x000C: * FORM FEED */ |
| 122 /* case 0x000D: * CARRIAGE RETURN */ | 133 /* case 0x000D: * CARRIAGE RETURN */ |
| 123 0, 1, 1, 1, 1, 1, 0, 0, | 134 0, 1, 1, 1, 1, 1, 0, 0, |
| 124 0, 0, 0, 0, 0, 0, 0, 0, | 135 0, 0, 0, 0, 0, 0, 0, 0, |
| 125 /* case 0x001C: * FILE SEPARATOR */ | 136 /* case 0x001C: * FILE SEPARATOR */ |
| 126 /* case 0x001D: * GROUP SEPARATOR */ | 137 /* case 0x001D: * GROUP SEPARATOR */ |
| 127 /* case 0x001E: * RECORD SEPARATOR */ | 138 /* case 0x001E: * RECORD SEPARATOR */ |
| 128 /* case 0x001F: * UNIT SEPARATOR */ | 139 /* case 0x001F: * UNIT SEPARATOR */ |
| 129 0, 0, 0, 0, 1, 1, 1, 1, | 140 0, 0, 0, 0, 1, 1, 1, 1, |
| 130 /* case 0x0020: * SPACE */ | 141 /* case 0x0020: * SPACE */ |
| 131 1, 0, 0, 0, 0, 0, 0, 0, | 142 1, 0, 0, 0, 0, 0, 0, 0, |
| 132 0, 0, 0, 0, 0, 0, 0, 0, | 143 0, 0, 0, 0, 0, 0, 0, 0, |
| 133 0, 0, 0, 0, 0, 0, 0, 0, | 144 0, 0, 0, 0, 0, 0, 0, 0, |
| 134 0, 0, 0, 0, 0, 0, 0, 0, | 145 0, 0, 0, 0, 0, 0, 0, 0, |
| 135 | 146 |
| 136 0, 0, 0, 0, 0, 0, 0, 0, | 147 0, 0, 0, 0, 0, 0, 0, 0, |
| 137 0, 0, 0, 0, 0, 0, 0, 0, | 148 0, 0, 0, 0, 0, 0, 0, 0, |
| 138 0, 0, 0, 0, 0, 0, 0, 0, | 149 0, 0, 0, 0, 0, 0, 0, 0, |
| 139 0, 0, 0, 0, 0, 0, 0, 0, | 150 0, 0, 0, 0, 0, 0, 0, 0, |
| 140 0, 0, 0, 0, 0, 0, 0, 0, | 151 0, 0, 0, 0, 0, 0, 0, 0, |
| 141 0, 0, 0, 0, 0, 0, 0, 0, | 152 0, 0, 0, 0, 0, 0, 0, 0, |
| 142 0, 0, 0, 0, 0, 0, 0, 0, | 153 0, 0, 0, 0, 0, 0, 0, 0, |
| 143 0, 0, 0, 0, 0, 0, 0, 0 | 154 0, 0, 0, 0, 0, 0, 0, 0 |
| 144 }; | 155 }; |
| 145 | 156 |
| 157 static PyObject *unicode_encode_call_errorhandler(const char *errors, |
| 158 PyObject **errorHandler,const char *encoding, const char *reason, |
| 159 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, |
| 160 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); |
| 161 |
| 162 static void raise_encode_exception(PyObject **exceptionObject, |
| 163 const char *encoding, |
| 164 const Py_UNICODE *unicode, Py_ssize_t size, |
| 165 Py_ssize_t startpos, Py_ssize_t endpos, |
| 166 const char *reason); |
| 167 |
| 146 /* Same for linebreaks */ | 168 /* Same for linebreaks */ |
| 147 static unsigned char ascii_linebreak[] = { | 169 static unsigned char ascii_linebreak[] = { |
| 148 0, 0, 0, 0, 0, 0, 0, 0, | 170 0, 0, 0, 0, 0, 0, 0, 0, |
| 149 /* 0x000A, * LINE FEED */ | 171 /* 0x000A, * LINE FEED */ |
| 150 /* 0x000B, * LINE TABULATION */ | |
| 151 /* 0x000C, * FORM FEED */ | |
| 152 /* 0x000D, * CARRIAGE RETURN */ | 172 /* 0x000D, * CARRIAGE RETURN */ |
| 153 0, 0, 1, 1, 1, 1, 0, 0, | 173 0, 0, 1, 0, 0, 1, 0, 0, |
| 154 0, 0, 0, 0, 0, 0, 0, 0, | 174 0, 0, 0, 0, 0, 0, 0, 0, |
| 155 /* 0x001C, * FILE SEPARATOR */ | 175 /* 0x001C, * FILE SEPARATOR */ |
| 156 /* 0x001D, * GROUP SEPARATOR */ | 176 /* 0x001D, * GROUP SEPARATOR */ |
| 157 /* 0x001E, * RECORD SEPARATOR */ | 177 /* 0x001E, * RECORD SEPARATOR */ |
| 158 0, 0, 0, 0, 1, 1, 1, 0, | 178 0, 0, 0, 0, 1, 1, 1, 0, |
| 159 0, 0, 0, 0, 0, 0, 0, 0, | 179 0, 0, 0, 0, 0, 0, 0, 0, |
| 160 0, 0, 0, 0, 0, 0, 0, 0, | 180 0, 0, 0, 0, 0, 0, 0, 0, |
| 161 0, 0, 0, 0, 0, 0, 0, 0, | 181 0, 0, 0, 0, 0, 0, 0, 0, |
| 162 0, 0, 0, 0, 0, 0, 0, 0, | 182 0, 0, 0, 0, 0, 0, 0, 0, |
| 163 | 183 |
| (...skipping 21 matching lines...) Expand all Loading... |
| 185 } | 205 } |
| 186 | 206 |
| 187 /* --- Bloom Filters ----------------------------------------------------- */ | 207 /* --- Bloom Filters ----------------------------------------------------- */ |
| 188 | 208 |
| 189 /* stuff to implement simple "bloom filters" for Unicode characters. | 209 /* stuff to implement simple "bloom filters" for Unicode characters. |
| 190 to keep things simple, we use a single bitmask, using the least 5 | 210 to keep things simple, we use a single bitmask, using the least 5 |
| 191 bits from each unicode characters as the bit index. */ | 211 bits from each unicode characters as the bit index. */ |
| 192 | 212 |
| 193 /* the linebreak mask is set up by Unicode_Init below */ | 213 /* the linebreak mask is set up by Unicode_Init below */ |
| 194 | 214 |
| 195 #if LONG_BIT >= 128 | |
| 196 #define BLOOM_WIDTH 128 | |
| 197 #elif LONG_BIT >= 64 | |
| 198 #define BLOOM_WIDTH 64 | |
| 199 #elif LONG_BIT >= 32 | |
| 200 #define BLOOM_WIDTH 32 | |
| 201 #else | |
| 202 #error "LONG_BIT is smaller than 32" | |
| 203 #endif | |
| 204 | |
| 205 #define BLOOM_MASK unsigned long | 215 #define BLOOM_MASK unsigned long |
| 206 | 216 |
| 207 static BLOOM_MASK bloom_linebreak; | 217 static BLOOM_MASK bloom_linebreak; |
| 208 | 218 |
| 209 #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1))))) | 219 #define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F)))) |
| 210 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) | |
| 211 | 220 |
| 212 #define BLOOM_LINEBREAK(ch) \ | 221 #define BLOOM_LINEBREAK(ch) \ |
| 213 ((ch) < 128U ? ascii_linebreak[(ch)] : \ | 222 ((ch) < 128U ? ascii_linebreak[(ch)] : \ |
| 214 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) | 223 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) |
| 215 | 224 |
| 216 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len) | 225 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len) |
| 217 { | 226 { |
| 218 /* calculate simple bloom-style bitmask for a given unicode string */ | 227 /* calculate simple bloom-style bitmask for a given unicode string */ |
| 219 | 228 |
| 220 BLOOM_MASK mask; | 229 long mask; |
| 221 Py_ssize_t i; | 230 Py_ssize_t i; |
| 222 | 231 |
| 223 mask = 0; | 232 mask = 0; |
| 224 for (i = 0; i < len; i++) | 233 for (i = 0; i < len; i++) |
| 225 BLOOM_ADD(mask, ptr[i]); | 234 mask |= (1 << (ptr[i] & 0x1F)); |
| 226 | 235 |
| 227 return mask; | 236 return mask; |
| 228 } | 237 } |
| 229 | 238 |
| 230 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t
setlen) | 239 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t
setlen) |
| 231 { | 240 { |
| 232 Py_ssize_t i; | 241 Py_ssize_t i; |
| 233 | 242 |
| 234 for (i = 0; i < setlen; i++) | 243 for (i = 0; i < setlen; i++) |
| 235 if (set[i] == chr) | 244 if (set[i] == chr) |
| (...skipping 19 matching lines...) Expand all Loading... |
| 255 | 264 |
| 256 /* Resizing shared object (unicode_empty or single character | 265 /* Resizing shared object (unicode_empty or single character |
| 257 objects) in-place is not allowed. Use PyUnicode_Resize() | 266 objects) in-place is not allowed. Use PyUnicode_Resize() |
| 258 instead ! */ | 267 instead ! */ |
| 259 | 268 |
| 260 if (unicode == unicode_empty || | 269 if (unicode == unicode_empty || |
| 261 (unicode->length == 1 && | 270 (unicode->length == 1 && |
| 262 unicode->str[0] < 256U && | 271 unicode->str[0] < 256U && |
| 263 unicode_latin1[unicode->str[0]] == unicode)) { | 272 unicode_latin1[unicode->str[0]] == unicode)) { |
| 264 PyErr_SetString(PyExc_SystemError, | 273 PyErr_SetString(PyExc_SystemError, |
| 265 "can't resize shared unicode objects"); | 274 "can't resize shared str objects"); |
| 266 return -1; | 275 return -1; |
| 267 } | 276 } |
| 268 | 277 |
| 269 /* We allocate one more byte to make sure the string is Ux0000 terminated. | 278 /* We allocate one more byte to make sure the string is Ux0000 terminated. |
| 270 The overallocation is also used by fastsearch, which assumes that it's | 279 The overallocation is also used by fastsearch, which assumes that it's |
| 271 safe to look at str[length] (without making any assumptions about what | 280 safe to look at str[length] (without making any assumptions about what |
| 272 it contains). */ | 281 it contains). */ |
| 273 | 282 |
| 274 oldstr = unicode->str; | 283 oldstr = unicode->str; |
| 275 unicode->str = PyObject_REALLOC(unicode->str, | 284 unicode->str = PyObject_REALLOC(unicode->str, |
| (...skipping 10 matching lines...) Expand all Loading... |
| 286 /* Reset the object caches */ | 295 /* Reset the object caches */ |
| 287 if (unicode->defenc) { | 296 if (unicode->defenc) { |
| 288 Py_CLEAR(unicode->defenc); | 297 Py_CLEAR(unicode->defenc); |
| 289 } | 298 } |
| 290 unicode->hash = -1; | 299 unicode->hash = -1; |
| 291 | 300 |
| 292 return 0; | 301 return 0; |
| 293 } | 302 } |
| 294 | 303 |
| 295 /* We allocate one more byte to make sure the string is | 304 /* We allocate one more byte to make sure the string is |
| 296 Ux0000 terminated; some code relies on that. | 305 Ux0000 terminated; some code (e.g. new_identifier) |
| 306 relies on that. |
| 297 | 307 |
| 298 XXX This allocator could further be enhanced by assuring that the | 308 XXX This allocator could further be enhanced by assuring that the |
| 299 free list never reduces its size below 1. | 309 free list never reduces its size below 1. |
| 300 | 310 |
| 301 */ | 311 */ |
| 302 | 312 |
| 303 static | 313 static |
| 304 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length) | 314 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length) |
| 305 { | 315 { |
| 306 register PyUnicodeObject *unicode; | 316 register PyUnicodeObject *unicode; |
| (...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 353 * the caller fails before initializing str -- unicode_resize() | 363 * the caller fails before initializing str -- unicode_resize() |
| 354 * reads str[0], and the Keep-Alive optimization can keep memory | 364 * reads str[0], and the Keep-Alive optimization can keep memory |
| 355 * allocated for str alive across a call to unicode_dealloc(unicode). | 365 * allocated for str alive across a call to unicode_dealloc(unicode). |
| 356 * We don't want unicode_resize to read uninitialized memory in | 366 * We don't want unicode_resize to read uninitialized memory in |
| 357 * that case. | 367 * that case. |
| 358 */ | 368 */ |
| 359 unicode->str[0] = 0; | 369 unicode->str[0] = 0; |
| 360 unicode->str[length] = 0; | 370 unicode->str[length] = 0; |
| 361 unicode->length = length; | 371 unicode->length = length; |
| 362 unicode->hash = -1; | 372 unicode->hash = -1; |
| 373 unicode->state = 0; |
| 363 unicode->defenc = NULL; | 374 unicode->defenc = NULL; |
| 364 return unicode; | 375 return unicode; |
| 365 | 376 |
| 366 onError: | 377 onError: |
| 367 /* XXX UNREF/NEWREF interface should be more symmetrical */ | 378 /* XXX UNREF/NEWREF interface should be more symmetrical */ |
| 368 _Py_DEC_REFTOTAL; | 379 _Py_DEC_REFTOTAL; |
| 369 _Py_ForgetReference((PyObject *)unicode); | 380 _Py_ForgetReference((PyObject *)unicode); |
| 370 PyObject_Del(unicode); | 381 PyObject_Del(unicode); |
| 371 return NULL; | 382 return NULL; |
| 372 } | 383 } |
| 373 | 384 |
| 374 static | 385 static |
| 375 void unicode_dealloc(register PyUnicodeObject *unicode) | 386 void unicode_dealloc(register PyUnicodeObject *unicode) |
| 376 { | 387 { |
| 388 switch (PyUnicode_CHECK_INTERNED(unicode)) { |
| 389 case SSTATE_NOT_INTERNED: |
| 390 break; |
| 391 |
| 392 case SSTATE_INTERNED_MORTAL: |
| 393 /* revive dead object temporarily for DelItem */ |
| 394 Py_REFCNT(unicode) = 3; |
| 395 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0) |
| 396 Py_FatalError( |
| 397 "deletion of interned string failed"); |
| 398 break; |
| 399 |
| 400 case SSTATE_INTERNED_IMMORTAL: |
| 401 Py_FatalError("Immortal interned string died."); |
| 402 |
| 403 default: |
| 404 Py_FatalError("Inconsistent interned string state."); |
| 405 } |
| 406 |
| 377 if (PyUnicode_CheckExact(unicode) && | 407 if (PyUnicode_CheckExact(unicode) && |
| 378 numfree < PyUnicode_MAXFREELIST) { | 408 numfree < PyUnicode_MAXFREELIST) { |
| 379 /* Keep-Alive optimization */ | 409 /* Keep-Alive optimization */ |
| 380 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { | 410 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { |
| 381 PyObject_DEL(unicode->str); | 411 PyObject_DEL(unicode->str); |
| 382 unicode->str = NULL; | 412 unicode->str = NULL; |
| 383 unicode->length = 0; | 413 unicode->length = 0; |
| 384 } | 414 } |
| 385 if (unicode->defenc) { | 415 if (unicode->defenc) { |
| 386 Py_CLEAR(unicode->defenc); | 416 Py_CLEAR(unicode->defenc); |
| (...skipping 164 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 551 | 581 |
| 552 PyObject *PyUnicode_FromWideChar(register const wchar_t *w, | 582 PyObject *PyUnicode_FromWideChar(register const wchar_t *w, |
| 553 Py_ssize_t size) | 583 Py_ssize_t size) |
| 554 { | 584 { |
| 555 PyUnicodeObject *unicode; | 585 PyUnicodeObject *unicode; |
| 556 register Py_ssize_t i; | 586 register Py_ssize_t i; |
| 557 Py_ssize_t alloc; | 587 Py_ssize_t alloc; |
| 558 const wchar_t *orig_w; | 588 const wchar_t *orig_w; |
| 559 | 589 |
| 560 if (w == NULL) { | 590 if (w == NULL) { |
| 591 if (size == 0) |
| 592 return PyUnicode_FromStringAndSize(NULL, 0); |
| 561 PyErr_BadInternalCall(); | 593 PyErr_BadInternalCall(); |
| 562 return NULL; | 594 return NULL; |
| 595 } |
| 596 |
| 597 if (size == -1) { |
| 598 size = wcslen(w); |
| 563 } | 599 } |
| 564 | 600 |
| 565 alloc = size; | 601 alloc = size; |
| 566 orig_w = w; | 602 orig_w = w; |
| 567 for (i = size; i > 0; i--) { | 603 for (i = size; i > 0; i--) { |
| 568 if (*w > 0xFFFF) | 604 if (*w > 0xFFFF) |
| 569 alloc++; | 605 alloc++; |
| 570 w++; | 606 w++; |
| 571 } | 607 } |
| 572 w = orig_w; | 608 w = orig_w; |
| (...skipping 20 matching lines...) Expand all Loading... |
| 593 } | 629 } |
| 594 | 630 |
| 595 #else | 631 #else |
| 596 | 632 |
| 597 PyObject *PyUnicode_FromWideChar(register const wchar_t *w, | 633 PyObject *PyUnicode_FromWideChar(register const wchar_t *w, |
| 598 Py_ssize_t size) | 634 Py_ssize_t size) |
| 599 { | 635 { |
| 600 PyUnicodeObject *unicode; | 636 PyUnicodeObject *unicode; |
| 601 | 637 |
| 602 if (w == NULL) { | 638 if (w == NULL) { |
| 639 if (size == 0) |
| 640 return PyUnicode_FromStringAndSize(NULL, 0); |
| 603 PyErr_BadInternalCall(); | 641 PyErr_BadInternalCall(); |
| 604 return NULL; | 642 return NULL; |
| 643 } |
| 644 |
| 645 if (size == -1) { |
| 646 size = wcslen(w); |
| 605 } | 647 } |
| 606 | 648 |
| 607 unicode = _PyUnicode_New(size); | 649 unicode = _PyUnicode_New(size); |
| 608 if (!unicode) | 650 if (!unicode) |
| 609 return NULL; | 651 return NULL; |
| 610 | 652 |
| 611 /* Copy the wchar_t data into the new object */ | 653 /* Copy the wchar_t data into the new object */ |
| 612 #ifdef HAVE_USABLE_WCHAR_T | 654 #ifdef HAVE_USABLE_WCHAR_T |
| 613 memcpy(unicode->str, w, size * sizeof(wchar_t)); | 655 memcpy(unicode->str, w, size * sizeof(wchar_t)); |
| 614 #else | 656 #else |
| (...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 678 | 720 |
| 679 #ifdef VA_LIST_IS_ARRAY | 721 #ifdef VA_LIST_IS_ARRAY |
| 680 Py_MEMCPY(count, vargs, sizeof(va_list)); | 722 Py_MEMCPY(count, vargs, sizeof(va_list)); |
| 681 #else | 723 #else |
| 682 #ifdef __va_copy | 724 #ifdef __va_copy |
| 683 __va_copy(count, vargs); | 725 __va_copy(count, vargs); |
| 684 #else | 726 #else |
| 685 count = vargs; | 727 count = vargs; |
| 686 #endif | 728 #endif |
| 687 #endif | 729 #endif |
| 688 /* step 1: count the number of %S/%R/%s format specifications | 730 /* step 1: count the number of %S/%R/%A/%s format specifications |
| 689 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these | 731 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/ |
| 690 * objects once during step 3 and put the result in an array) */ | 732 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the |
| 733 * result in an array) */ |
| 691 for (f = format; *f; f++) { | 734 for (f = format; *f; f++) { |
| 692 if (*f == '%') { | 735 if (*f == '%') { |
| 693 if (*(f+1)=='%') | 736 if (*(f+1)=='%') |
| 694 continue; | 737 continue; |
| 695 if (*(f+1)=='S' || *(f+1)=='R') | 738 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A') |
| 696 ++callcount; | 739 ++callcount; |
| 697 while (isdigit((unsigned)*f)) | 740 while (ISDIGIT((unsigned)*f)) |
| 698 width = (width*10) + *f++ - '0'; | 741 width = (width*10) + *f++ - '0'; |
| 699 while (*++f && *f != '%' && !isalpha((unsigned)*f)) | 742 while (*++f && *f != '%' && !ISALPHA((unsigned)*f)) |
| 700 ; | 743 ; |
| 701 if (*f == 's') | 744 if (*f == 's') |
| 702 ++callcount; | 745 ++callcount; |
| 703 } | 746 } |
| 704 } | 747 } |
| 705 /* step 2: allocate memory for the results of | 748 /* step 2: allocate memory for the results of |
| 706 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */ | 749 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */ |
| 707 if (callcount) { | 750 if (callcount) { |
| 708 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount); | 751 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount); |
| 709 if (!callresults) { | 752 if (!callresults) { |
| 710 PyErr_NoMemory(); | 753 PyErr_NoMemory(); |
| 711 return NULL; | 754 return NULL; |
| 712 } | 755 } |
| 713 callresult = callresults; | 756 callresult = callresults; |
| 714 } | 757 } |
| 715 /* step 3: figure out how large a buffer we need */ | 758 /* step 3: figure out how large a buffer we need */ |
| 716 for (f = format; *f; f++) { | 759 for (f = format; *f; f++) { |
| 717 if (*f == '%') { | 760 if (*f == '%') { |
| 718 const char* p = f; | 761 const char* p = f; |
| 719 width = 0; | 762 width = 0; |
| 720 while (isdigit((unsigned)*f)) | 763 while (ISDIGIT((unsigned)*f)) |
| 721 width = (width*10) + *f++ - '0'; | 764 width = (width*10) + *f++ - '0'; |
| 722 while (*++f && *f != '%' && !isalpha((unsigned)*f)) | 765 while (*++f && *f != '%' && !ISALPHA((unsigned)*f)) |
| 723 ; | 766 ; |
| 724 | 767 |
| 725 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since | 768 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since |
| 726 * they don't affect the amount of space we reserve. | 769 * they don't affect the amount of space we reserve. |
| 727 */ | 770 */ |
| 728 if ((*f == 'l' || *f == 'z') && | 771 if ((*f == 'l' || *f == 'z') && |
| 729 (f[1] == 'd' || f[1] == 'u')) | 772 (f[1] == 'd' || f[1] == 'u')) |
| 730 ++f; | 773 ++f; |
| 731 | 774 |
| 732 switch (*f) { | 775 switch (*f) { |
| (...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 799 PyObject *repr; | 842 PyObject *repr; |
| 800 assert(obj); | 843 assert(obj); |
| 801 repr = PyObject_Repr(obj); | 844 repr = PyObject_Repr(obj); |
| 802 if (!repr) | 845 if (!repr) |
| 803 goto fail; | 846 goto fail; |
| 804 n += PyUnicode_GET_SIZE(repr); | 847 n += PyUnicode_GET_SIZE(repr); |
| 805 /* Remember the repr and switch to the next slot */ | 848 /* Remember the repr and switch to the next slot */ |
| 806 *callresult++ = repr; | 849 *callresult++ = repr; |
| 807 break; | 850 break; |
| 808 } | 851 } |
| 852 case 'A': |
| 853 { |
| 854 PyObject *obj = va_arg(count, PyObject *); |
| 855 PyObject *ascii; |
| 856 assert(obj); |
| 857 ascii = PyObject_ASCII(obj); |
| 858 if (!ascii) |
| 859 goto fail; |
| 860 n += PyUnicode_GET_SIZE(ascii); |
| 861 /* Remember the repr and switch to the next slot */ |
| 862 *callresult++ = ascii; |
| 863 break; |
| 864 } |
| 809 case 'p': | 865 case 'p': |
| 810 (void) va_arg(count, int); | 866 (void) va_arg(count, int); |
| 811 /* maximum 64-bit pointer representation: | 867 /* maximum 64-bit pointer representation: |
| 812 * 0xffffffffffffffff | 868 * 0xffffffffffffffff |
| 813 * so 19 characters is enough. | 869 * so 19 characters is enough. |
| 814 * XXX I count 18 -- what's the extra for? | 870 * XXX I count 18 -- what's the extra for? |
| 815 */ | 871 */ |
| 816 n += 19; | 872 n += 19; |
| 817 break; | 873 break; |
| 818 default: | 874 default: |
| (...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 851 callresult = callresults; | 907 callresult = callresults; |
| 852 | 908 |
| 853 for (f = format; *f; f++) { | 909 for (f = format; *f; f++) { |
| 854 if (*f == '%') { | 910 if (*f == '%') { |
| 855 const char* p = f++; | 911 const char* p = f++; |
| 856 int longflag = 0; | 912 int longflag = 0; |
| 857 int size_tflag = 0; | 913 int size_tflag = 0; |
| 858 zeropad = (*f == '0'); | 914 zeropad = (*f == '0'); |
| 859 /* parse the width.precision part */ | 915 /* parse the width.precision part */ |
| 860 width = 0; | 916 width = 0; |
| 861 while (isdigit((unsigned)*f)) | 917 while (ISDIGIT((unsigned)*f)) |
| 862 width = (width*10) + *f++ - '0'; | 918 width = (width*10) + *f++ - '0'; |
| 863 precision = 0; | 919 precision = 0; |
| 864 if (*f == '.') { | 920 if (*f == '.') { |
| 865 f++; | 921 f++; |
| 866 while (isdigit((unsigned)*f)) | 922 while (ISDIGIT((unsigned)*f)) |
| 867 precision = (precision*10) + *f++ - '0'; | 923 precision = (precision*10) + *f++ - '0'; |
| 868 } | 924 } |
| 869 /* handle the long flag, but only for %ld and %lu. | 925 /* handle the long flag, but only for %ld and %lu. |
| 870 others can be added when necessary. */ | 926 others can be added when necessary. */ |
| 871 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) { | 927 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) { |
| 872 longflag = 1; | 928 longflag = 1; |
| 873 ++f; | 929 ++f; |
| 874 } | 930 } |
| 875 /* handle the size_t flag. */ | 931 /* handle the size_t flag. */ |
| 876 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { | 932 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { |
| (...skipping 177 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1054 if (size > PyUnicode_GET_SIZE(unicode)) | 1110 if (size > PyUnicode_GET_SIZE(unicode)) |
| 1055 return PyUnicode_GET_SIZE(unicode); | 1111 return PyUnicode_GET_SIZE(unicode); |
| 1056 else | 1112 else |
| 1057 return size; | 1113 return size; |
| 1058 } | 1114 } |
| 1059 | 1115 |
| 1060 #endif | 1116 #endif |
| 1061 | 1117 |
| 1062 PyObject *PyUnicode_FromOrdinal(int ordinal) | 1118 PyObject *PyUnicode_FromOrdinal(int ordinal) |
| 1063 { | 1119 { |
| 1064 Py_UNICODE s[1]; | 1120 Py_UNICODE s[2]; |
| 1065 | 1121 |
| 1066 #ifdef Py_UNICODE_WIDE | |
| 1067 if (ordinal < 0 || ordinal > 0x10ffff) { | 1122 if (ordinal < 0 || ordinal > 0x10ffff) { |
| 1068 PyErr_SetString(PyExc_ValueError, | 1123 PyErr_SetString(PyExc_ValueError, |
| 1069 "unichr() arg not in range(0x110000) " | 1124 "chr() arg not in range(0x110000)"); |
| 1070 "(wide Python build)"); | 1125 return NULL; |
| 1071 return NULL; | 1126 } |
| 1072 } | 1127 |
| 1073 #else | 1128 #ifndef Py_UNICODE_WIDE |
| 1074 if (ordinal < 0 || ordinal > 0xffff) { | 1129 if (ordinal > 0xffff) { |
| 1075 PyErr_SetString(PyExc_ValueError, | 1130 ordinal -= 0x10000; |
| 1076 "unichr() arg not in range(0x10000) " | 1131 s[0] = 0xD800 | (ordinal >> 10); |
| 1077 "(narrow Python build)"); | 1132 s[1] = 0xDC00 | (ordinal & 0x3FF); |
| 1078 return NULL; | 1133 return PyUnicode_FromUnicode(s, 2); |
| 1079 } | 1134 } |
| 1080 #endif | 1135 #endif |
| 1081 | 1136 |
| 1082 s[0] = (Py_UNICODE)ordinal; | 1137 s[0] = (Py_UNICODE)ordinal; |
| 1083 return PyUnicode_FromUnicode(s, 1); | 1138 return PyUnicode_FromUnicode(s, 1); |
| 1084 } | 1139 } |
| 1085 | 1140 |
| 1086 PyObject *PyUnicode_FromObject(register PyObject *obj) | 1141 PyObject *PyUnicode_FromObject(register PyObject *obj) |
| 1087 { | 1142 { |
| 1088 /* XXX Perhaps we should make this API an alias of | 1143 /* XXX Perhaps we should make this API an alias of |
| 1089 PyObject_Unicode() instead ?! */ | 1144 PyObject_Str() instead ?! */ |
| 1090 if (PyUnicode_CheckExact(obj)) { | 1145 if (PyUnicode_CheckExact(obj)) { |
| 1091 Py_INCREF(obj); | 1146 Py_INCREF(obj); |
| 1092 return obj; | 1147 return obj; |
| 1093 } | 1148 } |
| 1094 if (PyUnicode_Check(obj)) { | 1149 if (PyUnicode_Check(obj)) { |
| 1095 /* For a Unicode subtype that's not a Unicode object, | 1150 /* For a Unicode subtype that's not a Unicode object, |
| 1096 return a true Unicode object with the same data. */ | 1151 return a true Unicode object with the same data. */ |
| 1097 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj), | 1152 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj), |
| 1098 PyUnicode_GET_SIZE(obj)); | 1153 PyUnicode_GET_SIZE(obj)); |
| 1099 } | 1154 } |
| 1100 return PyUnicode_FromEncodedObject(obj, NULL, "strict"); | 1155 PyErr_Format(PyExc_TypeError, |
| 1156 "Can't convert '%.100s' object to str implicitly", |
| 1157 Py_TYPE(obj)->tp_name); |
| 1158 return NULL; |
| 1101 } | 1159 } |
| 1102 | 1160 |
| 1103 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, | 1161 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, |
| 1104 const char *encoding, | 1162 const char *encoding, |
| 1105 const char *errors) | 1163 const char *errors) |
| 1106 { | 1164 { |
| 1107 const char *s = NULL; | 1165 Py_buffer buffer; |
| 1108 Py_ssize_t len; | |
| 1109 PyObject *v; | 1166 PyObject *v; |
| 1110 | 1167 |
| 1111 if (obj == NULL) { | 1168 if (obj == NULL) { |
| 1112 PyErr_BadInternalCall(); | 1169 PyErr_BadInternalCall(); |
| 1113 return NULL; | 1170 return NULL; |
| 1114 } | 1171 } |
| 1115 | 1172 |
| 1116 #if 0 | 1173 /* Decoding bytes objects is the most common case and should be fast */ |
| 1117 /* For b/w compatibility we also accept Unicode objects provided | 1174 if (PyBytes_Check(obj)) { |
| 1118 that no encodings is given and then redirect to | 1175 if (PyBytes_GET_SIZE(obj) == 0) { |
| 1119 PyObject_Unicode() which then applies the additional logic for | 1176 Py_INCREF(unicode_empty); |
| 1120 Unicode subclasses. | 1177 v = (PyObject *) unicode_empty; |
| 1121 | 1178 } |
| 1122 NOTE: This API should really only be used for object which | 1179 else { |
| 1123 represent *encoded* Unicode ! | 1180 v = PyUnicode_Decode( |
| 1124 | 1181 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), |
| 1125 */ | 1182 encoding, errors); |
| 1126 if (PyUnicode_Check(obj)) { | 1183 } |
| 1127 if (encoding) { | 1184 return v; |
| 1128 PyErr_SetString(PyExc_TypeError, | 1185 } |
| 1129 "decoding Unicode is not supported"); | 1186 |
| 1130 return NULL; | |
| 1131 } | |
| 1132 return PyObject_Unicode(obj); | |
| 1133 } | |
| 1134 #else | |
| 1135 if (PyUnicode_Check(obj)) { | 1187 if (PyUnicode_Check(obj)) { |
| 1136 PyErr_SetString(PyExc_TypeError, | 1188 PyErr_SetString(PyExc_TypeError, |
| 1137 "decoding Unicode is not supported"); | 1189 "decoding str is not supported"); |
| 1138 return NULL; | 1190 return NULL; |
| 1139 } | 1191 } |
| 1140 #endif | 1192 |
| 1141 | 1193 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ |
| 1142 /* Coerce object */ | 1194 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { |
| 1143 if (PyString_Check(obj)) { | |
| 1144 s = PyString_AS_STRING(obj); | |
| 1145 len = PyString_GET_SIZE(obj); | |
| 1146 } | |
| 1147 else if (PyByteArray_Check(obj)) { | |
| 1148 /* Python 2.x specific */ | |
| 1149 PyErr_Format(PyExc_TypeError, | 1195 PyErr_Format(PyExc_TypeError, |
| 1150 "decoding bytearray is not supported"); | 1196 "coercing to str: need bytes, bytearray " |
| 1151 return NULL; | 1197 "or buffer-like object, %.80s found", |
| 1152 } | 1198 Py_TYPE(obj)->tp_name); |
| 1153 else if (PyObject_AsCharBuffer(obj, &s, &len)) { | 1199 return NULL; |
| 1154 /* Overwrite the error message with something more useful in | 1200 } |
| 1155 case of a TypeError. */ | 1201 |
| 1156 if (PyErr_ExceptionMatches(PyExc_TypeError)) | 1202 if (buffer.len == 0) { |
| 1157 PyErr_Format(PyExc_TypeError, | |
| 1158 "coercing to Unicode: need string or buffer, " | |
| 1159 "%.80s found", | |
| 1160 Py_TYPE(obj)->tp_name); | |
| 1161 goto onError; | |
| 1162 } | |
| 1163 | |
| 1164 /* Convert to Unicode */ | |
| 1165 if (len == 0) { | |
| 1166 Py_INCREF(unicode_empty); | 1203 Py_INCREF(unicode_empty); |
| 1167 v = (PyObject *)unicode_empty; | 1204 v = (PyObject *) unicode_empty; |
| 1168 } | 1205 } |
| 1169 else | 1206 else |
| 1170 v = PyUnicode_Decode(s, len, encoding, errors); | 1207 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); |
| 1171 | 1208 |
| 1209 PyBuffer_Release(&buffer); |
| 1172 return v; | 1210 return v; |
| 1173 | |
| 1174 onError: | |
| 1175 return NULL; | |
| 1176 } | 1211 } |
| 1177 | 1212 |
| 1178 PyObject *PyUnicode_Decode(const char *s, | 1213 PyObject *PyUnicode_Decode(const char *s, |
| 1179 Py_ssize_t size, | 1214 Py_ssize_t size, |
| 1180 const char *encoding, | 1215 const char *encoding, |
| 1181 const char *errors) | 1216 const char *errors) |
| 1182 { | 1217 { |
| 1183 PyObject *buffer = NULL, *unicode; | 1218 PyObject *buffer = NULL, *unicode; |
| 1219 Py_buffer info; |
| 1220 char lower[20]; /* Enough for any encoding name we recognize */ |
| 1221 char *l; |
| 1222 const char *e; |
| 1184 | 1223 |
| 1185 if (encoding == NULL) | 1224 if (encoding == NULL) |
| 1186 encoding = PyUnicode_GetDefaultEncoding(); | 1225 encoding = PyUnicode_GetDefaultEncoding(); |
| 1187 | 1226 |
| 1227 /* Convert encoding to lower case and replace '_' with '-' in order to |
| 1228 catch e.g. UTF_8 */ |
| 1229 e = encoding; |
| 1230 l = lower; |
| 1231 while (*e && l < &lower[(sizeof lower) - 2]) { |
| 1232 if (ISUPPER(*e)) { |
| 1233 *l++ = TOLOWER(*e++); |
| 1234 } |
| 1235 else if (*e == '_') { |
| 1236 *l++ = '-'; |
| 1237 e++; |
| 1238 } |
| 1239 else { |
| 1240 *l++ = *e++; |
| 1241 } |
| 1242 } |
| 1243 *l = '\0'; |
| 1244 |
| 1188 /* Shortcuts for common default encodings */ | 1245 /* Shortcuts for common default encodings */ |
| 1189 if (strcmp(encoding, "utf-8") == 0) | 1246 if (strcmp(lower, "utf-8") == 0) |
| 1190 return PyUnicode_DecodeUTF8(s, size, errors); | 1247 return PyUnicode_DecodeUTF8(s, size, errors); |
| 1191 else if (strcmp(encoding, "latin-1") == 0) | 1248 else if ((strcmp(lower, "latin-1") == 0) || |
| 1249 (strcmp(lower, "iso-8859-1") == 0)) |
| 1192 return PyUnicode_DecodeLatin1(s, size, errors); | 1250 return PyUnicode_DecodeLatin1(s, size, errors); |
| 1193 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) | 1251 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) |
| 1194 else if (strcmp(encoding, "mbcs") == 0) | 1252 else if (strcmp(lower, "mbcs") == 0) |
| 1195 return PyUnicode_DecodeMBCS(s, size, errors); | 1253 return PyUnicode_DecodeMBCS(s, size, errors); |
| 1196 #endif | 1254 #endif |
| 1197 else if (strcmp(encoding, "ascii") == 0) | 1255 else if (strcmp(lower, "ascii") == 0) |
| 1198 return PyUnicode_DecodeASCII(s, size, errors); | 1256 return PyUnicode_DecodeASCII(s, size, errors); |
| 1257 else if (strcmp(lower, "utf-16") == 0) |
| 1258 return PyUnicode_DecodeUTF16(s, size, errors, 0); |
| 1259 else if (strcmp(lower, "utf-32") == 0) |
| 1260 return PyUnicode_DecodeUTF32(s, size, errors, 0); |
| 1199 | 1261 |
| 1200 /* Decode via the codec registry */ | 1262 /* Decode via the codec registry */ |
| 1201 buffer = PyBuffer_FromMemory((void *)s, size); | 1263 buffer = NULL; |
| 1264 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) |
| 1265 goto onError; |
| 1266 buffer = PyMemoryView_FromBuffer(&info); |
| 1202 if (buffer == NULL) | 1267 if (buffer == NULL) |
| 1203 goto onError; | 1268 goto onError; |
| 1204 unicode = PyCodec_Decode(buffer, encoding, errors); | 1269 unicode = PyCodec_Decode(buffer, encoding, errors); |
| 1205 if (unicode == NULL) | 1270 if (unicode == NULL) |
| 1206 goto onError; | 1271 goto onError; |
| 1207 if (!PyUnicode_Check(unicode)) { | 1272 if (!PyUnicode_Check(unicode)) { |
| 1208 PyErr_Format(PyExc_TypeError, | 1273 PyErr_Format(PyExc_TypeError, |
| 1209 "decoder did not return an unicode object (type=%.400s)", | 1274 "decoder did not return a str object (type=%.400s)", |
| 1210 Py_TYPE(unicode)->tp_name); | 1275 Py_TYPE(unicode)->tp_name); |
| 1211 Py_DECREF(unicode); | 1276 Py_DECREF(unicode); |
| 1212 goto onError; | 1277 goto onError; |
| 1213 } | 1278 } |
| 1214 Py_DECREF(buffer); | 1279 Py_DECREF(buffer); |
| 1215 return unicode; | 1280 return unicode; |
| 1216 | 1281 |
| 1217 onError: | 1282 onError: |
| 1218 Py_XDECREF(buffer); | 1283 Py_XDECREF(buffer); |
| 1219 return NULL; | 1284 return NULL; |
| (...skipping 16 matching lines...) Expand all Loading... |
| 1236 /* Decode via the codec registry */ | 1301 /* Decode via the codec registry */ |
| 1237 v = PyCodec_Decode(unicode, encoding, errors); | 1302 v = PyCodec_Decode(unicode, encoding, errors); |
| 1238 if (v == NULL) | 1303 if (v == NULL) |
| 1239 goto onError; | 1304 goto onError; |
| 1240 return v; | 1305 return v; |
| 1241 | 1306 |
| 1242 onError: | 1307 onError: |
| 1243 return NULL; | 1308 return NULL; |
| 1244 } | 1309 } |
| 1245 | 1310 |
| 1311 PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode, |
| 1312 const char *encoding, |
| 1313 const char *errors) |
| 1314 { |
| 1315 PyObject *v; |
| 1316 |
| 1317 if (!PyUnicode_Check(unicode)) { |
| 1318 PyErr_BadArgument(); |
| 1319 goto onError; |
| 1320 } |
| 1321 |
| 1322 if (encoding == NULL) |
| 1323 encoding = PyUnicode_GetDefaultEncoding(); |
| 1324 |
| 1325 /* Decode via the codec registry */ |
| 1326 v = PyCodec_Decode(unicode, encoding, errors); |
| 1327 if (v == NULL) |
| 1328 goto onError; |
| 1329 if (!PyUnicode_Check(v)) { |
| 1330 PyErr_Format(PyExc_TypeError, |
| 1331 "decoder did not return a str object (type=%.400s)", |
| 1332 Py_TYPE(v)->tp_name); |
| 1333 Py_DECREF(v); |
| 1334 goto onError; |
| 1335 } |
| 1336 return v; |
| 1337 |
| 1338 onError: |
| 1339 return NULL; |
| 1340 } |
| 1341 |
| 1246 PyObject *PyUnicode_Encode(const Py_UNICODE *s, | 1342 PyObject *PyUnicode_Encode(const Py_UNICODE *s, |
| 1247 Py_ssize_t size, | 1343 Py_ssize_t size, |
| 1248 const char *encoding, | 1344 const char *encoding, |
| 1249 const char *errors) | 1345 const char *errors) |
| 1250 { | 1346 { |
| 1251 PyObject *v, *unicode; | 1347 PyObject *v, *unicode; |
| 1252 | 1348 |
| 1253 unicode = PyUnicode_FromUnicode(s, size); | 1349 unicode = PyUnicode_FromUnicode(s, size); |
| 1254 if (unicode == NULL) | 1350 if (unicode == NULL) |
| 1255 return NULL; | 1351 return NULL; |
| (...skipping 27 matching lines...) Expand all Loading... |
| 1283 } | 1379 } |
| 1284 | 1380 |
| 1285 PyObject *PyUnicode_AsEncodedString(PyObject *unicode, | 1381 PyObject *PyUnicode_AsEncodedString(PyObject *unicode, |
| 1286 const char *encoding, | 1382 const char *encoding, |
| 1287 const char *errors) | 1383 const char *errors) |
| 1288 { | 1384 { |
| 1289 PyObject *v; | 1385 PyObject *v; |
| 1290 | 1386 |
| 1291 if (!PyUnicode_Check(unicode)) { | 1387 if (!PyUnicode_Check(unicode)) { |
| 1292 PyErr_BadArgument(); | 1388 PyErr_BadArgument(); |
| 1293 goto onError; | 1389 return NULL; |
| 1294 } | 1390 } |
| 1295 | 1391 |
| 1296 if (encoding == NULL) | 1392 if (encoding == NULL) |
| 1297 encoding = PyUnicode_GetDefaultEncoding(); | 1393 encoding = PyUnicode_GetDefaultEncoding(); |
| 1298 | 1394 |
| 1299 /* Shortcuts for common default encodings */ | 1395 /* Shortcuts for common default encodings */ |
| 1300 if (errors == NULL) { | 1396 if (errors == NULL) { |
| 1301 if (strcmp(encoding, "utf-8") == 0) | 1397 if (strcmp(encoding, "utf-8") == 0) |
| 1302 return PyUnicode_AsUTF8String(unicode); | 1398 return PyUnicode_AsUTF8String(unicode); |
| 1303 else if (strcmp(encoding, "latin-1") == 0) | 1399 else if (strcmp(encoding, "latin-1") == 0) |
| 1304 return PyUnicode_AsLatin1String(unicode); | 1400 return PyUnicode_AsLatin1String(unicode); |
| 1305 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) | 1401 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) |
| 1306 else if (strcmp(encoding, "mbcs") == 0) | 1402 else if (strcmp(encoding, "mbcs") == 0) |
| 1307 return PyUnicode_AsMBCSString(unicode); | 1403 return PyUnicode_AsMBCSString(unicode); |
| 1308 #endif | 1404 #endif |
| 1309 else if (strcmp(encoding, "ascii") == 0) | 1405 else if (strcmp(encoding, "ascii") == 0) |
| 1310 return PyUnicode_AsASCIIString(unicode); | 1406 return PyUnicode_AsASCIIString(unicode); |
| 1311 } | 1407 /* During bootstrap, we may need to find the encodings |
| 1408 package, to load the file system encoding, and require the |
| 1409 file system encoding in order to load the encodings |
| 1410 package. |
| 1411 |
| 1412 Break out of this dependency by assuming that the path to |
| 1413 the encodings module is ASCII-only. XXX could try wcstombs |
| 1414 instead, if the file system encoding is the locale's |
| 1415 encoding. */ |
| 1416 else if (Py_FileSystemDefaultEncoding && |
| 1417 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 && |
| 1418 !PyThreadState_GET()->interp->codecs_initialized) |
| 1419 return PyUnicode_AsASCIIString(unicode); |
| 1420 } |
| 1421 |
| 1422 /* Encode via the codec registry */ |
| 1423 v = PyCodec_Encode(unicode, encoding, errors); |
| 1424 if (v == NULL) |
| 1425 return NULL; |
| 1426 |
| 1427 /* The normal path */ |
| 1428 if (PyBytes_Check(v)) |
| 1429 return v; |
| 1430 |
| 1431 /* If the codec returns a buffer, raise a warning and convert to bytes */ |
| 1432 if (PyByteArray_Check(v)) { |
| 1433 char msg[100]; |
| 1434 PyObject *b; |
| 1435 PyOS_snprintf(msg, sizeof(msg), |
| 1436 "encoder %s returned buffer instead of bytes", |
| 1437 encoding); |
| 1438 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) { |
| 1439 Py_DECREF(v); |
| 1440 return NULL; |
| 1441 } |
| 1442 |
| 1443 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); |
| 1444 Py_DECREF(v); |
| 1445 return b; |
| 1446 } |
| 1447 |
| 1448 PyErr_Format(PyExc_TypeError, |
| 1449 "encoder did not return a bytes object (type=%.400s)", |
| 1450 Py_TYPE(v)->tp_name); |
| 1451 Py_DECREF(v); |
| 1452 return NULL; |
| 1453 } |
| 1454 |
| 1455 PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode, |
| 1456 const char *encoding, |
| 1457 const char *errors) |
| 1458 { |
| 1459 PyObject *v; |
| 1460 |
| 1461 if (!PyUnicode_Check(unicode)) { |
| 1462 PyErr_BadArgument(); |
| 1463 goto onError; |
| 1464 } |
| 1465 |
| 1466 if (encoding == NULL) |
| 1467 encoding = PyUnicode_GetDefaultEncoding(); |
| 1312 | 1468 |
| 1313 /* Encode via the codec registry */ | 1469 /* Encode via the codec registry */ |
| 1314 v = PyCodec_Encode(unicode, encoding, errors); | 1470 v = PyCodec_Encode(unicode, encoding, errors); |
| 1315 if (v == NULL) | 1471 if (v == NULL) |
| 1316 goto onError; | 1472 goto onError; |
| 1317 if (!PyString_Check(v)) { | 1473 if (!PyUnicode_Check(v)) { |
| 1318 PyErr_Format(PyExc_TypeError, | 1474 PyErr_Format(PyExc_TypeError, |
| 1319 "encoder did not return a string object (type=%.400s)", | 1475 "encoder did not return an str object (type=%.400s)", |
| 1320 Py_TYPE(v)->tp_name); | 1476 Py_TYPE(v)->tp_name); |
| 1321 Py_DECREF(v); | 1477 Py_DECREF(v); |
| 1322 goto onError; | 1478 goto onError; |
| 1323 } | 1479 } |
| 1324 return v; | 1480 return v; |
| 1325 | 1481 |
| 1326 onError: | 1482 onError: |
| 1327 return NULL; | 1483 return NULL; |
| 1328 } | 1484 } |
| 1329 | 1485 |
| 1330 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode, | 1486 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode, |
| 1331 const char *errors) | 1487 const char *errors) |
| 1332 { | 1488 { |
| 1333 PyObject *v = ((PyUnicodeObject *)unicode)->defenc; | 1489 PyObject *v = ((PyUnicodeObject *)unicode)->defenc; |
| 1334 | |
| 1335 if (v) | 1490 if (v) |
| 1336 return v; | 1491 return v; |
| 1337 v = PyUnicode_AsEncodedString(unicode, NULL, errors); | 1492 if (errors != NULL) |
| 1338 if (v && errors == NULL) | 1493 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString"); |
| 1339 ((PyUnicodeObject *)unicode)->defenc = v; | 1494 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), |
| 1495 PyUnicode_GET_SIZE(unicode), |
| 1496 NULL); |
| 1497 if (!v) |
| 1498 return NULL; |
| 1499 ((PyUnicodeObject *)unicode)->defenc = v; |
| 1340 return v; | 1500 return v; |
| 1501 } |
| 1502 |
| 1503 PyObject* |
| 1504 PyUnicode_DecodeFSDefault(const char *s) { |
| 1505 Py_ssize_t size = (Py_ssize_t)strlen(s); |
| 1506 return PyUnicode_DecodeFSDefaultAndSize(s, size); |
| 1507 } |
| 1508 |
| 1509 PyObject* |
| 1510 PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) |
| 1511 { |
| 1512 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding |
| 1513 can be undefined. If it is case, decode using UTF-8. The following assume
s |
| 1514 that Py_FileSystemDefaultEncoding is set to a built-in encoding during th
e |
| 1515 bootstrapping process where the codecs aren't ready yet. |
| 1516 */ |
| 1517 if (Py_FileSystemDefaultEncoding) { |
| 1518 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) |
| 1519 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) { |
| 1520 return PyUnicode_DecodeMBCS(s, size, "replace"); |
| 1521 } |
| 1522 #elif defined(__APPLE__) |
| 1523 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) { |
| 1524 return PyUnicode_DecodeUTF8(s, size, "replace"); |
| 1525 } |
| 1526 #endif |
| 1527 return PyUnicode_Decode(s, size, |
| 1528 Py_FileSystemDefaultEncoding, |
| 1529 "replace"); |
| 1530 } |
| 1531 else { |
| 1532 return PyUnicode_DecodeUTF8(s, size, "replace"); |
| 1533 } |
| 1534 } |
| 1535 |
| 1536 /* Convert the argument to a bytes object, according to the file |
| 1537 system encoding */ |
| 1538 |
| 1539 int |
| 1540 PyUnicode_FSConverter(PyObject* arg, void* addr) |
| 1541 { |
| 1542 PyObject *output = NULL; |
| 1543 Py_ssize_t size; |
| 1544 void *data; |
| 1545 if (arg == NULL) { |
| 1546 Py_DECREF(*(PyObject**)addr); |
| 1547 return 1; |
| 1548 } |
| 1549 if (PyBytes_Check(arg) || PyByteArray_Check(arg)) { |
| 1550 output = arg; |
| 1551 Py_INCREF(output); |
| 1552 } |
| 1553 else { |
| 1554 arg = PyUnicode_FromObject(arg); |
| 1555 if (!arg) |
| 1556 return 0; |
| 1557 output = PyUnicode_AsEncodedObject(arg, |
| 1558 Py_FileSystemDefaultEncoding, |
| 1559 "surrogateescape"); |
| 1560 Py_DECREF(arg); |
| 1561 if (!output) |
| 1562 return 0; |
| 1563 if (!PyBytes_Check(output)) { |
| 1564 Py_DECREF(output); |
| 1565 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes"); |
| 1566 return 0; |
| 1567 } |
| 1568 } |
| 1569 if (PyBytes_Check(output)) { |
| 1570 size = PyBytes_GET_SIZE(output); |
| 1571 data = PyBytes_AS_STRING(output); |
| 1572 } |
| 1573 else { |
| 1574 size = PyByteArray_GET_SIZE(output); |
| 1575 data = PyByteArray_AS_STRING(output); |
| 1576 } |
| 1577 if (size != strlen(data)) { |
| 1578 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); |
| 1579 Py_DECREF(output); |
| 1580 return 0; |
| 1581 } |
| 1582 *(PyObject**)addr = output; |
| 1583 return Py_CLEANUP_SUPPORTED; |
| 1584 } |
| 1585 |
| 1586 |
| 1587 char* |
| 1588 _PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize) |
| 1589 { |
| 1590 PyObject *bytes; |
| 1591 if (!PyUnicode_Check(unicode)) { |
| 1592 PyErr_BadArgument(); |
| 1593 return NULL; |
| 1594 } |
| 1595 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL); |
| 1596 if (bytes == NULL) |
| 1597 return NULL; |
| 1598 if (psize != NULL) |
| 1599 *psize = PyBytes_GET_SIZE(bytes); |
| 1600 return PyBytes_AS_STRING(bytes); |
| 1601 } |
| 1602 |
| 1603 char* |
| 1604 _PyUnicode_AsString(PyObject *unicode) |
| 1605 { |
| 1606 return _PyUnicode_AsStringAndSize(unicode, NULL); |
| 1341 } | 1607 } |
| 1342 | 1608 |
| 1343 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode) | 1609 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode) |
| 1344 { | 1610 { |
| 1345 if (!PyUnicode_Check(unicode)) { | 1611 if (!PyUnicode_Check(unicode)) { |
| 1346 PyErr_BadArgument(); | 1612 PyErr_BadArgument(); |
| 1347 goto onError; | 1613 goto onError; |
| 1348 } | 1614 } |
| 1349 return PyUnicode_AS_UNICODE(unicode); | 1615 return PyUnicode_AS_UNICODE(unicode); |
| 1350 | 1616 |
| (...skipping 13 matching lines...) Expand all Loading... |
| 1364 return -1; | 1630 return -1; |
| 1365 } | 1631 } |
| 1366 | 1632 |
| 1367 const char *PyUnicode_GetDefaultEncoding(void) | 1633 const char *PyUnicode_GetDefaultEncoding(void) |
| 1368 { | 1634 { |
| 1369 return unicode_default_encoding; | 1635 return unicode_default_encoding; |
| 1370 } | 1636 } |
| 1371 | 1637 |
| 1372 int PyUnicode_SetDefaultEncoding(const char *encoding) | 1638 int PyUnicode_SetDefaultEncoding(const char *encoding) |
| 1373 { | 1639 { |
| 1374 PyObject *v; | 1640 if (strcmp(encoding, unicode_default_encoding) != 0) { |
| 1375 | 1641 PyErr_Format(PyExc_ValueError, |
| 1376 /* Make sure the encoding is valid. As side effect, this also | 1642 "Can only set default encoding to %s", |
| 1377 loads the encoding into the codec registry cache. */ | 1643 unicode_default_encoding); |
| 1378 v = _PyCodec_Lookup(encoding); | 1644 return -1; |
| 1379 if (v == NULL) | 1645 } |
| 1380 goto onError; | |
| 1381 Py_DECREF(v); | |
| 1382 strncpy(unicode_default_encoding, | |
| 1383 encoding, | |
| 1384 sizeof(unicode_default_encoding)); | |
| 1385 return 0; | 1646 return 0; |
| 1386 | |
| 1387 onError: | |
| 1388 return -1; | |
| 1389 } | 1647 } |
| 1390 | 1648 |
| 1391 /* error handling callback helper: | 1649 /* error handling callback helper: |
| 1392 build arguments, call the callback and check the arguments, | 1650 build arguments, call the callback and check the arguments, |
| 1393 if no exception occurred, copy the replacement to the output | 1651 if no exception occurred, copy the replacement to the output |
| 1394 and adjust various state variables. | 1652 and adjust various state variables. |
| 1395 return 0 on success, -1 on error | 1653 return 0 on success, -1 on error |
| 1396 */ | 1654 */ |
| 1397 | 1655 |
| 1398 static | 1656 static |
| 1399 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler
, | 1657 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler
, |
| 1400 const char *encoding, const char *reason, | 1658 const char *encoding, const char *reason, |
| 1401 const char *input, Py_ssize_t insize, Py_ss
ize_t *startinpos, | 1659 const char **input, const char **inend, Py_
ssize_t *startinpos, |
| 1402 Py_ssize_t *endinpos, PyObject **exceptionO
bject, const char **inptr, | 1660 Py_ssize_t *endinpos, PyObject **exceptionO
bject, const char **inptr, |
| 1403 PyUnicodeObject **output, Py_ssize_t *outpo
s, Py_UNICODE **outptr) | 1661 PyUnicodeObject **output, Py_ssize_t *outpo
s, Py_UNICODE **outptr) |
| 1404 { | 1662 { |
| 1405 static char *argparse = "O!n;decoding error handler must return (unicode, in
t) tuple"; | 1663 static char *argparse = "O!n;decoding error handler must return (str, int) t
uple"; |
| 1406 | 1664 |
| 1407 PyObject *restuple = NULL; | 1665 PyObject *restuple = NULL; |
| 1408 PyObject *repunicode = NULL; | 1666 PyObject *repunicode = NULL; |
| 1409 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output); | 1667 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output); |
| 1668 Py_ssize_t insize; |
| 1410 Py_ssize_t requiredsize; | 1669 Py_ssize_t requiredsize; |
| 1411 Py_ssize_t newpos; | 1670 Py_ssize_t newpos; |
| 1412 Py_UNICODE *repptr; | 1671 Py_UNICODE *repptr; |
| 1672 PyObject *inputobj = NULL; |
| 1413 Py_ssize_t repsize; | 1673 Py_ssize_t repsize; |
| 1414 int res = -1; | 1674 int res = -1; |
| 1415 | 1675 |
| 1416 if (*errorHandler == NULL) { | 1676 if (*errorHandler == NULL) { |
| 1417 *errorHandler = PyCodec_LookupError(errors); | 1677 *errorHandler = PyCodec_LookupError(errors); |
| 1418 if (*errorHandler == NULL) | 1678 if (*errorHandler == NULL) |
| 1419 goto onError; | 1679 goto onError; |
| 1420 } | 1680 } |
| 1421 | 1681 |
| 1422 if (*exceptionObject == NULL) { | 1682 if (*exceptionObject == NULL) { |
| 1423 *exceptionObject = PyUnicodeDecodeError_Create( | 1683 *exceptionObject = PyUnicodeDecodeError_Create( |
| 1424 encoding, input, insize, *startinpos, *endinpos, reason); | 1684 encoding, *input, *inend-*input, *startinpos, *endinpos, reason); |
| 1425 if (*exceptionObject == NULL) | 1685 if (*exceptionObject == NULL) |
| 1426 goto onError; | 1686 goto onError; |
| 1427 } | 1687 } |
| 1428 else { | 1688 else { |
| 1429 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos)) | 1689 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos)) |
| 1430 goto onError; | 1690 goto onError; |
| 1431 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos)) | 1691 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos)) |
| 1432 goto onError; | 1692 goto onError; |
| 1433 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) | 1693 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) |
| 1434 goto onError; | 1694 goto onError; |
| 1435 } | 1695 } |
| 1436 | 1696 |
| 1437 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NUL
L); | 1697 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NUL
L); |
| 1438 if (restuple == NULL) | 1698 if (restuple == NULL) |
| 1439 goto onError; | 1699 goto onError; |
| 1440 if (!PyTuple_Check(restuple)) { | 1700 if (!PyTuple_Check(restuple)) { |
| 1441 PyErr_SetString(PyExc_TypeError, &argparse[4]); | 1701 PyErr_SetString(PyExc_TypeError, &argparse[4]); |
| 1442 goto onError; | 1702 goto onError; |
| 1443 } | 1703 } |
| 1444 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &new
pos)) | 1704 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &new
pos)) |
| 1445 goto onError; | 1705 goto onError; |
| 1706 |
| 1707 /* Copy back the bytes variables, which might have been modified by the |
| 1708 callback */ |
| 1709 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); |
| 1710 if (!inputobj) |
| 1711 goto onError; |
| 1712 if (!PyBytes_Check(inputobj)) { |
| 1713 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"
); |
| 1714 } |
| 1715 *input = PyBytes_AS_STRING(inputobj); |
| 1716 insize = PyBytes_GET_SIZE(inputobj); |
| 1717 *inend = *input + insize; |
| 1718 /* we can DECREF safely, as the exception has another reference, |
| 1719 so the object won't go away. */ |
| 1720 Py_DECREF(inputobj); |
| 1721 |
| 1446 if (newpos<0) | 1722 if (newpos<0) |
| 1447 newpos = insize+newpos; | 1723 newpos = insize+newpos; |
| 1448 if (newpos<0 || newpos>insize) { | 1724 if (newpos<0 || newpos>insize) { |
| 1449 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of b
ounds", newpos); | 1725 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of b
ounds", newpos); |
| 1450 goto onError; | 1726 goto onError; |
| 1451 } | 1727 } |
| 1452 | 1728 |
| 1453 /* need more space? (at least enough for what we | 1729 /* need more space? (at least enough for what we |
| 1454 have+the replacement+the rest of the string (starting | 1730 have+the replacement+the rest of the string (starting |
| 1455 at the new input position), so we won't have to check space | 1731 at the new input position), so we won't have to check space |
| 1456 when there are no errors in the rest of the string) */ | 1732 when there are no errors in the rest of the string) */ |
| 1457 repptr = PyUnicode_AS_UNICODE(repunicode); | 1733 repptr = PyUnicode_AS_UNICODE(repunicode); |
| 1458 repsize = PyUnicode_GET_SIZE(repunicode); | 1734 repsize = PyUnicode_GET_SIZE(repunicode); |
| 1459 requiredsize = *outpos + repsize + insize-newpos; | 1735 requiredsize = *outpos + repsize + insize-newpos; |
| 1460 if (requiredsize > outsize) { | 1736 if (requiredsize > outsize) { |
| 1461 if (requiredsize<2*outsize) | 1737 if (requiredsize<2*outsize) |
| 1462 requiredsize = 2*outsize; | 1738 requiredsize = 2*outsize; |
| 1463 if (_PyUnicode_Resize(output, requiredsize) < 0) | 1739 if (_PyUnicode_Resize(output, requiredsize) < 0) |
| 1464 goto onError; | 1740 goto onError; |
| 1465 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; | 1741 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; |
| 1466 } | 1742 } |
| 1467 *endinpos = newpos; | 1743 *endinpos = newpos; |
| 1468 *inptr = input + newpos; | 1744 *inptr = *input + newpos; |
| 1469 Py_UNICODE_COPY(*outptr, repptr, repsize); | 1745 Py_UNICODE_COPY(*outptr, repptr, repsize); |
| 1470 *outptr += repsize; | 1746 *outptr += repsize; |
| 1471 *outpos += repsize; | 1747 *outpos += repsize; |
| 1748 |
| 1472 /* we made it! */ | 1749 /* we made it! */ |
| 1473 res = 0; | 1750 res = 0; |
| 1474 | 1751 |
| 1475 onError: | 1752 onError: |
| 1476 Py_XDECREF(restuple); | 1753 Py_XDECREF(restuple); |
| 1477 return res; | 1754 return res; |
| 1478 } | 1755 } |
| 1479 | 1756 |
| 1480 /* --- UTF-7 Codec -------------------------------------------------------- */ | 1757 /* --- UTF-7 Codec -------------------------------------------------------- */ |
| 1481 | 1758 |
| 1482 /* See RFC2152 for details. We encode conservatively and decode liberally. */ | 1759 /* See RFC2152 for details. We encode conservatively and decode liberally. */ |
| 1483 | 1760 |
| 1484 /* Three simple macros defining base-64. */ | 1761 /* Three simple macros defining base-64. */ |
| 1485 | 1762 |
| 1486 /* Is c a base-64 character? */ | 1763 /* Is c a base-64 character? */ |
| 1487 | 1764 |
| 1488 #define IS_BASE64(c) \ | 1765 #define IS_BASE64(c) \ |
| 1489 (isalnum(c) || (c) == '+' || (c) == '/') | 1766 (((c) >= 'A' && (c) <= 'Z') || \ |
| 1767 ((c) >= 'a' && (c) <= 'z') || \ |
| 1768 ((c) >= '0' && (c) <= '9') || \ |
| 1769 (c) == '+' || (c) == '/') |
| 1490 | 1770 |
| 1491 /* given that c is a base-64 character, what is its base-64 value? */ | 1771 /* given that c is a base-64 character, what is its base-64 value? */ |
| 1492 | 1772 |
| 1493 #define FROM_BASE64(c) \ | 1773 #define FROM_BASE64(c) \ |
| 1494 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ | 1774 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ |
| 1495 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ | 1775 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ |
| 1496 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ | 1776 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ |
| 1497 (c) == '+' ? 62 : 63) | 1777 (c) == '+' ? 62 : 63) |
| 1498 | 1778 |
| 1499 /* What is the base-64 character of the bottom 6 bits of n? */ | 1779 /* What is the base-64 character of the bottom 6 bits of n? */ |
| (...skipping 97 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1597 if (consumed) | 1877 if (consumed) |
| 1598 *consumed = 0; | 1878 *consumed = 0; |
| 1599 return (PyObject *)unicode; | 1879 return (PyObject *)unicode; |
| 1600 } | 1880 } |
| 1601 | 1881 |
| 1602 p = unicode->str; | 1882 p = unicode->str; |
| 1603 shiftOutStart = p; | 1883 shiftOutStart = p; |
| 1604 e = s + size; | 1884 e = s + size; |
| 1605 | 1885 |
| 1606 while (s < e) { | 1886 while (s < e) { |
| 1607 Py_UNICODE ch = (unsigned char) *s; | 1887 Py_UNICODE ch; |
| 1888 restart: |
| 1889 ch = (unsigned char) *s; |
| 1608 | 1890 |
| 1609 if (inShift) { /* in a base-64 section */ | 1891 if (inShift) { /* in a base-64 section */ |
| 1610 if (IS_BASE64(ch)) { /* consume a base-64 character */ | 1892 if (IS_BASE64(ch)) { /* consume a base-64 character */ |
| 1611 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); | 1893 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); |
| 1612 base64bits += 6; | 1894 base64bits += 6; |
| 1613 s++; | 1895 s++; |
| 1614 if (base64bits >= 16) { | 1896 if (base64bits >= 16) { |
| 1615 /* we have enough bits for a UTF-16 value */ | 1897 /* we have enough bits for a UTF-16 value */ |
| 1616 Py_UNICODE outCh = (Py_UNICODE) | 1898 Py_UNICODE outCh = (Py_UNICODE) |
| 1617 (base64buffer >> (base64bits-16)); | 1899 (base64buffer >> (base64bits-16)); |
| (...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1699 errmsg = "unexpected special character"; | 1981 errmsg = "unexpected special character"; |
| 1700 goto utf7Error; | 1982 goto utf7Error; |
| 1701 } | 1983 } |
| 1702 continue; | 1984 continue; |
| 1703 utf7Error: | 1985 utf7Error: |
| 1704 outpos = p-PyUnicode_AS_UNICODE(unicode); | 1986 outpos = p-PyUnicode_AS_UNICODE(unicode); |
| 1705 endinpos = s-starts; | 1987 endinpos = s-starts; |
| 1706 if (unicode_decode_call_errorhandler( | 1988 if (unicode_decode_call_errorhandler( |
| 1707 errors, &errorHandler, | 1989 errors, &errorHandler, |
| 1708 "utf7", errmsg, | 1990 "utf7", errmsg, |
| 1709 starts, size, &startinpos, &endinpos, &exc, &s, | 1991 &starts, &e, &startinpos, &endinpos, &exc, &s, |
| 1710 &unicode, &outpos, &p)) | 1992 &unicode, &outpos, &p)) |
| 1711 goto onError; | 1993 goto onError; |
| 1712 } | 1994 } |
| 1713 | 1995 |
| 1714 /* end of string */ | 1996 /* end of string */ |
| 1715 | 1997 |
| 1716 if (inShift && !consumed) { /* in shift sequence, no more to follow */ | 1998 if (inShift && !consumed) { /* in shift sequence, no more to follow */ |
| 1717 /* if we're in an inconsistent state, that's an error */ | 1999 /* if we're in an inconsistent state, that's an error */ |
| 1718 if (surrogate || | 2000 if (surrogate || |
| 1719 (base64bits >= 6) || | 2001 (base64bits >= 6) || |
| 1720 (base64bits > 0 && base64buffer != 0)) { | 2002 (base64bits > 0 && base64buffer != 0)) { |
| 1721 outpos = p-PyUnicode_AS_UNICODE(unicode); | 2003 outpos = p-PyUnicode_AS_UNICODE(unicode); |
| 1722 endinpos = size; | 2004 endinpos = size; |
| 1723 if (unicode_decode_call_errorhandler( | 2005 if (unicode_decode_call_errorhandler( |
| 1724 errors, &errorHandler, | 2006 errors, &errorHandler, |
| 1725 "utf7", "unterminated shift sequence", | 2007 "utf7", "unterminated shift sequence", |
| 1726 starts, size, &startinpos, &endinpos, &exc, &s, | 2008 &starts, &e, &startinpos, &endinpos, &exc, &s, |
| 1727 &unicode, &outpos, &p)) | 2009 &unicode, &outpos, &p)) |
| 1728 goto onError; | 2010 goto onError; |
| 2011 if (s < e) |
| 2012 goto restart; |
| 1729 } | 2013 } |
| 1730 } | 2014 } |
| 1731 | 2015 |
| 1732 /* return state */ | 2016 /* return state */ |
| 1733 if (consumed) { | 2017 if (consumed) { |
| 1734 if (inShift) { | 2018 if (inShift) { |
| 1735 p = shiftOutStart; /* back off output */ | 2019 p = shiftOutStart; /* back off output */ |
| 1736 *consumed = startinpos; | 2020 *consumed = startinpos; |
| 1737 } | 2021 } |
| 1738 else { | 2022 else { |
| (...skipping 25 matching lines...) Expand all Loading... |
| 1764 PyObject *v; | 2048 PyObject *v; |
| 1765 /* It might be possible to tighten this worst case */ | 2049 /* It might be possible to tighten this worst case */ |
| 1766 Py_ssize_t allocated = 8 * size; | 2050 Py_ssize_t allocated = 8 * size; |
| 1767 int inShift = 0; | 2051 int inShift = 0; |
| 1768 Py_ssize_t i = 0; | 2052 Py_ssize_t i = 0; |
| 1769 unsigned int base64bits = 0; | 2053 unsigned int base64bits = 0; |
| 1770 unsigned long base64buffer = 0; | 2054 unsigned long base64buffer = 0; |
| 1771 char * out; | 2055 char * out; |
| 1772 char * start; | 2056 char * start; |
| 1773 | 2057 |
| 2058 if (size == 0) |
| 2059 return PyBytes_FromStringAndSize(NULL, 0); |
| 2060 |
| 1774 if (allocated / 8 != size) | 2061 if (allocated / 8 != size) |
| 1775 return PyErr_NoMemory(); | 2062 return PyErr_NoMemory(); |
| 1776 | 2063 |
| 1777 if (size == 0) | 2064 v = PyBytes_FromStringAndSize(NULL, allocated); |
| 1778 return PyString_FromStringAndSize(NULL, 0); | |
| 1779 | |
| 1780 v = PyString_FromStringAndSize(NULL, allocated); | |
| 1781 if (v == NULL) | 2065 if (v == NULL) |
| 1782 return NULL; | 2066 return NULL; |
| 1783 | 2067 |
| 1784 start = out = PyString_AS_STRING(v); | 2068 start = out = PyBytes_AS_STRING(v); |
| 1785 for (;i < size; ++i) { | 2069 for (;i < size; ++i) { |
| 1786 Py_UNICODE ch = s[i]; | 2070 Py_UNICODE ch = s[i]; |
| 1787 | 2071 |
| 1788 if (inShift) { | 2072 if (inShift) { |
| 1789 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { | 2073 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { |
| 1790 /* shifting out */ | 2074 /* shifting out */ |
| 1791 if (base64bits) { /* output remaining bits */ | 2075 if (base64bits) { /* output remaining bits */ |
| 1792 *out++ = TO_BASE64(base64buffer << (6-base64bits)); | 2076 *out++ = TO_BASE64(base64buffer << (6-base64bits)); |
| 1793 base64buffer = 0; | 2077 base64buffer = 0; |
| 1794 base64bits = 0; | 2078 base64bits = 0; |
| (...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1838 base64buffer = (base64buffer << 16) | ch; | 2122 base64buffer = (base64buffer << 16) | ch; |
| 1839 while (base64bits >= 6) { | 2123 while (base64bits >= 6) { |
| 1840 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); | 2124 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); |
| 1841 base64bits -= 6; | 2125 base64bits -= 6; |
| 1842 } | 2126 } |
| 1843 } | 2127 } |
| 1844 if (base64bits) | 2128 if (base64bits) |
| 1845 *out++= TO_BASE64(base64buffer << (6-base64bits) ); | 2129 *out++= TO_BASE64(base64buffer << (6-base64bits) ); |
| 1846 if (inShift) | 2130 if (inShift) |
| 1847 *out++ = '-'; | 2131 *out++ = '-'; |
| 1848 | 2132 if (_PyBytes_Resize(&v, out - start) < 0) |
| 1849 if (_PyString_Resize(&v, out - start)) | |
| 1850 return NULL; | 2133 return NULL; |
| 1851 return v; | 2134 return v; |
| 1852 } | 2135 } |
| 1853 | 2136 |
| 1854 #undef IS_BASE64 | 2137 #undef IS_BASE64 |
| 1855 #undef FROM_BASE64 | 2138 #undef FROM_BASE64 |
| 1856 #undef TO_BASE64 | 2139 #undef TO_BASE64 |
| 1857 #undef DECODE_DIRECT | 2140 #undef DECODE_DIRECT |
| 1858 #undef ENCODE_DIRECT | 2141 #undef ENCODE_DIRECT |
| 1859 | 2142 |
| (...skipping 21 matching lines...) Expand all Loading... |
| 1881 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */ | 2164 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */ |
| 1882 }; | 2165 }; |
| 1883 | 2166 |
| 1884 PyObject *PyUnicode_DecodeUTF8(const char *s, | 2167 PyObject *PyUnicode_DecodeUTF8(const char *s, |
| 1885 Py_ssize_t size, | 2168 Py_ssize_t size, |
| 1886 const char *errors) | 2169 const char *errors) |
| 1887 { | 2170 { |
| 1888 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); | 2171 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); |
| 1889 } | 2172 } |
| 1890 | 2173 |
| 2174 /* Mask to check or force alignment of a pointer to C 'long' boundaries */ |
| 2175 #define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1) |
| 2176 |
| 2177 /* Mask to quickly check whether a C 'long' contains a |
| 2178 non-ASCII, UTF8-encoded char. */ |
| 2179 #if (SIZEOF_LONG == 8) |
| 2180 # define ASCII_CHAR_MASK 0x8080808080808080L |
| 2181 #elif (SIZEOF_LONG == 4) |
| 2182 # define ASCII_CHAR_MASK 0x80808080L |
| 2183 #else |
| 2184 # error C 'long' size should be either 4 or 8! |
| 2185 #endif |
| 2186 |
| 1891 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s, | 2187 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s, |
| 1892 Py_ssize_t size, | 2188 Py_ssize_t size, |
| 1893 const char *errors, | 2189 const char *errors, |
| 1894 Py_ssize_t *consumed) | 2190 Py_ssize_t *consumed) |
| 1895 { | 2191 { |
| 1896 const char *starts = s; | 2192 const char *starts = s; |
| 1897 int n; | 2193 int n; |
| 1898 int k; | 2194 int k; |
| 1899 Py_ssize_t startinpos; | 2195 Py_ssize_t startinpos; |
| 1900 Py_ssize_t endinpos; | 2196 Py_ssize_t endinpos; |
| 1901 Py_ssize_t outpos; | 2197 Py_ssize_t outpos; |
| 1902 const char *e; | 2198 const char *e, *aligned_end; |
| 1903 PyUnicodeObject *unicode; | 2199 PyUnicodeObject *unicode; |
| 1904 Py_UNICODE *p; | 2200 Py_UNICODE *p; |
| 1905 const char *errmsg = ""; | 2201 const char *errmsg = ""; |
| 1906 PyObject *errorHandler = NULL; | 2202 PyObject *errorHandler = NULL; |
| 1907 PyObject *exc = NULL; | 2203 PyObject *exc = NULL; |
| 1908 | 2204 |
| 1909 /* Note: size will always be longer than the resulting Unicode | 2205 /* Note: size will always be longer than the resulting Unicode |
| 1910 character count */ | 2206 character count */ |
| 1911 unicode = _PyUnicode_New(size); | 2207 unicode = _PyUnicode_New(size); |
| 1912 if (!unicode) | 2208 if (!unicode) |
| 1913 return NULL; | 2209 return NULL; |
| 1914 if (size == 0) { | 2210 if (size == 0) { |
| 1915 if (consumed) | 2211 if (consumed) |
| 1916 *consumed = 0; | 2212 *consumed = 0; |
| 1917 return (PyObject *)unicode; | 2213 return (PyObject *)unicode; |
| 1918 } | 2214 } |
| 1919 | 2215 |
| 1920 /* Unpack UTF-8 encoded data */ | 2216 /* Unpack UTF-8 encoded data */ |
| 1921 p = unicode->str; | 2217 p = unicode->str; |
| 1922 e = s + size; | 2218 e = s + size; |
| 2219 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); |
| 1923 | 2220 |
| 1924 while (s < e) { | 2221 while (s < e) { |
| 1925 Py_UCS4 ch = (unsigned char)*s; | 2222 Py_UCS4 ch = (unsigned char)*s; |
| 2223 |
| 2224 if (ch < 0x80) { |
| 2225 /* Fast path for runs of ASCII characters. Given that common UTF-8 |
| 2226 input will consist of an overwhelming majority of ASCII |
| 2227 characters, we try to optimize for this case by checking |
| 2228 as many characters as a C 'long' can contain. |
| 2229 First, check if we can do an aligned read, as most CPUs have |
| 2230 a penalty for unaligned reads. |
| 2231 */ |
| 2232 if (!((size_t) s & LONG_PTR_MASK)) { |
| 2233 /* Help register allocation */ |
| 2234 register const char *_s = s; |
| 2235 register Py_UNICODE *_p = p; |
| 2236 while (_s < aligned_end) { |
| 2237 /* Read a whole long at a time (either 4 or 8 bytes), |
| 2238 and do a fast unrolled copy if it only contains ASCII |
| 2239 characters. */ |
| 2240 unsigned long data = *(unsigned long *) _s; |
| 2241 if (data & ASCII_CHAR_MASK) |
| 2242 break; |
| 2243 _p[0] = (unsigned char) _s[0]; |
| 2244 _p[1] = (unsigned char) _s[1]; |
| 2245 _p[2] = (unsigned char) _s[2]; |
| 2246 _p[3] = (unsigned char) _s[3]; |
| 2247 #if (SIZEOF_LONG == 8) |
| 2248 _p[4] = (unsigned char) _s[4]; |
| 2249 _p[5] = (unsigned char) _s[5]; |
| 2250 _p[6] = (unsigned char) _s[6]; |
| 2251 _p[7] = (unsigned char) _s[7]; |
| 2252 #endif |
| 2253 _s += SIZEOF_LONG; |
| 2254 _p += SIZEOF_LONG; |
| 2255 } |
| 2256 s = _s; |
| 2257 p = _p; |
| 2258 if (s == e) |
| 2259 break; |
| 2260 ch = (unsigned char)*s; |
| 2261 } |
| 2262 } |
| 1926 | 2263 |
| 1927 if (ch < 0x80) { | 2264 if (ch < 0x80) { |
| 1928 *p++ = (Py_UNICODE)ch; | 2265 *p++ = (Py_UNICODE)ch; |
| 1929 s++; | 2266 s++; |
| 1930 continue; | 2267 continue; |
| 1931 } | 2268 } |
| 1932 | 2269 |
| 1933 n = utf8_code_length[ch]; | 2270 n = utf8_code_length[ch]; |
| 1934 | 2271 |
| 1935 if (s + n > e) { | 2272 if (s + n > e) { |
| (...skipping 29 matching lines...) Expand all Loading... |
| 1965 startinpos = s-starts; | 2302 startinpos = s-starts; |
| 1966 endinpos = startinpos + 1; | 2303 endinpos = startinpos + 1; |
| 1967 goto utf8Error; | 2304 goto utf8Error; |
| 1968 } | 2305 } |
| 1969 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); | 2306 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); |
| 1970 assert ((ch > 0x007F) && (ch <= 0x07FF)); | 2307 assert ((ch > 0x007F) && (ch <= 0x07FF)); |
| 1971 *p++ = (Py_UNICODE)ch; | 2308 *p++ = (Py_UNICODE)ch; |
| 1972 break; | 2309 break; |
| 1973 | 2310 |
| 1974 case 3: | 2311 case 3: |
| 1975 /* XXX: surrogates shouldn't be valid UTF-8! | 2312 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf |
| 1976 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf | 2313 will result in surrogates in range d800-dfff. Surrogates are |
| 1977 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt | 2314 not valid UTF-8 so they are rejected. |
| 1978 Uncomment the 2 lines below to make them invalid, | 2315 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf |
| 1979 codepoints: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */ | 2316 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ |
| 1980 if ((s[1] & 0xc0) != 0x80 || | 2317 if ((s[1] & 0xc0) != 0x80 || |
| 1981 (s[2] & 0xc0) != 0x80 || | 2318 (s[2] & 0xc0) != 0x80 || |
| 1982 ((unsigned char)s[0] == 0xE0 && | 2319 ((unsigned char)s[0] == 0xE0 && |
| 1983 (unsigned char)s[1] < 0xA0)/* || | 2320 (unsigned char)s[1] < 0xA0) || |
| 1984 ((unsigned char)s[0] == 0xED && | 2321 ((unsigned char)s[0] == 0xED && |
| 1985 (unsigned char)s[1] > 0x9F)*/) { | 2322 (unsigned char)s[1] > 0x9F)) { |
| 1986 errmsg = "invalid continuation byte"; | 2323 errmsg = "invalid continuation byte"; |
| 1987 startinpos = s-starts; | 2324 startinpos = s-starts; |
| 1988 endinpos = startinpos + 1; | 2325 endinpos = startinpos + 1; |
| 1989 | 2326 |
| 1990 /* if s[1] first two bits are 1 and 0, then the invalid | 2327 /* if s[1] first two bits are 1 and 0, then the invalid |
| 1991 continuation byte is s[2], so increment endinpos by 1, | 2328 continuation byte is s[2], so increment endinpos by 1, |
| 1992 if not, s[1] is invalid and endinpos doesn't need to | 2329 if not, s[1] is invalid and endinpos doesn't need to |
| 1993 be incremented. */ | 2330 be incremented. */ |
| 1994 if ((s[1] & 0xC0) == 0x80) | 2331 if ((s[1] & 0xC0) == 0x80) |
| 1995 endinpos++; | 2332 endinpos++; |
| (...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2039 break; | 2376 break; |
| 2040 } | 2377 } |
| 2041 s += n; | 2378 s += n; |
| 2042 continue; | 2379 continue; |
| 2043 | 2380 |
| 2044 utf8Error: | 2381 utf8Error: |
| 2045 outpos = p-PyUnicode_AS_UNICODE(unicode); | 2382 outpos = p-PyUnicode_AS_UNICODE(unicode); |
| 2046 if (unicode_decode_call_errorhandler( | 2383 if (unicode_decode_call_errorhandler( |
| 2047 errors, &errorHandler, | 2384 errors, &errorHandler, |
| 2048 "utf8", errmsg, | 2385 "utf8", errmsg, |
| 2049 starts, size, &startinpos, &endinpos, &exc, &s, | 2386 &starts, &e, &startinpos, &endinpos, &exc, &s, |
| 2050 &unicode, &outpos, &p)) | 2387 &unicode, &outpos, &p)) |
| 2051 goto onError; | 2388 goto onError; |
| 2389 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); |
| 2052 } | 2390 } |
| 2053 if (consumed) | 2391 if (consumed) |
| 2054 *consumed = s-starts; | 2392 *consumed = s-starts; |
| 2055 | 2393 |
| 2056 /* Adjust length */ | 2394 /* Adjust length */ |
| 2057 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) | 2395 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) |
| 2058 goto onError; | 2396 goto onError; |
| 2059 | 2397 |
| 2060 Py_XDECREF(errorHandler); | 2398 Py_XDECREF(errorHandler); |
| 2061 Py_XDECREF(exc); | 2399 Py_XDECREF(exc); |
| 2062 return (PyObject *)unicode; | 2400 return (PyObject *)unicode; |
| 2063 | 2401 |
| 2064 onError: | 2402 onError: |
| 2065 Py_XDECREF(errorHandler); | 2403 Py_XDECREF(errorHandler); |
| 2066 Py_XDECREF(exc); | 2404 Py_XDECREF(exc); |
| 2067 Py_DECREF(unicode); | 2405 Py_DECREF(unicode); |
| 2068 return NULL; | 2406 return NULL; |
| 2069 } | 2407 } |
| 2408 |
| 2409 #undef ASCII_CHAR_MASK |
| 2410 |
| 2070 | 2411 |
| 2071 /* Allocation strategy: if the string is short, convert into a stack buffer | 2412 /* Allocation strategy: if the string is short, convert into a stack buffer |
| 2072 and allocate exactly as much space needed at the end. Else allocate the | 2413 and allocate exactly as much space needed at the end. Else allocate the |
| 2073 maximum possible needed (4 result bytes per Unicode character), and return | 2414 maximum possible needed (4 result bytes per Unicode character), and return |
| 2074 the excess memory at the end. | 2415 the excess memory at the end. |
| 2075 */ | 2416 */ |
| 2076 PyObject * | 2417 PyObject * |
| 2077 PyUnicode_EncodeUTF8(const Py_UNICODE *s, | 2418 PyUnicode_EncodeUTF8(const Py_UNICODE *s, |
| 2078 Py_ssize_t size, | 2419 Py_ssize_t size, |
| 2079 const char *errors) | 2420 const char *errors) |
| 2080 { | 2421 { |
| 2081 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ | 2422 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ |
| 2082 | 2423 |
| 2083 Py_ssize_t i; /* index into s of next input byte */ | 2424 Py_ssize_t i; /* index into s of next input byte */ |
| 2084 PyObject *v; /* result string object */ | 2425 PyObject *result; /* result string object */ |
| 2085 char *p; /* next free byte in output buffer */ | 2426 char *p; /* next free byte in output buffer */ |
| 2086 Py_ssize_t nallocated; /* number of result bytes allocated */ | 2427 Py_ssize_t nallocated; /* number of result bytes allocated */ |
| 2087 Py_ssize_t nneeded; /* number of result bytes needed */ | 2428 Py_ssize_t nneeded; /* number of result bytes needed */ |
| 2088 char stackbuf[MAX_SHORT_UNICHARS * 4]; | 2429 char stackbuf[MAX_SHORT_UNICHARS * 4]; |
| 2430 PyObject *errorHandler = NULL; |
| 2431 PyObject *exc = NULL; |
| 2089 | 2432 |
| 2090 assert(s != NULL); | 2433 assert(s != NULL); |
| 2091 assert(size >= 0); | 2434 assert(size >= 0); |
| 2092 | 2435 |
| 2093 if (size <= MAX_SHORT_UNICHARS) { | 2436 if (size <= MAX_SHORT_UNICHARS) { |
| 2094 /* Write into the stack buffer; nallocated can't overflow. | 2437 /* Write into the stack buffer; nallocated can't overflow. |
| 2095 * At the end, we'll allocate exactly as much heap space as it | 2438 * At the end, we'll allocate exactly as much heap space as it |
| 2096 * turns out we need. | 2439 * turns out we need. |
| 2097 */ | 2440 */ |
| 2098 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); | 2441 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); |
| 2099 v = NULL; /* will allocate after we're done */ | 2442 result = NULL; /* will allocate after we're done */ |
| 2100 p = stackbuf; | 2443 p = stackbuf; |
| 2101 } | 2444 } |
| 2102 else { | 2445 else { |
| 2103 /* Overallocate on the heap, and give the excess back at the end. */ | 2446 /* Overallocate on the heap, and give the excess back at the end. */ |
| 2104 nallocated = size * 4; | 2447 nallocated = size * 4; |
| 2105 if (nallocated / 4 != size) /* overflow! */ | 2448 if (nallocated / 4 != size) /* overflow! */ |
| 2106 return PyErr_NoMemory(); | 2449 return PyErr_NoMemory(); |
| 2107 v = PyString_FromStringAndSize(NULL, nallocated); | 2450 result = PyBytes_FromStringAndSize(NULL, nallocated); |
| 2108 if (v == NULL) | 2451 if (result == NULL) |
| 2109 return NULL; | 2452 return NULL; |
| 2110 p = PyString_AS_STRING(v); | 2453 p = PyBytes_AS_STRING(result); |
| 2111 } | 2454 } |
| 2112 | 2455 |
| 2113 for (i = 0; i < size;) { | 2456 for (i = 0; i < size;) { |
| 2114 Py_UCS4 ch = s[i++]; | 2457 Py_UCS4 ch = s[i++]; |
| 2115 | 2458 |
| 2116 if (ch < 0x80) | 2459 if (ch < 0x80) |
| 2117 /* Encode ASCII */ | 2460 /* Encode ASCII */ |
| 2118 *p++ = (char) ch; | 2461 *p++ = (char) ch; |
| 2119 | 2462 |
| 2120 else if (ch < 0x0800) { | 2463 else if (ch < 0x0800) { |
| 2121 /* Encode Latin-1 */ | 2464 /* Encode Latin-1 */ |
| 2122 *p++ = (char)(0xc0 | (ch >> 6)); | 2465 *p++ = (char)(0xc0 | (ch >> 6)); |
| 2123 *p++ = (char)(0x80 | (ch & 0x3f)); | 2466 *p++ = (char)(0x80 | (ch & 0x3f)); |
| 2124 } | 2467 } else if (0xD800 <= ch && ch <= 0xDFFF) { |
| 2125 else { | 2468 #ifndef Py_UNICODE_WIDE |
| 2126 /* Encode UCS2 Unicode ordinals */ | 2469 /* Special case: check for high and low surrogate */ |
| 2127 if (ch < 0x10000) { | 2470 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) { |
| 2128 /* Special case: check for high surrogate */ | 2471 Py_UCS4 ch2 = s[i]; |
| 2129 if (0xD800 <= ch && ch <= 0xDBFF && i != size) { | 2472 /* Combine the two surrogates to form a UCS4 value */ |
| 2130 Py_UCS4 ch2 = s[i]; | 2473 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000; |
| 2131 /* Check for low surrogate and combine the two to | 2474 i++; |
| 2132 form a UCS4 value */ | 2475 |
| 2133 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { | 2476 /* Encode UCS4 Unicode ordinals */ |
| 2134 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000; | 2477 *p++ = (char)(0xf0 | (ch >> 18)); |
| 2135 i++; | 2478 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); |
| 2136 goto encodeUCS4; | |
| 2137 } | |
| 2138 /* Fall through: handles isolated high surrogates */ | |
| 2139 } | |
| 2140 *p++ = (char)(0xe0 | (ch >> 12)); | |
| 2141 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 2479 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); |
| 2142 *p++ = (char)(0x80 | (ch & 0x3f)); | 2480 *p++ = (char)(0x80 | (ch & 0x3f)); |
| 2143 continue; | 2481 } else { |
| 2144 } | 2482 #endif |
| 2145 encodeUCS4: | 2483 Py_ssize_t newpos; |
| 2484 PyObject *rep; |
| 2485 Py_ssize_t repsize, k; |
| 2486 rep = unicode_encode_call_errorhandler |
| 2487 (errors, &errorHandler, "utf-8", "surrogates not allowed", |
| 2488 s, size, &exc, i-1, i, &newpos); |
| 2489 if (!rep) |
| 2490 goto error; |
| 2491 |
| 2492 if (PyBytes_Check(rep)) |
| 2493 repsize = PyBytes_GET_SIZE(rep); |
| 2494 else |
| 2495 repsize = PyUnicode_GET_SIZE(rep); |
| 2496 |
| 2497 if (repsize > 4) { |
| 2498 Py_ssize_t offset; |
| 2499 |
| 2500 if (result == NULL) |
| 2501 offset = p - stackbuf; |
| 2502 else |
| 2503 offset = p - PyBytes_AS_STRING(result); |
| 2504 |
| 2505 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) { |
| 2506 /* integer overflow */ |
| 2507 PyErr_NoMemory(); |
| 2508 goto error; |
| 2509 } |
| 2510 nallocated += repsize - 4; |
| 2511 if (result != NULL) { |
| 2512 if (_PyBytes_Resize(&result, nallocated) < 0) |
| 2513 goto error; |
| 2514 } else { |
| 2515 result = PyBytes_FromStringAndSize(NULL, nallocated); |
| 2516 if (result == NULL) |
| 2517 goto error; |
| 2518 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset); |
| 2519 } |
| 2520 p = PyBytes_AS_STRING(result) + offset; |
| 2521 } |
| 2522 |
| 2523 if (PyBytes_Check(rep)) { |
| 2524 char *prep = PyBytes_AS_STRING(rep); |
| 2525 for(k = repsize; k > 0; k--) |
| 2526 *p++ = *prep++; |
| 2527 } else /* rep is unicode */ { |
| 2528 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep); |
| 2529 Py_UNICODE c; |
| 2530 |
| 2531 for(k=0; k<repsize; k++) { |
| 2532 c = prep[k]; |
| 2533 if (0x80 <= c) { |
| 2534 raise_encode_exception(&exc, "utf-8", s, size, |
| 2535 i-1, i, "surrogates not allow
ed"); |
| 2536 goto error; |
| 2537 } |
| 2538 *p++ = (char)prep[k]; |
| 2539 } |
| 2540 } |
| 2541 Py_DECREF(rep); |
| 2542 #ifndef Py_UNICODE_WIDE |
| 2543 } |
| 2544 #endif |
| 2545 } else if (ch < 0x10000) { |
| 2546 *p++ = (char)(0xe0 | (ch >> 12)); |
| 2547 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); |
| 2548 *p++ = (char)(0x80 | (ch & 0x3f)); |
| 2549 } else /* ch >= 0x10000 */ { |
| 2146 /* Encode UCS4 Unicode ordinals */ | 2550 /* Encode UCS4 Unicode ordinals */ |
| 2147 *p++ = (char)(0xf0 | (ch >> 18)); | 2551 *p++ = (char)(0xf0 | (ch >> 18)); |
| 2148 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); | 2552 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); |
| 2149 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 2553 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); |
| 2150 *p++ = (char)(0x80 | (ch & 0x3f)); | 2554 *p++ = (char)(0x80 | (ch & 0x3f)); |
| 2151 } | 2555 } |
| 2152 } | 2556 } |
| 2153 | 2557 |
| 2154 if (v == NULL) { | 2558 if (result == NULL) { |
| 2155 /* This was stack allocated. */ | 2559 /* This was stack allocated. */ |
| 2156 nneeded = p - stackbuf; | 2560 nneeded = p - stackbuf; |
| 2157 assert(nneeded <= nallocated); | 2561 assert(nneeded <= nallocated); |
| 2158 v = PyString_FromStringAndSize(stackbuf, nneeded); | 2562 result = PyBytes_FromStringAndSize(stackbuf, nneeded); |
| 2159 } | 2563 } |
| 2160 else { | 2564 else { |
| 2161 /* Cut back to size actually needed. */ | 2565 /* Cut back to size actually needed. */ |
| 2162 nneeded = p - PyString_AS_STRING(v); | 2566 nneeded = p - PyBytes_AS_STRING(result); |
| 2163 assert(nneeded <= nallocated); | 2567 assert(nneeded <= nallocated); |
| 2164 if (_PyString_Resize(&v, nneeded)) | 2568 _PyBytes_Resize(&result, nneeded); |
| 2165 return NULL; | 2569 } |
| 2166 } | 2570 Py_XDECREF(errorHandler); |
| 2167 return v; | 2571 Py_XDECREF(exc); |
| 2572 return result; |
| 2573 error: |
| 2574 Py_XDECREF(errorHandler); |
| 2575 Py_XDECREF(exc); |
| 2576 Py_XDECREF(result); |
| 2577 return NULL; |
| 2168 | 2578 |
| 2169 #undef MAX_SHORT_UNICHARS | 2579 #undef MAX_SHORT_UNICHARS |
| 2170 } | 2580 } |
| 2171 | 2581 |
| 2172 PyObject *PyUnicode_AsUTF8String(PyObject *unicode) | 2582 PyObject *PyUnicode_AsUTF8String(PyObject *unicode) |
| 2173 { | 2583 { |
| 2174 if (!PyUnicode_Check(unicode)) { | 2584 if (!PyUnicode_Check(unicode)) { |
| 2175 PyErr_BadArgument(); | 2585 PyErr_BadArgument(); |
| 2176 return NULL; | 2586 return NULL; |
| 2177 } | 2587 } |
| (...skipping 145 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2323 else | 2733 else |
| 2324 #endif | 2734 #endif |
| 2325 *p++ = ch; | 2735 *p++ = ch; |
| 2326 q += 4; | 2736 q += 4; |
| 2327 continue; | 2737 continue; |
| 2328 utf32Error: | 2738 utf32Error: |
| 2329 outpos = p-PyUnicode_AS_UNICODE(unicode); | 2739 outpos = p-PyUnicode_AS_UNICODE(unicode); |
| 2330 if (unicode_decode_call_errorhandler( | 2740 if (unicode_decode_call_errorhandler( |
| 2331 errors, &errorHandler, | 2741 errors, &errorHandler, |
| 2332 "utf32", errmsg, | 2742 "utf32", errmsg, |
| 2333 starts, size, &startinpos, &endinpos, &exc, (const char **)&q, | 2743 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const
char **)&q, |
| 2334 &unicode, &outpos, &p)) | 2744 &unicode, &outpos, &p)) |
| 2335 goto onError; | 2745 goto onError; |
| 2336 } | 2746 } |
| 2337 | 2747 |
| 2338 if (byteorder) | 2748 if (byteorder) |
| 2339 *byteorder = bo; | 2749 *byteorder = bo; |
| 2340 | 2750 |
| 2341 if (consumed) | 2751 if (consumed) |
| 2342 *consumed = (const char *)q-starts; | 2752 *consumed = (const char *)q-starts; |
| 2343 | 2753 |
| (...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2391 #ifndef Py_UNICODE_WIDE | 2801 #ifndef Py_UNICODE_WIDE |
| 2392 for (i = pairs = 0; i < size-1; i++) | 2802 for (i = pairs = 0; i < size-1; i++) |
| 2393 if (0xD800 <= s[i] && s[i] <= 0xDBFF && | 2803 if (0xD800 <= s[i] && s[i] <= 0xDBFF && |
| 2394 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF) | 2804 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF) |
| 2395 pairs++; | 2805 pairs++; |
| 2396 #endif | 2806 #endif |
| 2397 nsize = (size - pairs + (byteorder == 0)); | 2807 nsize = (size - pairs + (byteorder == 0)); |
| 2398 bytesize = nsize * 4; | 2808 bytesize = nsize * 4; |
| 2399 if (bytesize / 4 != nsize) | 2809 if (bytesize / 4 != nsize) |
| 2400 return PyErr_NoMemory(); | 2810 return PyErr_NoMemory(); |
| 2401 v = PyString_FromStringAndSize(NULL, bytesize); | 2811 v = PyBytes_FromStringAndSize(NULL, bytesize); |
| 2402 if (v == NULL) | 2812 if (v == NULL) |
| 2403 return NULL; | 2813 return NULL; |
| 2404 | 2814 |
| 2405 p = (unsigned char *)PyString_AS_STRING(v); | 2815 p = (unsigned char *)PyBytes_AS_STRING(v); |
| 2406 if (byteorder == 0) | 2816 if (byteorder == 0) |
| 2407 STORECHAR(0xFEFF); | 2817 STORECHAR(0xFEFF); |
| 2408 if (size == 0) | 2818 if (size == 0) |
| 2409 return v; | 2819 goto done; |
| 2410 | 2820 |
| 2411 if (byteorder == -1) { | 2821 if (byteorder == -1) { |
| 2412 /* force LE */ | 2822 /* force LE */ |
| 2413 iorder[0] = 0; | 2823 iorder[0] = 0; |
| 2414 iorder[1] = 1; | 2824 iorder[1] = 1; |
| 2415 iorder[2] = 2; | 2825 iorder[2] = 2; |
| 2416 iorder[3] = 3; | 2826 iorder[3] = 3; |
| 2417 } | 2827 } |
| 2418 else if (byteorder == 1) { | 2828 else if (byteorder == 1) { |
| 2419 /* force BE */ | 2829 /* force BE */ |
| (...skipping 10 matching lines...) Expand all Loading... |
| 2430 Py_UCS4 ch2 = *s; | 2840 Py_UCS4 ch2 = *s; |
| 2431 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { | 2841 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { |
| 2432 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; | 2842 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; |
| 2433 s++; | 2843 s++; |
| 2434 size--; | 2844 size--; |
| 2435 } | 2845 } |
| 2436 } | 2846 } |
| 2437 #endif | 2847 #endif |
| 2438 STORECHAR(ch); | 2848 STORECHAR(ch); |
| 2439 } | 2849 } |
| 2850 |
| 2851 done: |
| 2440 return v; | 2852 return v; |
| 2441 #undef STORECHAR | 2853 #undef STORECHAR |
| 2442 } | 2854 } |
| 2443 | 2855 |
| 2444 PyObject *PyUnicode_AsUTF32String(PyObject *unicode) | 2856 PyObject *PyUnicode_AsUTF32String(PyObject *unicode) |
| 2445 { | 2857 { |
| 2446 if (!PyUnicode_Check(unicode)) { | 2858 if (!PyUnicode_Check(unicode)) { |
| 2447 PyErr_BadArgument(); | 2859 PyErr_BadArgument(); |
| 2448 return NULL; | 2860 return NULL; |
| 2449 } | 2861 } |
| 2450 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode), | 2862 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode), |
| 2451 PyUnicode_GET_SIZE(unicode), | 2863 PyUnicode_GET_SIZE(unicode), |
| 2452 NULL, | 2864 NULL, |
| 2453 0); | 2865 0); |
| 2454 } | 2866 } |
| 2455 | 2867 |
| 2456 /* --- UTF-16 Codec ------------------------------------------------------- */ | 2868 /* --- UTF-16 Codec ------------------------------------------------------- */ |
| 2457 | 2869 |
| 2458 PyObject * | 2870 PyObject * |
| 2459 PyUnicode_DecodeUTF16(const char *s, | 2871 PyUnicode_DecodeUTF16(const char *s, |
| 2460 Py_ssize_t size, | 2872 Py_ssize_t size, |
| 2461 const char *errors, | 2873 const char *errors, |
| 2462 int *byteorder) | 2874 int *byteorder) |
| 2463 { | 2875 { |
| 2464 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); | 2876 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); |
| 2465 } | 2877 } |
| 2878 |
| 2879 /* Two masks for fast checking of whether a C 'long' may contain |
| 2880 UTF16-encoded surrogate characters. This is an efficient heuristic, |
| 2881 assuming that non-surrogate characters with a code point >= 0x8000 are |
| 2882 rare in most input. |
| 2883 FAST_CHAR_MASK is used when the input is in native byte ordering, |
| 2884 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering. |
| 2885 */ |
| 2886 #if (SIZEOF_LONG == 8) |
| 2887 # define FAST_CHAR_MASK 0x8000800080008000L |
| 2888 # define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L |
| 2889 #elif (SIZEOF_LONG == 4) |
| 2890 # define FAST_CHAR_MASK 0x80008000L |
| 2891 # define SWAPPED_FAST_CHAR_MASK 0x00800080L |
| 2892 #else |
| 2893 # error C 'long' size should be either 4 or 8! |
| 2894 #endif |
| 2466 | 2895 |
| 2467 PyObject * | 2896 PyObject * |
| 2468 PyUnicode_DecodeUTF16Stateful(const char *s, | 2897 PyUnicode_DecodeUTF16Stateful(const char *s, |
| 2469 Py_ssize_t size, | 2898 Py_ssize_t size, |
| 2470 const char *errors, | 2899 const char *errors, |
| 2471 int *byteorder, | 2900 int *byteorder, |
| 2472 Py_ssize_t *consumed) | 2901 Py_ssize_t *consumed) |
| 2473 { | 2902 { |
| 2474 const char *starts = s; | 2903 const char *starts = s; |
| 2475 Py_ssize_t startinpos; | 2904 Py_ssize_t startinpos; |
| 2476 Py_ssize_t endinpos; | 2905 Py_ssize_t endinpos; |
| 2477 Py_ssize_t outpos; | 2906 Py_ssize_t outpos; |
| 2478 PyUnicodeObject *unicode; | 2907 PyUnicodeObject *unicode; |
| 2479 Py_UNICODE *p; | 2908 Py_UNICODE *p; |
| 2480 const unsigned char *q, *e; | 2909 const unsigned char *q, *e, *aligned_end; |
| 2481 int bo = 0; /* assume native ordering by default */ | 2910 int bo = 0; /* assume native ordering by default */ |
| 2911 int native_ordering = 0; |
| 2482 const char *errmsg = ""; | 2912 const char *errmsg = ""; |
| 2483 /* Offsets from q for retrieving byte pairs in the right order. */ | 2913 /* Offsets from q for retrieving byte pairs in the right order. */ |
| 2484 #ifdef BYTEORDER_IS_LITTLE_ENDIAN | 2914 #ifdef BYTEORDER_IS_LITTLE_ENDIAN |
| 2485 int ihi = 1, ilo = 0; | 2915 int ihi = 1, ilo = 0; |
| 2486 #else | 2916 #else |
| 2487 int ihi = 0, ilo = 1; | 2917 int ihi = 0, ilo = 1; |
| 2488 #endif | 2918 #endif |
| 2489 PyObject *errorHandler = NULL; | 2919 PyObject *errorHandler = NULL; |
| 2490 PyObject *exc = NULL; | 2920 PyObject *exc = NULL; |
| 2491 | 2921 |
| 2492 /* Note: size will always be longer than the resulting Unicode | 2922 /* Note: size will always be longer than the resulting Unicode |
| 2493 character count */ | 2923 character count */ |
| 2494 unicode = _PyUnicode_New(size); | 2924 unicode = _PyUnicode_New(size); |
| 2495 if (!unicode) | 2925 if (!unicode) |
| 2496 return NULL; | 2926 return NULL; |
| 2497 if (size == 0) | 2927 if (size == 0) |
| 2498 return (PyObject *)unicode; | 2928 return (PyObject *)unicode; |
| 2499 | 2929 |
| 2500 /* Unpack UTF-16 encoded data */ | 2930 /* Unpack UTF-16 encoded data */ |
| 2501 p = unicode->str; | 2931 p = unicode->str; |
| 2502 q = (unsigned char *)s; | 2932 q = (unsigned char *)s; |
| 2503 e = q + size; | 2933 e = q + size - 1; |
| 2504 | 2934 |
| 2505 if (byteorder) | 2935 if (byteorder) |
| 2506 bo = *byteorder; | 2936 bo = *byteorder; |
| 2507 | 2937 |
| 2508 /* Check for BOM marks (U+FEFF) in the input and adjust current | 2938 /* Check for BOM marks (U+FEFF) in the input and adjust current |
| 2509 byte order setting accordingly. In native mode, the leading BOM | 2939 byte order setting accordingly. In native mode, the leading BOM |
| 2510 mark is skipped, in all other modes, it is copied to the output | 2940 mark is skipped, in all other modes, it is copied to the output |
| 2511 stream as-is (giving a ZWNBSP character). */ | 2941 stream as-is (giving a ZWNBSP character). */ |
| 2512 if (bo == 0) { | 2942 if (bo == 0) { |
| 2513 if (size >= 2) { | 2943 if (size >= 2) { |
| (...skipping 23 matching lines...) Expand all Loading... |
| 2537 if (bo == -1) { | 2967 if (bo == -1) { |
| 2538 /* force LE */ | 2968 /* force LE */ |
| 2539 ihi = 1; | 2969 ihi = 1; |
| 2540 ilo = 0; | 2970 ilo = 0; |
| 2541 } | 2971 } |
| 2542 else if (bo == 1) { | 2972 else if (bo == 1) { |
| 2543 /* force BE */ | 2973 /* force BE */ |
| 2544 ihi = 0; | 2974 ihi = 0; |
| 2545 ilo = 1; | 2975 ilo = 1; |
| 2546 } | 2976 } |
| 2547 | 2977 #ifdef BYTEORDER_IS_LITTLE_ENDIAN |
| 2978 native_ordering = ilo < ihi; |
| 2979 #else |
| 2980 native_ordering = ilo > ihi; |
| 2981 #endif |
| 2982 |
| 2983 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK); |
| 2548 while (q < e) { | 2984 while (q < e) { |
| 2549 Py_UNICODE ch; | 2985 Py_UNICODE ch; |
| 2550 /* remaining bytes at the end? (size should be even) */ | 2986 /* First check for possible aligned read of a C 'long'. Unaligned |
| 2551 if (e-q<2) { | 2987 reads are more expensive, better to defer to another iteration. */ |
| 2552 if (consumed) | 2988 if (!((size_t) q & LONG_PTR_MASK)) { |
| 2989 /* Fast path for runs of non-surrogate chars. */ |
| 2990 register const unsigned char *_q = q; |
| 2991 Py_UNICODE *_p = p; |
| 2992 if (native_ordering) { |
| 2993 /* Native ordering is simple: as long as the input cannot |
| 2994 possibly contain a surrogate char, do an unrolled copy |
| 2995 of several 16-bit code points to the target object. |
| 2996 The non-surrogate check is done on several input bytes |
| 2997 at a time (as many as a C 'long' can contain). */ |
| 2998 while (_q < aligned_end) { |
| 2999 unsigned long data = * (unsigned long *) _q; |
| 3000 if (data & FAST_CHAR_MASK) |
| 3001 break; |
| 3002 _p[0] = ((unsigned short *) _q)[0]; |
| 3003 _p[1] = ((unsigned short *) _q)[1]; |
| 3004 #if (SIZEOF_LONG == 8) |
| 3005 _p[2] = ((unsigned short *) _q)[2]; |
| 3006 _p[3] = ((unsigned short *) _q)[3]; |
| 3007 #endif |
| 3008 _q += SIZEOF_LONG; |
| 3009 _p += SIZEOF_LONG / 2; |
| 3010 } |
| 3011 } |
| 3012 else { |
| 3013 /* Byteswapped ordering is similar, but we must decompose |
| 3014 the copy bytewise, and take care of zero'ing out the |
| 3015 upper bytes if the target object is in 32-bit units |
| 3016 (that is, in UCS-4 builds). */ |
| 3017 while (_q < aligned_end) { |
| 3018 unsigned long data = * (unsigned long *) _q; |
| 3019 if (data & SWAPPED_FAST_CHAR_MASK) |
| 3020 break; |
| 3021 /* Zero upper bytes in UCS-4 builds */ |
| 3022 #if (Py_UNICODE_SIZE > 2) |
| 3023 _p[0] = 0; |
| 3024 _p[1] = 0; |
| 3025 #if (SIZEOF_LONG == 8) |
| 3026 _p[2] = 0; |
| 3027 _p[3] = 0; |
| 3028 #endif |
| 3029 #endif |
| 3030 /* Issue #4916; UCS-4 builds on big endian machines must |
| 3031 fill the two last bytes of each 4-byte unit. */ |
| 3032 #if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2) |
| 3033 # define OFF 2 |
| 3034 #else |
| 3035 # define OFF 0 |
| 3036 #endif |
| 3037 ((unsigned char *) _p)[OFF + 1] = _q[0]; |
| 3038 ((unsigned char *) _p)[OFF + 0] = _q[1]; |
| 3039 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2]; |
| 3040 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3]; |
| 3041 #if (SIZEOF_LONG == 8) |
| 3042 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4
]; |
| 3043 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5
]; |
| 3044 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6
]; |
| 3045 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7
]; |
| 3046 #endif |
| 3047 #undef OFF |
| 3048 _q += SIZEOF_LONG; |
| 3049 _p += SIZEOF_LONG / 2; |
| 3050 } |
| 3051 } |
| 3052 p = _p; |
| 3053 q = _q; |
| 3054 if (q >= e) |
| 2553 break; | 3055 break; |
| 2554 errmsg = "truncated data"; | |
| 2555 startinpos = ((const char *)q)-starts; | |
| 2556 endinpos = ((const char *)e)-starts; | |
| 2557 goto utf16Error; | |
| 2558 /* The remaining input chars are ignored if the callback | |
| 2559 chooses to skip the input */ | |
| 2560 } | 3056 } |
| 2561 ch = (q[ihi] << 8) | q[ilo]; | 3057 ch = (q[ihi] << 8) | q[ilo]; |
| 2562 | 3058 |
| 2563 q += 2; | 3059 q += 2; |
| 2564 | 3060 |
| 2565 if (ch < 0xD800 || ch > 0xDFFF) { | 3061 if (ch < 0xD800 || ch > 0xDFFF) { |
| 2566 *p++ = ch; | 3062 *p++ = ch; |
| 2567 continue; | 3063 continue; |
| 2568 } | 3064 } |
| 2569 | 3065 |
| 2570 /* UTF-16 code pair: */ | 3066 /* UTF-16 code pair: */ |
| 2571 if (q >= e) { | 3067 if (q > e) { |
| 2572 errmsg = "unexpected end of data"; | 3068 errmsg = "unexpected end of data"; |
| 2573 startinpos = (((const char *)q)-2)-starts; | 3069 startinpos = (((const char *)q) - 2) - starts; |
| 2574 endinpos = ((const char *)e)-starts; | 3070 endinpos = ((const char *)e) + 1 - starts; |
| 2575 goto utf16Error; | 3071 goto utf16Error; |
| 2576 } | 3072 } |
| 2577 if (0xD800 <= ch && ch <= 0xDBFF) { | 3073 if (0xD800 <= ch && ch <= 0xDBFF) { |
| 2578 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; | 3074 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; |
| 2579 q += 2; | 3075 q += 2; |
| 2580 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { | 3076 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { |
| 2581 #ifndef Py_UNICODE_WIDE | 3077 #ifndef Py_UNICODE_WIDE |
| 2582 *p++ = ch; | 3078 *p++ = ch; |
| 2583 *p++ = ch2; | 3079 *p++ = ch2; |
| 2584 #else | 3080 #else |
| 2585 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; | 3081 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; |
| 2586 #endif | 3082 #endif |
| 2587 continue; | 3083 continue; |
| 2588 } | 3084 } |
| 2589 else { | 3085 else { |
| 2590 errmsg = "illegal UTF-16 surrogate"; | 3086 errmsg = "illegal UTF-16 surrogate"; |
| 2591 startinpos = (((const char *)q)-4)-starts; | 3087 startinpos = (((const char *)q)-4)-starts; |
| 2592 endinpos = startinpos+2; | 3088 endinpos = startinpos+2; |
| 2593 goto utf16Error; | 3089 goto utf16Error; |
| 2594 } | 3090 } |
| 2595 | 3091 |
| 2596 } | 3092 } |
| 2597 errmsg = "illegal encoding"; | 3093 errmsg = "illegal encoding"; |
| 2598 startinpos = (((const char *)q)-2)-starts; | 3094 startinpos = (((const char *)q)-2)-starts; |
| 2599 endinpos = startinpos+2; | 3095 endinpos = startinpos+2; |
| 2600 /* Fall through to report the error */ | 3096 /* Fall through to report the error */ |
| 2601 | 3097 |
| 2602 utf16Error: | 3098 utf16Error: |
| 2603 outpos = p-PyUnicode_AS_UNICODE(unicode); | 3099 outpos = p - PyUnicode_AS_UNICODE(unicode); |
| 2604 if (unicode_decode_call_errorhandler( | 3100 if (unicode_decode_call_errorhandler( |
| 2605 errors, &errorHandler, | 3101 errors, |
| 3102 &errorHandler, |
| 2606 "utf16", errmsg, | 3103 "utf16", errmsg, |
| 2607 starts, size, &startinpos, &endinpos, &exc, (const char **)&q, | 3104 &starts, |
| 2608 &unicode, &outpos, &p)) | 3105 (const char **)&e, |
| 3106 &startinpos, |
| 3107 &endinpos, |
| 3108 &exc, |
| 3109 (const char **)&q, |
| 3110 &unicode, |
| 3111 &outpos, |
| 3112 &p)) |
| 2609 goto onError; | 3113 goto onError; |
| 3114 } |
| 3115 /* remaining byte at the end? (size should be even) */ |
| 3116 if (e == q) { |
| 3117 if (!consumed) { |
| 3118 errmsg = "truncated data"; |
| 3119 startinpos = ((const char *)q) - starts; |
| 3120 endinpos = ((const char *)e) + 1 - starts; |
| 3121 outpos = p - PyUnicode_AS_UNICODE(unicode); |
| 3122 if (unicode_decode_call_errorhandler( |
| 3123 errors, |
| 3124 &errorHandler, |
| 3125 "utf16", errmsg, |
| 3126 &starts, |
| 3127 (const char **)&e, |
| 3128 &startinpos, |
| 3129 &endinpos, |
| 3130 &exc, |
| 3131 (const char **)&q, |
| 3132 &unicode, |
| 3133 &outpos, |
| 3134 &p)) |
| 3135 goto onError; |
| 3136 /* The remaining input chars are ignored if the callback |
| 3137 chooses to skip the input */ |
| 3138 } |
| 2610 } | 3139 } |
| 2611 | 3140 |
| 2612 if (byteorder) | 3141 if (byteorder) |
| 2613 *byteorder = bo; | 3142 *byteorder = bo; |
| 2614 | 3143 |
| 2615 if (consumed) | 3144 if (consumed) |
| 2616 *consumed = (const char *)q-starts; | 3145 *consumed = (const char *)q-starts; |
| 2617 | 3146 |
| 2618 /* Adjust length */ | 3147 /* Adjust length */ |
| 2619 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) | 3148 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) |
| 2620 goto onError; | 3149 goto onError; |
| 2621 | 3150 |
| 2622 Py_XDECREF(errorHandler); | 3151 Py_XDECREF(errorHandler); |
| 2623 Py_XDECREF(exc); | 3152 Py_XDECREF(exc); |
| 2624 return (PyObject *)unicode; | 3153 return (PyObject *)unicode; |
| 2625 | 3154 |
| 2626 onError: | 3155 onError: |
| 2627 Py_DECREF(unicode); | 3156 Py_DECREF(unicode); |
| 2628 Py_XDECREF(errorHandler); | 3157 Py_XDECREF(errorHandler); |
| 2629 Py_XDECREF(exc); | 3158 Py_XDECREF(exc); |
| 2630 return NULL; | 3159 return NULL; |
| 2631 } | 3160 } |
| 3161 |
| 3162 #undef FAST_CHAR_MASK |
| 3163 #undef SWAPPED_FAST_CHAR_MASK |
| 2632 | 3164 |
| 2633 PyObject * | 3165 PyObject * |
| 2634 PyUnicode_EncodeUTF16(const Py_UNICODE *s, | 3166 PyUnicode_EncodeUTF16(const Py_UNICODE *s, |
| 2635 Py_ssize_t size, | 3167 Py_ssize_t size, |
| 2636 const char *errors, | 3168 const char *errors, |
| 2637 int byteorder) | 3169 int byteorder) |
| 2638 { | 3170 { |
| 2639 PyObject *v; | 3171 PyObject *v; |
| 2640 unsigned char *p; | 3172 unsigned char *p; |
| 2641 Py_ssize_t nsize, bytesize; | 3173 Py_ssize_t nsize, bytesize; |
| (...skipping 22 matching lines...) Expand all Loading... |
| 2664 pairs++; | 3196 pairs++; |
| 2665 #endif | 3197 #endif |
| 2666 /* 2 * (size + pairs + (byteorder == 0)) */ | 3198 /* 2 * (size + pairs + (byteorder == 0)) */ |
| 2667 if (size > PY_SSIZE_T_MAX || | 3199 if (size > PY_SSIZE_T_MAX || |
| 2668 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0)) | 3200 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0)) |
| 2669 return PyErr_NoMemory(); | 3201 return PyErr_NoMemory(); |
| 2670 nsize = size + pairs + (byteorder == 0); | 3202 nsize = size + pairs + (byteorder == 0); |
| 2671 bytesize = nsize * 2; | 3203 bytesize = nsize * 2; |
| 2672 if (bytesize / 2 != nsize) | 3204 if (bytesize / 2 != nsize) |
| 2673 return PyErr_NoMemory(); | 3205 return PyErr_NoMemory(); |
| 2674 v = PyString_FromStringAndSize(NULL, bytesize); | 3206 v = PyBytes_FromStringAndSize(NULL, bytesize); |
| 2675 if (v == NULL) | 3207 if (v == NULL) |
| 2676 return NULL; | 3208 return NULL; |
| 2677 | 3209 |
| 2678 p = (unsigned char *)PyString_AS_STRING(v); | 3210 p = (unsigned char *)PyBytes_AS_STRING(v); |
| 2679 if (byteorder == 0) | 3211 if (byteorder == 0) |
| 2680 STORECHAR(0xFEFF); | 3212 STORECHAR(0xFEFF); |
| 2681 if (size == 0) | 3213 if (size == 0) |
| 2682 return v; | 3214 goto done; |
| 2683 | 3215 |
| 2684 if (byteorder == -1) { | 3216 if (byteorder == -1) { |
| 2685 /* force LE */ | 3217 /* force LE */ |
| 2686 ihi = 1; | 3218 ihi = 1; |
| 2687 ilo = 0; | 3219 ilo = 0; |
| 2688 } | 3220 } |
| 2689 else if (byteorder == 1) { | 3221 else if (byteorder == 1) { |
| 2690 /* force BE */ | 3222 /* force BE */ |
| 2691 ihi = 0; | 3223 ihi = 0; |
| 2692 ilo = 1; | 3224 ilo = 1; |
| 2693 } | 3225 } |
| 2694 | 3226 |
| 2695 while (size-- > 0) { | 3227 while (size-- > 0) { |
| 2696 Py_UNICODE ch = *s++; | 3228 Py_UNICODE ch = *s++; |
| 2697 Py_UNICODE ch2 = 0; | 3229 Py_UNICODE ch2 = 0; |
| 2698 #ifdef Py_UNICODE_WIDE | 3230 #ifdef Py_UNICODE_WIDE |
| 2699 if (ch >= 0x10000) { | 3231 if (ch >= 0x10000) { |
| 2700 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); | 3232 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); |
| 2701 ch = 0xD800 | ((ch-0x10000) >> 10); | 3233 ch = 0xD800 | ((ch-0x10000) >> 10); |
| 2702 } | 3234 } |
| 2703 #endif | 3235 #endif |
| 2704 STORECHAR(ch); | 3236 STORECHAR(ch); |
| 2705 if (ch2) | 3237 if (ch2) |
| 2706 STORECHAR(ch2); | 3238 STORECHAR(ch2); |
| 2707 } | 3239 } |
| 3240 |
| 3241 done: |
| 2708 return v; | 3242 return v; |
| 2709 #undef STORECHAR | 3243 #undef STORECHAR |
| 2710 } | 3244 } |
| 2711 | 3245 |
| 2712 PyObject *PyUnicode_AsUTF16String(PyObject *unicode) | 3246 PyObject *PyUnicode_AsUTF16String(PyObject *unicode) |
| 2713 { | 3247 { |
| 2714 if (!PyUnicode_Check(unicode)) { | 3248 if (!PyUnicode_Check(unicode)) { |
| 2715 PyErr_BadArgument(); | 3249 PyErr_BadArgument(); |
| 2716 return NULL; | 3250 return NULL; |
| 2717 } | 3251 } |
| (...skipping 100 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2818 digits = 8; | 3352 digits = 8; |
| 2819 message = "truncated \\UXXXXXXXX escape"; | 3353 message = "truncated \\UXXXXXXXX escape"; |
| 2820 hexescape: | 3354 hexescape: |
| 2821 chr = 0; | 3355 chr = 0; |
| 2822 outpos = p-PyUnicode_AS_UNICODE(v); | 3356 outpos = p-PyUnicode_AS_UNICODE(v); |
| 2823 if (s+digits>end) { | 3357 if (s+digits>end) { |
| 2824 endinpos = size; | 3358 endinpos = size; |
| 2825 if (unicode_decode_call_errorhandler( | 3359 if (unicode_decode_call_errorhandler( |
| 2826 errors, &errorHandler, | 3360 errors, &errorHandler, |
| 2827 "unicodeescape", "end of string in escape sequence", | 3361 "unicodeescape", "end of string in escape sequence", |
| 2828 starts, size, &startinpos, &endinpos, &exc, &s, | 3362 &starts, &end, &startinpos, &endinpos, &exc, &s, |
| 2829 &v, &outpos, &p)) | 3363 &v, &outpos, &p)) |
| 2830 goto onError; | 3364 goto onError; |
| 2831 goto nextByte; | 3365 goto nextByte; |
| 2832 } | 3366 } |
| 2833 for (i = 0; i < digits; ++i) { | 3367 for (i = 0; i < digits; ++i) { |
| 2834 c = (unsigned char) s[i]; | 3368 c = (unsigned char) s[i]; |
| 2835 if (!isxdigit(c)) { | 3369 if (!ISXDIGIT(c)) { |
| 2836 endinpos = (s+i+1)-starts; | 3370 endinpos = (s+i+1)-starts; |
| 2837 if (unicode_decode_call_errorhandler( | 3371 if (unicode_decode_call_errorhandler( |
| 2838 errors, &errorHandler, | 3372 errors, &errorHandler, |
| 2839 "unicodeescape", message, | 3373 "unicodeescape", message, |
| 2840 starts, size, &startinpos, &endinpos, &exc, &s, | 3374 &starts, &end, &startinpos, &endinpos, &exc, &s, |
| 2841 &v, &outpos, &p)) | 3375 &v, &outpos, &p)) |
| 2842 goto onError; | 3376 goto onError; |
| 2843 goto nextByte; | 3377 goto nextByte; |
| 2844 } | 3378 } |
| 2845 chr = (chr<<4) & ~0xF; | 3379 chr = (chr<<4) & ~0xF; |
| 2846 if (c >= '0' && c <= '9') | 3380 if (c >= '0' && c <= '9') |
| 2847 chr += c - '0'; | 3381 chr += c - '0'; |
| 2848 else if (c >= 'a' && c <= 'f') | 3382 else if (c >= 'a' && c <= 'f') |
| 2849 chr += 10 + c - 'a'; | 3383 chr += 10 + c - 'a'; |
| 2850 else | 3384 else |
| (...skipping 18 matching lines...) Expand all Loading... |
| 2869 chr -= 0x10000L; | 3403 chr -= 0x10000L; |
| 2870 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); | 3404 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); |
| 2871 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); | 3405 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); |
| 2872 #endif | 3406 #endif |
| 2873 } else { | 3407 } else { |
| 2874 endinpos = s-starts; | 3408 endinpos = s-starts; |
| 2875 outpos = p-PyUnicode_AS_UNICODE(v); | 3409 outpos = p-PyUnicode_AS_UNICODE(v); |
| 2876 if (unicode_decode_call_errorhandler( | 3410 if (unicode_decode_call_errorhandler( |
| 2877 errors, &errorHandler, | 3411 errors, &errorHandler, |
| 2878 "unicodeescape", "illegal Unicode character", | 3412 "unicodeescape", "illegal Unicode character", |
| 2879 starts, size, &startinpos, &endinpos, &exc, &s, | 3413 &starts, &end, &startinpos, &endinpos, &exc, &s, |
| 2880 &v, &outpos, &p)) | 3414 &v, &outpos, &p)) |
| 2881 goto onError; | 3415 goto onError; |
| 2882 } | 3416 } |
| 2883 break; | 3417 break; |
| 2884 | 3418 |
| 2885 /* \N{name} */ | 3419 /* \N{name} */ |
| 2886 case 'N': | 3420 case 'N': |
| 2887 message = "malformed \\N character escape"; | 3421 message = "malformed \\N character escape"; |
| 2888 if (ucnhash_CAPI == NULL) { | 3422 if (ucnhash_CAPI == NULL) { |
| 2889 /* load the unicode data module */ | 3423 /* load the unicode data module */ |
| (...skipping 12 matching lines...) Expand all Loading... |
| 2902 s++; | 3436 s++; |
| 2903 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &ch
r)) | 3437 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &ch
r)) |
| 2904 goto store; | 3438 goto store; |
| 2905 } | 3439 } |
| 2906 } | 3440 } |
| 2907 endinpos = s-starts; | 3441 endinpos = s-starts; |
| 2908 outpos = p-PyUnicode_AS_UNICODE(v); | 3442 outpos = p-PyUnicode_AS_UNICODE(v); |
| 2909 if (unicode_decode_call_errorhandler( | 3443 if (unicode_decode_call_errorhandler( |
| 2910 errors, &errorHandler, | 3444 errors, &errorHandler, |
| 2911 "unicodeescape", message, | 3445 "unicodeescape", message, |
| 2912 starts, size, &startinpos, &endinpos, &exc, &s, | 3446 &starts, &end, &startinpos, &endinpos, &exc, &s, |
| 2913 &v, &outpos, &p)) | 3447 &v, &outpos, &p)) |
| 2914 goto onError; | 3448 goto onError; |
| 2915 break; | 3449 break; |
| 2916 | 3450 |
| 2917 default: | 3451 default: |
| 2918 if (s > end) { | 3452 if (s > end) { |
| 2919 message = "\\ at end of string"; | 3453 message = "\\ at end of string"; |
| 2920 s--; | 3454 s--; |
| 2921 endinpos = s-starts; | 3455 endinpos = s-starts; |
| 2922 outpos = p-PyUnicode_AS_UNICODE(v); | 3456 outpos = p-PyUnicode_AS_UNICODE(v); |
| 2923 if (unicode_decode_call_errorhandler( | 3457 if (unicode_decode_call_errorhandler( |
| 2924 errors, &errorHandler, | 3458 errors, &errorHandler, |
| 2925 "unicodeescape", message, | 3459 "unicodeescape", message, |
| 2926 starts, size, &startinpos, &endinpos, &exc, &s, | 3460 &starts, &end, &startinpos, &endinpos, &exc, &s, |
| 2927 &v, &outpos, &p)) | 3461 &v, &outpos, &p)) |
| 2928 goto onError; | 3462 goto onError; |
| 2929 } | 3463 } |
| 2930 else { | 3464 else { |
| 2931 *p++ = '\\'; | 3465 *p++ = '\\'; |
| 2932 *p++ = (unsigned char)s[-1]; | 3466 *p++ = (unsigned char)s[-1]; |
| 2933 } | 3467 } |
| 2934 break; | 3468 break; |
| 2935 } | 3469 } |
| 2936 nextByte: | 3470 nextByte: |
| (...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2974 | 3508 |
| 2975 while (size-- > 0) { | 3509 while (size-- > 0) { |
| 2976 if (*s == ch) | 3510 if (*s == ch) |
| 2977 return s; | 3511 return s; |
| 2978 s++; | 3512 s++; |
| 2979 } | 3513 } |
| 2980 | 3514 |
| 2981 return NULL; | 3515 return NULL; |
| 2982 } | 3516 } |
| 2983 | 3517 |
| 2984 static | 3518 static const char *hexdigits = "0123456789abcdef"; |
| 2985 PyObject *unicodeescape_string(const Py_UNICODE *s, | 3519 |
| 2986 Py_ssize_t size, | 3520 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, |
| 2987 int quotes) | 3521 Py_ssize_t size) |
| 2988 { | 3522 { |
| 2989 PyObject *repr; | 3523 PyObject *repr; |
| 2990 char *p; | 3524 char *p; |
| 2991 | 3525 |
| 2992 static const char *hexdigit = "0123456789abcdef"; | |
| 2993 #ifdef Py_UNICODE_WIDE | 3526 #ifdef Py_UNICODE_WIDE |
| 2994 const Py_ssize_t expandsize = 10; | 3527 const Py_ssize_t expandsize = 10; |
| 2995 #else | 3528 #else |
| 2996 const Py_ssize_t expandsize = 6; | 3529 const Py_ssize_t expandsize = 6; |
| 2997 #endif | 3530 #endif |
| 2998 | 3531 |
| 2999 /* XXX(nnorwitz): rather than over-allocating, it would be | 3532 /* XXX(nnorwitz): rather than over-allocating, it would be |
| 3000 better to choose a different scheme. Perhaps scan the | 3533 better to choose a different scheme. Perhaps scan the |
| 3001 first N-chars of the string and allocate based on that size. | 3534 first N-chars of the string and allocate based on that size. |
| 3002 */ | 3535 */ |
| 3003 /* Initial allocation is based on the longest-possible unichr | 3536 /* Initial allocation is based on the longest-possible unichr |
| 3004 escape. | 3537 escape. |
| 3005 | 3538 |
| 3006 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source | 3539 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source |
| 3007 unichr, so in this case it's the longest unichr escape. In | 3540 unichr, so in this case it's the longest unichr escape. In |
| 3008 narrow (UTF-16) builds this is five chars per source unichr | 3541 narrow (UTF-16) builds this is five chars per source unichr |
| 3009 since there are two unichrs in the surrogate pair, so in narrow | 3542 since there are two unichrs in the surrogate pair, so in narrow |
| 3010 (UTF-16) builds it's not the longest unichr escape. | 3543 (UTF-16) builds it's not the longest unichr escape. |
| 3011 | 3544 |
| 3012 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, | 3545 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, |
| 3013 so in the narrow (UTF-16) build case it's the longest unichr | 3546 so in the narrow (UTF-16) build case it's the longest unichr |
| 3014 escape. | 3547 escape. |
| 3015 */ | 3548 */ |
| 3016 | 3549 |
| 3550 if (size == 0) |
| 3551 return PyBytes_FromStringAndSize(NULL, 0); |
| 3552 |
| 3017 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) | 3553 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) |
| 3018 return PyErr_NoMemory(); | 3554 return PyErr_NoMemory(); |
| 3019 | 3555 |
| 3020 repr = PyString_FromStringAndSize(NULL, | 3556 repr = PyBytes_FromStringAndSize(NULL, |
| 3021 2 | 3557 2 |
| 3022 + expandsize*size | 3558 + expandsize*size |
| 3023 + 1); | 3559 + 1); |
| 3024 if (repr == NULL) | 3560 if (repr == NULL) |
| 3025 return NULL; | 3561 return NULL; |
| 3026 | 3562 |
| 3027 p = PyString_AS_STRING(repr); | 3563 p = PyBytes_AS_STRING(repr); |
| 3028 | 3564 |
| 3029 if (quotes) { | |
| 3030 *p++ = 'u'; | |
| 3031 *p++ = (findchar(s, size, '\'') && | |
| 3032 !findchar(s, size, '"')) ? '"' : '\''; | |
| 3033 } | |
| 3034 while (size-- > 0) { | 3565 while (size-- > 0) { |
| 3035 Py_UNICODE ch = *s++; | 3566 Py_UNICODE ch = *s++; |
| 3036 | 3567 |
| 3037 /* Escape quotes and backslashes */ | 3568 /* Escape backslashes */ |
| 3038 if ((quotes && | 3569 if (ch == '\\') { |
| 3039 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') { | |
| 3040 *p++ = '\\'; | 3570 *p++ = '\\'; |
| 3041 *p++ = (char) ch; | 3571 *p++ = (char) ch; |
| 3042 continue; | 3572 continue; |
| 3043 } | 3573 } |
| 3044 | 3574 |
| 3045 #ifdef Py_UNICODE_WIDE | 3575 #ifdef Py_UNICODE_WIDE |
| 3046 /* Map 21-bit characters to '\U00xxxxxx' */ | 3576 /* Map 21-bit characters to '\U00xxxxxx' */ |
| 3047 else if (ch >= 0x10000) { | 3577 else if (ch >= 0x10000) { |
| 3048 *p++ = '\\'; | 3578 *p++ = '\\'; |
| 3049 *p++ = 'U'; | 3579 *p++ = 'U'; |
| 3050 *p++ = hexdigit[(ch >> 28) & 0x0000000F]; | 3580 *p++ = hexdigits[(ch >> 28) & 0x0000000F]; |
| 3051 *p++ = hexdigit[(ch >> 24) & 0x0000000F]; | 3581 *p++ = hexdigits[(ch >> 24) & 0x0000000F]; |
| 3052 *p++ = hexdigit[(ch >> 20) & 0x0000000F]; | 3582 *p++ = hexdigits[(ch >> 20) & 0x0000000F]; |
| 3053 *p++ = hexdigit[(ch >> 16) & 0x0000000F]; | 3583 *p++ = hexdigits[(ch >> 16) & 0x0000000F]; |
| 3054 *p++ = hexdigit[(ch >> 12) & 0x0000000F]; | 3584 *p++ = hexdigits[(ch >> 12) & 0x0000000F]; |
| 3055 *p++ = hexdigit[(ch >> 8) & 0x0000000F]; | 3585 *p++ = hexdigits[(ch >> 8) & 0x0000000F]; |
| 3056 *p++ = hexdigit[(ch >> 4) & 0x0000000F]; | 3586 *p++ = hexdigits[(ch >> 4) & 0x0000000F]; |
| 3057 *p++ = hexdigit[ch & 0x0000000F]; | 3587 *p++ = hexdigits[ch & 0x0000000F]; |
| 3058 continue; | 3588 continue; |
| 3059 } | 3589 } |
| 3060 #else | 3590 #else |
| 3061 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ | 3591 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ |
| 3062 else if (ch >= 0xD800 && ch < 0xDC00) { | 3592 else if (ch >= 0xD800 && ch < 0xDC00) { |
| 3063 Py_UNICODE ch2; | 3593 Py_UNICODE ch2; |
| 3064 Py_UCS4 ucs; | 3594 Py_UCS4 ucs; |
| 3065 | 3595 |
| 3066 ch2 = *s++; | 3596 ch2 = *s++; |
| 3067 size--; | 3597 size--; |
| 3068 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { | 3598 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { |
| 3069 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; | 3599 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; |
| 3070 *p++ = '\\'; | 3600 *p++ = '\\'; |
| 3071 *p++ = 'U'; | 3601 *p++ = 'U'; |
| 3072 *p++ = hexdigit[(ucs >> 28) & 0x0000000F]; | 3602 *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; |
| 3073 *p++ = hexdigit[(ucs >> 24) & 0x0000000F]; | 3603 *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; |
| 3074 *p++ = hexdigit[(ucs >> 20) & 0x0000000F]; | 3604 *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; |
| 3075 *p++ = hexdigit[(ucs >> 16) & 0x0000000F]; | 3605 *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; |
| 3076 *p++ = hexdigit[(ucs >> 12) & 0x0000000F]; | 3606 *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; |
| 3077 *p++ = hexdigit[(ucs >> 8) & 0x0000000F]; | 3607 *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; |
| 3078 *p++ = hexdigit[(ucs >> 4) & 0x0000000F]; | 3608 *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; |
| 3079 *p++ = hexdigit[ucs & 0x0000000F]; | 3609 *p++ = hexdigits[ucs & 0x0000000F]; |
| 3080 continue; | 3610 continue; |
| 3081 } | 3611 } |
| 3082 /* Fall through: isolated surrogates are copied as-is */ | 3612 /* Fall through: isolated surrogates are copied as-is */ |
| 3083 s--; | 3613 s--; |
| 3084 size++; | 3614 size++; |
| 3085 } | 3615 } |
| 3086 #endif | 3616 #endif |
| 3087 | 3617 |
| 3088 /* Map 16-bit characters to '\uxxxx' */ | 3618 /* Map 16-bit characters to '\uxxxx' */ |
| 3089 if (ch >= 256) { | 3619 if (ch >= 256) { |
| 3090 *p++ = '\\'; | 3620 *p++ = '\\'; |
| 3091 *p++ = 'u'; | 3621 *p++ = 'u'; |
| 3092 *p++ = hexdigit[(ch >> 12) & 0x000F]; | 3622 *p++ = hexdigits[(ch >> 12) & 0x000F]; |
| 3093 *p++ = hexdigit[(ch >> 8) & 0x000F]; | 3623 *p++ = hexdigits[(ch >> 8) & 0x000F]; |
| 3094 *p++ = hexdigit[(ch >> 4) & 0x000F]; | 3624 *p++ = hexdigits[(ch >> 4) & 0x000F]; |
| 3095 *p++ = hexdigit[ch & 0x000F]; | 3625 *p++ = hexdigits[ch & 0x000F]; |
| 3096 } | 3626 } |
| 3097 | 3627 |
| 3098 /* Map special whitespace to '\t', \n', '\r' */ | 3628 /* Map special whitespace to '\t', \n', '\r' */ |
| 3099 else if (ch == '\t') { | 3629 else if (ch == '\t') { |
| 3100 *p++ = '\\'; | 3630 *p++ = '\\'; |
| 3101 *p++ = 't'; | 3631 *p++ = 't'; |
| 3102 } | 3632 } |
| 3103 else if (ch == '\n') { | 3633 else if (ch == '\n') { |
| 3104 *p++ = '\\'; | 3634 *p++ = '\\'; |
| 3105 *p++ = 'n'; | 3635 *p++ = 'n'; |
| 3106 } | 3636 } |
| 3107 else if (ch == '\r') { | 3637 else if (ch == '\r') { |
| 3108 *p++ = '\\'; | 3638 *p++ = '\\'; |
| 3109 *p++ = 'r'; | 3639 *p++ = 'r'; |
| 3110 } | 3640 } |
| 3111 | 3641 |
| 3112 /* Map non-printable US ASCII to '\xhh' */ | 3642 /* Map non-printable US ASCII to '\xhh' */ |
| 3113 else if (ch < ' ' || ch >= 0x7F) { | 3643 else if (ch < ' ' || ch >= 0x7F) { |
| 3114 *p++ = '\\'; | 3644 *p++ = '\\'; |
| 3115 *p++ = 'x'; | 3645 *p++ = 'x'; |
| 3116 *p++ = hexdigit[(ch >> 4) & 0x000F]; | 3646 *p++ = hexdigits[(ch >> 4) & 0x000F]; |
| 3117 *p++ = hexdigit[ch & 0x000F]; | 3647 *p++ = hexdigits[ch & 0x000F]; |
| 3118 } | 3648 } |
| 3119 | 3649 |
| 3120 /* Copy everything else as-is */ | 3650 /* Copy everything else as-is */ |
| 3121 else | 3651 else |
| 3122 *p++ = (char) ch; | 3652 *p++ = (char) ch; |
| 3123 } | 3653 } |
| 3124 if (quotes) | 3654 |
| 3125 *p++ = PyString_AS_STRING(repr)[1]; | 3655 assert(p - PyBytes_AS_STRING(repr) > 0); |
| 3126 | 3656 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) |
| 3127 *p = '\0'; | |
| 3128 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr))) | |
| 3129 return NULL; | 3657 return NULL; |
| 3130 return repr; | 3658 return repr; |
| 3131 } | 3659 } |
| 3132 | 3660 |
| 3133 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, | |
| 3134 Py_ssize_t size) | |
| 3135 { | |
| 3136 return unicodeescape_string(s, size, 0); | |
| 3137 } | |
| 3138 | |
| 3139 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) | 3661 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) |
| 3140 { | 3662 { |
| 3663 PyObject *s; |
| 3141 if (!PyUnicode_Check(unicode)) { | 3664 if (!PyUnicode_Check(unicode)) { |
| 3142 PyErr_BadArgument(); | 3665 PyErr_BadArgument(); |
| 3143 return NULL; | 3666 return NULL; |
| 3144 } | 3667 } |
| 3145 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), | 3668 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), |
| 3146 PyUnicode_GET_SIZE(unicode)); | 3669 PyUnicode_GET_SIZE(unicode)); |
| 3670 return s; |
| 3147 } | 3671 } |
| 3148 | 3672 |
| 3149 /* --- Raw Unicode Escape Codec ------------------------------------------- */ | 3673 /* --- Raw Unicode Escape Codec ------------------------------------------- */ |
| 3150 | 3674 |
| 3151 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, | 3675 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, |
| 3152 Py_ssize_t size, | 3676 Py_ssize_t size, |
| 3153 const char *errors) | 3677 const char *errors) |
| 3154 { | 3678 { |
| 3155 const char *starts = s; | 3679 const char *starts = s; |
| 3156 Py_ssize_t startinpos; | 3680 Py_ssize_t startinpos; |
| (...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3201 continue; | 3725 continue; |
| 3202 } | 3726 } |
| 3203 p--; | 3727 p--; |
| 3204 count = *s=='u' ? 4 : 8; | 3728 count = *s=='u' ? 4 : 8; |
| 3205 s++; | 3729 s++; |
| 3206 | 3730 |
| 3207 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ | 3731 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ |
| 3208 outpos = p-PyUnicode_AS_UNICODE(v); | 3732 outpos = p-PyUnicode_AS_UNICODE(v); |
| 3209 for (x = 0, i = 0; i < count; ++i, ++s) { | 3733 for (x = 0, i = 0; i < count; ++i, ++s) { |
| 3210 c = (unsigned char)*s; | 3734 c = (unsigned char)*s; |
| 3211 if (!isxdigit(c)) { | 3735 if (!ISXDIGIT(c)) { |
| 3212 endinpos = s-starts; | 3736 endinpos = s-starts; |
| 3213 if (unicode_decode_call_errorhandler( | 3737 if (unicode_decode_call_errorhandler( |
| 3214 errors, &errorHandler, | 3738 errors, &errorHandler, |
| 3215 "rawunicodeescape", "truncated \\uXXXX", | 3739 "rawunicodeescape", "truncated \\uXXXX", |
| 3216 starts, size, &startinpos, &endinpos, &exc, &s, | 3740 &starts, &end, &startinpos, &endinpos, &exc, &s, |
| 3217 &v, &outpos, &p)) | 3741 &v, &outpos, &p)) |
| 3218 goto onError; | 3742 goto onError; |
| 3219 goto nextByte; | 3743 goto nextByte; |
| 3220 } | 3744 } |
| 3221 x = (x<<4) & ~0xF; | 3745 x = (x<<4) & ~0xF; |
| 3222 if (c >= '0' && c <= '9') | 3746 if (c >= '0' && c <= '9') |
| 3223 x += c - '0'; | 3747 x += c - '0'; |
| 3224 else if (c >= 'a' && c <= 'f') | 3748 else if (c >= 'a' && c <= 'f') |
| 3225 x += 10 + c - 'a'; | 3749 x += 10 + c - 'a'; |
| 3226 else | 3750 else |
| (...skipping 11 matching lines...) Expand all Loading... |
| 3238 x -= 0x10000L; | 3762 x -= 0x10000L; |
| 3239 *p++ = 0xD800 + (Py_UNICODE) (x >> 10); | 3763 *p++ = 0xD800 + (Py_UNICODE) (x >> 10); |
| 3240 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF); | 3764 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF); |
| 3241 #endif | 3765 #endif |
| 3242 } else { | 3766 } else { |
| 3243 endinpos = s-starts; | 3767 endinpos = s-starts; |
| 3244 outpos = p-PyUnicode_AS_UNICODE(v); | 3768 outpos = p-PyUnicode_AS_UNICODE(v); |
| 3245 if (unicode_decode_call_errorhandler( | 3769 if (unicode_decode_call_errorhandler( |
| 3246 errors, &errorHandler, | 3770 errors, &errorHandler, |
| 3247 "rawunicodeescape", "\\Uxxxxxxxx out of range", | 3771 "rawunicodeescape", "\\Uxxxxxxxx out of range", |
| 3248 starts, size, &startinpos, &endinpos, &exc, &s, | 3772 &starts, &end, &startinpos, &endinpos, &exc, &s, |
| 3249 &v, &outpos, &p)) | 3773 &v, &outpos, &p)) |
| 3250 goto onError; | 3774 goto onError; |
| 3251 } | 3775 } |
| 3252 nextByte: | 3776 nextByte: |
| 3253 ; | 3777 ; |
| 3254 } | 3778 } |
| 3255 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) | 3779 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) |
| 3256 goto onError; | 3780 goto onError; |
| 3257 Py_XDECREF(errorHandler); | 3781 Py_XDECREF(errorHandler); |
| 3258 Py_XDECREF(exc); | 3782 Py_XDECREF(exc); |
| 3259 return (PyObject *)v; | 3783 return (PyObject *)v; |
| 3260 | 3784 |
| 3261 onError: | 3785 onError: |
| 3262 Py_XDECREF(v); | 3786 Py_XDECREF(v); |
| 3263 Py_XDECREF(errorHandler); | 3787 Py_XDECREF(errorHandler); |
| 3264 Py_XDECREF(exc); | 3788 Py_XDECREF(exc); |
| 3265 return NULL; | 3789 return NULL; |
| 3266 } | 3790 } |
| 3267 | 3791 |
| 3268 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, | 3792 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, |
| 3269 Py_ssize_t size) | 3793 Py_ssize_t size) |
| 3270 { | 3794 { |
| 3271 PyObject *repr; | 3795 PyObject *repr; |
| 3272 char *p; | 3796 char *p; |
| 3273 char *q; | 3797 char *q; |
| 3274 | 3798 |
| 3275 static const char *hexdigit = "0123456789abcdef"; | |
| 3276 #ifdef Py_UNICODE_WIDE | 3799 #ifdef Py_UNICODE_WIDE |
| 3277 const Py_ssize_t expandsize = 10; | 3800 const Py_ssize_t expandsize = 10; |
| 3278 #else | 3801 #else |
| 3279 const Py_ssize_t expandsize = 6; | 3802 const Py_ssize_t expandsize = 6; |
| 3280 #endif | 3803 #endif |
| 3281 | 3804 |
| 3282 if (size > PY_SSIZE_T_MAX / expandsize) | 3805 if (size > PY_SSIZE_T_MAX / expandsize) |
| 3283 return PyErr_NoMemory(); | 3806 return PyErr_NoMemory(); |
| 3284 | 3807 |
| 3285 repr = PyString_FromStringAndSize(NULL, expandsize * size); | 3808 repr = PyBytes_FromStringAndSize(NULL, expandsize * size); |
| 3286 if (repr == NULL) | 3809 if (repr == NULL) |
| 3287 return NULL; | 3810 return NULL; |
| 3288 if (size == 0) | 3811 if (size == 0) |
| 3289 return repr; | 3812 return repr; |
| 3290 | 3813 |
| 3291 p = q = PyString_AS_STRING(repr); | 3814 p = q = PyBytes_AS_STRING(repr); |
| 3292 while (size-- > 0) { | 3815 while (size-- > 0) { |
| 3293 Py_UNICODE ch = *s++; | 3816 Py_UNICODE ch = *s++; |
| 3294 #ifdef Py_UNICODE_WIDE | 3817 #ifdef Py_UNICODE_WIDE |
| 3295 /* Map 32-bit characters to '\Uxxxxxxxx' */ | 3818 /* Map 32-bit characters to '\Uxxxxxxxx' */ |
| 3296 if (ch >= 0x10000) { | 3819 if (ch >= 0x10000) { |
| 3297 *p++ = '\\'; | 3820 *p++ = '\\'; |
| 3298 *p++ = 'U'; | 3821 *p++ = 'U'; |
| 3299 *p++ = hexdigit[(ch >> 28) & 0xf]; | 3822 *p++ = hexdigits[(ch >> 28) & 0xf]; |
| 3300 *p++ = hexdigit[(ch >> 24) & 0xf]; | 3823 *p++ = hexdigits[(ch >> 24) & 0xf]; |
| 3301 *p++ = hexdigit[(ch >> 20) & 0xf]; | 3824 *p++ = hexdigits[(ch >> 20) & 0xf]; |
| 3302 *p++ = hexdigit[(ch >> 16) & 0xf]; | 3825 *p++ = hexdigits[(ch >> 16) & 0xf]; |
| 3303 *p++ = hexdigit[(ch >> 12) & 0xf]; | 3826 *p++ = hexdigits[(ch >> 12) & 0xf]; |
| 3304 *p++ = hexdigit[(ch >> 8) & 0xf]; | 3827 *p++ = hexdigits[(ch >> 8) & 0xf]; |
| 3305 *p++ = hexdigit[(ch >> 4) & 0xf]; | 3828 *p++ = hexdigits[(ch >> 4) & 0xf]; |
| 3306 *p++ = hexdigit[ch & 15]; | 3829 *p++ = hexdigits[ch & 15]; |
| 3307 } | 3830 } |
| 3308 else | 3831 else |
| 3309 #else | 3832 #else |
| 3310 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ | 3833 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ |
| 3311 if (ch >= 0xD800 && ch < 0xDC00) { | 3834 if (ch >= 0xD800 && ch < 0xDC00) { |
| 3312 Py_UNICODE ch2; | 3835 Py_UNICODE ch2; |
| 3313 Py_UCS4 ucs; | 3836 Py_UCS4 ucs; |
| 3314 | 3837 |
| 3315 ch2 = *s++; | 3838 ch2 = *s++; |
| 3316 size--; | 3839 size--; |
| 3317 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { | 3840 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { |
| 3318 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; | 3841 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; |
| 3319 *p++ = '\\'; | 3842 *p++ = '\\'; |
| 3320 *p++ = 'U'; | 3843 *p++ = 'U'; |
| 3321 *p++ = hexdigit[(ucs >> 28) & 0xf]; | 3844 *p++ = hexdigits[(ucs >> 28) & 0xf]; |
| 3322 *p++ = hexdigit[(ucs >> 24) & 0xf]; | 3845 *p++ = hexdigits[(ucs >> 24) & 0xf]; |
| 3323 *p++ = hexdigit[(ucs >> 20) & 0xf]; | 3846 *p++ = hexdigits[(ucs >> 20) & 0xf]; |
| 3324 *p++ = hexdigit[(ucs >> 16) & 0xf]; | 3847 *p++ = hexdigits[(ucs >> 16) & 0xf]; |
| 3325 *p++ = hexdigit[(ucs >> 12) & 0xf]; | 3848 *p++ = hexdigits[(ucs >> 12) & 0xf]; |
| 3326 *p++ = hexdigit[(ucs >> 8) & 0xf]; | 3849 *p++ = hexdigits[(ucs >> 8) & 0xf]; |
| 3327 *p++ = hexdigit[(ucs >> 4) & 0xf]; | 3850 *p++ = hexdigits[(ucs >> 4) & 0xf]; |
| 3328 *p++ = hexdigit[ucs & 0xf]; | 3851 *p++ = hexdigits[ucs & 0xf]; |
| 3329 continue; | 3852 continue; |
| 3330 } | 3853 } |
| 3331 /* Fall through: isolated surrogates are copied as-is */ | 3854 /* Fall through: isolated surrogates are copied as-is */ |
| 3332 s--; | 3855 s--; |
| 3333 size++; | 3856 size++; |
| 3334 } | 3857 } |
| 3335 #endif | 3858 #endif |
| 3336 /* Map 16-bit characters to '\uxxxx' */ | 3859 /* Map 16-bit characters to '\uxxxx' */ |
| 3337 if (ch >= 256) { | 3860 if (ch >= 256) { |
| 3338 *p++ = '\\'; | 3861 *p++ = '\\'; |
| 3339 *p++ = 'u'; | 3862 *p++ = 'u'; |
| 3340 *p++ = hexdigit[(ch >> 12) & 0xf]; | 3863 *p++ = hexdigits[(ch >> 12) & 0xf]; |
| 3341 *p++ = hexdigit[(ch >> 8) & 0xf]; | 3864 *p++ = hexdigits[(ch >> 8) & 0xf]; |
| 3342 *p++ = hexdigit[(ch >> 4) & 0xf]; | 3865 *p++ = hexdigits[(ch >> 4) & 0xf]; |
| 3343 *p++ = hexdigit[ch & 15]; | 3866 *p++ = hexdigits[ch & 15]; |
| 3344 } | 3867 } |
| 3345 /* Copy everything else as-is */ | 3868 /* Copy everything else as-is */ |
| 3346 else | 3869 else |
| 3347 *p++ = (char) ch; | 3870 *p++ = (char) ch; |
| 3348 } | 3871 } |
| 3349 *p = '\0'; | 3872 size = p - q; |
| 3350 if (_PyString_Resize(&repr, p - q)) | 3873 |
| 3874 assert(size > 0); |
| 3875 if (_PyBytes_Resize(&repr, size) < 0) |
| 3351 return NULL; | 3876 return NULL; |
| 3352 return repr; | 3877 return repr; |
| 3353 } | 3878 } |
| 3354 | 3879 |
| 3355 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) | 3880 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) |
| 3356 { | 3881 { |
| 3882 PyObject *s; |
| 3357 if (!PyUnicode_Check(unicode)) { | 3883 if (!PyUnicode_Check(unicode)) { |
| 3358 PyErr_BadArgument(); | 3884 PyErr_BadArgument(); |
| 3359 return NULL; | 3885 return NULL; |
| 3360 } | 3886 } |
| 3361 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), | 3887 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), |
| 3362 PyUnicode_GET_SIZE(unicode)); | 3888 PyUnicode_GET_SIZE(unicode)); |
| 3889 |
| 3890 return s; |
| 3363 } | 3891 } |
| 3364 | 3892 |
| 3365 /* --- Unicode Internal Codec ------------------------------------------- */ | 3893 /* --- Unicode Internal Codec ------------------------------------------- */ |
| 3366 | 3894 |
| 3367 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s, | 3895 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s, |
| 3368 Py_ssize_t size, | 3896 Py_ssize_t size, |
| 3369 const char *errors) | 3897 const char *errors) |
| 3370 { | 3898 { |
| 3371 const char *starts = s; | 3899 const char *starts = s; |
| 3372 Py_ssize_t startinpos; | 3900 Py_ssize_t startinpos; |
| (...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3409 reason = "truncated input"; | 3937 reason = "truncated input"; |
| 3410 } | 3938 } |
| 3411 else { | 3939 else { |
| 3412 endinpos = s - starts + Py_UNICODE_SIZE; | 3940 endinpos = s - starts + Py_UNICODE_SIZE; |
| 3413 reason = "illegal code point (> 0x10FFFF)"; | 3941 reason = "illegal code point (> 0x10FFFF)"; |
| 3414 } | 3942 } |
| 3415 outpos = p - PyUnicode_AS_UNICODE(v); | 3943 outpos = p - PyUnicode_AS_UNICODE(v); |
| 3416 if (unicode_decode_call_errorhandler( | 3944 if (unicode_decode_call_errorhandler( |
| 3417 errors, &errorHandler, | 3945 errors, &errorHandler, |
| 3418 "unicode_internal", reason, | 3946 "unicode_internal", reason, |
| 3419 starts, size, &startinpos, &endinpos, &exc, &s, | 3947 &starts, &end, &startinpos, &endinpos, &exc, &s, |
| 3420 &v, &outpos, &p)) { | 3948 &v, &outpos, &p)) { |
| 3421 goto onError; | 3949 goto onError; |
| 3422 } | 3950 } |
| 3423 } | 3951 } |
| 3424 else { | 3952 else { |
| 3425 p++; | 3953 p++; |
| 3426 s += Py_UNICODE_SIZE; | 3954 s += Py_UNICODE_SIZE; |
| 3427 } | 3955 } |
| 3428 } | 3956 } |
| 3429 | 3957 |
| (...skipping 11 matching lines...) Expand all Loading... |
| 3441 } | 3969 } |
| 3442 | 3970 |
| 3443 /* --- Latin-1 Codec ------------------------------------------------------ */ | 3971 /* --- Latin-1 Codec ------------------------------------------------------ */ |
| 3444 | 3972 |
| 3445 PyObject *PyUnicode_DecodeLatin1(const char *s, | 3973 PyObject *PyUnicode_DecodeLatin1(const char *s, |
| 3446 Py_ssize_t size, | 3974 Py_ssize_t size, |
| 3447 const char *errors) | 3975 const char *errors) |
| 3448 { | 3976 { |
| 3449 PyUnicodeObject *v; | 3977 PyUnicodeObject *v; |
| 3450 Py_UNICODE *p; | 3978 Py_UNICODE *p; |
| 3979 const char *e, *unrolled_end; |
| 3451 | 3980 |
| 3452 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ | 3981 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ |
| 3453 if (size == 1) { | 3982 if (size == 1) { |
| 3454 Py_UNICODE r = *(unsigned char*)s; | 3983 Py_UNICODE r = *(unsigned char*)s; |
| 3455 return PyUnicode_FromUnicode(&r, 1); | 3984 return PyUnicode_FromUnicode(&r, 1); |
| 3456 } | 3985 } |
| 3457 | 3986 |
| 3458 v = _PyUnicode_New(size); | 3987 v = _PyUnicode_New(size); |
| 3459 if (v == NULL) | 3988 if (v == NULL) |
| 3460 goto onError; | 3989 goto onError; |
| 3461 if (size == 0) | 3990 if (size == 0) |
| 3462 return (PyObject *)v; | 3991 return (PyObject *)v; |
| 3463 p = PyUnicode_AS_UNICODE(v); | 3992 p = PyUnicode_AS_UNICODE(v); |
| 3464 while (size-- > 0) | 3993 e = s + size; |
| 3465 *p++ = (unsigned char)*s++; | 3994 /* Unrolling the copy makes it much faster by reducing the looping |
| 3995 overhead. This is similar to what many memcpy() implementations do. */ |
| 3996 unrolled_end = e - 4; |
| 3997 while (s < unrolled_end) { |
| 3998 p[0] = (unsigned char) s[0]; |
| 3999 p[1] = (unsigned char) s[1]; |
| 4000 p[2] = (unsigned char) s[2]; |
| 4001 p[3] = (unsigned char) s[3]; |
| 4002 s += 4; |
| 4003 p += 4; |
| 4004 } |
| 4005 while (s < e) |
| 4006 *p++ = (unsigned char) *s++; |
| 3466 return (PyObject *)v; | 4007 return (PyObject *)v; |
| 3467 | 4008 |
| 3468 onError: | 4009 onError: |
| 3469 Py_XDECREF(v); | 4010 Py_XDECREF(v); |
| 3470 return NULL; | 4011 return NULL; |
| 3471 } | 4012 } |
| 3472 | 4013 |
| 3473 /* create or adjust a UnicodeEncodeError */ | 4014 /* create or adjust a UnicodeEncodeError */ |
| 3474 static void make_encode_exception(PyObject **exceptionObject, | 4015 static void make_encode_exception(PyObject **exceptionObject, |
| 3475 const char *encoding, | 4016 const char *encoding, |
| (...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3512 build arguments, call the callback and check the arguments, | 4053 build arguments, call the callback and check the arguments, |
| 3513 put the result into newpos and return the replacement string, which | 4054 put the result into newpos and return the replacement string, which |
| 3514 has to be freed by the caller */ | 4055 has to be freed by the caller */ |
| 3515 static PyObject *unicode_encode_call_errorhandler(const char *errors, | 4056 static PyObject *unicode_encode_call_errorhandler(const char *errors, |
| 3516 PyObject **errorHandler, | 4057 PyObject **errorHandler, |
| 3517 const char *encoding, const ch
ar *reason, | 4058 const char *encoding, const ch
ar *reason, |
| 3518 const Py_UNICODE *unicode, Py_
ssize_t size, PyObject **exceptionObject, | 4059 const Py_UNICODE *unicode, Py_
ssize_t size, PyObject **exceptionObject, |
| 3519 Py_ssize_t startpos, Py_ssize_
t endpos, | 4060 Py_ssize_t startpos, Py_ssize_
t endpos, |
| 3520 Py_ssize_t *newpos) | 4061 Py_ssize_t *newpos) |
| 3521 { | 4062 { |
| 3522 static char *argparse = "O!n;encoding error handler must return (unicode, in
t) tuple"; | 4063 static char *argparse = "On;encoding error handler must return (str/bytes, i
nt) tuple"; |
| 3523 | 4064 |
| 3524 PyObject *restuple; | 4065 PyObject *restuple; |
| 3525 PyObject *resunicode; | 4066 PyObject *resunicode; |
| 3526 | 4067 |
| 3527 if (*errorHandler == NULL) { | 4068 if (*errorHandler == NULL) { |
| 3528 *errorHandler = PyCodec_LookupError(errors); | 4069 *errorHandler = PyCodec_LookupError(errors); |
| 3529 if (*errorHandler == NULL) | 4070 if (*errorHandler == NULL) |
| 3530 return NULL; | 4071 return NULL; |
| 3531 } | 4072 } |
| 3532 | 4073 |
| 3533 make_encode_exception(exceptionObject, | 4074 make_encode_exception(exceptionObject, |
| 3534 encoding, unicode, size, startpos, endpos, reason); | 4075 encoding, unicode, size, startpos, endpos, reason); |
| 3535 if (*exceptionObject == NULL) | 4076 if (*exceptionObject == NULL) |
| 3536 return NULL; | 4077 return NULL; |
| 3537 | 4078 |
| 3538 restuple = PyObject_CallFunctionObjArgs( | 4079 restuple = PyObject_CallFunctionObjArgs( |
| 3539 *errorHandler, *exceptionObject, NULL); | 4080 *errorHandler, *exceptionObject, NULL); |
| 3540 if (restuple == NULL) | 4081 if (restuple == NULL) |
| 3541 return NULL; | 4082 return NULL; |
| 3542 if (!PyTuple_Check(restuple)) { | 4083 if (!PyTuple_Check(restuple)) { |
| 3543 PyErr_SetString(PyExc_TypeError, &argparse[4]); | 4084 PyErr_SetString(PyExc_TypeError, &argparse[3]); |
| 3544 Py_DECREF(restuple); | 4085 Py_DECREF(restuple); |
| 3545 return NULL; | 4086 return NULL; |
| 3546 } | 4087 } |
| 3547 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, | 4088 if (!PyArg_ParseTuple(restuple, argparse, |
| 3548 &resunicode, newpos)) { | 4089 &resunicode, newpos)) { |
| 4090 Py_DECREF(restuple); |
| 4091 return NULL; |
| 4092 } |
| 4093 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) { |
| 4094 PyErr_SetString(PyExc_TypeError, &argparse[3]); |
| 3549 Py_DECREF(restuple); | 4095 Py_DECREF(restuple); |
| 3550 return NULL; | 4096 return NULL; |
| 3551 } | 4097 } |
| 3552 if (*newpos<0) | 4098 if (*newpos<0) |
| 3553 *newpos = size+*newpos; | 4099 *newpos = size+*newpos; |
| 3554 if (*newpos<0 || *newpos>size) { | 4100 if (*newpos<0 || *newpos>size) { |
| 3555 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of b
ounds", *newpos); | 4101 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of b
ounds", *newpos); |
| 3556 Py_DECREF(restuple); | 4102 Py_DECREF(restuple); |
| 3557 return NULL; | 4103 return NULL; |
| 3558 } | 4104 } |
| (...skipping 10 matching lines...) Expand all Loading... |
| 3569 /* output object */ | 4115 /* output object */ |
| 3570 PyObject *res; | 4116 PyObject *res; |
| 3571 /* pointers to the beginning and end+1 of input */ | 4117 /* pointers to the beginning and end+1 of input */ |
| 3572 const Py_UNICODE *startp = p; | 4118 const Py_UNICODE *startp = p; |
| 3573 const Py_UNICODE *endp = p + size; | 4119 const Py_UNICODE *endp = p + size; |
| 3574 /* pointer to the beginning of the unencodable characters */ | 4120 /* pointer to the beginning of the unencodable characters */ |
| 3575 /* const Py_UNICODE *badp = NULL; */ | 4121 /* const Py_UNICODE *badp = NULL; */ |
| 3576 /* pointer into the output */ | 4122 /* pointer into the output */ |
| 3577 char *str; | 4123 char *str; |
| 3578 /* current output position */ | 4124 /* current output position */ |
| 3579 Py_ssize_t respos = 0; | |
| 3580 Py_ssize_t ressize; | 4125 Py_ssize_t ressize; |
| 3581 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; | 4126 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; |
| 3582 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal
not in range(128)"; | 4127 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal
not in range(128)"; |
| 3583 PyObject *errorHandler = NULL; | 4128 PyObject *errorHandler = NULL; |
| 3584 PyObject *exc = NULL; | 4129 PyObject *exc = NULL; |
| 3585 /* the following variable is used for caching string comparisons | 4130 /* the following variable is used for caching string comparisons |
| 3586 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharre
freplace */ | 4131 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharre
freplace */ |
| 3587 int known_errorHandler = -1; | 4132 int known_errorHandler = -1; |
| 3588 | 4133 |
| 3589 /* allocate enough for a simple encoding without | 4134 /* allocate enough for a simple encoding without |
| 3590 replacements, if we need more, we'll resize */ | 4135 replacements, if we need more, we'll resize */ |
| 3591 res = PyString_FromStringAndSize(NULL, size); | 4136 if (size == 0) |
| 4137 return PyBytes_FromStringAndSize(NULL, 0); |
| 4138 res = PyBytes_FromStringAndSize(NULL, size); |
| 3592 if (res == NULL) | 4139 if (res == NULL) |
| 3593 goto onError; | 4140 return NULL; |
| 3594 if (size == 0) | 4141 str = PyBytes_AS_STRING(res); |
| 3595 return res; | |
| 3596 str = PyString_AS_STRING(res); | |
| 3597 ressize = size; | 4142 ressize = size; |
| 3598 | 4143 |
| 3599 while (p<endp) { | 4144 while (p<endp) { |
| 3600 Py_UNICODE c = *p; | 4145 Py_UNICODE c = *p; |
| 3601 | 4146 |
| 3602 /* can we encode this? */ | 4147 /* can we encode this? */ |
| 3603 if (c<limit) { | 4148 if (c<limit) { |
| 3604 /* no overflow check, because we know that the space is enough */ | 4149 /* no overflow check, because we know that the space is enough */ |
| 3605 *str++ = (char)c; | 4150 *str++ = (char)c; |
| 3606 ++p; | 4151 ++p; |
| (...skipping 29 matching lines...) Expand all Loading... |
| 3636 case 1: /* strict */ | 4181 case 1: /* strict */ |
| 3637 raise_encode_exception(&exc, encoding, startp, size, collstart-s
tartp, collend-startp, reason); | 4182 raise_encode_exception(&exc, encoding, startp, size, collstart-s
tartp, collend-startp, reason); |
| 3638 goto onError; | 4183 goto onError; |
| 3639 case 2: /* replace */ | 4184 case 2: /* replace */ |
| 3640 while (collstart++<collend) | 4185 while (collstart++<collend) |
| 3641 *str++ = '?'; /* fall through */ | 4186 *str++ = '?'; /* fall through */ |
| 3642 case 3: /* ignore */ | 4187 case 3: /* ignore */ |
| 3643 p = collend; | 4188 p = collend; |
| 3644 break; | 4189 break; |
| 3645 case 4: /* xmlcharrefreplace */ | 4190 case 4: /* xmlcharrefreplace */ |
| 3646 respos = str-PyString_AS_STRING(res); | 4191 respos = str - PyBytes_AS_STRING(res); |
| 3647 /* determine replacement size (temporarily (mis)uses p) */ | 4192 /* determine replacement size (temporarily (mis)uses p) */ |
| 3648 for (p = collstart, repsize = 0; p < collend; ++p) { | 4193 for (p = collstart, repsize = 0; p < collend; ++p) { |
| 3649 if (*p<10) | 4194 if (*p<10) |
| 3650 repsize += 2+1+1; | 4195 repsize += 2+1+1; |
| 3651 else if (*p<100) | 4196 else if (*p<100) |
| 3652 repsize += 2+2+1; | 4197 repsize += 2+2+1; |
| 3653 else if (*p<1000) | 4198 else if (*p<1000) |
| 3654 repsize += 2+3+1; | 4199 repsize += 2+3+1; |
| 3655 else if (*p<10000) | 4200 else if (*p<10000) |
| 3656 repsize += 2+4+1; | 4201 repsize += 2+4+1; |
| 3657 #ifndef Py_UNICODE_WIDE | 4202 #ifndef Py_UNICODE_WIDE |
| 3658 else | 4203 else |
| 3659 repsize += 2+5+1; | 4204 repsize += 2+5+1; |
| 3660 #else | 4205 #else |
| 3661 else if (*p<100000) | 4206 else if (*p<100000) |
| 3662 repsize += 2+5+1; | 4207 repsize += 2+5+1; |
| 3663 else if (*p<1000000) | 4208 else if (*p<1000000) |
| 3664 repsize += 2+6+1; | 4209 repsize += 2+6+1; |
| 3665 else | 4210 else |
| 3666 repsize += 2+7+1; | 4211 repsize += 2+7+1; |
| 3667 #endif | 4212 #endif |
| 3668 } | 4213 } |
| 3669 requiredsize = respos+repsize+(endp-collend); | 4214 requiredsize = respos+repsize+(endp-collend); |
| 3670 if (requiredsize > ressize) { | 4215 if (requiredsize > ressize) { |
| 3671 if (requiredsize<2*ressize) | 4216 if (requiredsize<2*ressize) |
| 3672 requiredsize = 2*ressize; | 4217 requiredsize = 2*ressize; |
| 3673 if (_PyString_Resize(&res, requiredsize)) | 4218 if (_PyBytes_Resize(&res, requiredsize)) |
| 3674 goto onError; | 4219 goto onError; |
| 3675 str = PyString_AS_STRING(res) + respos; | 4220 str = PyBytes_AS_STRING(res) + respos; |
| 3676 ressize = requiredsize; | 4221 ressize = requiredsize; |
| 3677 } | 4222 } |
| 3678 /* generate replacement (temporarily (mis)uses p) */ | 4223 /* generate replacement (temporarily (mis)uses p) */ |
| 3679 for (p = collstart; p < collend; ++p) { | 4224 for (p = collstart; p < collend; ++p) { |
| 3680 str += sprintf(str, "&#%d;", (int)*p); | 4225 str += sprintf(str, "&#%d;", (int)*p); |
| 3681 } | 4226 } |
| 3682 p = collend; | 4227 p = collend; |
| 3683 break; | 4228 break; |
| 3684 default: | 4229 default: |
| 3685 repunicode = unicode_encode_call_errorhandler(errors, &errorHand
ler, | 4230 repunicode = unicode_encode_call_errorhandler(errors, &errorHand
ler, |
| 3686 encoding, reason,
startp, size, &exc, | 4231 encoding, reason,
startp, size, &exc, |
| 3687 collstart-startp,
collend-startp, &newpos); | 4232 collstart-startp,
collend-startp, &newpos); |
| 3688 if (repunicode == NULL) | 4233 if (repunicode == NULL) |
| 3689 goto onError; | 4234 goto onError; |
| 3690 /* need more space? (at least enough for what we have+the | 4235 if (PyBytes_Check(repunicode)) { |
| 3691 replacement+the rest of the string, so we won't have to | 4236 /* Directly copy bytes result to output. */ |
| 3692 check space for encodable characters) */ | 4237 repsize = PyBytes_Size(repunicode); |
| 3693 respos = str-PyString_AS_STRING(res); | 4238 if (repsize > 1) { |
| 4239 /* Make room for all additional bytes. */ |
| 4240 respos = str - PyBytes_AS_STRING(res); |
| 4241 if (_PyBytes_Resize(&res, ressize+repsize-1)) { |
| 4242 Py_DECREF(repunicode); |
| 4243 goto onError; |
| 4244 } |
| 4245 str = PyBytes_AS_STRING(res) + respos; |
| 4246 ressize += repsize-1; |
| 4247 } |
| 4248 memcpy(str, PyBytes_AsString(repunicode), repsize); |
| 4249 str += repsize; |
| 4250 p = startp + newpos; |
| 4251 Py_DECREF(repunicode); |
| 4252 break; |
| 4253 } |
| 4254 /* need more space? (at least enough for what we |
| 4255 have+the replacement+the rest of the string, so |
| 4256 we won't have to check space for encodable characters) */ |
| 4257 respos = str - PyBytes_AS_STRING(res); |
| 3694 repsize = PyUnicode_GET_SIZE(repunicode); | 4258 repsize = PyUnicode_GET_SIZE(repunicode); |
| 3695 requiredsize = respos+repsize+(endp-collend); | 4259 requiredsize = respos+repsize+(endp-collend); |
| 3696 if (requiredsize > ressize) { | 4260 if (requiredsize > ressize) { |
| 3697 if (requiredsize<2*ressize) | 4261 if (requiredsize<2*ressize) |
| 3698 requiredsize = 2*ressize; | 4262 requiredsize = 2*ressize; |
| 3699 if (_PyString_Resize(&res, requiredsize)) { | 4263 if (_PyBytes_Resize(&res, requiredsize)) { |
| 3700 Py_DECREF(repunicode); | 4264 Py_DECREF(repunicode); |
| 3701 goto onError; | 4265 goto onError; |
| 3702 } | 4266 } |
| 3703 str = PyString_AS_STRING(res) + respos; | 4267 str = PyBytes_AS_STRING(res) + respos; |
| 3704 ressize = requiredsize; | 4268 ressize = requiredsize; |
| 3705 } | 4269 } |
| 3706 /* check if there is anything unencodable in the replacement | 4270 /* check if there is anything unencodable in the replacement |
| 3707 and copy it to the output */ | 4271 and copy it to the output */ |
| 3708 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2
, ++str) { | 4272 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2
, ++str) { |
| 3709 c = *uni2; | 4273 c = *uni2; |
| 3710 if (c >= limit) { | 4274 if (c >= limit) { |
| 3711 raise_encode_exception(&exc, encoding, startp, size, | 4275 raise_encode_exception(&exc, encoding, startp, size, |
| 3712 unicodepos, unicodepos+1, reason)
; | 4276 unicodepos, unicodepos+1, reason)
; |
| 3713 Py_DECREF(repunicode); | 4277 Py_DECREF(repunicode); |
| 3714 goto onError; | 4278 goto onError; |
| 3715 } | 4279 } |
| 3716 *str = (char)c; | 4280 *str = (char)c; |
| 3717 } | 4281 } |
| 3718 p = startp + newpos; | 4282 p = startp + newpos; |
| 3719 Py_DECREF(repunicode); | 4283 Py_DECREF(repunicode); |
| 3720 } | 4284 } |
| 3721 } | 4285 } |
| 3722 } | 4286 } |
| 3723 /* Resize if we allocated to much */ | 4287 /* Resize if we allocated to much */ |
| 3724 respos = str-PyString_AS_STRING(res); | 4288 size = str - PyBytes_AS_STRING(res); |
| 3725 if (respos<ressize) | 4289 if (size < ressize) { /* If this falls res will be NULL */ |
| 3726 /* If this falls res will be NULL */ | 4290 assert(size >= 0); |
| 3727 _PyString_Resize(&res, respos); | 4291 if (_PyBytes_Resize(&res, size) < 0) |
| 4292 goto onError; |
| 4293 } |
| 4294 |
| 3728 Py_XDECREF(errorHandler); | 4295 Py_XDECREF(errorHandler); |
| 3729 Py_XDECREF(exc); | 4296 Py_XDECREF(exc); |
| 3730 return res; | 4297 return res; |
| 3731 | 4298 |
| 3732 onError: | 4299 onError: |
| 3733 Py_XDECREF(res); | 4300 Py_XDECREF(res); |
| 3734 Py_XDECREF(errorHandler); | 4301 Py_XDECREF(errorHandler); |
| 3735 Py_XDECREF(exc); | 4302 Py_XDECREF(exc); |
| 3736 return NULL; | 4303 return NULL; |
| 3737 } | 4304 } |
| (...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3789 *p++ = c; | 4356 *p++ = c; |
| 3790 ++s; | 4357 ++s; |
| 3791 } | 4358 } |
| 3792 else { | 4359 else { |
| 3793 startinpos = s-starts; | 4360 startinpos = s-starts; |
| 3794 endinpos = startinpos + 1; | 4361 endinpos = startinpos + 1; |
| 3795 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v); | 4362 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v); |
| 3796 if (unicode_decode_call_errorhandler( | 4363 if (unicode_decode_call_errorhandler( |
| 3797 errors, &errorHandler, | 4364 errors, &errorHandler, |
| 3798 "ascii", "ordinal not in range(128)", | 4365 "ascii", "ordinal not in range(128)", |
| 3799 starts, size, &startinpos, &endinpos, &exc, &s, | 4366 &starts, &e, &startinpos, &endinpos, &exc, &s, |
| 3800 &v, &outpos, &p)) | 4367 &v, &outpos, &p)) |
| 3801 goto onError; | 4368 goto onError; |
| 3802 } | 4369 } |
| 3803 } | 4370 } |
| 3804 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v)) | 4371 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) |
| 3805 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) | 4372 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) |
| 3806 goto onError; | 4373 goto onError; |
| 3807 Py_XDECREF(errorHandler); | 4374 Py_XDECREF(errorHandler); |
| 3808 Py_XDECREF(exc); | 4375 Py_XDECREF(exc); |
| 3809 return (PyObject *)v; | 4376 return (PyObject *)v; |
| 3810 | 4377 |
| 3811 onError: | 4378 onError: |
| 3812 Py_XDECREF(v); | 4379 Py_XDECREF(v); |
| 3813 Py_XDECREF(errorHandler); | 4380 Py_XDECREF(errorHandler); |
| 3814 Py_XDECREF(exc); | 4381 Py_XDECREF(exc); |
| (...skipping 157 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3972 if (size > 0) { | 4539 if (size > 0) { |
| 3973 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL); | 4540 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL); |
| 3974 if (mbcssize == 0) { | 4541 if (mbcssize == 0) { |
| 3975 PyErr_SetFromWindowsErrWithFilename(0, NULL); | 4542 PyErr_SetFromWindowsErrWithFilename(0, NULL); |
| 3976 return -1; | 4543 return -1; |
| 3977 } | 4544 } |
| 3978 } | 4545 } |
| 3979 | 4546 |
| 3980 if (*repr == NULL) { | 4547 if (*repr == NULL) { |
| 3981 /* Create string object */ | 4548 /* Create string object */ |
| 3982 *repr = PyString_FromStringAndSize(NULL, mbcssize); | 4549 *repr = PyBytes_FromStringAndSize(NULL, mbcssize); |
| 3983 if (*repr == NULL) | 4550 if (*repr == NULL) |
| 3984 return -1; | 4551 return -1; |
| 3985 } | 4552 } |
| 3986 else { | 4553 else { |
| 3987 /* Extend string object */ | 4554 /* Extend string object */ |
| 3988 n = PyString_Size(*repr); | 4555 n = PyBytes_Size(*repr); |
| 3989 if (_PyString_Resize(repr, n + mbcssize) < 0) | 4556 if (_PyBytes_Resize(repr, n + mbcssize) < 0) |
| 3990 return -1; | 4557 return -1; |
| 3991 } | 4558 } |
| 3992 | 4559 |
| 3993 /* Do the conversion */ | 4560 /* Do the conversion */ |
| 3994 if (size > 0) { | 4561 if (size > 0) { |
| 3995 char *s = PyString_AS_STRING(*repr) + n; | 4562 char *s = PyBytes_AS_STRING(*repr) + n; |
| 3996 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL
)) { | 4563 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL
)) { |
| 3997 PyErr_SetFromWindowsErrWithFilename(0, NULL); | 4564 PyErr_SetFromWindowsErrWithFilename(0, NULL); |
| 3998 return -1; | 4565 return -1; |
| 3999 } | 4566 } |
| 4000 } | 4567 } |
| 4001 | 4568 |
| 4002 return 0; | 4569 return 0; |
| 4003 } | 4570 } |
| 4004 | 4571 |
| 4005 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, | 4572 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, |
| (...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4090 x = mapstring[ch]; | 4657 x = mapstring[ch]; |
| 4091 | 4658 |
| 4092 if (x == 0xfffe) { | 4659 if (x == 0xfffe) { |
| 4093 /* undefined mapping */ | 4660 /* undefined mapping */ |
| 4094 outpos = p-PyUnicode_AS_UNICODE(v); | 4661 outpos = p-PyUnicode_AS_UNICODE(v); |
| 4095 startinpos = s-starts; | 4662 startinpos = s-starts; |
| 4096 endinpos = startinpos+1; | 4663 endinpos = startinpos+1; |
| 4097 if (unicode_decode_call_errorhandler( | 4664 if (unicode_decode_call_errorhandler( |
| 4098 errors, &errorHandler, | 4665 errors, &errorHandler, |
| 4099 "charmap", "character maps to <undefined>", | 4666 "charmap", "character maps to <undefined>", |
| 4100 starts, size, &startinpos, &endinpos, &exc, &s, | 4667 &starts, &e, &startinpos, &endinpos, &exc, &s, |
| 4101 &v, &outpos, &p)) { | 4668 &v, &outpos, &p)) { |
| 4102 goto onError; | 4669 goto onError; |
| 4103 } | 4670 } |
| 4104 continue; | 4671 continue; |
| 4105 } | 4672 } |
| 4106 *p++ = x; | 4673 *p++ = x; |
| 4107 ++s; | 4674 ++s; |
| 4108 } | 4675 } |
| 4109 } | 4676 } |
| 4110 else { | 4677 else { |
| 4111 while (s < e) { | 4678 while (s < e) { |
| 4112 unsigned char ch = *s; | 4679 unsigned char ch = *s; |
| 4113 PyObject *w, *x; | 4680 PyObject *w, *x; |
| 4114 | 4681 |
| 4115 /* Get mapping (char ordinal -> integer, Unicode char or None) */ | 4682 /* Get mapping (char ordinal -> integer, Unicode char or None) */ |
| 4116 w = PyInt_FromLong((long)ch); | 4683 w = PyLong_FromLong((long)ch); |
| 4117 if (w == NULL) | 4684 if (w == NULL) |
| 4118 goto onError; | 4685 goto onError; |
| 4119 x = PyObject_GetItem(mapping, w); | 4686 x = PyObject_GetItem(mapping, w); |
| 4120 Py_DECREF(w); | 4687 Py_DECREF(w); |
| 4121 if (x == NULL) { | 4688 if (x == NULL) { |
| 4122 if (PyErr_ExceptionMatches(PyExc_LookupError)) { | 4689 if (PyErr_ExceptionMatches(PyExc_LookupError)) { |
| 4123 /* No mapping found means: mapping is undefined. */ | 4690 /* No mapping found means: mapping is undefined. */ |
| 4124 PyErr_Clear(); | 4691 PyErr_Clear(); |
| 4125 x = Py_None; | 4692 x = Py_None; |
| 4126 Py_INCREF(x); | 4693 Py_INCREF(x); |
| 4127 } else | 4694 } else |
| 4128 goto onError; | 4695 goto onError; |
| 4129 } | 4696 } |
| 4130 | 4697 |
| 4131 /* Apply mapping */ | 4698 /* Apply mapping */ |
| 4132 if (PyInt_Check(x)) { | 4699 if (PyLong_Check(x)) { |
| 4133 long value = PyInt_AS_LONG(x); | 4700 long value = PyLong_AS_LONG(x); |
| 4134 if (value < 0 || value > 65535) { | 4701 if (value < 0 || value > 65535) { |
| 4135 PyErr_SetString(PyExc_TypeError, | 4702 PyErr_SetString(PyExc_TypeError, |
| 4136 "character mapping must be in range(65536)")
; | 4703 "character mapping must be in range(65536)")
; |
| 4137 Py_DECREF(x); | 4704 Py_DECREF(x); |
| 4138 goto onError; | 4705 goto onError; |
| 4139 } | 4706 } |
| 4140 *p++ = (Py_UNICODE)value; | 4707 *p++ = (Py_UNICODE)value; |
| 4141 } | 4708 } |
| 4142 else if (x == Py_None) { | 4709 else if (x == Py_None) { |
| 4143 /* undefined mapping */ | 4710 /* undefined mapping */ |
| 4144 outpos = p-PyUnicode_AS_UNICODE(v); | 4711 outpos = p-PyUnicode_AS_UNICODE(v); |
| 4145 startinpos = s-starts; | 4712 startinpos = s-starts; |
| 4146 endinpos = startinpos+1; | 4713 endinpos = startinpos+1; |
| 4147 if (unicode_decode_call_errorhandler( | 4714 if (unicode_decode_call_errorhandler( |
| 4148 errors, &errorHandler, | 4715 errors, &errorHandler, |
| 4149 "charmap", "character maps to <undefined>", | 4716 "charmap", "character maps to <undefined>", |
| 4150 starts, size, &startinpos, &endinpos, &exc, &s, | 4717 &starts, &e, &startinpos, &endinpos, &exc, &s, |
| 4151 &v, &outpos, &p)) { | 4718 &v, &outpos, &p)) { |
| 4152 Py_DECREF(x); | 4719 Py_DECREF(x); |
| 4153 goto onError; | 4720 goto onError; |
| 4154 } | 4721 } |
| 4155 Py_DECREF(x); | 4722 Py_DECREF(x); |
| 4156 continue; | 4723 continue; |
| 4157 } | 4724 } |
| 4158 else if (PyUnicode_Check(x)) { | 4725 else if (PyUnicode_Check(x)) { |
| 4159 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x); | 4726 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x); |
| 4160 | 4727 |
| (...skipping 21 matching lines...) Expand all Loading... |
| 4182 PyUnicode_AS_UNICODE(x), | 4749 PyUnicode_AS_UNICODE(x), |
| 4183 targetsize); | 4750 targetsize); |
| 4184 p += targetsize; | 4751 p += targetsize; |
| 4185 extrachars -= targetsize; | 4752 extrachars -= targetsize; |
| 4186 } | 4753 } |
| 4187 /* 1-0 mapping: skip the character */ | 4754 /* 1-0 mapping: skip the character */ |
| 4188 } | 4755 } |
| 4189 else { | 4756 else { |
| 4190 /* wrong return value */ | 4757 /* wrong return value */ |
| 4191 PyErr_SetString(PyExc_TypeError, | 4758 PyErr_SetString(PyExc_TypeError, |
| 4192 "character mapping must return integer, None or
unicode"); | 4759 "character mapping must return integer, None or
str"); |
| 4193 Py_DECREF(x); | 4760 Py_DECREF(x); |
| 4194 goto onError; | 4761 goto onError; |
| 4195 } | 4762 } |
| 4196 Py_DECREF(x); | 4763 Py_DECREF(x); |
| 4197 ++s; | 4764 ++s; |
| 4198 } | 4765 } |
| 4199 } | 4766 } |
| 4200 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) | 4767 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) |
| 4201 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) | 4768 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) |
| 4202 goto onError; | 4769 goto onError; |
| (...skipping 14 matching lines...) Expand all Loading... |
| 4217 PyObject_HEAD | 4784 PyObject_HEAD |
| 4218 unsigned char level1[32]; | 4785 unsigned char level1[32]; |
| 4219 int count2, count3; | 4786 int count2, count3; |
| 4220 unsigned char level23[1]; | 4787 unsigned char level23[1]; |
| 4221 }; | 4788 }; |
| 4222 | 4789 |
| 4223 static PyObject* | 4790 static PyObject* |
| 4224 encoding_map_size(PyObject *obj, PyObject* args) | 4791 encoding_map_size(PyObject *obj, PyObject* args) |
| 4225 { | 4792 { |
| 4226 struct encoding_map *map = (struct encoding_map*)obj; | 4793 struct encoding_map *map = (struct encoding_map*)obj; |
| 4227 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 + | 4794 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + |
| 4228 128*map->count3); | 4795 128*map->count3); |
| 4229 } | 4796 } |
| 4230 | 4797 |
| 4231 static PyMethodDef encoding_map_methods[] = { | 4798 static PyMethodDef encoding_map_methods[] = { |
| 4232 {"size", encoding_map_size, METH_NOARGS, | 4799 {"size", encoding_map_size, METH_NOARGS, |
| 4233 PyDoc_STR("Return the size (in bytes) of this object") }, | 4800 PyDoc_STR("Return the size (in bytes) of this object") }, |
| 4234 { 0 } | 4801 { 0 } |
| 4235 }; | 4802 }; |
| 4236 | 4803 |
| 4237 static void | 4804 static void |
| 4238 encoding_map_dealloc(PyObject* o) | 4805 encoding_map_dealloc(PyObject* o) |
| 4239 { | 4806 { |
| 4240 PyObject_FREE(o); | 4807 PyObject_FREE(o); |
| 4241 } | 4808 } |
| 4242 | 4809 |
| 4243 static PyTypeObject EncodingMapType = { | 4810 static PyTypeObject EncodingMapType = { |
| 4244 PyVarObject_HEAD_INIT(NULL, 0) | 4811 PyVarObject_HEAD_INIT(NULL, 0) |
| 4245 "EncodingMap", /*tp_name*/ | 4812 "EncodingMap", /*tp_name*/ |
| 4246 sizeof(struct encoding_map), /*tp_basicsize*/ | 4813 sizeof(struct encoding_map), /*tp_basicsize*/ |
| 4247 0, /*tp_itemsize*/ | 4814 0, /*tp_itemsize*/ |
| 4248 /* methods */ | 4815 /* methods */ |
| 4249 encoding_map_dealloc, /*tp_dealloc*/ | 4816 encoding_map_dealloc, /*tp_dealloc*/ |
| 4250 0, /*tp_print*/ | 4817 0, /*tp_print*/ |
| 4251 0, /*tp_getattr*/ | 4818 0, /*tp_getattr*/ |
| 4252 0, /*tp_setattr*/ | 4819 0, /*tp_setattr*/ |
| 4253 0, /*tp_compare*/ | 4820 0, /*tp_reserved*/ |
| 4254 0, /*tp_repr*/ | 4821 0, /*tp_repr*/ |
| 4255 0, /*tp_as_number*/ | 4822 0, /*tp_as_number*/ |
| 4256 0, /*tp_as_sequence*/ | 4823 0, /*tp_as_sequence*/ |
| 4257 0, /*tp_as_mapping*/ | 4824 0, /*tp_as_mapping*/ |
| 4258 0, /*tp_hash*/ | 4825 0, /*tp_hash*/ |
| 4259 0, /*tp_call*/ | 4826 0, /*tp_call*/ |
| 4260 0, /*tp_str*/ | 4827 0, /*tp_str*/ |
| 4261 0, /*tp_getattro*/ | 4828 0, /*tp_getattro*/ |
| 4262 0, /*tp_setattro*/ | 4829 0, /*tp_setattro*/ |
| 4263 0, /*tp_as_buffer*/ | 4830 0, /*tp_as_buffer*/ |
| (...skipping 69 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4333 | 4900 |
| 4334 if (count2 >= 0xFF || count3 >= 0xFF) | 4901 if (count2 >= 0xFF || count3 >= 0xFF) |
| 4335 need_dict = 1; | 4902 need_dict = 1; |
| 4336 | 4903 |
| 4337 if (need_dict) { | 4904 if (need_dict) { |
| 4338 PyObject *result = PyDict_New(); | 4905 PyObject *result = PyDict_New(); |
| 4339 PyObject *key, *value; | 4906 PyObject *key, *value; |
| 4340 if (!result) | 4907 if (!result) |
| 4341 return NULL; | 4908 return NULL; |
| 4342 for (i = 0; i < 256; i++) { | 4909 for (i = 0; i < 256; i++) { |
| 4343 value = NULL; | 4910 key = value = NULL; |
| 4344 key = PyInt_FromLong(decode[i]); | 4911 key = PyLong_FromLong(decode[i]); |
| 4345 value = PyInt_FromLong(i); | 4912 value = PyLong_FromLong(i); |
| 4346 if (!key || !value) | 4913 if (!key || !value) |
| 4347 goto failed1; | 4914 goto failed1; |
| 4348 if (PyDict_SetItem(result, key, value) == -1) | 4915 if (PyDict_SetItem(result, key, value) == -1) |
| 4349 goto failed1; | 4916 goto failed1; |
| 4350 Py_DECREF(key); | 4917 Py_DECREF(key); |
| 4351 Py_DECREF(value); | 4918 Py_DECREF(value); |
| 4352 } | 4919 } |
| 4353 return result; | 4920 return result; |
| 4354 failed1: | 4921 failed1: |
| 4355 Py_XDECREF(key); | 4922 Py_XDECREF(key); |
| (...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4423 return -1; | 4990 return -1; |
| 4424 } | 4991 } |
| 4425 return i; | 4992 return i; |
| 4426 } | 4993 } |
| 4427 | 4994 |
| 4428 /* Lookup the character ch in the mapping. If the character | 4995 /* Lookup the character ch in the mapping. If the character |
| 4429 can't be found, Py_None is returned (or NULL, if another | 4996 can't be found, Py_None is returned (or NULL, if another |
| 4430 error occurred). */ | 4997 error occurred). */ |
| 4431 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping) | 4998 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping) |
| 4432 { | 4999 { |
| 4433 PyObject *w = PyInt_FromLong((long)c); | 5000 PyObject *w = PyLong_FromLong((long)c); |
| 4434 PyObject *x; | 5001 PyObject *x; |
| 4435 | 5002 |
| 4436 if (w == NULL) | 5003 if (w == NULL) |
| 4437 return NULL; | 5004 return NULL; |
| 4438 x = PyObject_GetItem(mapping, w); | 5005 x = PyObject_GetItem(mapping, w); |
| 4439 Py_DECREF(w); | 5006 Py_DECREF(w); |
| 4440 if (x == NULL) { | 5007 if (x == NULL) { |
| 4441 if (PyErr_ExceptionMatches(PyExc_LookupError)) { | 5008 if (PyErr_ExceptionMatches(PyExc_LookupError)) { |
| 4442 /* No mapping found means: mapping is undefined. */ | 5009 /* No mapping found means: mapping is undefined. */ |
| 4443 PyErr_Clear(); | 5010 PyErr_Clear(); |
| 4444 x = Py_None; | 5011 x = Py_None; |
| 4445 Py_INCREF(x); | 5012 Py_INCREF(x); |
| 4446 return x; | 5013 return x; |
| 4447 } else | 5014 } else |
| 4448 return NULL; | 5015 return NULL; |
| 4449 } | 5016 } |
| 4450 else if (x == Py_None) | 5017 else if (x == Py_None) |
| 4451 return x; | 5018 return x; |
| 4452 else if (PyInt_Check(x)) { | 5019 else if (PyLong_Check(x)) { |
| 4453 long value = PyInt_AS_LONG(x); | 5020 long value = PyLong_AS_LONG(x); |
| 4454 if (value < 0 || value > 255) { | 5021 if (value < 0 || value > 255) { |
| 4455 PyErr_SetString(PyExc_TypeError, | 5022 PyErr_SetString(PyExc_TypeError, |
| 4456 "character mapping must be in range(256)"); | 5023 "character mapping must be in range(256)"); |
| 4457 Py_DECREF(x); | 5024 Py_DECREF(x); |
| 4458 return NULL; | 5025 return NULL; |
| 4459 } | 5026 } |
| 4460 return x; | 5027 return x; |
| 4461 } | 5028 } |
| 4462 else if (PyString_Check(x)) | 5029 else if (PyBytes_Check(x)) |
| 4463 return x; | 5030 return x; |
| 4464 else { | 5031 else { |
| 4465 /* wrong return value */ | 5032 /* wrong return value */ |
| 4466 PyErr_SetString(PyExc_TypeError, | 5033 PyErr_Format(PyExc_TypeError, |
| 4467 "character mapping must return integer, None or str"); | 5034 "character mapping must return integer, bytes or None, not
%.400s", |
| 5035 x->ob_type->tp_name); |
| 4468 Py_DECREF(x); | 5036 Py_DECREF(x); |
| 4469 return NULL; | 5037 return NULL; |
| 4470 } | 5038 } |
| 4471 } | 5039 } |
| 4472 | 5040 |
| 4473 static int | 5041 static int |
| 4474 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requireds
ize) | 5042 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requireds
ize) |
| 4475 { | 5043 { |
| 4476 Py_ssize_t outsize = PyString_GET_SIZE(*outobj); | 5044 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); |
| 4477 /* exponentially overallocate to minimize reallocations */ | 5045 /* exponentially overallocate to minimize reallocations */ |
| 4478 if (requiredsize < 2*outsize) | 5046 if (requiredsize < 2*outsize) |
| 4479 requiredsize = 2*outsize; | 5047 requiredsize = 2*outsize; |
| 4480 if (_PyString_Resize(outobj, requiredsize)) { | 5048 if (_PyBytes_Resize(outobj, requiredsize)) |
| 4481 return 0; | 5049 return -1; |
| 4482 } | 5050 return 0; |
| 4483 return 1; | |
| 4484 } | 5051 } |
| 4485 | 5052 |
| 4486 typedef enum charmapencode_result { | 5053 typedef enum charmapencode_result { |
| 4487 enc_SUCCESS, enc_FAILED, enc_EXCEPTION | 5054 enc_SUCCESS, enc_FAILED, enc_EXCEPTION |
| 4488 }charmapencode_result; | 5055 }charmapencode_result; |
| 4489 /* lookup the character, put the result in the output string and adjust | 5056 /* lookup the character, put the result in the output string and adjust |
| 4490 various state variables. Reallocate the output string if not enough | 5057 various state variables. Resize the output bytes object if not enough |
| 4491 space is available. Return a new reference to the object that | 5058 space is available. Return a new reference to the object that |
| 4492 was put in the output buffer, or Py_None, if the mapping was undefined | 5059 was put in the output buffer, or Py_None, if the mapping was undefined |
| 4493 (in which case no character was written) or NULL, if a | 5060 (in which case no character was written) or NULL, if a |
| 4494 reallocation error occurred. The caller must decref the result */ | 5061 reallocation error occurred. The caller must decref the result */ |
| 4495 static | 5062 static |
| 4496 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping, | 5063 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping, |
| 4497 PyObject **outobj, Py_ssize_t *outpos) | 5064 PyObject **outobj, Py_ssize_t *outpos) |
| 4498 { | 5065 { |
| 4499 PyObject *rep; | 5066 PyObject *rep; |
| 4500 char *outstart; | 5067 char *outstart; |
| 4501 Py_ssize_t outsize = PyString_GET_SIZE(*outobj); | 5068 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); |
| 4502 | 5069 |
| 4503 if (Py_TYPE(mapping) == &EncodingMapType) { | 5070 if (Py_TYPE(mapping) == &EncodingMapType) { |
| 4504 int res = encoding_map_lookup(c, mapping); | 5071 int res = encoding_map_lookup(c, mapping); |
| 4505 Py_ssize_t requiredsize = *outpos+1; | 5072 Py_ssize_t requiredsize = *outpos+1; |
| 4506 if (res == -1) | 5073 if (res == -1) |
| 4507 return enc_FAILED; | 5074 return enc_FAILED; |
| 4508 if (outsize<requiredsize) | 5075 if (outsize<requiredsize) |
| 4509 if (!charmapencode_resize(outobj, outpos, requiredsize)) | 5076 if (charmapencode_resize(outobj, outpos, requiredsize)) |
| 4510 return enc_EXCEPTION; | 5077 return enc_EXCEPTION; |
| 4511 outstart = PyString_AS_STRING(*outobj); | 5078 outstart = PyBytes_AS_STRING(*outobj); |
| 4512 outstart[(*outpos)++] = (char)res; | 5079 outstart[(*outpos)++] = (char)res; |
| 4513 return enc_SUCCESS; | 5080 return enc_SUCCESS; |
| 4514 } | 5081 } |
| 4515 | 5082 |
| 4516 rep = charmapencode_lookup(c, mapping); | 5083 rep = charmapencode_lookup(c, mapping); |
| 4517 if (rep==NULL) | 5084 if (rep==NULL) |
| 4518 return enc_EXCEPTION; | 5085 return enc_EXCEPTION; |
| 4519 else if (rep==Py_None) { | 5086 else if (rep==Py_None) { |
| 4520 Py_DECREF(rep); | 5087 Py_DECREF(rep); |
| 4521 return enc_FAILED; | 5088 return enc_FAILED; |
| 4522 } else { | 5089 } else { |
| 4523 if (PyInt_Check(rep)) { | 5090 if (PyLong_Check(rep)) { |
| 4524 Py_ssize_t requiredsize = *outpos+1; | 5091 Py_ssize_t requiredsize = *outpos+1; |
| 4525 if (outsize<requiredsize) | 5092 if (outsize<requiredsize) |
| 4526 if (!charmapencode_resize(outobj, outpos, requiredsize)) { | 5093 if (charmapencode_resize(outobj, outpos, requiredsize)) { |
| 4527 Py_DECREF(rep); | 5094 Py_DECREF(rep); |
| 4528 return enc_EXCEPTION; | 5095 return enc_EXCEPTION; |
| 4529 } | 5096 } |
| 4530 outstart = PyString_AS_STRING(*outobj); | 5097 outstart = PyBytes_AS_STRING(*outobj); |
| 4531 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep); | 5098 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); |
| 4532 } | 5099 } |
| 4533 else { | 5100 else { |
| 4534 const char *repchars = PyString_AS_STRING(rep); | 5101 const char *repchars = PyBytes_AS_STRING(rep); |
| 4535 Py_ssize_t repsize = PyString_GET_SIZE(rep); | 5102 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); |
| 4536 Py_ssize_t requiredsize = *outpos+repsize; | 5103 Py_ssize_t requiredsize = *outpos+repsize; |
| 4537 if (outsize<requiredsize) | 5104 if (outsize<requiredsize) |
| 4538 if (!charmapencode_resize(outobj, outpos, requiredsize)) { | 5105 if (charmapencode_resize(outobj, outpos, requiredsize)) { |
| 4539 Py_DECREF(rep); | 5106 Py_DECREF(rep); |
| 4540 return enc_EXCEPTION; | 5107 return enc_EXCEPTION; |
| 4541 } | 5108 } |
| 4542 outstart = PyString_AS_STRING(*outobj); | 5109 outstart = PyBytes_AS_STRING(*outobj); |
| 4543 memcpy(outstart + *outpos, repchars, repsize); | 5110 memcpy(outstart + *outpos, repchars, repsize); |
| 4544 *outpos += repsize; | 5111 *outpos += repsize; |
| 4545 } | 5112 } |
| 4546 } | 5113 } |
| 4547 Py_DECREF(rep); | 5114 Py_DECREF(rep); |
| 4548 return enc_SUCCESS; | 5115 return enc_SUCCESS; |
| 4549 } | 5116 } |
| 4550 | 5117 |
| 4551 /* handle an error in PyUnicode_EncodeCharmap | 5118 /* handle an error in PyUnicode_EncodeCharmap |
| 4552 Return 0 on success, -1 on error */ | 5119 Return 0 on success, -1 on error */ |
| (...skipping 87 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4640 } | 5207 } |
| 4641 } | 5208 } |
| 4642 *inpos = collendpos; | 5209 *inpos = collendpos; |
| 4643 break; | 5210 break; |
| 4644 default: | 5211 default: |
| 4645 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, | 5212 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, |
| 4646 encoding, reason, p, size,
exceptionObject, | 5213 encoding, reason, p, size,
exceptionObject, |
| 4647 collstartpos, collendpos,
&newpos); | 5214 collstartpos, collendpos,
&newpos); |
| 4648 if (repunicode == NULL) | 5215 if (repunicode == NULL) |
| 4649 return -1; | 5216 return -1; |
| 5217 if (PyBytes_Check(repunicode)) { |
| 5218 /* Directly copy bytes result to output. */ |
| 5219 Py_ssize_t outsize = PyBytes_Size(*res); |
| 5220 Py_ssize_t requiredsize; |
| 5221 repsize = PyBytes_Size(repunicode); |
| 5222 requiredsize = *respos + repsize; |
| 5223 if (requiredsize > outsize) |
| 5224 /* Make room for all additional bytes. */ |
| 5225 if (charmapencode_resize(res, respos, requiredsize)) { |
| 5226 Py_DECREF(repunicode); |
| 5227 return -1; |
| 5228 } |
| 5229 memcpy(PyBytes_AsString(*res) + *respos, |
| 5230 PyBytes_AsString(repunicode), repsize); |
| 5231 *respos += repsize; |
| 5232 *inpos = newpos; |
| 5233 Py_DECREF(repunicode); |
| 5234 break; |
| 5235 } |
| 4650 /* generate replacement */ | 5236 /* generate replacement */ |
| 4651 repsize = PyUnicode_GET_SIZE(repunicode); | 5237 repsize = PyUnicode_GET_SIZE(repunicode); |
| 4652 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { | 5238 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { |
| 4653 x = charmapencode_output(*uni2, mapping, res, respos); | 5239 x = charmapencode_output(*uni2, mapping, res, respos); |
| 4654 if (x==enc_EXCEPTION) { | 5240 if (x==enc_EXCEPTION) { |
| 4655 return -1; | 5241 return -1; |
| 4656 } | 5242 } |
| 4657 else if (x==enc_FAILED) { | 5243 else if (x==enc_FAILED) { |
| 4658 Py_DECREF(repunicode); | 5244 Py_DECREF(repunicode); |
| 4659 raise_encode_exception(exceptionObject, encoding, p, size, colls
tartpos, collendpos, reason); | 5245 raise_encode_exception(exceptionObject, encoding, p, size, colls
tartpos, collendpos, reason); |
| (...skipping 23 matching lines...) Expand all Loading... |
| 4683 * -1=not initialized, 0=unknown, 1=strict, 2=replace, | 5269 * -1=not initialized, 0=unknown, 1=strict, 2=replace, |
| 4684 * 3=ignore, 4=xmlcharrefreplace */ | 5270 * 3=ignore, 4=xmlcharrefreplace */ |
| 4685 int known_errorHandler = -1; | 5271 int known_errorHandler = -1; |
| 4686 | 5272 |
| 4687 /* Default to Latin-1 */ | 5273 /* Default to Latin-1 */ |
| 4688 if (mapping == NULL) | 5274 if (mapping == NULL) |
| 4689 return PyUnicode_EncodeLatin1(p, size, errors); | 5275 return PyUnicode_EncodeLatin1(p, size, errors); |
| 4690 | 5276 |
| 4691 /* allocate enough for a simple encoding without | 5277 /* allocate enough for a simple encoding without |
| 4692 replacements, if we need more, we'll resize */ | 5278 replacements, if we need more, we'll resize */ |
| 4693 res = PyString_FromStringAndSize(NULL, size); | 5279 res = PyBytes_FromStringAndSize(NULL, size); |
| 4694 if (res == NULL) | 5280 if (res == NULL) |
| 4695 goto onError; | 5281 goto onError; |
| 4696 if (size == 0) | 5282 if (size == 0) |
| 4697 return res; | 5283 return res; |
| 4698 | 5284 |
| 4699 while (inpos<size) { | 5285 while (inpos<size) { |
| 4700 /* try to encode it */ | 5286 /* try to encode it */ |
| 4701 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &
respos); | 5287 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &
respos); |
| 4702 if (x==enc_EXCEPTION) /* error */ | 5288 if (x==enc_EXCEPTION) /* error */ |
| 4703 goto onError; | 5289 goto onError; |
| 4704 if (x==enc_FAILED) { /* unencodable character */ | 5290 if (x==enc_FAILED) { /* unencodable character */ |
| 4705 if (charmap_encoding_error(p, size, &inpos, mapping, | 5291 if (charmap_encoding_error(p, size, &inpos, mapping, |
| 4706 &exc, | 5292 &exc, |
| 4707 &known_errorHandler, &errorHandler, error
s, | 5293 &known_errorHandler, &errorHandler, error
s, |
| 4708 &res, &respos)) { | 5294 &res, &respos)) { |
| 4709 goto onError; | 5295 goto onError; |
| 4710 } | 5296 } |
| 4711 } | 5297 } |
| 4712 else | 5298 else |
| 4713 /* done with this character => adjust input position */ | 5299 /* done with this character => adjust input position */ |
| 4714 ++inpos; | 5300 ++inpos; |
| 4715 } | 5301 } |
| 4716 | 5302 |
| 4717 /* Resize if we allocated to much */ | 5303 /* Resize if we allocated to much */ |
| 4718 if (respos<PyString_GET_SIZE(res)) { | 5304 if (respos<PyBytes_GET_SIZE(res)) |
| 4719 if (_PyString_Resize(&res, respos)) | 5305 if (_PyBytes_Resize(&res, respos) < 0) |
| 4720 goto onError; | 5306 goto onError; |
| 4721 } | 5307 |
| 4722 Py_XDECREF(exc); | 5308 Py_XDECREF(exc); |
| 4723 Py_XDECREF(errorHandler); | 5309 Py_XDECREF(errorHandler); |
| 4724 return res; | 5310 return res; |
| 4725 | 5311 |
| 4726 onError: | 5312 onError: |
| 4727 Py_XDECREF(res); | 5313 Py_XDECREF(res); |
| 4728 Py_XDECREF(exc); | 5314 Py_XDECREF(exc); |
| 4729 Py_XDECREF(errorHandler); | 5315 Py_XDECREF(errorHandler); |
| 4730 return NULL; | 5316 return NULL; |
| 4731 } | 5317 } |
| (...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4783 build arguments, call the callback and check the arguments, | 5369 build arguments, call the callback and check the arguments, |
| 4784 put the result into newpos and return the replacement string, which | 5370 put the result into newpos and return the replacement string, which |
| 4785 has to be freed by the caller */ | 5371 has to be freed by the caller */ |
| 4786 static PyObject *unicode_translate_call_errorhandler(const char *errors, | 5372 static PyObject *unicode_translate_call_errorhandler(const char *errors, |
| 4787 PyObject **errorHandler, | 5373 PyObject **errorHandler, |
| 4788 const char *reason, | 5374 const char *reason, |
| 4789 const Py_UNICODE *unicode,
Py_ssize_t size, PyObject **exceptionObject, | 5375 const Py_UNICODE *unicode,
Py_ssize_t size, PyObject **exceptionObject, |
| 4790 Py_ssize_t startpos, Py_ssi
ze_t endpos, | 5376 Py_ssize_t startpos, Py_ssi
ze_t endpos, |
| 4791 Py_ssize_t *newpos) | 5377 Py_ssize_t *newpos) |
| 4792 { | 5378 { |
| 4793 static char *argparse = "O!n;translating error handler must return (unicode,
int) tuple"; | 5379 static char *argparse = "O!n;translating error handler must return (str, int
) tuple"; |
| 4794 | 5380 |
| 4795 Py_ssize_t i_newpos; | 5381 Py_ssize_t i_newpos; |
| 4796 PyObject *restuple; | 5382 PyObject *restuple; |
| 4797 PyObject *resunicode; | 5383 PyObject *resunicode; |
| 4798 | 5384 |
| 4799 if (*errorHandler == NULL) { | 5385 if (*errorHandler == NULL) { |
| 4800 *errorHandler = PyCodec_LookupError(errors); | 5386 *errorHandler = PyCodec_LookupError(errors); |
| 4801 if (*errorHandler == NULL) | 5387 if (*errorHandler == NULL) |
| 4802 return NULL; | 5388 return NULL; |
| 4803 } | 5389 } |
| (...skipping 30 matching lines...) Expand all Loading... |
| 4834 Py_DECREF(restuple); | 5420 Py_DECREF(restuple); |
| 4835 return resunicode; | 5421 return resunicode; |
| 4836 } | 5422 } |
| 4837 | 5423 |
| 4838 /* Lookup the character ch in the mapping and put the result in result, | 5424 /* Lookup the character ch in the mapping and put the result in result, |
| 4839 which must be decrefed by the caller. | 5425 which must be decrefed by the caller. |
| 4840 Return 0 on success, -1 on error */ | 5426 Return 0 on success, -1 on error */ |
| 4841 static | 5427 static |
| 4842 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result) | 5428 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result) |
| 4843 { | 5429 { |
| 4844 PyObject *w = PyInt_FromLong((long)c); | 5430 PyObject *w = PyLong_FromLong((long)c); |
| 4845 PyObject *x; | 5431 PyObject *x; |
| 4846 | 5432 |
| 4847 if (w == NULL) | 5433 if (w == NULL) |
| 4848 return -1; | 5434 return -1; |
| 4849 x = PyObject_GetItem(mapping, w); | 5435 x = PyObject_GetItem(mapping, w); |
| 4850 Py_DECREF(w); | 5436 Py_DECREF(w); |
| 4851 if (x == NULL) { | 5437 if (x == NULL) { |
| 4852 if (PyErr_ExceptionMatches(PyExc_LookupError)) { | 5438 if (PyErr_ExceptionMatches(PyExc_LookupError)) { |
| 4853 /* No mapping found means: use 1:1 mapping. */ | 5439 /* No mapping found means: use 1:1 mapping. */ |
| 4854 PyErr_Clear(); | 5440 PyErr_Clear(); |
| 4855 *result = NULL; | 5441 *result = NULL; |
| 4856 return 0; | 5442 return 0; |
| 4857 } else | 5443 } else |
| 4858 return -1; | 5444 return -1; |
| 4859 } | 5445 } |
| 4860 else if (x == Py_None) { | 5446 else if (x == Py_None) { |
| 4861 *result = x; | 5447 *result = x; |
| 4862 return 0; | 5448 return 0; |
| 4863 } | 5449 } |
| 4864 else if (PyInt_Check(x)) { | 5450 else if (PyLong_Check(x)) { |
| 4865 long value = PyInt_AS_LONG(x); | 5451 long value = PyLong_AS_LONG(x); |
| 4866 long max = PyUnicode_GetMax(); | 5452 long max = PyUnicode_GetMax(); |
| 4867 if (value < 0 || value > max) { | 5453 if (value < 0 || value > max) { |
| 4868 PyErr_Format(PyExc_TypeError, | 5454 PyErr_Format(PyExc_TypeError, |
| 4869 "character mapping must be in range(0x%lx)", max+1); | 5455 "character mapping must be in range(0x%x)", max+1); |
| 4870 Py_DECREF(x); | 5456 Py_DECREF(x); |
| 4871 return -1; | 5457 return -1; |
| 4872 } | 5458 } |
| 4873 *result = x; | 5459 *result = x; |
| 4874 return 0; | 5460 return 0; |
| 4875 } | 5461 } |
| 4876 else if (PyUnicode_Check(x)) { | 5462 else if (PyUnicode_Check(x)) { |
| 4877 *result = x; | 5463 *result = x; |
| 4878 return 0; | 5464 return 0; |
| 4879 } | 5465 } |
| 4880 else { | 5466 else { |
| 4881 /* wrong return value */ | 5467 /* wrong return value */ |
| 4882 PyErr_SetString(PyExc_TypeError, | 5468 PyErr_SetString(PyExc_TypeError, |
| 4883 "character mapping must return integer, None or unicode"
); | 5469 "character mapping must return integer, None or str"); |
| 4884 Py_DECREF(x); | 5470 Py_DECREF(x); |
| 4885 return -1; | 5471 return -1; |
| 4886 } | 5472 } |
| 4887 } | 5473 } |
| 4888 /* ensure that *outobj is at least requiredsize characters long, | 5474 /* ensure that *outobj is at least requiredsize characters long, |
| 4889 if not reallocate and adjust various state variables. | 5475 if not reallocate and adjust various state variables. |
| 4890 Return 0 on success, -1 on error */ | 5476 Return 0 on success, -1 on error */ |
| 4891 static | 5477 static |
| 4892 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, | 5478 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, |
| 4893 Py_ssize_t requiredsize) | 5479 Py_ssize_t requiredsize) |
| (...skipping 23 matching lines...) Expand all Loading... |
| 4917 PyObject **res) | 5503 PyObject **res) |
| 4918 { | 5504 { |
| 4919 if (charmaptranslate_lookup(*curinp, mapping, res)) | 5505 if (charmaptranslate_lookup(*curinp, mapping, res)) |
| 4920 return -1; | 5506 return -1; |
| 4921 if (*res==NULL) { | 5507 if (*res==NULL) { |
| 4922 /* not found => default to 1:1 mapping */ | 5508 /* not found => default to 1:1 mapping */ |
| 4923 *(*outp)++ = *curinp; | 5509 *(*outp)++ = *curinp; |
| 4924 } | 5510 } |
| 4925 else if (*res==Py_None) | 5511 else if (*res==Py_None) |
| 4926 ; | 5512 ; |
| 4927 else if (PyInt_Check(*res)) { | 5513 else if (PyLong_Check(*res)) { |
| 4928 /* no overflow check, because we know that the space is enough */ | 5514 /* no overflow check, because we know that the space is enough */ |
| 4929 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res); | 5515 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res); |
| 4930 } | 5516 } |
| 4931 else if (PyUnicode_Check(*res)) { | 5517 else if (PyUnicode_Check(*res)) { |
| 4932 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res); | 5518 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res); |
| 4933 if (repsize==1) { | 5519 if (repsize==1) { |
| 4934 /* no overflow check, because we know that the space is enough */ | 5520 /* no overflow check, because we know that the space is enough */ |
| 4935 *(*outp)++ = *PyUnicode_AS_UNICODE(*res); | 5521 *(*outp)++ = *PyUnicode_AS_UNICODE(*res); |
| 4936 } | 5522 } |
| 4937 else if (repsize!=0) { | 5523 else if (repsize!=0) { |
| 4938 /* more than one character */ | 5524 /* more than one character */ |
| 4939 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) + | 5525 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) + |
| (...skipping 261 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 5201 for (p = collstart; p < collend; ++p) | 5787 for (p = collstart; p < collend; ++p) |
| 5202 output += sprintf(output, "&#%d;", (int)*p); | 5788 output += sprintf(output, "&#%d;", (int)*p); |
| 5203 p = collend; | 5789 p = collend; |
| 5204 break; | 5790 break; |
| 5205 default: | 5791 default: |
| 5206 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, | 5792 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, |
| 5207 encoding, reason, s, l
ength, &exc, | 5793 encoding, reason, s, l
ength, &exc, |
| 5208 collstart-s, collend-s
, &newpos); | 5794 collstart-s, collend-s
, &newpos); |
| 5209 if (repunicode == NULL) | 5795 if (repunicode == NULL) |
| 5210 goto onError; | 5796 goto onError; |
| 5797 if (!PyUnicode_Check(repunicode)) { |
| 5798 /* Byte results not supported, since they have no decimal proper
ty. */ |
| 5799 PyErr_SetString(PyExc_TypeError, "error handler should return un
icode"); |
| 5800 Py_DECREF(repunicode); |
| 5801 goto onError; |
| 5802 } |
| 5211 /* generate replacement */ | 5803 /* generate replacement */ |
| 5212 repsize = PyUnicode_GET_SIZE(repunicode); | 5804 repsize = PyUnicode_GET_SIZE(repunicode); |
| 5213 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { | 5805 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { |
| 5214 Py_UNICODE ch = *uni2; | 5806 Py_UNICODE ch = *uni2; |
| 5215 if (Py_UNICODE_ISSPACE(ch)) | 5807 if (Py_UNICODE_ISSPACE(ch)) |
| 5216 *output++ = ' '; | 5808 *output++ = ' '; |
| 5217 else { | 5809 else { |
| 5218 decimal = Py_UNICODE_TODECIMAL(ch); | 5810 decimal = Py_UNICODE_TODECIMAL(ch); |
| 5219 if (decimal >= 0) | 5811 if (decimal >= 0) |
| 5220 *output++ = '0' + decimal; | 5812 *output++ = '0' + decimal; |
| (...skipping 20 matching lines...) Expand all Loading... |
| 5241 onError: | 5833 onError: |
| 5242 Py_XDECREF(exc); | 5834 Py_XDECREF(exc); |
| 5243 Py_XDECREF(errorHandler); | 5835 Py_XDECREF(errorHandler); |
| 5244 return -1; | 5836 return -1; |
| 5245 } | 5837 } |
| 5246 | 5838 |
| 5247 /* --- Helpers ------------------------------------------------------------ */ | 5839 /* --- Helpers ------------------------------------------------------------ */ |
| 5248 | 5840 |
| 5249 #include "stringlib/unicodedefs.h" | 5841 #include "stringlib/unicodedefs.h" |
| 5250 #include "stringlib/fastsearch.h" | 5842 #include "stringlib/fastsearch.h" |
| 5251 | |
| 5252 #include "stringlib/count.h" | 5843 #include "stringlib/count.h" |
| 5844 /* Include _ParseTupleFinds from find.h */ |
| 5845 #define FROM_UNICODE |
| 5253 #include "stringlib/find.h" | 5846 #include "stringlib/find.h" |
| 5254 #include "stringlib/partition.h" | 5847 #include "stringlib/partition.h" |
| 5255 #include "stringlib/split.h" | 5848 |
| 5849 #define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping |
| 5850 #define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLoca
le |
| 5851 #include "stringlib/localeutil.h" |
| 5256 | 5852 |
| 5257 /* helper macro to fixup start/end slice values */ | 5853 /* helper macro to fixup start/end slice values */ |
| 5258 #define ADJUST_INDICES(start, end, len) \ | 5854 #define FIX_START_END(obj) \ |
| 5259 if (end > len) \ | 5855 if (start < 0) \ |
| 5260 end = len; \ | 5856 start += (obj)->length; \ |
| 5261 else if (end < 0) { \ | 5857 if (start < 0) \ |
| 5262 end += len; \ | 5858 start = 0; \ |
| 5263 if (end < 0) \ | 5859 if (end > (obj)->length) \ |
| 5264 end = 0; \ | 5860 end = (obj)->length; \ |
| 5265 } \ | 5861 if (end < 0) \ |
| 5266 if (start < 0) { \ | 5862 end += (obj)->length; \ |
| 5267 start += len; \ | 5863 if (end < 0) \ |
| 5268 if (start < 0) \ | 5864 end = 0; |
| 5269 start = 0; \ | |
| 5270 } | |
| 5271 | 5865 |
| 5272 Py_ssize_t PyUnicode_Count(PyObject *str, | 5866 Py_ssize_t PyUnicode_Count(PyObject *str, |
| 5273 PyObject *substr, | 5867 PyObject *substr, |
| 5274 Py_ssize_t start, | 5868 Py_ssize_t start, |
| 5275 Py_ssize_t end) | 5869 Py_ssize_t end) |
| 5276 { | 5870 { |
| 5277 Py_ssize_t result; | 5871 Py_ssize_t result; |
| 5278 PyUnicodeObject* str_obj; | 5872 PyUnicodeObject* str_obj; |
| 5279 PyUnicodeObject* sub_obj; | 5873 PyUnicodeObject* sub_obj; |
| 5280 | 5874 |
| 5281 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str); | 5875 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str); |
| 5282 if (!str_obj) | 5876 if (!str_obj) |
| 5283 return -1; | 5877 return -1; |
| 5284 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr); | 5878 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr); |
| 5285 if (!sub_obj) { | 5879 if (!sub_obj) { |
| 5286 Py_DECREF(str_obj); | 5880 Py_DECREF(str_obj); |
| 5287 return -1; | 5881 return -1; |
| 5288 } | 5882 } |
| 5289 | 5883 |
| 5290 ADJUST_INDICES(start, end, str_obj->length); | 5884 FIX_START_END(str_obj); |
| 5885 |
| 5291 result = stringlib_count( | 5886 result = stringlib_count( |
| 5292 str_obj->str + start, end - start, sub_obj->str, sub_obj->length, | 5887 str_obj->str + start, end - start, sub_obj->str, sub_obj->length |
| 5293 PY_SSIZE_T_MAX | |
| 5294 ); | 5888 ); |
| 5295 | 5889 |
| 5296 Py_DECREF(sub_obj); | 5890 Py_DECREF(sub_obj); |
| 5297 Py_DECREF(str_obj); | 5891 Py_DECREF(str_obj); |
| 5298 | 5892 |
| 5299 return result; | 5893 return result; |
| 5300 } | 5894 } |
| 5301 | 5895 |
| 5302 Py_ssize_t PyUnicode_Find(PyObject *str, | 5896 Py_ssize_t PyUnicode_Find(PyObject *str, |
| 5303 PyObject *sub, | 5897 PyObject *sub, |
| (...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 5338 static | 5932 static |
| 5339 int tailmatch(PyUnicodeObject *self, | 5933 int tailmatch(PyUnicodeObject *self, |
| 5340 PyUnicodeObject *substring, | 5934 PyUnicodeObject *substring, |
| 5341 Py_ssize_t start, | 5935 Py_ssize_t start, |
| 5342 Py_ssize_t end, | 5936 Py_ssize_t end, |
| 5343 int direction) | 5937 int direction) |
| 5344 { | 5938 { |
| 5345 if (substring->length == 0) | 5939 if (substring->length == 0) |
| 5346 return 1; | 5940 return 1; |
| 5347 | 5941 |
| 5348 ADJUST_INDICES(start, end, self->length); | 5942 FIX_START_END(self); |
| 5943 |
| 5349 end -= substring->length; | 5944 end -= substring->length; |
| 5350 if (end < start) | 5945 if (end < start) |
| 5351 return 0; | 5946 return 0; |
| 5352 | 5947 |
| 5353 if (direction > 0) { | 5948 if (direction > 0) { |
| 5354 if (Py_UNICODE_MATCH(self, end, substring)) | 5949 if (Py_UNICODE_MATCH(self, end, substring)) |
| 5355 return 1; | 5950 return 1; |
| 5356 } else { | 5951 } else { |
| 5357 if (Py_UNICODE_MATCH(self, start, substring)) | 5952 if (Py_UNICODE_MATCH(self, start, substring)) |
| 5358 return 1; | 5953 return 1; |
| (...skipping 175 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 5534 previous_is_cased = 1; | 6129 previous_is_cased = 1; |
| 5535 else | 6130 else |
| 5536 previous_is_cased = 0; | 6131 previous_is_cased = 0; |
| 5537 } | 6132 } |
| 5538 return 1; | 6133 return 1; |
| 5539 } | 6134 } |
| 5540 | 6135 |
| 5541 PyObject * | 6136 PyObject * |
| 5542 PyUnicode_Join(PyObject *separator, PyObject *seq) | 6137 PyUnicode_Join(PyObject *separator, PyObject *seq) |
| 5543 { | 6138 { |
| 5544 PyObject *internal_separator = NULL; | |
| 5545 const Py_UNICODE blank = ' '; | 6139 const Py_UNICODE blank = ' '; |
| 5546 const Py_UNICODE *sep = ␣ | 6140 const Py_UNICODE *sep = ␣ |
| 5547 Py_ssize_t seplen = 1; | 6141 Py_ssize_t seplen = 1; |
| 5548 PyUnicodeObject *res = NULL; /* the result */ | 6142 PyUnicodeObject *res = NULL; /* the result */ |
| 5549 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */ | |
| 5550 Py_ssize_t res_used; /* # used bytes */ | |
| 5551 Py_UNICODE *res_p; /* pointer to free byte in res's string area */ | 6143 Py_UNICODE *res_p; /* pointer to free byte in res's string area */ |
| 5552 PyObject *fseq; /* PySequence_Fast(seq) */ | 6144 PyObject *fseq; /* PySequence_Fast(seq) */ |
| 5553 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence
*/ | 6145 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ |
| 6146 PyObject **items; |
| 5554 PyObject *item; | 6147 PyObject *item; |
| 5555 Py_ssize_t i; | 6148 Py_ssize_t sz, i; |
| 5556 | 6149 |
| 5557 fseq = PySequence_Fast(seq, ""); | 6150 fseq = PySequence_Fast(seq, ""); |
| 5558 if (fseq == NULL) { | 6151 if (fseq == NULL) { |
| 5559 return NULL; | 6152 return NULL; |
| 5560 } | 6153 } |
| 5561 | 6154 |
| 5562 /* Grrrr. A codec may be invoked to convert str objects to | 6155 /* NOTE: the following code can't call back into Python code, |
| 5563 * Unicode, and so it's possible to call back into Python code | 6156 * so we are sure that fseq won't be mutated. |
| 5564 * during PyUnicode_FromObject(), and so it's possible for a sick | |
| 5565 * codec to change the size of fseq (if seq is a list). Therefore | |
| 5566 * we have to keep refetching the size -- can't assume seqlen | |
| 5567 * is invariant. | |
| 5568 */ | 6157 */ |
| 6158 |
| 5569 seqlen = PySequence_Fast_GET_SIZE(fseq); | 6159 seqlen = PySequence_Fast_GET_SIZE(fseq); |
| 5570 /* If empty sequence, return u"". */ | 6160 /* If empty sequence, return u"". */ |
| 5571 if (seqlen == 0) { | 6161 if (seqlen == 0) { |
| 5572 res = _PyUnicode_New(0); /* empty sequence; return u"" */ | 6162 res = _PyUnicode_New(0); /* empty sequence; return u"" */ |
| 5573 goto Done; | 6163 goto Done; |
| 5574 } | 6164 } |
| 6165 items = PySequence_Fast_ITEMS(fseq); |
| 5575 /* If singleton sequence with an exact Unicode, return that. */ | 6166 /* If singleton sequence with an exact Unicode, return that. */ |
| 5576 if (seqlen == 1) { | 6167 if (seqlen == 1) { |
| 5577 item = PySequence_Fast_GET_ITEM(fseq, 0); | 6168 item = items[0]; |
| 5578 if (PyUnicode_CheckExact(item)) { | 6169 if (PyUnicode_CheckExact(item)) { |
| 5579 Py_INCREF(item); | 6170 Py_INCREF(item); |
| 5580 res = (PyUnicodeObject *)item; | 6171 res = (PyUnicodeObject *)item; |
| 5581 goto Done; | 6172 goto Done; |
| 5582 } | 6173 } |
| 5583 } | 6174 } |
| 5584 | 6175 else { |
| 5585 /* At least two items to join, or one that isn't exact Unicode. */ | 6176 /* Set up sep and seplen */ |
| 5586 if (seqlen > 1) { | |
| 5587 /* Set up sep and seplen -- they're needed. */ | |
| 5588 if (separator == NULL) { | 6177 if (separator == NULL) { |
| 5589 sep = ␣ | 6178 sep = ␣ |
| 5590 seplen = 1; | 6179 seplen = 1; |
| 5591 } | 6180 } |
| 5592 else { | 6181 else { |
| 5593 internal_separator = PyUnicode_FromObject(separator); | 6182 if (!PyUnicode_Check(separator)) { |
| 5594 if (internal_separator == NULL) | 6183 PyErr_Format(PyExc_TypeError, |
| 6184 "separator: expected str instance," |
| 6185 " %.80s found", |
| 6186 Py_TYPE(separator)->tp_name); |
| 5595 goto onError; | 6187 goto onError; |
| 5596 sep = PyUnicode_AS_UNICODE(internal_separator); | 6188 } |
| 5597 seplen = PyUnicode_GET_SIZE(internal_separator); | 6189 sep = PyUnicode_AS_UNICODE(separator); |
| 5598 /* In case PyUnicode_FromObject() mutated seq. */ | 6190 seplen = PyUnicode_GET_SIZE(separator); |
| 5599 seqlen = PySequence_Fast_GET_SIZE(fseq); | 6191 } |
| 5600 } | 6192 } |
| 5601 } | 6193 |
| 5602 | 6194 /* There are at least two things to join, or else we have a subclass |
| 5603 /* Get space. */ | 6195 * of str in the sequence. |
| 5604 res = _PyUnicode_New(res_alloc); | 6196 * Do a pre-pass to figure out the total amount of space we'll |
| 5605 if (res == NULL) | 6197 * need (sz), and see whether all argument are strings. |
| 5606 goto onError; | 6198 */ |
| 5607 res_p = PyUnicode_AS_UNICODE(res); | 6199 sz = 0; |
| 5608 res_used = 0; | 6200 for (i = 0; i < seqlen; i++) { |
| 5609 | 6201 const Py_ssize_t old_sz = sz; |
| 5610 for (i = 0; i < seqlen; ++i) { | 6202 item = items[i]; |
| 5611 Py_ssize_t itemlen; | 6203 if (!PyUnicode_Check(item)) { |
| 5612 Py_ssize_t new_res_used; | |
| 5613 | |
| 5614 item = PySequence_Fast_GET_ITEM(fseq, i); | |
| 5615 /* Convert item to Unicode. */ | |
| 5616 if (! PyUnicode_Check(item) && ! PyString_Check(item)) { | |
| 5617 PyErr_Format(PyExc_TypeError, | |