diff --git a/Include/longobject.h b/Include/longobject.h --- a/Include/longobject.h +++ b/Include/longobject.h @@ -80,6 +80,7 @@ PyAPI_FUNC(PyObject *) PyLong_FromString(char *, char **, int); #ifndef Py_LIMITED_API PyAPI_FUNC(PyObject *) PyLong_FromUnicode(Py_UNICODE*, Py_ssize_t, int); +PyAPI_FUNC(PyObject *) PyLong_FromUnicodeObject(PyObject *u, int base); #endif #ifndef Py_LIMITED_API diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -68,12 +68,15 @@ properly set, but the default rules below doesn't set it. I'll sort this out some other day -- fredrik@pythonware.com */ -#ifndef Py_UNICODE_SIZE -#error Must define Py_UNICODE_SIZE +#ifndef SIZEOF_WCHAR_T +#error Must define SIZEOF_WCHAR_T #endif -/* Setting Py_UNICODE_WIDE enables UCS-4 storage. Otherwise, Unicode - strings are stored as UCS-2 (with limited support for UTF-16) */ +#define Py_UNICODE_SIZE SIZEOF_WCHAR_T + +/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE. + Otherwise, Unicode strings are stored as UCS-2 (with limited support + for UTF-16) */ #if Py_UNICODE_SIZE >= 4 #define Py_UNICODE_WIDE @@ -84,19 +87,14 @@ /* #define HAVE_WCHAR_H */ /* #define HAVE_USABLE_WCHAR_T */ -/* Defaults for various platforms */ -#ifndef PY_UNICODE_TYPE +/* Py_UNICODE was the native Unicode storage format (code unit) used by + Python and represents a single Unicode element in the Unicode type. + With PEP 393, Py_UNICODE is deprected and replaced with a + typedef to wchar_t. */ -/* Windows has a usable wchar_t type (unless we're using UCS-4) */ -# if defined(MS_WIN32) && Py_UNICODE_SIZE == 2 -# define HAVE_USABLE_WCHAR_T -# define PY_UNICODE_TYPE wchar_t -# endif - -# if defined(Py_UNICODE_WIDE) -# define PY_UNICODE_TYPE Py_UCS4 -# endif - +#ifndef Py_LIMITED_API +#define PY_UNICODE_TYPE wchar_t +typedef wchar_t Py_UNICODE; #endif /* If the compiler provides a wchar_t type we try to support it @@ -121,200 +119,18 @@ # include #endif -/* - * Use this typedef when you need to represent a UTF-16 surrogate pair - * as single unsigned integer. - */ +/* Py_UCS4 and Py_UCS2 are typdefs for the respecitve + unicode representations. */ #if SIZEOF_INT >= 4 typedef unsigned int Py_UCS4; #elif SIZEOF_LONG >= 4 typedef unsigned long Py_UCS4; +#else +#error "Could not find a proper typedef for Py_UCS4" #endif -/* Py_UNICODE is the native Unicode storage format (code unit) used by - Python and represents a single Unicode element in the Unicode - type. */ +typedef unsigned short Py_UCS2; -#ifndef Py_LIMITED_API -typedef PY_UNICODE_TYPE Py_UNICODE; -#endif - -/* --- UCS-2/UCS-4 Name Mangling ------------------------------------------ */ - -/* Unicode API names are mangled to assure that UCS-2 and UCS-4 builds - produce different external names and thus cause import errors in - case Python interpreters and extensions with mixed compiled in - Unicode width assumptions are combined. */ - -#ifndef Py_UNICODE_WIDE - -# define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString -# define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString -# define PyUnicode_AsDecodedObject PyUnicodeUCS2_AsDecodedObject -# define PyUnicode_AsDecodedUnicode PyUnicodeUCS2_AsDecodedUnicode -# define PyUnicode_AsEncodedObject PyUnicodeUCS2_AsEncodedObject -# define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString -# define PyUnicode_AsEncodedUnicode PyUnicodeUCS2_AsEncodedUnicode -# define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String -# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString -# define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String -# define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String -# define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String -# define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode -# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS2_AsUnicodeEscapeString -# define PyUnicode_AsWideChar PyUnicodeUCS2_AsWideChar -# define PyUnicode_AsWideCharString PyUnicodeUCS2_AsWideCharString -# define PyUnicode_ClearFreeList PyUnicodeUCS2_ClearFreelist -# define PyUnicode_Compare PyUnicodeUCS2_Compare -# define PyUnicode_CompareWithASCIIString PyUnicodeUCS2_CompareWithASCIIString -# define PyUnicode_Concat PyUnicodeUCS2_Concat -# define PyUnicode_Append PyUnicodeUCS2_Append -# define PyUnicode_AppendAndDel PyUnicodeUCS2_AppendAndDel -# define PyUnicode_Contains PyUnicodeUCS2_Contains -# define PyUnicode_Count PyUnicodeUCS2_Count -# define PyUnicode_Decode PyUnicodeUCS2_Decode -# define PyUnicode_DecodeASCII PyUnicodeUCS2_DecodeASCII -# define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap -# define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1 -# define PyUnicode_DecodeFSDefault PyUnicodeUCS2_DecodeFSDefault -# define PyUnicode_DecodeFSDefaultAndSize PyUnicodeUCS2_DecodeFSDefaultAndSize -# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape -# define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32 -# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful -# define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16 -# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful -# define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8 -# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS2_DecodeUTF8Stateful -# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS2_DecodeUnicodeEscape -# define PyUnicode_Encode PyUnicodeUCS2_Encode -# define PyUnicode_EncodeASCII PyUnicodeUCS2_EncodeASCII -# define PyUnicode_EncodeCharmap PyUnicodeUCS2_EncodeCharmap -# define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal -# define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1 -# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape -# define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32 -# define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16 -# define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8 -# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape -# define PyUnicode_Find PyUnicodeUCS2_Find -# define PyUnicode_Format PyUnicodeUCS2_Format -# define PyUnicode_FromEncodedObject PyUnicodeUCS2_FromEncodedObject -# define PyUnicode_FromFormat PyUnicodeUCS2_FromFormat -# define PyUnicode_FromFormatV PyUnicodeUCS2_FromFormatV -# define PyUnicode_FromObject PyUnicodeUCS2_FromObject -# define PyUnicode_FromOrdinal PyUnicodeUCS2_FromOrdinal -# define PyUnicode_FromString PyUnicodeUCS2_FromString -# define PyUnicode_FromStringAndSize PyUnicodeUCS2_FromStringAndSize -# define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode -# define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar -# define PyUnicode_FSConverter PyUnicodeUCS2_FSConverter -# define PyUnicode_FSDecoder PyUnicodeUCS2_FSDecoder -# define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding -# define PyUnicode_GetMax PyUnicodeUCS2_GetMax -# define PyUnicode_GetSize PyUnicodeUCS2_GetSize -# define PyUnicode_IsIdentifier PyUnicodeUCS2_IsIdentifier -# define PyUnicode_Join PyUnicodeUCS2_Join -# define PyUnicode_Partition PyUnicodeUCS2_Partition -# define PyUnicode_RPartition PyUnicodeUCS2_RPartition -# define PyUnicode_RSplit PyUnicodeUCS2_RSplit -# define PyUnicode_Replace PyUnicodeUCS2_Replace -# define PyUnicode_Resize PyUnicodeUCS2_Resize -# define PyUnicode_RichCompare PyUnicodeUCS2_RichCompare -# define PyUnicode_Split PyUnicodeUCS2_Split -# define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines -# define PyUnicode_Tailmatch PyUnicodeUCS2_Tailmatch -# define PyUnicode_Translate PyUnicodeUCS2_Translate -# define PyUnicode_TranslateCharmap PyUnicodeUCS2_TranslateCharmap -# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS2_AsDefaultEncodedString -# define _PyUnicode_Fini _PyUnicodeUCS2_Fini -# define _PyUnicode_Init _PyUnicodeUCS2_Init -# define PyUnicode_strdup PyUnicodeUCS2_strdup - -#else - -# define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString -# define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString -# define PyUnicode_AsDecodedObject PyUnicodeUCS4_AsDecodedObject -# define PyUnicode_AsDecodedUnicode PyUnicodeUCS4_AsDecodedUnicode -# define PyUnicode_AsEncodedObject PyUnicodeUCS4_AsEncodedObject -# define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString -# define PyUnicode_AsEncodedUnicode PyUnicodeUCS4_AsEncodedUnicode -# define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String -# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString -# define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String -# define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String -# define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String -# define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode -# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS4_AsUnicodeEscapeString -# define PyUnicode_AsWideChar PyUnicodeUCS4_AsWideChar -# define PyUnicode_AsWideCharString PyUnicodeUCS4_AsWideCharString -# define PyUnicode_ClearFreeList PyUnicodeUCS4_ClearFreelist -# define PyUnicode_Compare PyUnicodeUCS4_Compare -# define PyUnicode_CompareWithASCIIString PyUnicodeUCS4_CompareWithASCIIString -# define PyUnicode_Concat PyUnicodeUCS4_Concat -# define PyUnicode_Append PyUnicodeUCS4_Append -# define PyUnicode_AppendAndDel PyUnicodeUCS4_AppendAndDel -# define PyUnicode_Contains PyUnicodeUCS4_Contains -# define PyUnicode_Count PyUnicodeUCS4_Count -# define PyUnicode_Decode PyUnicodeUCS4_Decode -# define PyUnicode_DecodeASCII PyUnicodeUCS4_DecodeASCII -# define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap -# define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1 -# define PyUnicode_DecodeFSDefault PyUnicodeUCS4_DecodeFSDefault -# define PyUnicode_DecodeFSDefaultAndSize PyUnicodeUCS4_DecodeFSDefaultAndSize -# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape -# define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32 -# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful -# define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16 -# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful -# define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8 -# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS4_DecodeUTF8Stateful -# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS4_DecodeUnicodeEscape -# define PyUnicode_Encode PyUnicodeUCS4_Encode -# define PyUnicode_EncodeASCII PyUnicodeUCS4_EncodeASCII -# define PyUnicode_EncodeCharmap PyUnicodeUCS4_EncodeCharmap -# define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal -# define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1 -# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape -# define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32 -# define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16 -# define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8 -# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape -# define PyUnicode_Find PyUnicodeUCS4_Find -# define PyUnicode_Format PyUnicodeUCS4_Format -# define PyUnicode_FromEncodedObject PyUnicodeUCS4_FromEncodedObject -# define PyUnicode_FromFormat PyUnicodeUCS4_FromFormat -# define PyUnicode_FromFormatV PyUnicodeUCS4_FromFormatV -# define PyUnicode_FromObject PyUnicodeUCS4_FromObject -# define PyUnicode_FromOrdinal PyUnicodeUCS4_FromOrdinal -# define PyUnicode_FromString PyUnicodeUCS4_FromString -# define PyUnicode_FromStringAndSize PyUnicodeUCS4_FromStringAndSize -# define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode -# define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar -# define PyUnicode_FSConverter PyUnicodeUCS4_FSConverter -# define PyUnicode_FSDecoder PyUnicodeUCS4_FSDecoder -# define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding -# define PyUnicode_GetMax PyUnicodeUCS4_GetMax -# define PyUnicode_GetSize PyUnicodeUCS4_GetSize -# define PyUnicode_IsIdentifier PyUnicodeUCS4_IsIdentifier -# define PyUnicode_Join PyUnicodeUCS4_Join -# define PyUnicode_Partition PyUnicodeUCS4_Partition -# define PyUnicode_RPartition PyUnicodeUCS4_RPartition -# define PyUnicode_RSplit PyUnicodeUCS4_RSplit -# define PyUnicode_Replace PyUnicodeUCS4_Replace -# define PyUnicode_Resize PyUnicodeUCS4_Resize -# define PyUnicode_RichCompare PyUnicodeUCS4_RichCompare -# define PyUnicode_Split PyUnicodeUCS4_Split -# define PyUnicode_Splitlines PyUnicodeUCS4_Splitlines -# define PyUnicode_Tailmatch PyUnicodeUCS4_Tailmatch -# define PyUnicode_Translate PyUnicodeUCS4_Translate -# define PyUnicode_TranslateCharmap PyUnicodeUCS4_TranslateCharmap -# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS4_AsDefaultEncodedString -# define _PyUnicode_Fini _PyUnicodeUCS4_Fini -# define _PyUnicode_Init _PyUnicodeUCS4_Init -# define PyUnicode_strdup PyUnicodeUCS4_strdup - -#endif /* --- Internal Unicode Operations ---------------------------------------- */ @@ -354,7 +170,7 @@ Py_UNICODE_ISDIGIT(ch) || \ Py_UNICODE_ISNUMERIC(ch)) -#define Py_UNICODE_COPY(target, source, length) \ +#define Py_UNICODE_COPY(target, source, length) \ Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE)) #define Py_UNICODE_FILL(target, value, length) \ @@ -366,9 +182,10 @@ valid, and the substring must not be empty. */ #define Py_UNICODE_MATCH(string, offset, substring) \ - ((*((string)->str + (offset)) == *((substring)->str)) && \ - ((*((string)->str + (offset) + (substring)->length-1) == *((substring)->str + (substring)->length-1))) && \ - !memcmp((string)->str + (offset), (substring)->str, (substring)->length*sizeof(Py_UNICODE))) + ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \ + ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \ + !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE))) + #endif /* Py_LIMITED_API */ #ifdef __cplusplus @@ -380,39 +197,216 @@ #ifndef Py_LIMITED_API typedef struct { PyObject_HEAD - Py_ssize_t length; /* Length of raw Unicode data in buffer */ - Py_UNICODE *str; /* Raw Unicode buffer */ + Py_ssize_t length; /* Number of code points in the string */ + void *str; /* Canonical, smallest-form Unicode buffer */ Py_hash_t hash; /* Hash value; -1 if not set */ int state; /* != 0 if interned. In this case the two * references from the dictionary to this object - * are *not* counted in ob_refcnt. */ - PyObject *defenc; /* (Default) Encoded version as Python - string, or NULL; this is used for - implementing the buffer protocol */ + * are *not* counted in ob_refcnt. + * See SSTATE_KIND_* for other bits */ + Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the + * terminating \0. */ + char *utf8; /* UTF-8 representation (null-terminated) */ + Py_ssize_t wstr_length; /* Number of code points in wstr, possible + * surrogates count as two code points. */ + wchar_t *wstr; /* wchar_t representation (null-terminated) */ } PyUnicodeObject; #endif PyAPI_DATA(PyTypeObject) PyUnicode_Type; PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; -#define SSTATE_NOT_INTERNED 0 -#define SSTATE_INTERNED_MORTAL 1 -#define SSTATE_INTERNED_IMMORTAL 2 - #define PyUnicode_Check(op) \ PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS) #define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type) /* Fast access macros */ #ifndef Py_LIMITED_API + +/* Returns the deprecated Py_UNICODE representation's size in code units + (this includes surrogate pairs as 2 units). + If the Py_UNICODE representation is not available, it will be computed + on request. Use PyUnicode_GET_LENGTH() for the length in code points. */ + #define PyUnicode_GET_SIZE(op) \ - (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length)) + (assert(PyUnicode_Check(op)), \ + (((PyUnicodeObject *)(op))->wstr) ? \ + (((PyUnicodeObject *)(op))->wstr_length) : \ + ((void)PyUnicode_AsUnicode((PyObject *)(op)), \ + (((PyUnicodeObject *)(op))->wstr_length))) + #define PyUnicode_GET_DATA_SIZE(op) \ - (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE))) + (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE) + +/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE + representation on demand. Using this macro is very inefficient now, + try to port your code to use the new PyUnicode_*BYTE_DATA() macros or + use PyUnicode_WRITE() and PyUnicode_READ(). */ + #define PyUnicode_AS_UNICODE(op) \ - (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->str)) + (assert(PyUnicode_Check(op)), \ + (((PyUnicodeObject *)(op))->wstr) ? (((PyUnicodeObject *)(op))->wstr) : \ + PyUnicode_AsUnicode((PyObject *)(op))) + #define PyUnicode_AS_DATA(op) \ - (assert(PyUnicode_Check(op)),((const char *)((PyUnicodeObject *)(op))->str)) + ((const char *)(PyUnicode_AS_UNICODE(op))) + + +/* --- Flexible String Representaion Helper Macros (PEP 393) -------------- */ + +/* Values for PyUnicodeObject.state: */ + +/* Interning state. */ +#define SSTATE_INTERN_MASK 0x03 +#define SSTATE_NOT_INTERNED 0x00 +#define SSTATE_INTERNED_MORTAL 0x01 +#define SSTATE_INTERNED_IMMORTAL 0x02 + +/* Kind of the canonical representation. */ +#define SSTATE_KIND_MASK 0x0C +#define SSTATE_KIND_NOT_READY 0x00 +#define SSTATE_KIND_LATIN1 0x04 +#define SSTATE_KIND_UCS2 0x08 +#define SSTATE_KIND_UCS4 0x0C + +/* Compact is with respect to the allocation scheme. Compact unicode objects + only require one memory block while non-compact objects use one block for + the PyUnicodeObject struct and another for its data buffer. */ +#define SSTATE_COMPACT_MASK 0x10 +#define SSTATE_NOT_COMPACT 0x00 +#define SSTATE_IS_COMPACT 0x10 + +/* Return values of the PyUnicode_KIND() macro: */ + +/* String contains only wstr byte characters. This is only possible + when the string was created with a legacy API and PyUnicode_Ready() + has not been called yet. Note that PyUnicode_KIND() calls + PyUnicode_FAST_READY() so PyUnicode_WCHAR_KIND is only possible as a + intialized value not as a result of PyUnicode_KIND(). */ +#define PyUnicode_WCHAR_KIND 0 +#define PyUnicode_1BYTE_KIND 1 +#define PyUnicode_2BYTE_KIND 2 +#define PyUnicode_4BYTE_KIND 3 + + +/* Return the number of bytes the string uses to represent single characters, + this can be 1, 2 or 4. */ +#define PyUnicode_CHARACTER_SIZE(op) \ + (1 << (((SSTATE_KIND_MASK & ((PyUnicodeObject *)(op))->state) >> 2) - 1)) + +/* Return pointers to the canonical representation casted as unsigned char, + Py_UCS2, or Py_UCS4 for direct character access. + No checks are performed, use PyUnicode_CHARACTER_SIZE or + PyUnicode_KIND() before to ensure these will work correctly. */ + +#define PyUnicode_1BYTE_DATA(op) ((unsigned char *)((op)->str)) +#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2 *)((op)->str)) +#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4 *)((op)->str)) + +/* Return true if the string is compact or 0 if not. + No type checks or Ready calls are performed. */ +#define PyUnicode_IS_COMPACT(op) \ + (((op)->state & SSTATE_COMPACT_MASK) == SSTATE_IS_COMPACT) + +/* Return one of the PyUnicode_*_KIND values defined above. + This macro calls PyUnicode_FAST_READY() before returning the kind. */ +#define PyUnicode_KIND(op) \ + (assert(PyUnicode_Check(op)), \ + PyUnicode_FAST_READY((PyUnicodeObject *)(op)), \ + ((SSTATE_KIND_MASK & (((PyUnicodeObject *)(op))->state)) >> 2)) + +/* Return a void pointer to the raw unicode buffer. + This macro calls PyUnicode_FAST_READY() before returning the pointer. */ +#define PyUnicode_DATA(op) \ + (assert(PyUnicode_Check(op)), \ + PyUnicode_FAST_READY((PyUnicodeObject *)(op)), \ + ((((PyUnicodeObject *)(op))->str))) + +/* Write into the canonical representation, this macro does not do any sanity + checks and is intended for usage in loops. The caller should cache the + kind and data pointers optained form other macro calls. + index is the index in the string (starts at 0) and value is the new + code point value which shoule be written to that locaiton. */ +#define PyUnicode_WRITE(kind, data, index, value) \ + do { \ + const int k_ = (kind); \ + if (k_ == PyUnicode_1BYTE_KIND) \ + ((unsigned char *)(data))[(index)] = (unsigned char)(value); \ + else if (k_ == PyUnicode_2BYTE_KIND) \ + ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \ + else \ + ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \ + } while (0) + +/* Read a code point form the string's canonical representation. No checks + or ready calls are performed. */ +#define PyUnicode_READ(kind, data, index) \ + ((Py_UCS4) \ + ((kind) == PyUnicode_1BYTE_KIND ? \ + ((const unsigned char *)(data))[(index)] : \ + ((kind) == PyUnicode_2BYTE_KIND ? \ + ((const Py_UCS2 *)(data))[(index)] : \ + ((const Py_UCS4 *)(data))[(index)] \ + ) \ + )) + +/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it + calls PyUnicode_KIND() and might call it twice. For single reads, use + PyUnicode_READ_CHAR, for multiple consecutive reads callers should + cache kind and use PyUnicode_READ instead. */ +#define PyUnicode_READ_CHAR(unicode, index) \ + ((Py_UCS4) \ + (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \ + ((const unsigned char *)(PyUnicode_DATA((unicode))))[(index)] : \ + (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \ + ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \ + ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \ + ) \ + )) + +/* Returns the length of the unicode string. The caller has to make sure that + the string has it's canonical representation set before calling + this macro. Call PyUnicode_(FAST_)Ready to ensure that. */ +#define PyUnicode_GET_LENGTH(op) \ + (assert(PyUnicode_Check(op)), ((PyUnicodeObject *)(op))->length) + +/* PyUnicode_FAST_READY() does less work than PyUnicode_Ready() in the best + case. If the canonical representation is not yet set, it will still call + PyUnicode_Ready(). + Returns 0 on success and -1 on errors. */ +#define PyUnicode_FAST_READY(op) \ + (assert(PyUnicode_Check(op)), \ + (((PyUnicodeObject *)(op))->str != NULL ? \ + 0 : PyUnicode_Ready((PyUnicodeObject *)(op)))) + +/* Generic helper macro to convert characters of different types. + from_type and to_type have to be valid type names, begin and end + are pointers to the source characters which should be of type + "from_type *". to is a pointer of type "to_type *" and points to the + buffer where the result characters are written to. */ +#define PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \ + do { \ + const from_type *iter_; to_type *to_; \ + for (iter_ = (begin), to_ = (to_type *)(to); \ + iter_ < (end); \ + ++iter_, ++to_) { \ + *to_ = *iter_; \ + } \ + } while (0) + +/* Return a maximum character value which is suitable for creating another + string based on op. This is always an approximation but more efficient + than interating over the string. */ +#define PyUnicode_MAX_CHAR_VALUE(op) \ + (PyUnicode_FAST_READY((op)), \ + (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \ + (((PyUnicodeObject *)(op))->str == ((PyUnicodeObject *)(op))->utf8 ? \ + (0x7f) : (0xff) \ + ) : \ + (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \ + (0xffff) : (0x10ffff) \ + ))) + #endif /* --- Constants ---------------------------------------------------------- */ @@ -428,6 +422,52 @@ /* --- Plain Py_UNICODE --------------------------------------------------- */ +/* With PEP 393, this is the recommended way to allocate a new unicode object. + This function will allocate the object and its buffer in a single memory + block. Objects created using this function are not resizable. */ +#ifndef Py_LIMITED_API +PyAPI_FUNC(PyObject*) PyUnicode_New( + Py_ssize_t size, /* Number of code points in the new string */ + Py_UCS4 maxchar /* maximum code point value in the string */ + ); +#endif + +/* Initializes the canonical string representation from a the deprected + wstr/Py_UNICODE representation. This function is used to convert + unicode objects which were created using the old API to the new flexible + format introduced with PEP 393. The PyUnicode_FAST_READY() macro can be + more efficient if the string is already ready. */ +#ifndef Py_LIMITED_API +PyAPI_FUNC(int) PyUnicode_Ready( + PyUnicodeObject *unicode /* Unicode object */ + ); +#endif + +/* Copy character from one unicode object into another, this function performs + character conversion when nessesary and falls back to memcpy if possible. + No error checking or validation of the arguments is performed. */ +#ifndef Py_LIMITED_API +PyAPI_FUNC(void) PyUnicode_CopyCharacters( + PyUnicodeObject *to, + Py_ssize_t to_start, + const PyUnicodeObject *from, + Py_ssize_t from_start, + Py_ssize_t how_many + ); +#endif + +/* Find the maximum code point and count the number of surrogate pairs so a + correct string length can be computed before converting a string to UCS4. + This function counts single surrogates as a character and not as a pair. */ +#ifndef Py_LIMITED_API +PyAPI_FUNC(int) PyUnicode_FindMaxCharAndNumSurrogatePairs( + const wchar_t *begin, + const wchar_t *end, + Py_UCS4 *maxchar, + Py_ssize_t *num_surrogates + ); +#endif + /* Create a Unicode Object from the Py_UNICODE buffer u of the given size. @@ -452,13 +492,15 @@ ); /* Similar to PyUnicode_FromUnicode(), but u points to null-terminated - UTF-8 encoded bytes */ + UTF-8 encoded bytes. The size is determined with strlen(). */ PyAPI_FUNC(PyObject*) PyUnicode_FromString( const char *u /* UTF-8 encoded string */ ); /* Return a read-only pointer to the Unicode object's internal - Py_UNICODE buffer. */ + Py_UNICODE buffer. + If the wchar_t/Py_UNICODE representation is not yet available, this + function will calculate it. */ #ifndef Py_LIMITED_API PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode( @@ -466,6 +508,18 @@ ); #endif +/* Return a read-only pointer to the Unicode object's internal + Py_UNICODE buffer and save the length at size. + If the wchar_t/Py_UNICODE representation is not yet available, this + function will calculate it. */ + +#ifndef Py_LIMITED_API +PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize( + PyObject *unicode, /* Unicode object */ + Py_ssize_t *size /* location where to save the length */ + ); +#endif + /* Get the length of the Unicode object. */ PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize( @@ -563,7 +617,8 @@ #endif /* Use only if you know it's a string */ -#define PyUnicode_CHECK_INTERNED(op) (((PyUnicodeObject *)(op))->state) +#define PyUnicode_CHECK_INTERNED(op) \ + (SSTATE_INTERN_MASK & ((PyUnicodeObject *)(op))->state) /* --- wchar_t support for platforms which support it --------------------- */ @@ -658,13 +713,11 @@ /* Return a Python string holding the default encoded value of the Unicode object. - Same as PyUnicode_AsUTF8String() except - the resulting string is cached in the Unicode object for subsequent - usage by this function. The cached version is needed to implement - the character buffer interface and will live (at least) as long as - the Unicode object itself. + This function is exactly the same as PyUnicode_AsUTF8String(). - The refcount of the string is *not* incremented. + *** The behavior of this function changed with PEP 393, the resulting + string is no longer cached in the Unicode object and will leak memory + unless the caller decrefs it. *** *** Exported for internal use by the interpreter only !!! *** @@ -681,23 +734,36 @@ In case of an error, no *size is set. + This funcation caches the UTF-8 encoded string in the unicodeobject + and subsequent calls will return the same string. The memory is relased + when the unicodeobject is deallocated. + + _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to + support the previous internal function with the same behaviour. + *** This API is for interpreter INTERNAL USE ONLY and will likely *** be removed or changed in the future. *** If you need to access the Unicode object as UTF-8 bytes string, *** please use PyUnicode_AsUTF8String() instead. - */ #ifndef Py_LIMITED_API -PyAPI_FUNC(char *) _PyUnicode_AsStringAndSize( +PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize( PyObject *unicode, Py_ssize_t *size); +#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize #endif /* Returns a pointer to the default encoding (UTF-8) of the Unicode object unicode. + Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation + in the unicodeobject. + + _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to + support the previous internal function with the same behaviour. + Use of this API is DEPRECATED since no size information can be extracted from the returned data. @@ -710,7 +776,8 @@ */ #ifndef Py_LIMITED_API -PyAPI_FUNC(char *) _PyUnicode_AsString(PyObject *unicode); +PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode); +#define _PyUnicode_AsString PyUnicode_AsUTF8 #endif /* Returns "utf-8". */ @@ -1243,6 +1310,17 @@ ); #endif +/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyUnicodeObject + as argument instead of a raw buffer and length. This function additionally + transforms spaces to ASCII because this is what the callers in longobject, + floatobject, and complexobject did anyways. */ + +#ifndef Py_LIMITED_API +PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalAndSpaceToASCII( + PyObject *unicode /* Unicode object */ + ); +#endif + /* --- File system encoding ---------------------------------------------- */ /* ParseTuple converter: encode str objects to bytes using diff --git a/Lib/test/test_sys.py b/Lib/test/test_sys.py --- a/Lib/test/test_sys.py +++ b/Lib/test/test_sys.py @@ -833,12 +833,14 @@ class newstyleclass(object): pass check(newstyleclass, s) # unicode - usize = len('\0'.encode('unicode-internal')) - samples = ['', '1'*100] + # each tuple contains a string and its expected character size + samples = [('', 1), ('1'*100, 1), ('\xff'*50, 1), + ('\u0100'*40, 2), ('\uffff'*100, 2), + ('\U00010000'*30, 4), ('\U0010ffff'*100, 4)] # we need to test for both sizes, because we don't know if the string # has been cached - for s in samples: - basicsize = size(h + 'PPPiP') + usize * (len(s) + 1) + for s, usize in samples: + basicsize = size(h + 'PPPiPPPP') + usize * (len(s) + 1) check(s, basicsize) # weakref import weakref diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -1448,10 +1448,7 @@ from ctypes import (pythonapi, py_object, c_int, c_long, c_longlong, c_ssize_t, c_uint, c_ulong, c_ulonglong, c_size_t) - if sys.maxunicode == 65535: - name = "PyUnicodeUCS2_FromFormat" - else: - name = "PyUnicodeUCS4_FromFormat" + name = "PyUnicode_FromFormat" _PyUnicode_FromFormat = getattr(pythonapi, name) _PyUnicode_FromFormat.restype = py_object diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -700,12 +700,10 @@ return NULL; str = PyUnicode_FromObject(str); - if (str == NULL) + if (str == NULL || PyUnicode_FAST_READY(str) == -1) return NULL; - v = codec_tuple(PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(str), - PyUnicode_GET_SIZE(str), - errors), - PyUnicode_GET_SIZE(str)); + v = codec_tuple(PyUnicode_AsEncodedString(str, "utf-8", errors), + PyUnicode_GET_LENGTH(str)); Py_DECREF(str); return v; } diff --git a/Modules/_ctypes/_ctypes.c b/Modules/_ctypes/_ctypes.c --- a/Modules/_ctypes/_ctypes.c +++ b/Modules/_ctypes/_ctypes.c @@ -1844,11 +1844,9 @@ return NULL; } if (PyUnicode_Check(proto)) { - PyObject *v = _PyUnicode_AsDefaultEncodedString(proto); - if (!v) + proto_str = PyUnicode_AsUTF8AndSize(proto, &proto_len); + if (!proto_str) goto error; - proto_str = PyBytes_AS_STRING(v); - proto_len = PyBytes_GET_SIZE(v); } else { PyErr_SetString(PyExc_TypeError, "class must define a '_type_' string attribute"); diff --git a/Modules/_dbmmodule.c b/Modules/_dbmmodule.c --- a/Modules/_dbmmodule.c +++ b/Modules/_dbmmodule.c @@ -219,8 +219,8 @@ return -1; } if (PyUnicode_Check(arg)) { - arg = _PyUnicode_AsDefaultEncodedString(arg); - if (arg == NULL) + key.dptr = PyUnicode_AsUTF8AndSize(arg, &key.dsize); + if (key.dptr == NULL) return -1; } if (!PyBytes_Check(arg)) { @@ -229,8 +229,10 @@ arg->ob_type->tp_name); return -1; } - key.dptr = PyBytes_AS_STRING(arg); - key.dsize = PyBytes_GET_SIZE(arg); + else { + key.dptr = PyBytes_AS_STRING(arg); + key.dsize = PyBytes_GET_SIZE(arg); + } val = dbm_fetch(dp->di_dbm, key); return val.dptr != NULL; } diff --git a/Modules/_io/textio.c b/Modules/_io/textio.c --- a/Modules/_io/textio.c +++ b/Modules/_io/textio.c @@ -396,22 +396,20 @@ PyObject *translated = NULL; Py_UNICODE *out_str; Py_UNICODE *in, *out, *end; - if (Py_REFCNT(output) != 1) { - /* We could try to optimize this so that we only do a copy - when there is something to translate. On the other hand, - most decoders should only output non-shared strings, i.e. - translation is done in place. */ - translated = PyUnicode_FromUnicode(NULL, len); - if (translated == NULL) - goto error; - assert(Py_REFCNT(translated) == 1); - memcpy(PyUnicode_AS_UNICODE(translated), - PyUnicode_AS_UNICODE(output), - len * sizeof(Py_UNICODE)); - } - else { - translated = output; - } + /* XXX: Previous in-place decoding here is disabled as it does not + work reliably with new flexible unicode strings when data is + accessed via PyUnicode_AS_UNICODE(). */ + /* We could try to optimize this so that we only do a copy + when there is something to translate. On the other hand, + most decoders should only output non-shared strings, i.e. + translation is done in place. */ + translated = PyUnicode_FromUnicode(NULL, len); + if (translated == NULL) + goto error; + assert(Py_REFCNT(translated) == 1); + memcpy(PyUnicode_AS_UNICODE(translated), + PyUnicode_AS_UNICODE(output), + len * sizeof(Py_UNICODE)); out_str = PyUnicode_AS_UNICODE(translated); in = in_str; out = out_str; @@ -777,9 +775,8 @@ static PyObject * utf8_encode(textio *self, PyObject *text) { - return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(text), - PyUnicode_GET_SIZE(text), - PyBytes_AS_STRING(self->errors)); + return PyUnicode_AsEncodedString(text, "utf-8", + PyBytes_AS_STRING(self->errors)); } static PyObject * @@ -2559,10 +2556,10 @@ } } - if (line == NULL) + if (line == NULL || PyUnicode_FAST_READY(line) == -1) return NULL; - if (PyUnicode_GET_SIZE(line) == 0) { + if (PyUnicode_GET_LENGTH(line) == 0) { /* Reached EOF or would have blocked */ Py_DECREF(line); Py_CLEAR(self->snapshot); diff --git a/Modules/_pickle.c b/Modules/_pickle.c --- a/Modules/_pickle.c +++ b/Modules/_pickle.c @@ -1863,9 +1863,7 @@ if (self->bin) { char pdata[5]; - encoded = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(obj), - PyUnicode_GET_SIZE(obj), - "surrogatepass"); + encoded = PyUnicode_AsEncodedString(obj, "utf-8", "surrogatepass"); if (encoded == NULL) goto error; diff --git a/Modules/_sre.c b/Modules/_sre.c --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -1679,6 +1679,11 @@ /* Unicode objects do not support the buffer API. So, get the data directly instead. */ if (PyUnicode_Check(string)) { + // ptr = PyUnicode_DATA(string); + // *p_length = PyUnicode_GET_LENGTH(string); + // *p_charsize = PyUnicode_CHARACTER_SIZE(string); + if (PyUnicode_FAST_READY(string) == -1) + return NULL; ptr = (void *)PyUnicode_AS_DATA(string); *p_length = PyUnicode_GET_SIZE(string); *p_charsize = sizeof(Py_UNICODE); diff --git a/Modules/pyexpat.c b/Modules/pyexpat.c --- a/Modules/pyexpat.c +++ b/Modules/pyexpat.c @@ -1102,17 +1102,22 @@ PyUnicodeObject *_u_string = NULL; int result = 0; int i; + int kind; + void *data; /* Yes, supports only 8bit encodings */ _u_string = (PyUnicodeObject *) PyUnicode_Decode(template_buffer, 256, name, "replace"); - if (_u_string == NULL) + if (_u_string == NULL || PyUnicode_FAST_READY(_u_string) == -1) return result; + kind = PyUnicode_KIND(_u_string); + data = PyUnicode_DATA(_u_string); + for (i = 0; i < 256; i++) { /* Stupid to access directly, but fast */ - Py_UNICODE c = _u_string->str[i]; + Py_UCS4 c = PyUnicode_READ(kind, data, i); if (c == Py_UNICODE_REPLACEMENT_CHARACTER) info->map[i] = -1; else diff --git a/Objects/abstract.c b/Objects/abstract.c --- a/Objects/abstract.c +++ b/Objects/abstract.c @@ -1379,9 +1379,7 @@ PyBytes_GET_SIZE(o)); if (PyUnicode_Check(o)) /* The above check is done in PyLong_FromUnicode(). */ - return PyLong_FromUnicode(PyUnicode_AS_UNICODE(o), - PyUnicode_GET_SIZE(o), - 10); + return PyLong_FromUnicodeObject(o, 10); if (!PyObject_AsCharBuffer(o, &buffer, &buffer_len)) return long_from_string(buffer, buffer_len); diff --git a/Objects/codeobject.c b/Objects/codeobject.c --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -8,19 +8,24 @@ /* all_name_chars(s): true iff all chars in s are valid NAME_CHARS */ static int -all_name_chars(Py_UNICODE *s) +all_name_chars(PyObject *o) { static char ok_name_char[256]; static unsigned char *name_chars = (unsigned char *)NAME_CHARS; + PyUnicodeObject *u = (PyUnicodeObject *)o; + const unsigned char *s; + + if (!PyUnicode_Check(o) || PyUnicode_FAST_READY(u) == -1 || + PyUnicode_MAX_CHAR_VALUE(u) >= 128) + return 0; if (ok_name_char[*name_chars] == 0) { unsigned char *p; for (p = name_chars; *p; p++) ok_name_char[*p] = 1; } + s = PyUnicode_1BYTE_DATA(u); while (*s) { - if (*s >= 128) - return 0; if (ok_name_char[*s++] == 0) return 0; } @@ -77,9 +82,7 @@ /* Intern selected string constants */ for (i = PyTuple_GET_SIZE(consts); --i >= 0; ) { PyObject *v = PyTuple_GetItem(consts, i); - if (!PyUnicode_Check(v)) - continue; - if (!all_name_chars(PyUnicode_AS_UNICODE(v))) + if (!all_name_chars(v)) continue; PyUnicode_InternInPlace(&PyTuple_GET_ITEM(consts, i)); } diff --git a/Objects/complexobject.c b/Objects/complexobject.c --- a/Objects/complexobject.c +++ b/Objects/complexobject.c @@ -755,20 +755,10 @@ Py_ssize_t len; if (PyUnicode_Check(v)) { - Py_ssize_t i, buflen = PyUnicode_GET_SIZE(v); - Py_UNICODE *bufptr; - s_buffer = PyUnicode_TransformDecimalToASCII( - PyUnicode_AS_UNICODE(v), buflen); + s_buffer = PyUnicode_TransformDecimalAndSpaceToASCII(v); if (s_buffer == NULL) return NULL; - /* Replace non-ASCII whitespace with ' ' */ - bufptr = PyUnicode_AS_UNICODE(s_buffer); - for (i = 0; i < buflen; i++) { - Py_UNICODE ch = bufptr[i]; - if (ch > 127 && Py_UNICODE_ISSPACE(ch)) - bufptr[i] = ' '; - } - s = _PyUnicode_AsStringAndSize(s_buffer, &len); + s = PyUnicode_AsUTF8AndSize(s_buffer, &len); if (s == NULL) goto error; } diff --git a/Objects/floatobject.c b/Objects/floatobject.c --- a/Objects/floatobject.c +++ b/Objects/floatobject.c @@ -174,20 +174,10 @@ PyObject *result = NULL; if (PyUnicode_Check(v)) { - Py_ssize_t i, buflen = PyUnicode_GET_SIZE(v); - Py_UNICODE *bufptr; - s_buffer = PyUnicode_TransformDecimalToASCII( - PyUnicode_AS_UNICODE(v), buflen); + s_buffer = PyUnicode_TransformDecimalAndSpaceToASCII(v); if (s_buffer == NULL) return NULL; - /* Replace non-ASCII whitespace with ' ' */ - bufptr = PyUnicode_AS_UNICODE(s_buffer); - for (i = 0; i < buflen; i++) { - Py_UNICODE ch = bufptr[i]; - if (ch > 127 && Py_UNICODE_ISSPACE(ch)) - bufptr[i] = ' '; - } - s = _PyUnicode_AsStringAndSize(s_buffer, &len); + s = PyUnicode_AsUTF8AndSize(s_buffer, &len); if (s == NULL) { Py_DECREF(s_buffer); return NULL; diff --git a/Objects/longobject.c b/Objects/longobject.c --- a/Objects/longobject.c +++ b/Objects/longobject.c @@ -2130,23 +2130,26 @@ PyObject * PyLong_FromUnicode(Py_UNICODE *u, Py_ssize_t length, int base) { + PyObject *v, *unicode = PyUnicode_FromUnicode(u, length); + if (unicode == NULL) + return NULL; + v = PyLong_FromUnicodeObject(unicode, base); + Py_DECREF(unicode); + return v; +} + +PyObject * +PyLong_FromUnicodeObject(PyObject *u, int base) +{ PyObject *result; PyObject *asciidig; char *buffer, *end; - Py_ssize_t i, buflen; - Py_UNICODE *ptr; - - asciidig = PyUnicode_TransformDecimalToASCII(u, length); + Py_ssize_t buflen; + + asciidig = PyUnicode_TransformDecimalAndSpaceToASCII(u); if (asciidig == NULL) return NULL; - /* Replace non-ASCII whitespace with ' ' */ - ptr = PyUnicode_AS_UNICODE(asciidig); - for (i = 0; i < length; i++) { - Py_UNICODE ch = ptr[i]; - if (ch > 127 && Py_UNICODE_ISSPACE(ch)) - ptr[i] = ' '; - } - buffer = _PyUnicode_AsStringAndSize(asciidig, &buflen); + buffer = PyUnicode_AsUTF8AndSize(asciidig, &buflen); if (buffer == NULL) { Py_DECREF(asciidig); return NULL; @@ -4136,9 +4139,7 @@ } if (PyUnicode_Check(x)) - return PyLong_FromUnicode(PyUnicode_AS_UNICODE(x), - PyUnicode_GET_SIZE(x), - (int)base); + return PyLong_FromUnicodeObject(x, (int)base); else if (PyByteArray_Check(x) || PyBytes_Check(x)) { /* Since PyLong_FromString doesn't have a length parameter, * check here for possible NULs in the string. */ diff --git a/Objects/object.c b/Objects/object.c --- a/Objects/object.c +++ b/Objects/object.c @@ -295,9 +295,7 @@ } else if (PyUnicode_Check(s)) { PyObject *t; - t = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(s), - PyUnicode_GET_SIZE(s), - "backslashreplace"); + t = PyUnicode_AsEncodedString(s, "utf-8", "backslashreplace"); if (t == NULL) ret = 0; else { diff --git a/Objects/stringlib/eq.h b/Objects/stringlib/eq.h --- a/Objects/stringlib/eq.h +++ b/Objects/stringlib/eq.h @@ -9,13 +9,25 @@ register PyUnicodeObject *a = (PyUnicodeObject *)aa; register PyUnicodeObject *b = (PyUnicodeObject *)bb; + if (PyUnicode_FAST_READY(a) == -1 || PyUnicode_FAST_READY(b) == -1) { + assert(0 && "unicode_eq ready fail"); + return 0; + } + if (a->length != b->length) return 0; if (a->length == 0) return 1; - if (a->str[0] != b->str[0]) + if ((SSTATE_KIND_MASK & a->state) != (SSTATE_KIND_MASK & b->state)) return 0; - if (a->length == 1) + /* Just comparing the first byte is enough to see if a and b differ. + * If they are 2 byte or 4 byte character most differences will happen in + * the lower bytes anyways. + */ + if (PyUnicode_1BYTE_DATA(a)[0] != PyUnicode_1BYTE_DATA(b)[0]) + return 0; + if (PyUnicode_KIND(a) == PyUnicode_1BYTE_KIND && a->length == 1) return 1; - return memcmp(a->str, b->str, a->length * sizeof(Py_UNICODE)) == 0; + return memcmp(a->str, b->str, + a->length * PyUnicode_CHARACTER_SIZE(a)) == 0; } diff --git a/Objects/typeobject.c b/Objects/typeobject.c --- a/Objects/typeobject.c +++ b/Objects/typeobject.c @@ -23,7 +23,8 @@ ((PyUnicodeObject *)(name))->hash) #define MCACHE_CACHEABLE_NAME(name) \ PyUnicode_CheckExact(name) && \ - PyUnicode_GET_SIZE(name) <= MCACHE_MAX_ATTR_SIZE + PyUnicode_FAST_READY(name) != -1 && \ + PyUnicode_GET_LENGTH(name) <= MCACHE_MAX_ATTR_SIZE struct method_cache_entry { unsigned int version; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -100,10 +100,6 @@ */ static PyObject *interned; -/* Free list for Unicode objects */ -static PyUnicodeObject *free_list; -static int numfree; - /* The empty Unicode object is shared to improve performance. */ static PyUnicodeObject *unicode_empty; @@ -257,14 +253,29 @@ /* --- Unicode Object ----------------------------------------------------- */ +static PyObject * +substring(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t len); + +static PyObject * +fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s)); + static int unicode_resize(register PyUnicodeObject *unicode, - Py_ssize_t length) + Py_ssize_t length) { void *oldstr; + /* Resizing is only supported for old unicode objects. */ + assert(!PyUnicode_IS_COMPACT(unicode)); + assert(unicode->wstr != NULL); + assert(unicode->str == NULL); + + /* ... and only if they have not been readied yet, because + callees usually rely on the wstr representation when resizing. */ + assert(unicode->str == NULL); + /* Shortcut if there's nothing much to do. */ - if (unicode->length == length) + if (unicode->wstr_length == length) goto reset; /* Resizing shared object (unicode_empty or single character @@ -272,9 +283,9 @@ instead ! */ if (unicode == unicode_empty || - (unicode->length == 1 && - unicode->str[0] < 256U && - unicode_latin1[unicode->str[0]] == unicode)) { + (unicode->wstr_length == 1 && + unicode->wstr[0] < 256U && + unicode_latin1[unicode->wstr[0]] == unicode)) { PyErr_SetString(PyExc_SystemError, "can't resize shared str objects"); return -1; @@ -285,21 +296,28 @@ safe to look at str[length] (without making any assumptions about what it contains). */ - oldstr = unicode->str; - unicode->str = PyObject_REALLOC(unicode->str, - sizeof(Py_UNICODE) * (length + 1)); - if (!unicode->str) { - unicode->str = (Py_UNICODE *)oldstr; + oldstr = unicode->wstr; + unicode->wstr = PyObject_REALLOC(unicode->wstr, + sizeof(Py_UNICODE) * (length + 1)); + if (!unicode->wstr) { + unicode->wstr = (Py_UNICODE *)oldstr; PyErr_NoMemory(); return -1; } - unicode->str[length] = 0; - unicode->length = length; + unicode->wstr[length] = 0; + unicode->wstr_length = length; reset: - /* Reset the object caches */ - if (unicode->defenc) { - Py_CLEAR(unicode->defenc); + if (unicode->str != NULL) { + PyObject_FREE(unicode->str); + if (unicode->utf8 && unicode->utf8 != unicode->str) { + PyObject_FREE(unicode->utf8); + } + unicode->utf8 = NULL; + unicode->utf8_length = 0; + unicode->str = NULL; + unicode->length = 0; + unicode->state = unicode->state & ~SSTATE_KIND_MASK; } unicode->hash = -1; @@ -315,10 +333,15 @@ */ +#ifdef Py_DEBUG +int unicode_old_new_calls = 0; +#endif + static PyUnicodeObject * _PyUnicode_New(Py_ssize_t length) { register PyUnicodeObject *unicode; + size_t new_size; /* Optimization for empty strings */ if (length == 0 && unicode_empty != NULL) { @@ -330,40 +353,26 @@ if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { return (PyUnicodeObject *)PyErr_NoMemory(); } - - /* Unicode freelist & memory allocation */ - if (free_list) { - unicode = free_list; - free_list = *(PyUnicodeObject **)unicode; - numfree--; - if (unicode->str) { - /* Keep-Alive optimization: we only upsize the buffer, - never downsize it. */ - if ((unicode->length < length) && - unicode_resize(unicode, length) < 0) { - PyObject_DEL(unicode->str); - unicode->str = NULL; - } - } - else { - size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); - unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size); - } - PyObject_INIT(unicode, &PyUnicode_Type); - } - else { - size_t new_size; - unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); - if (unicode == NULL) - return NULL; - new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); - unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size); - } - - if (!unicode->str) { + if (length < 0) { + PyErr_SetString(PyExc_SystemError, + "Negative size passed to _PyUnicode_New"); + return NULL; + } + +#ifdef Py_DEBUG + ++unicode_old_new_calls; +#endif + + unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); + if (unicode == NULL) + return NULL; + new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); + unicode->wstr = (Py_UNICODE*) PyObject_MALLOC(new_size); + if (!unicode->wstr) { PyErr_NoMemory(); goto onError; } + /* Initialize the first element to guard against cases where * the caller fails before initializing str -- unicode_resize() * reads str[0], and the Keep-Alive optimization can keep memory @@ -371,12 +380,15 @@ * We don't want unicode_resize to read uninitialized memory in * that case. */ - unicode->str[0] = 0; - unicode->str[length] = 0; - unicode->length = length; + unicode->wstr[0] = 0; + unicode->wstr[length] = 0; + unicode->wstr_length = length; unicode->hash = -1; unicode->state = 0; - unicode->defenc = NULL; + unicode->str = NULL; + unicode->length = 0; + unicode->utf8 = NULL; + unicode->utf8_length = 0; return unicode; onError: @@ -387,6 +399,474 @@ return NULL; } +#ifdef Py_DEBUG +int unicode_new_new_calls = 0; +#endif + +PyObject * +PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) +{ + PyObject *obj; + PyUnicodeObject *unicode; + int kind; + int is_sharing = 0; + Py_ssize_t buffer_size; + Py_ssize_t struct_size; + + /* Optimization for empty strings */ + if (size == 0 && unicode_empty != NULL) { + Py_INCREF(unicode_empty); + return (PyObject *)unicode_empty; + } + + /* Ensure we won't overflow the size. */ + if (size > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { + return PyErr_NoMemory(); + } + if (size < 0) { + PyErr_SetString(PyExc_SystemError, + "Negative size passed to PyUnicode_New"); + return NULL; + } + +#ifdef Py_DEBUG + ++unicode_new_new_calls; +#endif + + if (maxchar < 128) { + kind = SSTATE_KIND_LATIN1; + buffer_size = size + 1; + is_sharing = 1; + } + else if (maxchar < 256) { + kind = SSTATE_KIND_LATIN1; + buffer_size = size + 1; + } + else if (maxchar < 65536) { + kind = SSTATE_KIND_UCS2; + buffer_size = 2 * (size + 1); + if (sizeof(wchar_t) == 2) + is_sharing = 1; + } + else { + kind = SSTATE_KIND_UCS4; + buffer_size = 4 * (size + 1); + if (sizeof(wchar_t) == 4) + is_sharing = 1; + } + + /* Duplicated allocation code from _PyObject_New() instead of a call to + * PyObject_New() so we are able to allocate space for the object and + * it's data buffer. + */ + struct_size = _PyObject_SIZE(&PyUnicode_Type); + obj = (PyObject *) PyObject_MALLOC(struct_size + buffer_size); + if (obj == NULL) + return PyErr_NoMemory(); + obj = PyObject_INIT(obj, &PyUnicode_Type); + if (obj == NULL) + return NULL; + + unicode = (PyUnicodeObject *)obj; + unicode->length = size; + unicode->str = (void *)(((char *)obj) + struct_size); + unicode->hash = -1; + unicode->state = SSTATE_IS_COMPACT | kind; + + if (kind == SSTATE_KIND_LATIN1) { + unicode->wstr_length = 0; + unicode->wstr = NULL; + PyUnicode_1BYTE_DATA(unicode)[0] = 0; + PyUnicode_1BYTE_DATA(unicode)[size] = 0; + if (is_sharing) { + unicode->utf8_length = size; + unicode->utf8 = (char *)unicode->str; + } + else { + unicode->utf8_length = 0; + unicode->utf8 = NULL; + } + } + else { + unicode->utf8 = NULL; + if (kind == SSTATE_KIND_UCS2) { + PyUnicode_2BYTE_DATA(unicode)[0] = 0; + PyUnicode_2BYTE_DATA(unicode)[size] = 0; + } + else { /* kind == SSTATE_KIND_UCS4 */ + PyUnicode_4BYTE_DATA(unicode)[0] = 0; + PyUnicode_4BYTE_DATA(unicode)[size] = 0; + } + if (is_sharing) { + unicode->wstr_length = size; + unicode->wstr = (wchar_t *)unicode->str; + } + else { + unicode->wstr_length = 0; + unicode->wstr = NULL; + } + } + return obj; +} + +/* Helper function to convert a wchar_t representation to UCS4, this will + decode surrogate pairs, the other conversions are implemented as macros + for efficency. + This function assumes that unicode can hold one more code point than wstr + characters for a terminalting null. */ +int +unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, + PyUnicodeObject *unicode) +{ + assert(unicode && PyUnicode_Check(unicode)); + assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); + + const wchar_t *iter; + wchar_t surrogate = 0; + Py_UCS4 *ucs4_out = PyUnicode_4BYTE_DATA(unicode); + + for (iter = begin; iter < end; ++iter) { + assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) + + PyUnicode_GET_LENGTH(unicode))); + if (surrogate) { + /* expecting a second surrogate */ + if (*iter >= 0xDC00 && *iter <= 0xDFFF) { + *ucs4_out++ = (((surrogate & 0x3FF)<<10) + | (*iter & 0x3FF)) + 0x10000; + } + else { + /* did not find a valid second surrogate, thus just writing + the first... */ + *ucs4_out++ = surrogate; + *ucs4_out++ = *iter; + } + surrogate = 0; + } + else if (*iter >= 0xD800 && *iter <= 0xDBFF) { + /* first surrogate */ + surrogate = *iter; + } + else { + /* unexpected second surrogates are transparently written out + here. */ + *ucs4_out++ = *iter; + } + } + if (surrogate != 0) { + /* caught a single first surrogate at the end of the string. */ + assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) + + PyUnicode_GET_LENGTH(unicode) - 1)); + *ucs4_out++ = surrogate; + } + assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) + + PyUnicode_GET_LENGTH(unicode))); + + return 0; +} + +void +PyUnicode_CopyCharacters(PyUnicodeObject *to, Py_ssize_t to_start, + const PyUnicodeObject *from, Py_ssize_t from_start, + Py_ssize_t how_many) +{ + const int from_kind = PyUnicode_KIND(from); + const int to_kind = PyUnicode_KIND(to); + + assert(PyUnicode_Check(from)); + assert(PyUnicode_Check(to)); + + if (from_kind == to_kind) { + const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(to); + Py_MEMCPY(PyUnicode_1BYTE_DATA(to) + (to_start * char_size), + PyUnicode_1BYTE_DATA(from) + (from_start * char_size), + how_many * char_size); + } + else { + switch (from_kind) { + case PyUnicode_1BYTE_KIND: + switch (to_kind) { + case PyUnicode_2BYTE_KIND: + PyUnicode_CONVERT_BYTES( + unsigned char, Py_UCS2, + PyUnicode_1BYTE_DATA(from) + from_start, + PyUnicode_1BYTE_DATA(from) + from_start + how_many, + PyUnicode_2BYTE_DATA(to) + to_start + ); + break; + case PyUnicode_4BYTE_KIND: + PyUnicode_CONVERT_BYTES( + unsigned char, Py_UCS4, + PyUnicode_1BYTE_DATA(from) + from_start, + PyUnicode_1BYTE_DATA(from) + from_start + how_many, + PyUnicode_4BYTE_DATA(to) + to_start + ); + break; + case PyUnicode_1BYTE_KIND: + default: + assert(0 && "Impossible switch state 1 to 1 in " + "PyUnicode_CopyCharacters."); + break; + } + break; + case PyUnicode_2BYTE_KIND: + switch (to_kind) { + case PyUnicode_1BYTE_KIND: + PyUnicode_CONVERT_BYTES( + Py_UCS2, unsigned char, + PyUnicode_2BYTE_DATA(from) + from_start, + PyUnicode_2BYTE_DATA(from) + from_start + how_many, + PyUnicode_1BYTE_DATA(to) + to_start + ); + break; + case PyUnicode_4BYTE_KIND: + PyUnicode_CONVERT_BYTES( + Py_UCS2, Py_UCS4, + PyUnicode_2BYTE_DATA(from) + from_start, + PyUnicode_2BYTE_DATA(from) + from_start + how_many, + PyUnicode_4BYTE_DATA(to) + to_start + ); + break; + case PyUnicode_2BYTE_KIND: + default: + assert(0 && "Impossible switch state 2 to 2 in " + "PyUnicode_CopyCharacters."); + break; + } + break; + case PyUnicode_4BYTE_KIND: + switch (to_kind) { + case PyUnicode_1BYTE_KIND: + PyUnicode_CONVERT_BYTES( + Py_UCS4, unsigned char, + PyUnicode_4BYTE_DATA(from) + from_start, + PyUnicode_4BYTE_DATA(from) + from_start + how_many, + PyUnicode_1BYTE_DATA(to) + to_start + ); + break; + case PyUnicode_2BYTE_KIND: + PyUnicode_CONVERT_BYTES( + Py_UCS4, Py_UCS2, + PyUnicode_4BYTE_DATA(from) + from_start, + PyUnicode_4BYTE_DATA(from) + from_start + how_many, + PyUnicode_2BYTE_DATA(to) + to_start + ); + break; + case PyUnicode_4BYTE_KIND: + default: + assert(0 && "Impossible switch state 4 to 4 in " + "PyUnicode_CopyCharacters."); + break; + } + break; + default: + assert(0 && "Unknown string kind in PyUnicode_KIND"); + } + } +} + +int +PyUnicode_FindMaxCharAndNumSurrogatePairs(const wchar_t *begin, + const wchar_t *end, + Py_UCS4 *maxchar, + Py_ssize_t *num_surrogates) +{ + const wchar_t *iter; + wchar_t surrogate = 0; + Py_UCS4 surrogate_val; + + if (num_surrogates == NULL || maxchar == NULL) { + PyErr_SetString(PyExc_SystemError, + "unexpected NULL arguments to " + "PyUnicode_FindMaxCharAndNumSurrogatePairs"); + return -1; + } + + *num_surrogates = 0; + *maxchar = 0; + + for (iter = begin; iter < end; ++iter) { + if (*iter > *maxchar) + *maxchar = *iter; + + if (surrogate) { + /* expecting a second surrogate, if the next character is in the + proper range, reconstruct surrogate pair, increase maxchar + and count up surrogate. If no follow up surrogate came, + the above if ensured already that maxchar was updated. */ + if (*iter >= 0xDC00 && *iter <= 0xDFFF) { + surrogate_val = (((surrogate & 0x3FF)<<10) + | (*iter & 0x3FF)) + 0x10000; + ++(*num_surrogates); + if (surrogate_val > *maxchar) + *maxchar = surrogate_val; + } + surrogate = 0; + } + else if (*iter >= 0xD800 && *iter <= 0xDBFF) { + /* first surrogate */ + surrogate = *iter; + } + } + return 0; +} + +#ifdef Py_DEBUG +int unicode_ready_calls = 0; +#endif + +int +PyUnicode_Ready(PyUnicodeObject *unicode) +{ + wchar_t *end; + Py_UCS4 maxchar = 0; + Py_ssize_t num_surrogates; + Py_ssize_t length_wo_surrogates; + + assert(PyUnicode_Check(unicode)); + + if (unicode->str == NULL) { + /* PyUnicode_Ready() is only intened for old-style API usage where + * strings were created using _PyObject_New() and where no canonical + * representation (the str field) has been set yet aka strings + * which are not yet ready. + */ + assert(unicode->wstr != NULL); + assert((unicode->state & SSTATE_KIND_MASK) == SSTATE_KIND_NOT_READY); + assert(!PyUnicode_IS_COMPACT(unicode)); + /* Actually, it should neither be interned nor be anything else: */ + assert(unicode->state == 0); + assert(unicode->utf8 == NULL); + +#ifdef Py_DEBUG + ++unicode_ready_calls; +#endif + + end = unicode->wstr + unicode->wstr_length; + if (PyUnicode_FindMaxCharAndNumSurrogatePairs(unicode->wstr, end, + &maxchar, + &num_surrogates) == -1) { + assert(0 && "PyUnicode_FindMaxCharAndNumSurrogatePairs failed"); + return -1; + } + + if (maxchar < 256) { + unicode->str = PyObject_MALLOC(unicode->wstr_length + 1); + if (!unicode->str) { + PyErr_NoMemory(); + return -1; + } + PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, + unicode->wstr, end, + PyUnicode_1BYTE_DATA(unicode)); + PyUnicode_1BYTE_DATA(unicode)[unicode->wstr_length] = '\0'; + unicode->length = unicode->wstr_length; + unicode->state = SSTATE_KIND_LATIN1; + if (maxchar < 128) { + unicode->utf8 = unicode->str; + unicode->utf8_length = unicode->wstr_length; + } + else { + unicode->utf8 = NULL; + unicode->utf8_length = 0; + } + goto free_wstr; + } + /* In this case we might have to convert down from 4-byte native + wchar_t to 2-byte unicode. */ + else if (maxchar < 65536) { + assert(num_surrogates == 0 && + "FindMaxCharAndNumSurrogatePairs() messed up"); + + if (sizeof(wchar_t) == 2) { + /* We can share representations and are done. */ + unicode->str = unicode->wstr; + PyUnicode_2BYTE_DATA(unicode)[unicode->wstr_length] = '\0'; + unicode->length = unicode->wstr_length; + unicode->state = SSTATE_KIND_UCS2; + unicode->utf8 = NULL; + unicode->utf8_length = 0; + return 0; + } + else { + assert(sizeof(wchar_t) == 4); + + unicode->str = PyObject_MALLOC(2 * (unicode->wstr_length + 1)); + if (!unicode->str) { + PyErr_NoMemory(); + return -1; + } + PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, + unicode->wstr, end, + PyUnicode_2BYTE_DATA(unicode)); + PyUnicode_2BYTE_DATA(unicode)[unicode->wstr_length] = '\0'; + unicode->length = unicode->wstr_length; + unicode->state = SSTATE_KIND_UCS2; + unicode->utf8 = NULL; + unicode->utf8_length = 0; + goto free_wstr; + } + } + /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */ + else { + if (sizeof(wchar_t) == 2 || num_surrogates > 0) { + /* in case the native repsentation is 2-byte or in case there + were surrogates, we need to allocate a new normalized + 4-byte version. */ + length_wo_surrogates = unicode->wstr_length - num_surrogates; + unicode->str = PyObject_MALLOC(4 * (length_wo_surrogates + 1)); + if (!unicode->str) { + PyErr_NoMemory(); + return -1; + } + unicode->length = length_wo_surrogates; + unicode->state = SSTATE_KIND_UCS4; + unicode->utf8 = NULL; + unicode->utf8_length = 0; + if (unicode_convert_wchar_to_ucs4(unicode->wstr, end, + unicode) < 0) { + assert(0 && "ConvertWideCharToUCS4 failed"); + return -1; + } + PyObject_FREE(unicode->wstr); + if (sizeof(wchar_t) == 2) { + unicode->wstr = NULL; + unicode->wstr_length = 0; + } + else { + assert(sizeof(wchar_t) == 4); + unicode->wstr = unicode->str; + unicode->wstr_length = unicode->length; + } + } + else { + assert(sizeof(wchar_t) == 4); + assert(num_surrogates == 0); + + unicode->str = unicode->wstr; + unicode->length = unicode->wstr_length; + unicode->utf8 = NULL; + unicode->utf8_length = 0; + unicode->state = SSTATE_KIND_UCS4; + } + PyUnicode_4BYTE_DATA(unicode)[unicode->length] = '\0'; + return 0; + } + } + else { + /* if unicode->str is not NULL */ + assert((unicode->state & SSTATE_KIND_MASK) != SSTATE_KIND_NOT_READY); + return 0; + } + assert(0 && "This should be unreachable."); + return 0; + + free_wstr: + PyObject_FREE(unicode->wstr); + unicode->wstr = NULL; + unicode->wstr_length = 0; + return 0; +} + static void unicode_dealloc(register PyUnicodeObject *unicode) { @@ -409,25 +889,17 @@ Py_FatalError("Inconsistent interned string state."); } - if (PyUnicode_CheckExact(unicode) && - numfree < PyUnicode_MAXFREELIST) { - /* Keep-Alive optimization */ - if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { + if (unicode->wstr && unicode->wstr != unicode->str) + PyObject_DEL(unicode->wstr); + if (unicode->utf8 && unicode->utf8 != unicode->str) + PyObject_DEL(unicode->utf8); + + if (PyUnicode_IS_COMPACT(unicode)) { + Py_TYPE(unicode)->tp_free((PyObject *)unicode); + } + else { + if (unicode->str) PyObject_DEL(unicode->str); - unicode->str = NULL; - unicode->length = 0; - } - if (unicode->defenc) { - Py_CLEAR(unicode->defenc); - } - /* Add to free list */ - *(PyUnicodeObject **)unicode = free_list; - free_list = unicode; - numfree++; - } - else { - PyObject_DEL(unicode->str); - Py_XDECREF(unicode->defenc); Py_TYPE(unicode)->tp_free((PyObject *)unicode); } } @@ -443,21 +915,26 @@ return -1; } v = *unicode; - if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) { + if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0 || + v->wstr == NULL) { PyErr_BadInternalCall(); return -1; } /* Resizing unicode_empty and single character objects is not - possible since these are being shared. We simply return a fresh - copy with the same Unicode content. */ - if (v->length != length && - (v == unicode_empty || v->length == 1)) { + possible since these are being shared. + The same goes for new-representation unicode objects or objects which + have already been readied. + For these, we simply return a fresh copy with the same Unicode content. + */ + if ((v->wstr_length != length && + (v == unicode_empty || v->wstr_length == 1)) || + PyUnicode_IS_COMPACT(v) || v->str) { PyUnicodeObject *w = _PyUnicode_New(length); if (w == NULL) return -1; - Py_UNICODE_COPY(w->str, v->str, - length < v->length ? length : v->length); + Py_UNICODE_COPY(w->wstr, v->wstr, + length < v->wstr_length ? length : v->wstr_length); Py_DECREF(*unicode); *unicode = w; return 0; @@ -478,6 +955,8 @@ PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) { PyUnicodeObject *unicode; + Py_UCS4 maxchar = 0; + Py_ssize_t num_surrogates; /* If the Unicode data is known at construction time, we can apply some optimizations which share commonly used objects. */ @@ -494,25 +973,56 @@ if (size == 1 && *u < 256) { unicode = unicode_latin1[*u]; if (!unicode) { - unicode = _PyUnicode_New(1); + unicode = (PyUnicodeObject *) PyUnicode_New(1, *u); if (!unicode) return NULL; - unicode->str[0] = *u; + PyUnicode_1BYTE_DATA(unicode)[0] = *u; unicode_latin1[*u] = unicode; } Py_INCREF(unicode); return (PyObject *)unicode; } + + /* If not empty and not single character, copy the Unicode data + into the new object */ + + if (PyUnicode_FindMaxCharAndNumSurrogatePairs(u, u + size, &maxchar, + &num_surrogates) == -1) + return NULL; + + unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates, + maxchar); + if (!unicode) + return NULL; + + switch (PyUnicode_KIND(unicode)) { + case PyUnicode_1BYTE_KIND: + PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char, + u, u + size, PyUnicode_1BYTE_DATA(unicode)); + break; + case PyUnicode_2BYTE_KIND: + PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2, + u, u + size, PyUnicode_2BYTE_DATA(unicode)); + break; + case PyUnicode_4BYTE_KIND: + /* This is the only case which has to process surrogates, thus + a simple copy loop is not enough and we need a function. */ + if (unicode_convert_wchar_to_ucs4(u, u + size, unicode) < 0) { + Py_DECREF(unicode); + return NULL; + } + break; + default: + assert(0 && "Impossible state"); + } + + return (PyObject *)unicode; } unicode = _PyUnicode_New(size); if (!unicode) return NULL; - /* Copy the Unicode data into the new object */ - if (u != NULL) - Py_UNICODE_COPY(unicode->str, u, size); - return (PyObject *)unicode; } @@ -544,10 +1054,10 @@ if (size == 1 && Py_CHARMASK(*u) < 128) { unicode = unicode_latin1[Py_CHARMASK(*u)]; if (!unicode) { - unicode = _PyUnicode_New(1); + unicode = (PyUnicodeObject *)PyUnicode_New(1, Py_CHARMASK(*u)); if (!unicode) return NULL; - unicode->str[0] = Py_CHARMASK(*u); + PyUnicode_1BYTE_DATA(unicode)[0] = Py_CHARMASK(*u); unicode_latin1[Py_CHARMASK(*u)] = unicode; } Py_INCREF(unicode); @@ -641,8 +1151,6 @@ PyObject * PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size) { - PyUnicodeObject *unicode; - if (w == NULL) { if (size == 0) return PyUnicode_FromStringAndSize(NULL, 0); @@ -654,24 +1162,7 @@ size = wcslen(w); } - unicode = _PyUnicode_New(size); - if (!unicode) - return NULL; - - /* Copy the wchar_t data into the new object */ -#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T - memcpy(unicode->str, w, size * sizeof(wchar_t)); -#else - { - register Py_UNICODE *u; - register Py_ssize_t i; - u = PyUnicode_AS_UNICODE(unicode); - for (i = size; i > 0; i--) - *u++ = *w++; - } -#endif - - return (PyObject *)unicode; + return PyUnicode_FromUnicode(w, size); } #endif /* CONVERT_WCHAR_TO_SURROGATES */ @@ -779,10 +1270,6 @@ return f; } -#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;} - -/* size of fixed-size buffer for formatting single arguments */ -#define ITEM_BUFFER_LEN 21 /* maximum number of characters required for output of %ld. 21 characters allows for 64-bit integers (in decimal) and an optional sign. */ #define MAX_LONG_CHARS 21 @@ -791,6 +1278,8 @@ plus 1 for the sign. 53/22 is an upper bound for log10(256). */ #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) +#define MAX(a, b) (((a) >= (b)) ? (a) : (b)) + PyObject * PyUnicode_FromFormatV(const char *format, va_list vargs) { @@ -803,80 +1292,35 @@ int precision = 0; int zeropad; const char* f; - Py_UNICODE *s; - PyObject *string; + PyUnicodeObject *string; /* used by sprintf */ - char buffer[ITEM_BUFFER_LEN+1]; - /* use abuffer instead of buffer, if we need more space - * (which can happen if there's a format specifier with width). */ - char *abuffer = NULL; - char *realbuffer; - Py_ssize_t abuffersize = 0; char fmt[61]; /* should be enough for %0width.precisionlld */ - const char *copy; + Py_UCS4 maxchar = 127; /* result is ASCII by default */ + Py_UCS4 argmaxchar; + Py_ssize_t numbersize = 0; + char *numberresults = NULL; + char *numberresult = NULL; + Py_ssize_t i; + int kind; + void *data; Py_VA_COPY(count, vargs); /* step 1: count the number of %S/%R/%A/%s format specifications * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/ * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the - * result in an array) */ - for (f = format; *f; f++) { - if (*f == '%') { - /* skip width or width.precision (eg. "1.2" of "%1.2f") */ - f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL); - if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V') - ++callcount; - } - else if (128 <= (unsigned char)*f) { - PyErr_Format(PyExc_ValueError, - "PyUnicode_FromFormatV() expects an ASCII-encoded format " - "string, got a non-ASCII byte: 0x%02x", - (unsigned char)*f); - return NULL; - } - } - /* step 2: allocate memory for the results of - * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */ - if (callcount) { - callresults = PyObject_Malloc(sizeof(PyObject *)*callcount); - if (!callresults) { - PyErr_NoMemory(); - return NULL; - } - callresult = callresults; - } - /* step 3: figure out how large a buffer we need */ + * result in an array) + * also esimate a upper bound for all the number formats in the string, + * numbers will be formated in step 3 and be keept in a '\0'-separated + * buffer before putting everything together. */ for (f = format; *f; f++) { if (*f == '%') { -#ifdef HAVE_LONG_LONG int longlongflag; -#endif - const char* p; - - p = f; - f = parse_format_flags(f, &width, NULL, - NULL, &longlongflag, NULL); - - switch (*f) { - case 'c': - { -#ifndef Py_UNICODE_WIDE - int ordinal = va_arg(count, int); - if (ordinal > 0xffff) - n += 2; - else - n++; -#else - (void)va_arg(count, int); - n++; -#endif - break; - } - case '%': - n++; - break; - case 'd': case 'u': case 'i': case 'x': - (void) va_arg(count, int); + /* skip width or width.precision (eg. "1.2" of "%1.2f") */ + f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL); + if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V') + ++callcount; + + else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') { #ifdef HAVE_LONG_LONG if (longlongflag) { if (width < MAX_LONG_LONG_CHARS) @@ -890,10 +1334,142 @@ need more (which we allocate later). */ if (width < MAX_LONG_CHARS) width = MAX_LONG_CHARS; - n += width; - /* XXX should allow for large precision here too. */ - if (abuffersize < width) - abuffersize = width; + + /* account for the size + '\0' to separate numbers + inside of the numberresults buffer */ + numbersize += (width + 1); + } + } + else if ((unsigned char)*f > 127) { + PyErr_Format(PyExc_ValueError, + "PyUnicode_FromFormatV() expects an ASCII-encoded format " + "string, got a non-ASCII byte: 0x%02x", + (unsigned char)*f); + return NULL; + } + } + /* step 2: allocate memory for the results of + * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */ + if (callcount) { + callresults = PyObject_Malloc(sizeof(PyObject *) * callcount); + if (!callresults) { + PyErr_NoMemory(); + return NULL; + } + callresult = callresults; + } + /* step 2.5: allocate memory for the results of formating numbers */ + if (numbersize) { + numberresults = PyObject_Malloc(numbersize); + if (!numberresults) { + PyErr_NoMemory(); + goto fail; + } + numberresult = numberresults; + } + + /* step 3: format numbers and figure out how large a buffer we need */ + for (f = format; *f; f++) { + if (*f == '%') { + const char* p; + int longflag; + int longlongflag; + int size_tflag; + int numprinted; + + p = f; + zeropad = (f[1] == '0'); + f = parse_format_flags(f, &width, &precision, + &longflag, &longlongflag, &size_tflag); + switch (*f) { + case 'c': + { + int ordinal = va_arg(count, int); + maxchar = MAX(maxchar, ordinal); + n++; + break; + } + case '%': + n++; + break; + case 'i': + case 'd': + makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, + width, precision, *f); + if (longflag) + numprinted = sprintf(numberresult, fmt, + va_arg(count, long)); +#ifdef HAVE_LONG_LONG + else if (longlongflag) + numprinted = sprintf(numberresult, fmt, + va_arg(count, PY_LONG_LONG)); +#endif + else if (size_tflag) + numprinted = sprintf(numberresult, fmt, + va_arg(count, Py_ssize_t)); + else + numprinted = sprintf(numberresult, fmt, + va_arg(count, int)); + n += numprinted; + /* advance by +1 to skip over the '\0' */ + numberresult += (numprinted + 1); + assert(*(numberresult - 1) == '\0'); + assert(*(numberresult - 2) != '\0'); + assert(numprinted >= 0); + assert(numberresult <= numberresults + numbersize); + break; + case 'u': + makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, + width, precision, 'u'); + if (longflag) + numprinted = sprintf(numberresult, fmt, + va_arg(count, unsigned long)); +#ifdef HAVE_LONG_LONG + else if (longlongflag) + numprinted = sprintf(numberresult, fmt, + va_arg(count, unsigned PY_LONG_LONG)); +#endif + else if (size_tflag) + numprinted = sprintf(numberresult, fmt, + va_arg(count, size_t)); + else + numprinted = sprintf(numberresult, fmt, + va_arg(count, unsigned int)); + n += numprinted; + numberresult += (numprinted + 1); + assert(*(numberresult - 1) == '\0'); + assert(*(numberresult - 2) != '\0'); + assert(numprinted >= 0); + assert(numberresult <= numberresults + numbersize); + break; + case 'x': + makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x'); + numprinted = sprintf(numberresult, fmt, va_arg(count, int)); + n += numprinted; + numberresult += (numprinted + 1); + assert(*(numberresult - 1) == '\0'); + assert(*(numberresult - 2) != '\0'); + assert(numprinted >= 0); + assert(numberresult <= numberresults + numbersize); + break; + case 'p': + numprinted = sprintf(numberresult, "%p", va_arg(count, void*)); + /* %p is ill-defined: ensure leading 0x. */ + if (numberresult[1] == 'X') + numberresult[1] = 'x'; + else if (numberresult[1] != 'x') { + memmove(numberresult + 2, numberresult, + strlen(numberresult) + 1); + numberresult[0] = '0'; + numberresult[1] = 'x'; + numprinted += 2; + } + n += numprinted; + numberresult += (numprinted + 1); + assert(*(numberresult - 1) == '\0'); + assert(*(numberresult - 2) != '\0'); + assert(numprinted >= 0); + assert(numberresult <= numberresults + numbersize); break; case 's': { @@ -902,7 +1478,11 @@ PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace"); if (!str) goto fail; - n += PyUnicode_GET_SIZE(str); + /* since PyUnicode_DecodeUTF8 returns already flexible + unicode objects, there is no need to call ready on them */ + argmaxchar = PyUnicode_MAX_CHAR_VALUE(str); + maxchar = MAX(maxchar, argmaxchar); + n += PyUnicode_GET_LENGTH(str); /* Remember the str and switch to the next slot */ *callresult++ = str; break; @@ -911,7 +1491,11 @@ { PyObject *obj = va_arg(count, PyObject *); assert(obj && PyUnicode_Check(obj)); - n += PyUnicode_GET_SIZE(obj); + if (PyUnicode_FAST_READY(obj) == -1) + goto fail; + argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj); + maxchar = MAX(maxchar, argmaxchar); + n += PyUnicode_GET_LENGTH(obj); break; } case 'V': @@ -922,14 +1506,20 @@ assert(obj || str); assert(!obj || PyUnicode_Check(obj)); if (obj) { - n += PyUnicode_GET_SIZE(obj); + if (PyUnicode_FAST_READY(obj) == -1) + goto fail; + argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj); + maxchar = MAX(maxchar, argmaxchar); + n += PyUnicode_GET_LENGTH(obj); *callresult++ = NULL; } else { str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace"); if (!str_obj) goto fail; - n += PyUnicode_GET_SIZE(str_obj); + argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj); + maxchar = MAX(maxchar, argmaxchar); + n += PyUnicode_GET_LENGTH(str_obj); *callresult++ = str_obj; } break; @@ -940,9 +1530,11 @@ PyObject *str; assert(obj); str = PyObject_Str(obj); - if (!str) + if (!str || PyUnicode_FAST_READY(str) == -1) goto fail; - n += PyUnicode_GET_SIZE(str); + argmaxchar = PyUnicode_MAX_CHAR_VALUE(str); + maxchar = MAX(maxchar, argmaxchar); + n += PyUnicode_GET_LENGTH(str); /* Remember the str and switch to the next slot */ *callresult++ = str; break; @@ -953,9 +1545,11 @@ PyObject *repr; assert(obj); repr = PyObject_Repr(obj); - if (!repr) + if (!repr || PyUnicode_FAST_READY(repr) == -1) goto fail; - n += PyUnicode_GET_SIZE(repr); + argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr); + maxchar = MAX(maxchar, argmaxchar); + n += PyUnicode_GET_LENGTH(repr); /* Remember the repr and switch to the next slot */ *callresult++ = repr; break; @@ -966,22 +1560,15 @@ PyObject *ascii; assert(obj); ascii = PyObject_ASCII(obj); - if (!ascii) + if (!ascii || PyUnicode_FAST_READY(ascii) == -1) goto fail; - n += PyUnicode_GET_SIZE(ascii); + argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii); + maxchar = MAX(maxchar, argmaxchar); + n += PyUnicode_GET_LENGTH(ascii); /* Remember the repr and switch to the next slot */ *callresult++ = ascii; break; } - case 'p': - (void) va_arg(count, int); - /* maximum 64-bit pointer representation: - * 0xffffffffffffffff - * so 19 characters is enough. - * XXX I count 18 -- what's the extra for? - */ - n += 19; - break; default: /* if we stumble upon an unknown formatting code, copy the rest of @@ -996,98 +1583,64 @@ n++; } expand: - if (abuffersize > ITEM_BUFFER_LEN) { - /* add 1 for sprintf's trailing null byte */ - abuffer = PyObject_Malloc(abuffersize + 1); - if (!abuffer) { - PyErr_NoMemory(); - goto fail; - } - realbuffer = abuffer; - } - else - realbuffer = buffer; /* step 4: fill the buffer */ - /* Since we've analyzed how much space we need for the worst case, + /* Since we've analyzed how much space we need, we don't have to resize the string. There can be no errors beyond this point. */ - string = PyUnicode_FromUnicode(NULL, n); + string = (PyUnicodeObject *)PyUnicode_New(n, maxchar); if (!string) goto fail; - - s = PyUnicode_AS_UNICODE(string); + kind = PyUnicode_KIND(string); + data = PyUnicode_DATA(string); callresult = callresults; - - for (f = format; *f; f++) { + numberresult = numberresults; + + for (i = 0, f = format; *f; f++) { if (*f == '%') { const char* p; - int longflag; - int longlongflag; - int size_tflag; p = f; - zeropad = (f[1] == '0'); - f = parse_format_flags(f, &width, &precision, - &longflag, &longlongflag, &size_tflag); + f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL); + /* checking for == because the last argument could be a empty + string, which causes i to point to end, the assert at the end of + the loop */ + assert(i <= PyUnicode_GET_LENGTH(string)); switch (*f) { case 'c': { - int ordinal = va_arg(vargs, int); -#ifndef Py_UNICODE_WIDE - if (ordinal > 0xffff) { - ordinal -= 0x10000; - *s++ = 0xD800 | (ordinal >> 10); - *s++ = 0xDC00 | (ordinal & 0x3FF); - } else -#endif - *s++ = ordinal; + const int ordinal = va_arg(vargs, int); + PyUnicode_WRITE(kind, data, i++, ordinal); break; } case 'i': case 'd': - makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, - width, precision, *f); - if (longflag) - sprintf(realbuffer, fmt, va_arg(vargs, long)); -#ifdef HAVE_LONG_LONG - else if (longlongflag) - sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG)); -#endif - else if (size_tflag) - sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t)); + case 'u': + case 'x': + case 'p': + /* unused, since we already have the result */ + if (*f == 'p') + (void) va_arg(vargs, void *); else - sprintf(realbuffer, fmt, va_arg(vargs, int)); - appendstring(realbuffer); - break; - case 'u': - makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, - width, precision, 'u'); - if (longflag) - sprintf(realbuffer, fmt, va_arg(vargs, unsigned long)); -#ifdef HAVE_LONG_LONG - else if (longlongflag) - sprintf(realbuffer, fmt, va_arg(vargs, - unsigned PY_LONG_LONG)); -#endif - else if (size_tflag) - sprintf(realbuffer, fmt, va_arg(vargs, size_t)); - else - sprintf(realbuffer, fmt, va_arg(vargs, unsigned int)); - appendstring(realbuffer); - break; - case 'x': - makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x'); - sprintf(realbuffer, fmt, va_arg(vargs, int)); - appendstring(realbuffer); + (void) va_arg(vargs, int); + /* extract the result from numberresults and append. */ + for (; *numberresult; ++i, ++numberresult) + PyUnicode_WRITE(kind, data, i, *numberresult); + /* skip over the separating '\0' */ + assert(*numberresult == '\0'); + numberresult++; + assert(numberresult <= numberresults + numbersize); break; case 's': { /* unused, since we already have the result */ (void) va_arg(vargs, char *); - Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult), - PyUnicode_GET_SIZE(*callresult)); - s += PyUnicode_GET_SIZE(*callresult); + const Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult); + assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string)); + PyUnicode_CopyCharacters(string, i, + (PyUnicodeObject *)*callresult, 0, + size); + i += size; /* We're done with the unicode()/repr() => forget it */ Py_DECREF(*callresult); /* switch to next unicode()/repr() result */ @@ -1097,23 +1650,34 @@ case 'U': { PyObject *obj = va_arg(vargs, PyObject *); - Py_ssize_t size = PyUnicode_GET_SIZE(obj); - Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); - s += size; + assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string)); + const Py_ssize_t size = PyUnicode_GET_LENGTH(obj); + PyUnicode_CopyCharacters(string, i, + (PyUnicodeObject *)obj, 0, + size); + i += size; break; } case 'V': { + Py_ssize_t size; PyObject *obj = va_arg(vargs, PyObject *); va_arg(vargs, const char *); if (obj) { - Py_ssize_t size = PyUnicode_GET_SIZE(obj); - Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); - s += size; + size = PyUnicode_GET_LENGTH(obj); + assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string)); + PyUnicode_CopyCharacters(string, i, + (PyUnicodeObject *)obj, 0, + size); + i += size; } else { - Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult), - PyUnicode_GET_SIZE(*callresult)); - s += PyUnicode_GET_SIZE(*callresult); + size = PyUnicode_GET_LENGTH(*callresult); + assert(PyUnicode_KIND(*callresult) <= + PyUnicode_KIND(string)); + PyUnicode_CopyCharacters(string, i, + (PyUnicodeObject *)*callresult, + 0, size); + i += size; Py_DECREF(*callresult); } ++callresult; @@ -1123,52 +1687,42 @@ case 'R': case 'A': { - Py_UNICODE *ucopy; - Py_ssize_t usize; - Py_ssize_t upos; /* unused, since we already have the result */ (void) va_arg(vargs, PyObject *); - ucopy = PyUnicode_AS_UNICODE(*callresult); - usize = PyUnicode_GET_SIZE(*callresult); - for (upos = 0; upos forget it */ Py_DECREF(*callresult); /* switch to next unicode()/repr() result */ ++callresult; break; } - case 'p': - sprintf(buffer, "%p", va_arg(vargs, void*)); - /* %p is ill-defined: ensure leading 0x. */ - if (buffer[1] == 'X') - buffer[1] = 'x'; - else if (buffer[1] != 'x') { - memmove(buffer+2, buffer, strlen(buffer)+1); - buffer[0] = '0'; - buffer[1] = 'x'; - } - appendstring(buffer); - break; case '%': - *s++ = '%'; + PyUnicode_WRITE(kind, data, i++, '%'); break; default: - appendstring(p); + for (; *p; ++p, ++i) + PyUnicode_WRITE(kind, data, i, *p); + assert(i == PyUnicode_GET_LENGTH(string)); goto end; } } - else - *s++ = *f; - } + else { + assert(i < PyUnicode_GET_LENGTH(string)); + PyUnicode_WRITE(kind, data, i++, *f); + } + } + assert(i == PyUnicode_GET_LENGTH(string)); end: if (callresults) PyObject_Free(callresults); - if (abuffer) - PyObject_Free(abuffer); - PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string)); - return string; + if (numberresults) + PyObject_Free(numberresults); + return (PyObject *)string; fail: if (callresults) { PyObject **callresult2 = callresults; @@ -1178,12 +1732,12 @@ } PyObject_Free(callresults); } - if (abuffer) - PyObject_Free(abuffer); + if (numberresults) + PyObject_Free(numberresults); return NULL; } -#undef appendstring +#undef MAX PyObject * PyUnicode_FromFormat(const char *format, ...) @@ -1215,99 +1769,23 @@ wchar_t *w, Py_ssize_t size) { -#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T Py_ssize_t res; + wchar_t *wstr; + + wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res); + if (wstr == NULL) + return -1; + if (w != NULL) { - res = PyUnicode_GET_SIZE(unicode); if (size > res) size = res + 1; else res = size; - memcpy(w, unicode->str, size * sizeof(wchar_t)); + Py_MEMCPY(w, wstr, size * sizeof(wchar_t)); return res; } else - return PyUnicode_GET_SIZE(unicode) + 1; -#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4 - register const Py_UNICODE *u; - const Py_UNICODE *uend; - const wchar_t *worig, *wend; - Py_ssize_t nchar; - - u = PyUnicode_AS_UNICODE(unicode); - uend = u + PyUnicode_GET_SIZE(unicode); - if (w != NULL) { - worig = w; - wend = w + size; - while (u != uend && w != wend) { - if (0xD800 <= u[0] && u[0] <= 0xDBFF - && 0xDC00 <= u[1] && u[1] <= 0xDFFF) - { - *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000; - u += 2; - } - else { - *w = *u; - u++; - } - w++; - } - if (w != wend) - *w = L'\0'; - return w - worig; - } - else { - nchar = 1; /* nul character at the end */ - while (u != uend) { - if (0xD800 <= u[0] && u[0] <= 0xDBFF - && 0xDC00 <= u[1] && u[1] <= 0xDFFF) - u += 2; - else - u++; - nchar++; - } - } - return nchar; -#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2 - register Py_UNICODE *u, *uend, ordinal; - register Py_ssize_t i; - wchar_t *worig, *wend; - Py_ssize_t nchar; - - u = PyUnicode_AS_UNICODE(unicode); - uend = u + PyUnicode_GET_SIZE(u); - if (w != NULL) { - worig = w; - wend = w + size; - while (u != uend && w != wend) { - ordinal = *u; - if (ordinal > 0xffff) { - ordinal -= 0x10000; - *w++ = 0xD800 | (ordinal >> 10); - *w++ = 0xDC00 | (ordinal & 0x3FF); - } - else - *w++ = ordinal; - u++; - } - if (w != wend) - *w = 0; - return w - worig; - } - else { - nchar = 1; /* nul character */ - while (u != uend) { - if (*u > 0xffff) - nchar += 2; - else - nchar++; - u++; - } - return nchar; - } -#else -# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670" -#endif + return res + 1; } Py_ssize_t @@ -1335,6 +1813,8 @@ } buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0); + if (buflen == -1) + return NULL; if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) { PyErr_NoMemory(); return NULL; @@ -1346,6 +1826,8 @@ return NULL; } buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen); + if (buflen == -1) + return NULL; if (size != NULL) *size = buflen; return buffer; @@ -1389,8 +1871,9 @@ if (PyUnicode_Check(obj)) { /* For a Unicode subtype that's not a Unicode object, return a true Unicode object with the same data. */ - return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj), - PyUnicode_GET_SIZE(obj)); + if (PyUnicode_FAST_READY(obj) == -1) + return NULL; + return substring((PyUnicodeObject *)obj, 0, PyUnicode_GET_LENGTH(obj)); } PyErr_Format(PyExc_TypeError, "Can't convert '%.100s' object to str implicitly", @@ -1641,6 +2124,9 @@ return NULL; } +static PyObject * +_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors); + PyObject * PyUnicode_EncodeFSDefault(PyObject *unicode) { @@ -1649,9 +2135,7 @@ PyUnicode_GET_SIZE(unicode), NULL); #elif defined(__APPLE__) - return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), - PyUnicode_GET_SIZE(unicode), - "surrogateescape"); + return _PyUnicode_AsUTF8String(unicode, "surrogateescape"); #else PyInterpreterState *interp = PyThreadState_GET()->interp; /* Bootstrap check: if the filesystem codec is implemented in Python, we @@ -1721,11 +2205,9 @@ if (encoding == NULL) { if (errors == NULL || strcmp(errors, "strict") == 0) - return PyUnicode_AsUTF8String(unicode); + return _PyUnicode_AsUTF8String(unicode, NULL); else - return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), - PyUnicode_GET_SIZE(unicode), - errors); + return _PyUnicode_AsUTF8String(unicode, errors); } /* Shortcuts for common default encodings */ @@ -1734,11 +2216,9 @@ (strcmp(lower, "utf8") == 0)) { if (errors == NULL || strcmp(errors, "strict") == 0) - return PyUnicode_AsUTF8String(unicode); + return _PyUnicode_AsUTF8String(unicode, NULL); else - return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), - PyUnicode_GET_SIZE(unicode), - errors); + return _PyUnicode_AsUTF8String(unicode, errors); } else if ((strcmp(lower, "latin-1") == 0) || (strcmp(lower, "latin1") == 0) || @@ -1827,16 +2307,10 @@ PyObject * _PyUnicode_AsDefaultEncodedString(PyObject *unicode) { - PyObject *v = ((PyUnicodeObject *)unicode)->defenc; - if (v) - return v; - v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), - PyUnicode_GET_SIZE(unicode), - NULL); - if (!v) - return NULL; - ((PyUnicodeObject *)unicode)->defenc = v; - return v; + /* XXX: This function currently leaks as callers assume a borrowed + reference which cannot be cached in the new (as of PEP 393) unicode + struct anymore. */ + return PyUnicode_AsUTF8String(unicode); } PyObject* @@ -1973,48 +2447,192 @@ char* -_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize) +PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) { PyObject *bytes; + PyUnicodeObject *u = (PyUnicodeObject *)unicode; + if (!PyUnicode_Check(unicode)) { PyErr_BadArgument(); return NULL; } - bytes = _PyUnicode_AsDefaultEncodedString(unicode); - if (bytes == NULL) - return NULL; - if (psize != NULL) - *psize = PyBytes_GET_SIZE(bytes); - return PyBytes_AS_STRING(bytes); + if (PyUnicode_FAST_READY(u) == -1) + return NULL; + + if (u->utf8 == NULL) { + bytes = _PyUnicode_AsUTF8String(unicode, "strict"); + if (bytes == NULL) + return NULL; + u->utf8 = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1); + if (u->utf8 == NULL) { + Py_DECREF(bytes); + return NULL; + } + u->utf8_length = PyBytes_GET_SIZE(bytes); + Py_MEMCPY(u->utf8, PyBytes_AS_STRING(bytes), u->utf8_length + 1); + Py_DECREF(bytes); + } + + if (psize) + *psize = u->utf8_length; + return u->utf8; } char* -_PyUnicode_AsString(PyObject *unicode) -{ - return _PyUnicode_AsStringAndSize(unicode, NULL); +PyUnicode_AsUTF8(PyObject *unicode) +{ + return PyUnicode_AsUTF8AndSize(unicode, NULL); +} + +#ifdef Py_DEBUG +int unicode_as_unicode_calls = 0; +#endif + + +Py_UNICODE * +PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size) +{ + PyUnicodeObject *u; + const unsigned char *one_byte; + const Py_UCS2 *two_bytes; + const Py_UCS4 *four_bytes; + const Py_UCS4 *ucs4_end; + wchar_t *w; + wchar_t *wchar_end; + Py_ssize_t num_surrogates; + + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + return NULL; + } + u = (PyUnicodeObject*)unicode; + if (u->wstr == NULL) { + assert((u->state & SSTATE_KIND_MASK) != 0); + +#ifdef Py_DEBUG + ++unicode_as_unicode_calls; +#endif + + if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) { + if (sizeof(wchar_t) == 2) { + four_bytes = PyUnicode_4BYTE_DATA(u); + ucs4_end = four_bytes + u->length; + num_surrogates = 0; + + for (; four_bytes < ucs4_end; ++four_bytes) { + if (*four_bytes > 0xFFFF) + ++num_surrogates; + } + + u->wstr = (wchar_t *) PyObject_MALLOC( + sizeof(wchar_t) * (u->length + 1 + num_surrogates)); + if (!u->wstr) { + PyErr_NoMemory(); + return NULL; + } + u->wstr_length = u->length + num_surrogates; + + w = u->wstr; + wchar_end = w + u->wstr_length; + four_bytes = PyUnicode_4BYTE_DATA(u); + for (; four_bytes < ucs4_end; ++four_bytes, ++w) { + if (*four_bytes > 0xFFFF) { + /* encode surrogate pair in this case */ + *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10); + *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF); + } + else + *w = *four_bytes; + + if (w > wchar_end) { + assert(0 && "Miscalculated string end"); + } + } + *w = 0; + } + else { + /* if sizeof(wchar_t) == 4 */ + Py_FatalError("Impossible unicode object state, wstr and str " + "should share memory already."); + return NULL; + } + } + else { + u->wstr = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * + (u->length + 1)); + if (!u->wstr) { + PyErr_NoMemory(); + return NULL; + } + u->wstr_length = u->length; + w = u->wstr; + wchar_end = w + u->length; + + if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) { + one_byte = PyUnicode_1BYTE_DATA(u); + for (; w < wchar_end; ++one_byte, ++w) + *w = *one_byte; + /* null-terminate the wstr */ + *w = 0; + } + else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) { + if (sizeof(wchar_t) == 4) { + two_bytes = PyUnicode_2BYTE_DATA(u); + for (; w < wchar_end; ++two_bytes, ++w) + *w = *two_bytes; + /* null-terminate the wstr */ + *w = 0; + } + else { + /* sizeof(wchar_t) == 2 */ + PyObject_FREE(u->wstr); + u->wstr = NULL; + Py_FatalError("Impossible unicode object state, wstr " + "and str should share memory already."); + return NULL; + } + } + else { + assert(0 && "This should never happen."); + } + } + } + if (size != NULL) + *size = u->wstr_length; + return u->wstr; } Py_UNICODE * PyUnicode_AsUnicode(PyObject *unicode) { + return PyUnicode_AsUnicodeAndSize(unicode, NULL); +} + + +Py_ssize_t +PyUnicode_GetSize(PyObject *unicode) +{ + PyUnicodeObject *u; + if (!PyUnicode_Check(unicode)) { PyErr_BadArgument(); goto onError; } - return PyUnicode_AS_UNICODE(unicode); - - onError: - return NULL; -} - -Py_ssize_t -PyUnicode_GetSize(PyObject *unicode) -{ - if (!PyUnicode_Check(unicode)) { - PyErr_BadArgument(); - goto onError; - } - return PyUnicode_GET_SIZE(unicode); + + u = (PyUnicodeObject *)unicode; + if (u->str == NULL) { + /* The check for NULL on the canonical representation is done to + make PyUnicode_GetSize() compatible with interleaved + PyUnicode_GetSize() and PyUnicode_GET_SIZE() calls which must + not trigger PyUnicode_Ready() on unfinished strings. */ + return PyUnicode_GET_SIZE(u); + } + else { + if (PyUnicode_FAST_READY(u) == -1) + goto onError; + + return PyUnicode_GET_LENGTH(u); + } onError: return -1; @@ -2280,8 +2898,8 @@ *consumed = 0; return (PyObject *)unicode; } - - p = unicode->str; + + p = PyUnicode_AS_UNICODE(unicode); shiftOutStart = p; e = s + size; @@ -2578,96 +3196,172 @@ /* Mask to check or force alignment of a pointer to C 'long' boundaries */ #define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1) -/* Mask to quickly check whether a C 'long' contains a - non-ASCII, UTF8-encoded char. */ -#if (SIZEOF_LONG == 8) -# define ASCII_CHAR_MASK 0x8080808080808080L -#elif (SIZEOF_LONG == 4) -# define ASCII_CHAR_MASK 0x80808080L -#else -# error C 'long' size should be either 4 or 8! -#endif +/* Scans a UTF-8 string and returns the maximum character to be expected, + the size of the decoded unicode string and if any major errors were + encountered. + + This function does check basic UTF-8 sanity, it does however NOT CHECK + if the string contains surrogates, and if all continuation bytes are + within the correct ranges, these checks are performed in + PyUnicode_DecodeUTF8Stateful. + + If it sets has_errors to 1, it means the value of unicode_size and max_char + will be bogus and you should not rely on useful information in them. + */ +static Py_UCS4 +utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size, + Py_ssize_t *unicode_size, int *has_errors) +{ + Py_ssize_t n; + Py_ssize_t char_count = 0; + Py_UCS4 max_char = 127; + Py_UCS4 upper_bound; + const unsigned char *p = (const unsigned char *)s; + const unsigned char *end = p + string_size; + int err = 0; + + for (; p < end && !err; ++p, ++char_count) { + /* Only check value if it's not a ASCII char... */ + if (*p >= 0x80) { + n = utf8_code_length[*p]; + switch (n) { + /* invalid start byte */ + case 0: + err = 1; + break; + case 2: + /* Code points between 0x00FF and 0x07FF inclusive. + Approximate the upper bound of the code point, + if this flips over 255 we can be sure it will be more + than 255 and the string will need 2 bytes per code coint, + if it stays under or equal to 255, we can be sure 1 byte + is enough. + ((*p & 0b00011111) << 6) | 0b00111111 */ + upper_bound = ((*p & 0x1F) << 6) | 0x3F; + if (max_char < upper_bound) + max_char = upper_bound; + /* Ensure we track at least that we left ASCII space. */ + if (max_char < 128) + max_char = 128; + break; + case 3: + /* Between 0x0FFF and 0xFFFF inclusive, so values are + always > 255 and <= 65535 and will always need 2 bytes. */ + if (max_char < 65535) + max_char = 65535; + break; + case 4: + /* Code point will be above 0xFFFF for sure in this case. */ + max_char = 65537; + break; + /* Internal error, this should be caught by the first if */ + case 1: + default: + assert(0 && "Impossible case in utf8_max_char_and_size"); + err = 1; + } + /* Instead of number of overall bytes for this code point, + n containts the number of following bytes: */ + --n; + /* Check if the follow up chars are all valid continuation bytes */ + if (n >= 1 && (p + n) < end) { + const unsigned char *cont; + for (cont = p + 1; cont < (p + n); ++cont) { + if ((*cont & 0xc0) != 0x80) { + err = 1; + break; + } + } + p += n; + } + else + err = 1; + } + } + + if (unicode_size) + *unicode_size = char_count; + if (has_errors) + *has_errors = err; + return max_char; +} + +/* Similar to PyUnicode_WRITE but can also write into wstr field + of the legacy unicode representation */ +#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \ + do { \ + const int k_ = (kind); \ + if (k_ == 0) \ + ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \ + else if (k_ == PyUnicode_1BYTE_KIND) \ + ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \ + else if (k_ == PyUnicode_2BYTE_KIND) \ + ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \ + else \ + ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \ + } while (0) PyObject * PyUnicode_DecodeUTF8Stateful(const char *s, - Py_ssize_t size, - const char *errors, - Py_ssize_t *consumed) + Py_ssize_t size, + const char *errors, + Py_ssize_t *consumed) { const char *starts = s; int n; int k; Py_ssize_t startinpos; Py_ssize_t endinpos; - Py_ssize_t outpos; - const char *e, *aligned_end; + const char *e; PyUnicodeObject *unicode; - Py_UNICODE *p; const char *errmsg = ""; PyObject *errorHandler = NULL; PyObject *exc = NULL; - - /* Note: size will always be longer than the resulting Unicode - character count */ - unicode = _PyUnicode_New(size); - if (!unicode) - return NULL; + Py_UCS4 maxchar = 0; + Py_ssize_t unicode_size; + Py_ssize_t i; + int kind; + void *data; + int has_errors; + Py_UNICODE *error_outptr; + if (size == 0) { if (consumed) *consumed = 0; - return (PyObject *)unicode; - } - + return (PyObject *)PyUnicode_New(0, 0); + } + maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size, + &has_errors); + if (has_errors) { + unicode = _PyUnicode_New(size); + if (!unicode) + return NULL; + kind = 0; + data = PyUnicode_AS_UNICODE(unicode); + assert(data != NULL); + } + else { + unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar); + if (!unicode) + return NULL; + /* When the string is ASCII only, just use memcpy and return. */ + if (maxchar < 128) { + assert(unicode_size == size); + Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size); + return (PyObject *)unicode; + } + kind = PyUnicode_KIND(unicode); + data = PyUnicode_DATA(unicode); + } /* Unpack UTF-8 encoded data */ - p = unicode->str; + i = 0; e = s + size; - aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); while (s < e) { Py_UCS4 ch = (unsigned char)*s; if (ch < 0x80) { - /* Fast path for runs of ASCII characters. Given that common UTF-8 - input will consist of an overwhelming majority of ASCII - characters, we try to optimize for this case by checking - as many characters as a C 'long' can contain. - First, check if we can do an aligned read, as most CPUs have - a penalty for unaligned reads. - */ - if (!((size_t) s & LONG_PTR_MASK)) { - /* Help register allocation */ - register const char *_s = s; - register Py_UNICODE *_p = p; - while (_s < aligned_end) { - /* Read a whole long at a time (either 4 or 8 bytes), - and do a fast unrolled copy if it only contains ASCII - characters. */ - unsigned long data = *(unsigned long *) _s; - if (data & ASCII_CHAR_MASK) - break; - _p[0] = (unsigned char) _s[0]; - _p[1] = (unsigned char) _s[1]; - _p[2] = (unsigned char) _s[2]; - _p[3] = (unsigned char) _s[3]; -#if (SIZEOF_LONG == 8) - _p[4] = (unsigned char) _s[4]; - _p[5] = (unsigned char) _s[5]; - _p[6] = (unsigned char) _s[6]; - _p[7] = (unsigned char) _s[7]; -#endif - _s += SIZEOF_LONG; - _p += SIZEOF_LONG; - } - s = _s; - p = _p; - if (s == e) - break; - ch = (unsigned char)*s; - } - } - - if (ch < 0x80) { - *p++ = (Py_UNICODE)ch; + WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); s++; continue; } @@ -2710,7 +3404,7 @@ } ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); assert ((ch > 0x007F) && (ch <= 0x07FF)); - *p++ = (Py_UNICODE)ch; + WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); break; case 3: @@ -2739,7 +3433,7 @@ } ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); assert ((ch > 0x07FF) && (ch <= 0xFFFF)); - *p++ = (Py_UNICODE)ch; + WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); break; case 4: @@ -2764,41 +3458,71 @@ ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); -#ifdef Py_UNICODE_WIDE - *p++ = (Py_UNICODE)ch; -#else - /* compute and append the two surrogates: */ - - /* translate from 10000..10FFFF to 0..FFFF */ - ch -= 0x10000; - - /* high surrogate = top 10 bits added to D800 */ - *p++ = (Py_UNICODE)(0xD800 + (ch >> 10)); - - /* low surrogate = bottom 10 bits added to DC00 */ - *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF)); -#endif + /* If the string is flexible or we have native UCS-4, write + directly.. */ + if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND) + WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); + + else { + /* compute and append the two surrogates: */ + + /* translate from 10000..10FFFF to 0..FFFF */ + ch -= 0x10000; + + /* high surrogate = top 10 bits added to D800 */ + WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, + (Py_UNICODE)(0xD800 + (ch >> 10))); + + /* low surrogate = bottom 10 bits added to DC00 */ + WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, + (Py_UNICODE)(0xDC00 + (ch & 0x03FF))); + } break; } s += n; continue; utf8Error: - outpos = p-PyUnicode_AS_UNICODE(unicode); + /* If this is not yet a resizable string, make it one.. */ + if (kind != PyUnicode_WCHAR_KIND) { + Py_UNICODE *u; + PyUnicodeObject *new_unicode = _PyUnicode_New(size); + if (!new_unicode) + goto onError; + u = PyUnicode_AsUnicode((PyObject *)unicode); + if (!u) + goto onError; + Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i); + Py_DECREF(unicode); + unicode = new_unicode; + kind = 0; + data = PyUnicode_AS_UNICODE(new_unicode); + assert(data != NULL); + } + error_outptr = PyUnicode_AS_UNICODE(unicode) + i; if (unicode_decode_call_errorhandler( errors, &errorHandler, "utf8", errmsg, &starts, &e, &startinpos, &endinpos, &exc, &s, - &unicode, &outpos, &p)) + &unicode, &i, &error_outptr)) goto onError; - aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); - } + /* Update data because unicode_decode_call_errorhandler might have + re-created or resized the unicode object. */ + data = PyUnicode_AS_UNICODE(unicode); + } + /* Ensure the unicode_size calculation above was correct: */ + assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size); + if (consumed) *consumed = s-starts; - /* Adjust length */ - if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) - goto onError; + /* Adjust length and ready string when it contained errors and + is of the old resizable kind. */ + if (kind == PyUnicode_WCHAR_KIND) { + if (_PyUnicode_Resize(&unicode, i) < 0 || + PyUnicode_Ready(unicode) == -1) + goto onError; + } Py_XDECREF(errorHandler); Py_XDECREF(exc); @@ -2811,7 +3535,7 @@ return NULL; } -#undef ASCII_CHAR_MASK +#undef WRITE_FLEXIBLE_OR_WSTR #ifdef __APPLE__ @@ -2928,15 +3652,15 @@ #endif /* __APPLE__ */ -/* Allocation strategy: if the string is short, convert into a stack buffer +/* Primary internal function which creates utf8 encoded bytes objects. + + Allocation strategy: if the string is short, convert into a stack buffer and allocate exactly as much space needed at the end. Else allocate the maximum possible needed (4 result bytes per Unicode character), and return the excess memory at the end. */ -PyObject * -PyUnicode_EncodeUTF8(const Py_UNICODE *s, - Py_ssize_t size, - const char *errors) +static PyObject * +_PyUnicode_AsUTF8String(PyObject *obj, const char *errors) { #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ @@ -2948,8 +3672,26 @@ char stackbuf[MAX_SHORT_UNICHARS * 4]; PyObject *errorHandler = NULL; PyObject *exc = NULL; - - assert(s != NULL); + int kind; + void *data; + Py_ssize_t size; + PyUnicodeObject *unicode = (PyUnicodeObject *)obj; + + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + return NULL; + } + + if (PyUnicode_FAST_READY(unicode) == -1) + return NULL; + + if (unicode->utf8) + return PyBytes_FromStringAndSize(unicode->utf8, unicode->utf8_length); + + kind = PyUnicode_KIND(unicode); + data = PyUnicode_DATA(unicode); + size = PyUnicode_GET_LENGTH(unicode); + assert(size >= 0); if (size <= MAX_SHORT_UNICHARS) { @@ -2973,7 +3715,7 @@ } for (i = 0; i < size;) { - Py_UCS4 ch = s[i++]; + Py_UCS4 ch = PyUnicode_READ(kind, data, i++); if (ch < 0x80) /* Encode ASCII */ @@ -2984,83 +3726,70 @@ *p++ = (char)(0xc0 | (ch >> 6)); *p++ = (char)(0x80 | (ch & 0x3f)); } else if (0xD800 <= ch && ch <= 0xDFFF) { -#ifndef Py_UNICODE_WIDE - /* Special case: check for high and low surrogate */ - if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) { - Py_UCS4 ch2 = s[i]; - /* Combine the two surrogates to form a UCS4 value */ - ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000; - i++; - - /* Encode UCS4 Unicode ordinals */ - *p++ = (char)(0xf0 | (ch >> 18)); - *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); - *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); - *p++ = (char)(0x80 | (ch & 0x3f)); - } else { -#endif - Py_ssize_t newpos; - PyObject *rep; - Py_ssize_t repsize, k; - rep = unicode_encode_call_errorhandler - (errors, &errorHandler, "utf-8", "surrogates not allowed", - s, size, &exc, i-1, i, &newpos); - if (!rep) + Py_ssize_t newpos; + PyObject *rep; + Py_ssize_t repsize, k; + // TODO: Handle the case when sizeof(Py_UNICODE) is 2 + // and the index in unicode (i) does not match as in the + // canonical representation's size is different than unicode size. + rep = unicode_encode_call_errorhandler( + errors, &errorHandler, "utf-8", "surrogates not allowed", + PyUnicode_AS_UNICODE(unicode), size, &exc, i-1, i, &newpos); + if (!rep) + goto error; + + if (PyBytes_Check(rep)) + repsize = PyBytes_GET_SIZE(rep); + else + repsize = PyUnicode_GET_SIZE(rep); + + if (repsize > 4) { + Py_ssize_t offset; + + if (result == NULL) + offset = p - stackbuf; + else + offset = p - PyBytes_AS_STRING(result); + + if (nallocated > PY_SSIZE_T_MAX - repsize + 4) { + /* integer overflow */ + PyErr_NoMemory(); goto error; - - if (PyBytes_Check(rep)) - repsize = PyBytes_GET_SIZE(rep); - else - repsize = PyUnicode_GET_SIZE(rep); - - if (repsize > 4) { - Py_ssize_t offset; - + } + nallocated += repsize - 4; + if (result != NULL) { + if (_PyBytes_Resize(&result, nallocated) < 0) + goto error; + } else { + result = PyBytes_FromStringAndSize(NULL, nallocated); if (result == NULL) - offset = p - stackbuf; - else - offset = p - PyBytes_AS_STRING(result); - - if (nallocated > PY_SSIZE_T_MAX - repsize + 4) { - /* integer overflow */ - PyErr_NoMemory(); + goto error; + Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset); + } + p = PyBytes_AS_STRING(result) + offset; + } + + if (PyBytes_Check(rep)) { + char *prep = PyBytes_AS_STRING(rep); + for(k = repsize; k > 0; k--) + *p++ = *prep++; + } else /* rep is unicode */ { + Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep); + Py_UNICODE c; + + for(k=0; k 0; k--) - *p++ = *prep++; - } else /* rep is unicode */ { - Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep); - Py_UNICODE c; - - for(k=0; k> 12)); *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); @@ -3086,6 +3815,7 @@ assert(nneeded <= nallocated); _PyBytes_Resize(&result, nneeded); } + Py_XDECREF(errorHandler); Py_XDECREF(exc); return result; @@ -3099,18 +3829,24 @@ } PyObject * +PyUnicode_EncodeUTF8(const Py_UNICODE *s, + Py_ssize_t size, + const char *errors) +{ + PyObject *v, *unicode; + + unicode = PyUnicode_FromUnicode(s, size); + if (unicode == NULL) + return NULL; + v = _PyUnicode_AsUTF8String(unicode, errors); + Py_DECREF(unicode); + return v; +} + +PyObject * PyUnicode_AsUTF8String(PyObject *unicode) { - PyObject *utf8; - if (!PyUnicode_Check(unicode)) { - PyErr_BadArgument(); - return NULL; - } - utf8 = _PyUnicode_AsDefaultEncodedString(unicode); - if (utf8 == NULL) - return NULL; - Py_INCREF(utf8); - return utf8; + return _PyUnicode_AsUTF8String(unicode, NULL); } /* --- UTF-32 Codec ------------------------------------------------------- */ @@ -3222,7 +3958,7 @@ return (PyObject *)unicode; /* Unpack UTF-32 encoded data */ - p = unicode->str; + p = PyUnicode_AS_UNICODE(unicode); while (q < e) { Py_UCS4 ch; @@ -3275,7 +4011,7 @@ *consumed = (const char *)q-starts; /* Adjust length */ - if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) + if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) goto onError; Py_XDECREF(errorHandler); @@ -3452,7 +4188,7 @@ return (PyObject *)unicode; /* Unpack UTF-16 encoded data */ - p = unicode->str; + p = PyUnicode_AS_UNICODE(unicode); q = (unsigned char *)s; e = q + size - 1; @@ -3669,7 +4405,7 @@ *consumed = (const char *)q-starts; /* Adjust length */ - if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) + if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) goto onError; Py_XDECREF(errorHandler); @@ -3782,6 +4518,76 @@ /* --- Unicode Escape Codec ----------------------------------------------- */ +/* Helper function for PyUnicode_DecodeUnicodeEscape, determines + if all the escapes in the string make it still a valid ASCII string. + Returns -1 if any escapes were found which cause the string to + pop out of ASCII range. Otherwise returns the length of the + required buffer to hold the string. + */ +Py_ssize_t +length_of_escaped_ascii_string(const char *s, Py_ssize_t size) +{ + const unsigned char *p = (const unsigned char *)s; + const unsigned char *end = p + size; + Py_ssize_t length = 0; + + if (size < 0) + return -1; + + for (; p < end; ++p) { + if (*p > 127) { + /* Non-ASCII */ + return -1; + } + else if (*p != '\\') { + /* Normal character */ + ++length; + } + else { + /* Backslash-escape, check next char */ + ++p; + /* Escape sequence reaches till end of string or + non-ASCII follow-up. */ + if (p >= end || *p > 127) + return -1; + switch (*p) { + case '\n': + /* backslash + \n result in zero characters */ + break; + case '\\': case '\'': case '\"': + case 'b': case 'f': case 't': + case 'n': case 'r': case 'v': case 'a': + ++length; + break; + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + case 'x': case 'u': case 'U': case 'N': + /* these do not guarantee ASCII characters */ + return -1; + default: + /* count the backslash + the other character */ + length += 2; + } + } + } + return length; +} + +/* Similar to PyUnicode_WRITE but either write into wstr field + or treat string as ASCII. */ +#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \ + do { \ + if ((kind) != 0) \ + ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \ + else \ + ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \ + } while (0) + +#define WRITE_WSTR(buf, index, value) \ + assert(kind == PyUnicode_WCHAR_KIND), \ + ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value) + + static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; PyObject * @@ -3792,8 +4598,7 @@ const char *starts = s; Py_ssize_t startinpos; Py_ssize_t endinpos; - Py_ssize_t outpos; - int i; + int j; PyUnicodeObject *v; Py_UNICODE *p; const char *end; @@ -3801,19 +4606,42 @@ Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ PyObject *errorHandler = NULL; PyObject *exc = NULL; - - /* Escaped strings will always be longer than the resulting - Unicode string, so we start with size here and then reduce the - length after conversion to the true value. - (but if the error callback returns a long replacement string - we'll have to allocate more space) */ - v = _PyUnicode_New(size); - if (v == NULL) - goto onError; + Py_ssize_t ascii_length; + Py_ssize_t i; + int kind; + void *data; + + ascii_length = length_of_escaped_ascii_string(s, size); + + /* After length_of_escaped_ascii_string() there are two alternatives, + either the string is pure ASCII with named escapes like \n, etc. + and we determined it's exact size (common case) + or it contains \x, \u, ... escape sequences. then we create a + legacy wchar string and resize it at the end of this function. */ + if (ascii_length >= 0) { + v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127); + if (!v) + goto onError; + assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND); + kind = PyUnicode_1BYTE_KIND; + data = PyUnicode_DATA(v); + } + else { + /* Escaped strings will always be longer than the resulting + Unicode string, so we start with size here and then reduce the + length after conversion to the true value. + (but if the error callback returns a long replacement string + we'll have to allocate more space) */ + v = _PyUnicode_New(size); + if (!v) + goto onError; + kind = PyUnicode_WCHAR_KIND; + data = PyUnicode_AS_UNICODE(v); + } + if (size == 0) return (PyObject *)v; - - p = PyUnicode_AS_UNICODE(v); + i = 0; end = s + size; while (s < end) { @@ -3821,9 +4649,18 @@ Py_UNICODE x; int digits; + if (kind == PyUnicode_WCHAR_KIND) { + assert(i < v->wstr_length); + } + else { + /* The only case in which i == ascii_length is a backslash + followed by a newline. */ + assert(i <= ascii_length); + } + /* Non-escape characters are interpreted as Unicode ordinals */ if (*s != '\\') { - *p++ = (unsigned char) *s++; + WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++); continue; } @@ -3833,20 +4670,33 @@ c = *s++; if (s > end) c = '\0'; /* Invalid after \ */ + + if (kind == PyUnicode_WCHAR_KIND) { + assert(i < v->wstr_length); + } + else { + /* The only case in which i == ascii_length is a backslash + followed by a newline. */ + assert(i < ascii_length || (i == ascii_length && c == '\n')); + } + switch (c) { /* \x escapes */ case '\n': break; - case '\\': *p++ = '\\'; break; - case '\'': *p++ = '\''; break; - case '\"': *p++ = '\"'; break; - case 'b': *p++ = '\b'; break; - case 'f': *p++ = '\014'; break; /* FF */ - case 't': *p++ = '\t'; break; - case 'n': *p++ = '\n'; break; - case 'r': *p++ = '\r'; break; - case 'v': *p++ = '\013'; break; /* VT */ - case 'a': *p++ = '\007'; break; /* BEL, not classic C */ + case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break; + case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break; + case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break; + case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break; + /* FF */ + case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break; + case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break; + case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break; + case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break; + /* VT */ + case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break; + /* BEL, not classic C */ + case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break; /* \OOO (octal) escapes */ case '0': case '1': case '2': case '3': @@ -3857,7 +4707,7 @@ if (s < end && '0' <= *s && *s <= '7') x = (x<<3) + *s++ - '0'; } - *p++ = x; + WRITE_WSTR(data, i++, x); break; /* hex escapes */ @@ -3879,27 +4729,30 @@ message = "truncated \\UXXXXXXXX escape"; hexescape: chr = 0; - outpos = p-PyUnicode_AS_UNICODE(v); + p = PyUnicode_AS_UNICODE(v) + i; if (s+digits>end) { endinpos = size; if (unicode_decode_call_errorhandler( errors, &errorHandler, "unicodeescape", "end of string in escape sequence", &starts, &end, &startinpos, &endinpos, &exc, &s, - &v, &outpos, &p)) + &v, &i, &p)) goto onError; + data = PyUnicode_AS_UNICODE(v); goto nextByte; } - for (i = 0; i < digits; ++i) { - c = (unsigned char) s[i]; + for (j = 0; j < digits; ++j) { + c = (unsigned char) s[j]; if (!Py_ISXDIGIT(c)) { - endinpos = (s+i+1)-starts; + endinpos = (s+j+1)-starts; + p = PyUnicode_AS_UNICODE(v) + i; if (unicode_decode_call_errorhandler( errors, &errorHandler, "unicodeescape", message, &starts, &end, &startinpos, &endinpos, &exc, &s, - &v, &outpos, &p)) + &v, &i, &p)) goto onError; + data = PyUnicode_AS_UNICODE(v); goto nextByte; } chr = (chr<<4) & ~0xF; @@ -3910,7 +4763,7 @@ else chr += 10 + c - 'A'; } - s += i; + s += j; if (chr == 0xffffffff && PyErr_Occurred()) /* _decoding_error will have already written into the target buffer. */ @@ -3919,26 +4772,27 @@ /* when we get here, chr is a 32-bit unicode character */ if (chr <= 0xffff) /* UCS-2 character */ - *p++ = (Py_UNICODE) chr; + WRITE_WSTR(data, i++, chr); else if (chr <= 0x10ffff) { /* UCS-4 character. Either store directly, or as surrogate pair. */ #ifdef Py_UNICODE_WIDE - *p++ = chr; + WRITE_WSTR(data, i++, chr); #else chr -= 0x10000L; - *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); - *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); + WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10)); + WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF)); #endif } else { endinpos = s-starts; - outpos = p-PyUnicode_AS_UNICODE(v); + p = PyUnicode_AS_UNICODE(v) + i; if (unicode_decode_call_errorhandler( errors, &errorHandler, "unicodeescape", "illegal Unicode character", &starts, &end, &startinpos, &endinpos, &exc, &s, - &v, &outpos, &p)) + &v, &i, &p)) goto onError; + data = PyUnicode_AS_UNICODE(v); } break; @@ -3947,7 +4801,8 @@ message = "malformed \\N character escape"; if (ucnhash_CAPI == NULL) { /* load the unicode data module */ - ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1); + ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( + PyUnicodeData_CAPSULE_NAME, 1); if (ucnhash_CAPI == NULL) goto ucnhashError; } @@ -3960,43 +4815,51 @@ /* found a name. look it up in the unicode database */ message = "unknown Unicode character name"; s++; - if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr)) + if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), + &chr)) goto store; } } endinpos = s-starts; - outpos = p-PyUnicode_AS_UNICODE(v); + p = PyUnicode_AS_UNICODE(v) + i; if (unicode_decode_call_errorhandler( errors, &errorHandler, "unicodeescape", message, &starts, &end, &startinpos, &endinpos, &exc, &s, - &v, &outpos, &p)) + &v, &i, &p)) goto onError; + data = PyUnicode_AS_UNICODE(v); break; default: if (s > end) { + assert(kind == PyUnicode_WCHAR_KIND); message = "\\ at end of string"; s--; endinpos = s-starts; - outpos = p-PyUnicode_AS_UNICODE(v); + p = PyUnicode_AS_UNICODE(v) + i; if (unicode_decode_call_errorhandler( errors, &errorHandler, "unicodeescape", message, &starts, &end, &startinpos, &endinpos, &exc, &s, - &v, &outpos, &p)) + &v, &i, &p)) goto onError; + data = PyUnicode_AS_UNICODE(v); } else { - *p++ = '\\'; - *p++ = (unsigned char)s[-1]; + WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); + WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]); } break; } nextByte: ; } - if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) + /* Ensure the length prediction worked in case of ASCII strings */ + assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length); + + if (kind == PyUnicode_WCHAR_KIND && (_PyUnicode_Resize(&v, i) < 0 || + PyUnicode_Ready(v) == -1)) goto onError; Py_XDECREF(errorHandler); Py_XDECREF(exc); @@ -4019,6 +4882,9 @@ return NULL; } +#undef WRITE_ASCII_OR_WSTR +#undef WRITE_WSTR + /* Return a Unicode-Escape string version of the Unicode object. If quotes is true, the string is enclosed in u"" or u'' quotes as @@ -4447,7 +5313,9 @@ v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE); if (v == NULL) goto onError; - if (PyUnicode_GetSize((PyObject *)v) == 0) + /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH + as string was created with the old API. */ + if (PyUnicode_GET_SIZE(v) == 0) return (PyObject *)v; p = PyUnicode_AS_UNICODE(v); end = s + size; @@ -6323,6 +7191,60 @@ return NULL; } +static Py_UCS4 +fix_decimal_and_space_to_ascii(PyUnicodeObject *self) +{ + /* No need to call PyUnicode_Ready(self) because this function is only + called as a callback from fixup() which does it already. */ + const Py_ssize_t len = PyUnicode_GET_LENGTH(self); + const int kind = PyUnicode_KIND(self); + void *data = PyUnicode_DATA(self); + Py_UCS4 maxchar = 0, ch, fixed; + Py_ssize_t i; + + for (i = 0; i < len; ++i) { + ch = PyUnicode_READ(kind, data, i); + fixed = 0; + if (ch > 127) { + if (Py_UNICODE_ISSPACE(ch)) + fixed = ' '; + else { + const int decimal = Py_UNICODE_TODECIMAL(ch); + if (decimal >= 0) + fixed = '0' + decimal; + } + if (fixed != 0) { + if (fixed > maxchar) + maxchar = fixed; + PyUnicode_WRITE(kind, data, i, fixed); + } + else if (ch > maxchar) + maxchar = ch; + } + else if (ch > maxchar) + maxchar = ch; + } + + return maxchar; +} + +PyObject * +PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) +{ + if (!PyUnicode_Check(unicode)) { + PyErr_BadInternalCall(); + return NULL; + } + if (PyUnicode_FAST_READY(unicode) == -1) + return NULL; + if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) { + /* If the string is already ASCII, just return the same string */ + Py_INCREF(unicode); + return unicode; + } + return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii); +} + PyObject * PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, Py_ssize_t length) @@ -6532,10 +7454,17 @@ Py_DECREF(str_obj); return -1; } - - ADJUST_INDICES(start, end, str_obj->length); + + /* Creates the wstr representation: */ + if (PyUnicode_AsUnicode((PyObject *)str_obj) == NULL) { + Py_DECREF(sub_obj); + Py_DECREF(str_obj); + return -1; + } + + ADJUST_INDICES(start, end, str_obj->wstr_length); result = stringlib_count( - str_obj->str + start, end - start, sub_obj->str, sub_obj->length, + str_obj->wstr + start, end - start, sub_obj->wstr, sub_obj->wstr_length, PY_SSIZE_T_MAX ); @@ -6589,20 +7518,62 @@ Py_ssize_t end, int direction) { - if (substring->length == 0) + int kind_self; + int kind_sub; + void *data_self; + void *data_sub; + Py_ssize_t offset; + Py_ssize_t i; + Py_ssize_t end_sub; + + if (PyUnicode_FAST_READY(self) == -1 || + PyUnicode_FAST_READY(substring) == -1) + return 0; + + if (PyUnicode_GET_LENGTH(substring) == 0) return 1; - ADJUST_INDICES(start, end, self->length); - end -= substring->length; + ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self)); + end -= PyUnicode_GET_LENGTH(substring); if (end < start) return 0; - if (direction > 0) { - if (Py_UNICODE_MATCH(self, end, substring)) + kind_self = PyUnicode_KIND(self); + data_self = PyUnicode_DATA(self); + kind_sub = PyUnicode_KIND(substring); + data_sub = PyUnicode_DATA(substring); + end_sub = PyUnicode_GET_LENGTH(substring) - 1; + + if (direction > 0) + offset = end; + else + offset = start; + + if (PyUnicode_READ(kind_self, data_self, offset) == + PyUnicode_READ(kind_sub, data_sub, 0) && + PyUnicode_READ(kind_self, data_self, offset + end_sub) == + PyUnicode_READ(kind_sub, data_sub, end_sub)) { + /* If both are of the same kind, memcmp is sufficient */ + if (kind_self == kind_sub) { + return ! memcmp((char *)data_self + + (offset * PyUnicode_CHARACTER_SIZE(substring)), + data_sub, + PyUnicode_GET_LENGTH(substring) * + PyUnicode_CHARACTER_SIZE(substring)); + } + /* otherwise we have to compare each character by first accesing it */ + else { + /* We do not need to compare 0 and len(substring)-1 because + the if statement above ensured already that they are equal + when we end up here. */ + // TODO: honor direction and do a forward or backwards search + for (i = 1; i < end_sub; ++i) { + if (PyUnicode_READ(kind_self, data_self, offset + i) != + PyUnicode_READ(kind_sub, data_sub, i)) + return 0; + } return 1; - } else { - if (Py_UNICODE_MATCH(self, start, substring)) - return 1; + } } return 0; @@ -6639,18 +7610,38 @@ static PyObject * fixup(PyUnicodeObject *self, - int (*fixfct)(PyUnicodeObject *s)) -{ - + Py_UCS4 (*fixfct)(PyUnicodeObject *s)) +{ PyUnicodeObject *u; - - u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); + Py_UCS4 maxchar_old, maxchar_new = 0; + + if (PyUnicode_FAST_READY(self) == -1) + return NULL; + maxchar_old = PyUnicode_MAX_CHAR_VALUE(self); + u = (PyUnicodeObject*) PyUnicode_New(PyUnicode_GET_LENGTH(self), + maxchar_old); if (u == NULL) return NULL; - Py_UNICODE_COPY(u->str, self->str, self->length); - - if (!fixfct(u) && PyUnicode_CheckExact(self)) { + Py_MEMCPY(u->str, self->str, u->length * PyUnicode_CHARACTER_SIZE(u)); + + /* fix functions return the new maximum character in a string, + if the kind of the resulting unicode object does not change, + everything is fine. Otherwise we need to change the string kind + and re-run the fix function. */ + maxchar_new = fixfct(u); + if (maxchar_new == 0) + /* do nothing, keep maxchar_new at 0 which means no changes. */; + else if (maxchar_new <= 127) + maxchar_new = 127; + else if (maxchar_new <= 255) + maxchar_new = 255; + else if (maxchar_new <= 65535) + maxchar_new = 65535; + else + maxchar_new = 1114111; /* 0x10ffff */ + + if (!maxchar_new && PyUnicode_CheckExact(self)) { /* fixfct should return TRUE if it modified the buffer. If FALSE, return a reference to the original buffer instead (to save space, not time) */ @@ -6658,123 +7649,207 @@ Py_DECREF(u); return (PyObject*) self; } - return (PyObject*) u; -} - -static int + else if (maxchar_new == maxchar_old) { + return (PyObject*) u; + } + else { + /* In case the maximum character changed, we need to + convert the string to the new category. */ + PyUnicodeObject *v = (PyUnicodeObject *)PyUnicode_New( + PyUnicode_GET_LENGTH(self), maxchar_new); + if (v == NULL) { + Py_DECREF(u); + return NULL; + } + if (maxchar_new > maxchar_old) { + /* If the maxchar increased so that the kind changed, not all + characters are representable anymore and we need to fix the + string again. This only happens in very few cases. */ + PyUnicode_CopyCharacters(v, 0, self, 0, PyUnicode_GET_LENGTH(self)); + maxchar_old = fixfct(v); + assert(maxchar_old > 0 && maxchar_old <= maxchar_new); + } + else + PyUnicode_CopyCharacters(v, 0, u, 0, PyUnicode_GET_LENGTH(self)); + + Py_DECREF(u); + return (PyObject*) v; + } +} + +static Py_UCS4 fixupper(PyUnicodeObject *self) { - Py_ssize_t len = self->length; - Py_UNICODE *s = self->str; - int status = 0; - - while (len-- > 0) { - register Py_UNICODE ch; - - ch = Py_UNICODE_TOUPPER(*s); - if (ch != *s) { - status = 1; - *s = ch; - } - s++; - } - - return status; -} - -static int + /* No need to call PyUnicode_Ready(self) because this function is only + called as a callback from fixup() which does it already. */ + const Py_ssize_t len = PyUnicode_GET_LENGTH(self); + const int kind = PyUnicode_KIND(self); + void *data = PyUnicode_DATA(self); + int touched = 0; + Py_UCS4 maxchar = 0; + Py_ssize_t i; + + for (i = 0; i < len; ++i) { + const Py_UCS4 ch = PyUnicode_READ(kind, data, i); + const Py_UCS4 up = Py_UNICODE_TOUPPER(ch); + if (up != ch) { + if (up > maxchar) + maxchar = up; + PyUnicode_WRITE(kind, data, i, up); + touched = 1; + } + else if (ch > maxchar) + maxchar = ch; + } + + if (touched) + return maxchar; + else + return 0; +} + +static Py_UCS4 fixlower(PyUnicodeObject *self) { - Py_ssize_t len = self->length; - Py_UNICODE *s = self->str; - int status = 0; - - while (len-- > 0) { - register Py_UNICODE ch; - - ch = Py_UNICODE_TOLOWER(*s); - if (ch != *s) { - status = 1; - *s = ch; - } - s++; - } - - return status; -} - -static int + /* No need to call PyUnicode_Ready(self) because fixup() which does it. */ + const Py_ssize_t len = PyUnicode_GET_LENGTH(self); + const int kind = PyUnicode_KIND(self); + void *data = PyUnicode_DATA(self); + int touched = 0; + Py_UCS4 maxchar = 0; + Py_ssize_t i; + + for(i = 0; i < len; ++i) { + const Py_UCS4 ch = PyUnicode_READ(kind, data, i); + const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch); + if (lo != ch) { + if (lo > maxchar) + maxchar = lo; + PyUnicode_WRITE(kind, data, i, lo); + touched = 1; + } + else if (ch > maxchar) + maxchar = ch; + } + + if (touched) + return maxchar; + else + return 0; +} + +static Py_UCS4 fixswapcase(PyUnicodeObject *self) { - Py_ssize_t len = self->length; - Py_UNICODE *s = self->str; - int status = 0; - - while (len-- > 0) { - if (Py_UNICODE_ISUPPER(*s)) { - *s = Py_UNICODE_TOLOWER(*s); - status = 1; - } else if (Py_UNICODE_ISLOWER(*s)) { - *s = Py_UNICODE_TOUPPER(*s); - status = 1; - } - s++; - } - - return status; -} - -static int + /* No need to call PyUnicode_Ready(self) because fixup() which does it. */ + const Py_ssize_t len = PyUnicode_GET_LENGTH(self); + const int kind = PyUnicode_KIND(self); + void *data = PyUnicode_DATA(self); + int touched = 0; + Py_UCS4 maxchar = 0; + Py_ssize_t i; + + for(i = 0; i < len; ++i) { + const Py_UCS4 ch = PyUnicode_READ(kind, data, i); + Py_UCS4 nu = 0; + + if (Py_UNICODE_ISUPPER(ch)) + nu = Py_UNICODE_TOLOWER(ch); + else if (Py_UNICODE_ISLOWER(ch)) + nu = Py_UNICODE_TOUPPER(ch); + + if (nu != 0) { + if (nu > maxchar) + maxchar = nu; + PyUnicode_WRITE(kind, data, i, nu); + touched = 1; + } + else if (ch > maxchar) + maxchar = ch; + } + + if (touched) + return maxchar; + else + return 0; +} + +static Py_UCS4 fixcapitalize(PyUnicodeObject *self) { - Py_ssize_t len = self->length; - Py_UNICODE *s = self->str; - int status = 0; + /* No need to call PyUnicode_Ready(self) because fixup() which does it. */ + const Py_ssize_t len = PyUnicode_GET_LENGTH(self); + const int kind = PyUnicode_KIND(self); + void *data = PyUnicode_DATA(self); + int touched = 0; + Py_UCS4 maxchar = 0; + Py_ssize_t i = 0; + Py_UCS4 ch; if (len == 0) return 0; - if (!Py_UNICODE_ISUPPER(*s)) { - *s = Py_UNICODE_TOUPPER(*s); - status = 1; - } - s++; - while (--len > 0) { - if (!Py_UNICODE_ISLOWER(*s)) { - *s = Py_UNICODE_TOLOWER(*s); - status = 1; - } - s++; - } - return status; -} - -static int + + ch = PyUnicode_READ(kind, data, i); + if (!Py_UNICODE_ISUPPER(ch)) { + maxchar = Py_UNICODE_TOUPPER(ch); + PyUnicode_WRITE(kind, data, i, maxchar); + touched = 1; + } + ++i; + for(; i < len; ++i) { + ch = PyUnicode_READ(kind, data, i); + if (!Py_UNICODE_ISLOWER(ch)) { + const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch); + if (lo > maxchar) + maxchar = lo; + PyUnicode_WRITE(kind, data, i, lo); + touched = 1; + } + else if (ch > maxchar) + maxchar = ch; + } + + if (touched) + return maxchar; + else + return 0; +} + +static Py_UCS4 fixtitle(PyUnicodeObject *self) { - register Py_UNICODE *p = PyUnicode_AS_UNICODE(self); - register Py_UNICODE *e; + /* No need to call PyUnicode_Ready(self) because fixup() which does it. */ + const Py_ssize_t len = PyUnicode_GET_LENGTH(self); + const int kind = PyUnicode_KIND(self); + void *data = PyUnicode_DATA(self); + Py_UCS4 maxchar = 0; + Py_ssize_t i = 0; int previous_is_cased; /* Shortcut for single character strings */ - if (PyUnicode_GET_SIZE(self) == 1) { - Py_UNICODE ch = Py_UNICODE_TOTITLE(*p); - if (*p != ch) { - *p = ch; - return 1; + if (len == 1) { + const Py_UCS4 ch = PyUnicode_READ(kind, data, i); + const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch); + if (ti != ch) { + PyUnicode_WRITE(kind, data, i, ti); + return ti; } else return 0; } - - e = p + PyUnicode_GET_SIZE(self); previous_is_cased = 0; - for (; p < e; p++) { - register const Py_UNICODE ch = *p; + for(; i < len; ++i) { + const Py_UCS4 ch = PyUnicode_READ(kind, data, i); + Py_UCS4 nu; if (previous_is_cased) - *p = Py_UNICODE_TOLOWER(ch); + nu = Py_UNICODE_TOLOWER(ch); else - *p = Py_UNICODE_TOTITLE(ch); + nu = Py_UNICODE_TOTITLE(ch); + + if (nu > maxchar) + maxchar = nu; + PyUnicode_WRITE(kind, data, i, nu); if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISUPPER(ch) || @@ -6783,22 +7858,22 @@ else previous_is_cased = 0; } - return 1; + return maxchar; } PyObject * PyUnicode_Join(PyObject *separator, PyObject *seq) { - const Py_UNICODE blank = ' '; - const Py_UNICODE *sep = ␣ + PyObject *sep = NULL; Py_ssize_t seplen = 1; PyUnicodeObject *res = NULL; /* the result */ - Py_UNICODE *res_p; /* pointer to free byte in res's string area */ PyObject *fseq; /* PySequence_Fast(seq) */ Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ PyObject **items; PyObject *item; - Py_ssize_t sz, i; + Py_ssize_t sz, i, res_offset; + Py_UCS4 maxchar = 0; + Py_UCS4 item_maxchar; fseq = PySequence_Fast(seq, ""); if (fseq == NULL) { @@ -6812,7 +7887,7 @@ seqlen = PySequence_Fast_GET_SIZE(fseq); /* If empty sequence, return u"". */ if (seqlen == 0) { - res = _PyUnicode_New(0); /* empty sequence; return u"" */ + res = (PyUnicodeObject *)PyUnicode_New(0, 0); goto Done; } items = PySequence_Fast_ITEMS(fseq); @@ -6828,8 +7903,10 @@ else { /* Set up sep and seplen */ if (separator == NULL) { - sep = ␣ - seplen = 1; + /* fall back to a blank space separator */ + sep = PyUnicode_FromStringAndSize(" ", 1); + if (!sep || PyUnicode_FAST_READY(sep) == -1) + goto onError; } else { if (!PyUnicode_Check(separator)) { @@ -6839,8 +7916,14 @@ Py_TYPE(separator)->tp_name); goto onError; } - sep = PyUnicode_AS_UNICODE(separator); - seplen = PyUnicode_GET_SIZE(separator); + if (PyUnicode_FAST_READY(separator) == -1) + goto onError; + sep = separator; + seplen = PyUnicode_GET_LENGTH(separator); + maxchar = PyUnicode_MAX_CHAR_VALUE(separator); + /* inc refcount to keep this code path symetric with the + above case of a blank separator */ + Py_INCREF(sep); } } @@ -6860,7 +7943,12 @@ i, Py_TYPE(item)->tp_name); goto onError; } - sz += PyUnicode_GET_SIZE(item); + if (PyUnicode_FAST_READY(item) == -1) + goto onError; + sz += PyUnicode_GET_LENGTH(item); + item_maxchar = PyUnicode_MAX_CHAR_VALUE(item); + if (item_maxchar > maxchar) + maxchar = item_maxchar; if (i != 0) sz += seplen; if (sz < old_sz || sz > PY_SSIZE_T_MAX) { @@ -6870,42 +7958,70 @@ } } - res = _PyUnicode_New(sz); + res = (PyUnicodeObject *)PyUnicode_New(sz, maxchar); if (res == NULL) goto onError; /* Catenate everything. */ - res_p = PyUnicode_AS_UNICODE(res); - for (i = 0; i < seqlen; ++i) { + for (i = 0, res_offset = 0; i < seqlen; ++i) { Py_ssize_t itemlen; item = items[i]; - itemlen = PyUnicode_GET_SIZE(item); + itemlen = PyUnicode_GET_LENGTH(item); /* Copy item, and maybe the separator. */ if (i) { - Py_UNICODE_COPY(res_p, sep, seplen); - res_p += seplen; - } - Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen); - res_p += itemlen; - } + PyUnicode_CopyCharacters(res, res_offset, + (PyUnicodeObject *)sep, 0, seplen); + res_offset += seplen; + } + PyUnicode_CopyCharacters(res, res_offset, + (PyUnicodeObject *)item, 0, itemlen); + res_offset += itemlen; + } + assert(res_offset == PyUnicode_GET_LENGTH(res)); Done: Py_DECREF(fseq); + Py_XDECREF(sep); return (PyObject *)res; onError: Py_DECREF(fseq); + Py_XDECREF(sep); Py_XDECREF(res); return NULL; } +#define FILL(kind, data, value, start, length) \ + do { \ + Py_ssize_t i_ = 0; \ + assert(kind != 0); \ + switch ((kind)) { \ + case PyUnicode_1BYTE_KIND: { \ + unsigned char * to_ = (unsigned char *)((data)) + (start); \ + for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ + break; \ + } \ + case PyUnicode_2BYTE_KIND: { \ + Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \ + for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ + break; \ + } \ + default: { \ + Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \ + for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ + break; \ + } \ + } \ + } while (0) + static PyUnicodeObject * pad(PyUnicodeObject *self, Py_ssize_t left, Py_ssize_t right, - Py_UNICODE fill) + Py_UCS4 fill) { PyUnicodeObject *u; + Py_UCS4 maxchar; if (left < 0) left = 0; @@ -6922,17 +8038,23 @@ PyErr_SetString(PyExc_OverflowError, "padded string is too long"); return NULL; } - u = _PyUnicode_New(left + self->length + right); + maxchar = PyUnicode_MAX_CHAR_VALUE(self); + if (fill > maxchar) + maxchar = fill; + u = (PyUnicodeObject *)PyUnicode_New(left + self->length + right, maxchar); if (u) { + const int kind = PyUnicode_KIND(u); + void *data = PyUnicode_DATA(u); if (left) - Py_UNICODE_FILL(u->str, fill, left); - Py_UNICODE_COPY(u->str + left, self->str, self->length); + FILL(kind, data, fill, 0, left); if (right) - Py_UNICODE_FILL(u->str + left + self->length, fill, right); + FILL(kind, data, fill, left + self->length, right); + PyUnicode_CopyCharacters(u, left, self, 0, self->length); } return u; } +#undef FILL PyObject * PyUnicode_Splitlines(PyObject *string, int keepends) @@ -6959,14 +8081,20 @@ if (maxcount < 0) maxcount = PY_SSIZE_T_MAX; + if (PyUnicode_AsUnicode((PyObject *)self) == NULL) + return NULL; + if (substring == NULL) return stringlib_split_whitespace( - (PyObject*) self, self->str, self->length, maxcount + (PyObject*) self, self->wstr, self->wstr_length, maxcount ); + if (PyUnicode_AsUnicode((PyObject *)substring) == NULL) + return NULL; + return stringlib_split( - (PyObject*) self, self->str, self->length, - substring->str, substring->length, + (PyObject*) self, self->wstr, self->wstr_length, + substring->wstr, substring->wstr_length, maxcount ); } @@ -6979,14 +8107,20 @@ if (maxcount < 0) maxcount = PY_SSIZE_T_MAX; + if (PyUnicode_AsUnicode((PyObject *)self) == NULL) + return NULL; + if (substring == NULL) return stringlib_rsplit_whitespace( - (PyObject*) self, self->str, self->length, maxcount + (PyObject*) self, self->wstr, self->wstr_length, maxcount ); + if (PyUnicode_AsUnicode((PyObject *)substring) == NULL) + return NULL; + return stringlib_rsplit( - (PyObject*) self, self->str, self->length, - substring->str, substring->length, + (PyObject*) self, self->wstr, self->wstr_length, + substring->wstr, substring->wstr_length, maxcount ); } @@ -7001,54 +8135,74 @@ if (maxcount < 0) maxcount = PY_SSIZE_T_MAX; - else if (maxcount == 0 || self->length == 0) + else if (maxcount == 0 || PyUnicode_GET_SIZE(self) == 0) goto nothing; - if (str1->length == str2->length) { + if (PyUnicode_GET_SIZE(str1) == PyUnicode_GET_SIZE(str2)) { Py_ssize_t i; /* same length */ - if (str1->length == 0) + if (PyUnicode_GET_SIZE(str1) == 0) goto nothing; - if (str1->length == 1) { + if (PyUnicode_GET_SIZE(str1) == 1) { /* replace characters */ Py_UNICODE u1, u2; - if (!findchar(self->str, self->length, str1->str[0])) + if (!findchar(PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), + PyUnicode_AS_UNICODE(str1)[0])) goto nothing; - u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); + u = (PyUnicodeObject*) PyUnicode_FromUnicode( + NULL, + PyUnicode_GET_SIZE(self) + ); if (!u) return NULL; - Py_UNICODE_COPY(u->str, self->str, self->length); - u1 = str1->str[0]; - u2 = str2->str[0]; - for (i = 0; i < u->length; i++) - if (u->str[i] == u1) { + Py_UNICODE_COPY(PyUnicode_AS_UNICODE(u), + PyUnicode_AS_UNICODE(self), + PyUnicode_GET_SIZE(self) + ); + u1 = PyUnicode_AS_UNICODE(str1)[0]; + u2 = PyUnicode_AS_UNICODE(str2)[0]; + for (i = 0; i < PyUnicode_GET_SIZE(u); i++) + if (PyUnicode_AS_UNICODE(u)[i] == u1) { if (--maxcount < 0) break; - u->str[i] = u2; + PyUnicode_AS_UNICODE(u)[i] = u2; } } else { i = stringlib_find( - self->str, self->length, str1->str, str1->length, 0 + PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), + PyUnicode_AS_UNICODE(str1), PyUnicode_GET_SIZE(str1), 0 ); if (i < 0) goto nothing; - u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); + u = (PyUnicodeObject*) PyUnicode_FromUnicode( + NULL, + PyUnicode_GET_SIZE(self) + ); if (!u) return NULL; - Py_UNICODE_COPY(u->str, self->str, self->length); - + Py_UNICODE_COPY(PyUnicode_AS_UNICODE(u), + PyUnicode_AS_UNICODE(self), + PyUnicode_GET_SIZE(self) + ); /* change everything in-place, starting with this one */ - Py_UNICODE_COPY(u->str+i, str2->str, str2->length); - i += str1->length; + Py_UNICODE_COPY(PyUnicode_AS_UNICODE(u)+i, + PyUnicode_AS_UNICODE(str2), + PyUnicode_GET_SIZE(str2) + ); + i += PyUnicode_GET_SIZE(str1); while ( --maxcount > 0) { - i = stringlib_find(self->str+i, self->length-i, - str1->str, str1->length, + i = stringlib_find(PyUnicode_AS_UNICODE(self)+i, + PyUnicode_GET_SIZE(self)-i, + PyUnicode_AS_UNICODE(str1), + PyUnicode_GET_SIZE(str1), i); if (i == -1) break; - Py_UNICODE_COPY(u->str+i, str2->str, str2->length); - i += str1->length; + Py_UNICODE_COPY(PyUnicode_AS_UNICODE(u)+i, + PyUnicode_AS_UNICODE(str2), + PyUnicode_GET_SIZE(str2)); + i += PyUnicode_GET_SIZE(str1); } } } else { @@ -7058,22 +8212,27 @@ Py_UNICODE *p; /* replace strings */ - n = stringlib_count(self->str, self->length, str1->str, str1->length, + n = stringlib_count(PyUnicode_AS_UNICODE(self), + PyUnicode_GET_SIZE(self), + PyUnicode_AS_UNICODE(str1), + PyUnicode_GET_SIZE(str1), maxcount); if (n == 0) goto nothing; - /* new_size = self->length + n * (str2->length - str1->length)); */ - delta = (str2->length - str1->length); + /* new_size = PyUnicode_GET_SIZE(self) + n * (PyUnicode_GET_SIZE(str2) - + PyUnicode_GET_SIZE(str1))); */ + delta = (PyUnicode_GET_SIZE(str2) - PyUnicode_GET_SIZE(str1)); if (delta == 0) { - new_size = self->length; + new_size = PyUnicode_GET_SIZE(self); } else { - product = n * (str2->length - str1->length); - if ((product / (str2->length - str1->length)) != n) { + product = n * (PyUnicode_GET_SIZE(str2) - PyUnicode_GET_SIZE(str1)); + if ((product / (PyUnicode_GET_SIZE(str2) - + PyUnicode_GET_SIZE(str1))) != n) { PyErr_SetString(PyExc_OverflowError, "replace string is too long"); return NULL; } - new_size = self->length + product; + new_size = PyUnicode_GET_SIZE(self) + product; if (new_size < 0) { PyErr_SetString(PyExc_OverflowError, "replace string is too long"); @@ -7084,40 +8243,46 @@ if (!u) return NULL; i = 0; - p = u->str; - if (str1->length > 0) { + p = PyUnicode_AS_UNICODE(u); + if (PyUnicode_GET_SIZE(str1) > 0) { while (n-- > 0) { /* look for next match */ - j = stringlib_find(self->str+i, self->length-i, - str1->str, str1->length, + j = stringlib_find(PyUnicode_AS_UNICODE(self)+i, + PyUnicode_GET_SIZE(self)-i, + PyUnicode_AS_UNICODE(str1), + PyUnicode_GET_SIZE(str1), i); if (j == -1) break; else if (j > i) { /* copy unchanged part [i:j] */ - Py_UNICODE_COPY(p, self->str+i, j-i); + Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(self)+i, j-i); p += j - i; } /* copy substitution string */ - if (str2->length > 0) { - Py_UNICODE_COPY(p, str2->str, str2->length); - p += str2->length; + if (PyUnicode_GET_SIZE(str2) > 0) { + Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(str2), + PyUnicode_GET_SIZE(str2)); + p += PyUnicode_GET_SIZE(str2); } - i = j + str1->length; - } - if (i < self->length) + i = j + PyUnicode_GET_SIZE(str1); + } + if (i < PyUnicode_GET_SIZE(self)) /* copy tail [i:] */ - Py_UNICODE_COPY(p, self->str+i, self->length-i); + Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(self)+i, + PyUnicode_GET_SIZE(self)-i); } else { /* interleave */ while (n > 0) { - Py_UNICODE_COPY(p, str2->str, str2->length); - p += str2->length; + Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(str2), + PyUnicode_GET_SIZE(str2)); + p += PyUnicode_GET_SIZE(str2); if (--n <= 0) break; - *p++ = self->str[i++]; - } - Py_UNICODE_COPY(p, self->str+i, self->length-i); + *p++ = PyUnicode_AS_UNICODE(self)[i++]; + } + Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(self)+i, + PyUnicode_GET_SIZE(self)-i); } } return (PyObject *) u; @@ -7128,7 +8293,8 @@ Py_INCREF(self); return (PyObject *) self; } - return PyUnicode_FromUnicode(self->str, self->length); + return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self), + PyUnicode_GET_SIZE(self)); } /* --- Unicode Object Methods --------------------------------------------- */ @@ -7235,6 +8401,9 @@ Py_ssize_t width; Py_UNICODE fillchar = ' '; + if (PyUnicode_FAST_READY(self) == -1) + return NULL; + if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) return NULL; @@ -7301,27 +8470,29 @@ #else +/* This function assumes that str1 and str2 are readied by the caller. */ + static int unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) { - register Py_ssize_t len1, len2; - - Py_UNICODE *s1 = str1->str; - Py_UNICODE *s2 = str2->str; - - len1 = str1->length; - len2 = str2->length; - - while (len1 > 0 && len2 > 0) { - Py_UNICODE c1, c2; - - c1 = *s1++; - c2 = *s2++; + int kind1, kind2; + void *data1, *data2; + Py_ssize_t len1, len2, i; + + kind1 = PyUnicode_KIND(str1); + kind2 = PyUnicode_KIND(str2); + data1 = PyUnicode_DATA(str1); + data2 = PyUnicode_DATA(str2); + len1 = PyUnicode_GET_LENGTH(str1); + len2 = PyUnicode_GET_LENGTH(str2); + + for (i = 0; i < len1 && i < len2; ++i) { + Py_UCS4 c1, c2; + c1 = PyUnicode_READ(kind1, data1, i); + c2 = PyUnicode_READ(kind2, data2, i); if (c1 != c2) return (c1 < c2) ? -1 : 1; - - len1--; len2--; } return (len1 < len2) ? -1 : (len1 != len2); @@ -7332,9 +8503,13 @@ int PyUnicode_Compare(PyObject *left, PyObject *right) { - if (PyUnicode_Check(left) && PyUnicode_Check(right)) + if (PyUnicode_Check(left) && PyUnicode_Check(right)) { + if (PyUnicode_FAST_READY(left) == -1 || + PyUnicode_FAST_READY(right) == -1) + return -1; return unicode_compare((PyUnicodeObject *)left, (PyUnicodeObject *)right); + } PyErr_Format(PyExc_TypeError, "Can't compare %.100s and %.100s", left->ob_type->tp_name, @@ -7345,17 +8520,23 @@ int PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) { - int i; - Py_UNICODE *id; + Py_ssize_t i; + int kind; + void *data; + Py_UCS4 chr; + assert(PyUnicode_Check(uni)); - id = PyUnicode_AS_UNICODE(uni); + if (PyUnicode_FAST_READY(uni) == -1) + return -1; + kind = PyUnicode_KIND(uni); + data = PyUnicode_DATA(uni); /* Compare Unicode string and source character set string */ - for (i = 0; id[i] && str[i]; i++) - if (id[i] != str[i]) - return ((int)id[i] < (int)str[i]) ? -1 : 1; + for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++) + if (chr != str[i]) + return (chr < (unsigned char)(str[i])) ? -1 : 1; /* This check keeps Python strings that end in '\0' from comparing equal to C strings identical up to that point. */ - if (PyUnicode_GET_SIZE(uni) != i || id[i]) + if (PyUnicode_GET_LENGTH(uni) != i || chr) return 1; /* uni is longer */ if (str[i]) return -1; /* str is longer */ @@ -7373,7 +8554,11 @@ if (PyUnicode_Check(left) && PyUnicode_Check(right)) { PyObject *v; - if (PyUnicode_GET_SIZE(left) != PyUnicode_GET_SIZE(right)) { + if (PyUnicode_FAST_READY(left) == -1 || + PyUnicode_FAST_READY(right) == -1) + return NULL; + if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) || + PyUnicode_KIND(left) != PyUnicode_KIND(right)) { if (op == Py_EQ) { Py_INCREF(Py_False); return Py_False; @@ -7455,6 +8640,7 @@ PyUnicode_Concat(PyObject *left, PyObject *right) { PyUnicodeObject *u = NULL, *v = NULL, *w; + Py_UCS4 maxchar; /* Coerce the two arguments */ u = (PyUnicodeObject *)PyUnicode_FromObject(left); @@ -7474,13 +8660,22 @@ return (PyObject *)v; } + if (PyUnicode_FAST_READY(u) == -1 || PyUnicode_FAST_READY(v) == -1) + goto onError; + + maxchar = PyUnicode_MAX_CHAR_VALUE(u); + if (PyUnicode_MAX_CHAR_VALUE(v) > maxchar) + maxchar = PyUnicode_MAX_CHAR_VALUE(v); + /* Concat the two Unicode strings */ - w = _PyUnicode_New(u->length + v->length); + w = (PyUnicodeObject *) PyUnicode_New( + PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v), + maxchar); if (w == NULL) goto onError; - Py_UNICODE_COPY(w->str, u->str, u->length); - Py_UNICODE_COPY(w->str + u->length, v->str, v->length); - + PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)); + PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u), v, 0, + PyUnicode_GET_LENGTH(v)); Py_DECREF(u); Py_DECREF(v); return (PyObject *)w; @@ -7533,10 +8728,16 @@ &start, &end)) return NULL; - ADJUST_INDICES(start, end, self->length); + if (PyUnicode_AsUnicode((PyObject *)self) == NULL || + PyUnicode_AsUnicode((PyObject *)substring) == NULL) { + Py_DECREF(substring); + return NULL; + } + + ADJUST_INDICES(start, end, self->wstr_length); result = PyLong_FromSsize_t( - stringlib_count(self->str + start, end - start, - substring->str, substring->length, + stringlib_count(self->wstr + start, end - start, + substring->wstr, substring->wstr_length, PY_SSIZE_T_MAX) ); @@ -7588,11 +8789,14 @@ if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) return NULL; + if (PyUnicode_AsUnicode((PyObject *)self) == NULL) + return NULL; + /* First pass: determine size of output string */ i = 0; /* chars up to and including most recent \n or \r */ j = 0; /* chars since most recent \n or \r (use in tab calculations) */ - e = self->str + self->length; /* end of input */ - for (p = self->str; p < e; p++) + e = self->wstr + self->wstr_length; /* end of input */ + for (p = self->wstr; p < e; p++) if (*p == '\t') { if (tabsize > 0) { incr = tabsize - (j % tabsize); /* cannot overflow */ @@ -7622,10 +8826,10 @@ return NULL; j = 0; /* same as in first pass */ - q = u->str; /* next output char */ - qe = u->str + u->length; /* end of output */ - - for (p = self->str; p < e; p++) + q = u->wstr; /* next output char */ + qe = u->wstr + u->wstr_length; /* end of output */ + + for (p = self->wstr; p < e; p++) if (*p == '\t') { if (tabsize > 0) { i = tabsize - (j % tabsize); @@ -7646,6 +8850,10 @@ j = 0; } + if (PyUnicode_Ready(u) == -1) { + Py_DECREF(u); + return NULL; + } return (PyObject*) u; overflow2: @@ -7690,12 +8898,30 @@ static PyObject * unicode_getitem(PyUnicodeObject *self, Py_ssize_t index) { + PyUnicodeObject *u = NULL; + Py_UCS4 ch; + + if (PyUnicode_FAST_READY(self) == -1) + return NULL; if (index < 0 || index >= self->length) { PyErr_SetString(PyExc_IndexError, "string index out of range"); return NULL; } - return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1); + ch = PyUnicode_READ(PyUnicode_KIND(self), PyUnicode_DATA(self), index); + /* Call PyUnicode_FromUnicode to re-use shared latin-1 optimization. */ + if (ch < 256) { + const Py_UNICODE str[2] = {ch, 0}; + u = (PyUnicodeObject *) PyUnicode_FromUnicode(str, 1); + } + else { + u = (PyUnicodeObject *) PyUnicode_New(1, ch); + if (u == NULL) + return NULL; + PyUnicode_WRITE(PyUnicode_KIND(u), PyUnicode_DATA(u), 0, ch); + } + + return (PyObject *) u; } /* Believe it or not, this produces the same value for ASCII strings @@ -7704,22 +8930,48 @@ unicode_hash(PyUnicodeObject *self) { Py_ssize_t len; - Py_UNICODE *p; Py_hash_t x; if (self->hash != -1) return self->hash; - len = Py_SIZE(self); - p = self->str; - x = *p << 7; - while (--len >= 0) - x = (1000003*x) ^ *p++; - x ^= Py_SIZE(self); + if (PyUnicode_FAST_READY(self) == -1) + return -1; + len = PyUnicode_GET_LENGTH(self); + + /* The hash function as a macro, gets expanded three times below. */ +#define HASH(P) \ + x = *P << 7; \ + while (--len >= 0) \ + x = (1000003*x) ^ *P++; + + switch (PyUnicode_KIND(self)) { + case PyUnicode_1BYTE_KIND: { + const unsigned char *c = PyUnicode_1BYTE_DATA(self); + HASH(c); + break; + } + case PyUnicode_2BYTE_KIND: { + const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self); + HASH(s); + break; + } + default: { + assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND && + "Impossible switch case in unicode_hash"); + + const Py_UCS4 *l = PyUnicode_4BYTE_DATA(self); + HASH(l); + break; + } + } + x ^= PyUnicode_GET_LENGTH(self); + if (x == -1) x = -2; self->hash = x; return x; } +#undef HASH PyDoc_STRVAR(index__doc__, "S.index(sub[, start[, end]]) -> int\n\ @@ -7937,21 +9189,28 @@ static PyObject* unicode_isalnum(PyUnicodeObject *self) { - register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); - register const Py_UNICODE *e; + int kind; + void *data; + Py_ssize_t len, i; + + if (PyUnicode_FAST_READY(self) == -1) + return NULL; + + kind = PyUnicode_KIND(self); + data = PyUnicode_DATA(self); + len = PyUnicode_GET_LENGTH(self); /* Shortcut for single character strings */ - if (PyUnicode_GET_SIZE(self) == 1 && - Py_UNICODE_ISALNUM(*p)) + if (len == 1 && + Py_UNICODE_ISALNUM(PyUnicode_READ(kind, data, 0))) return PyBool_FromLong(1); /* Special case for empty strings */ - if (PyUnicode_GET_SIZE(self) == 0) + if (len == 0) return PyBool_FromLong(0); - e = p + PyUnicode_GET_SIZE(self); - for (; p < e; p++) { - if (!Py_UNICODE_ISALNUM(*p)) + for (i = 0; i < len; i++) { + if (!Py_UNICODE_ISALNUM(PyUnicode_READ(kind, data, i))) return PyBool_FromLong(0); } return PyBool_FromLong(1); @@ -8141,7 +9400,9 @@ static Py_ssize_t unicode_length(PyUnicodeObject *self) { - return self->length; + if (PyUnicode_FAST_READY(self) == -1) + return -1; + return PyUnicode_GET_LENGTH(self); } PyDoc_STRVAR(ljust__doc__, @@ -8156,6 +9417,9 @@ Py_ssize_t width; Py_UNICODE fillchar = ' '; + if (PyUnicode_FAST_READY(self) == -1) + return NULL; + if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) return NULL; @@ -8222,16 +9486,58 @@ return PyUnicode_FromUnicode(s+i, j-i); } +/* Assumes an already ready self string. */ + +static PyObject * +substring(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t len) +{ + const int kind = PyUnicode_KIND(self); + void *data = PyUnicode_DATA(self); + Py_UCS4 maxchar = 0; + Py_ssize_t i; + PyObject *unicode; + + if (start < 0 || len < 0 || (start + len) > PyUnicode_GET_LENGTH(self)) { + PyErr_BadInternalCall(); + return NULL; + } + + if (len == PyUnicode_GET_LENGTH(self) && PyUnicode_CheckExact(self)) { + Py_INCREF(self); + return (PyObject*)self; + } + + for (i = 0; i < len; ++i) { + const Py_UCS4 ch = PyUnicode_READ(kind, data, start + i); + if (ch > maxchar) + maxchar = ch; + } + + unicode = PyUnicode_New(len, maxchar); + if (unicode == NULL) + return NULL; + PyUnicode_CopyCharacters((PyUnicodeObject *)unicode, 0, + self, start, len); + return unicode; +} static PyObject * do_strip(PyUnicodeObject *self, int striptype) { - Py_UNICODE *s = PyUnicode_AS_UNICODE(self); - Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j; + int kind; + void *data; + Py_ssize_t len, i, j; + + if (PyUnicode_FAST_READY(self) == -1) + return NULL; + + kind = PyUnicode_KIND(self); + data = PyUnicode_DATA(self); + len = PyUnicode_GET_LENGTH(self); i = 0; if (striptype != RIGHTSTRIP) { - while (i < len && Py_UNICODE_ISSPACE(s[i])) { + while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) { i++; } } @@ -8240,7 +9546,7 @@ if (striptype != LEFTSTRIP) { do { j--; - } while (j >= i && Py_UNICODE_ISSPACE(s[j])); + } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j))); j++; } @@ -8249,7 +9555,7 @@ return (PyObject*)self; } else - return PyUnicode_FromUnicode(s+i, j-i); + return substring(self, i, j-i); } @@ -8329,9 +9635,8 @@ unicode_repeat(PyUnicodeObject *str, Py_ssize_t len) { PyUnicodeObject *u; - Py_UNICODE *p; - Py_ssize_t nchars; - size_t nbytes; + Py_ssize_t nchars, n; + size_t nbytes, char_size; if (len < 1) { Py_INCREF(unicode_empty); @@ -8344,35 +9649,46 @@ return (PyObject*) str; } + if (PyUnicode_FAST_READY(str) == -1) + return NULL; + /* ensure # of chars needed doesn't overflow int and # of bytes * needed doesn't overflow size_t */ - nchars = len * str->length; - if (nchars / len != str->length) { + nchars = len * PyUnicode_GET_LENGTH(str); + if (nchars / len != PyUnicode_GET_LENGTH(str)) { PyErr_SetString(PyExc_OverflowError, "repeated string is too long"); return NULL; } - nbytes = (nchars + 1) * sizeof(Py_UNICODE); - if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) { + char_size = PyUnicode_CHARACTER_SIZE(str); + nbytes = (nchars + 1) * char_size; + if (nbytes / char_size != (size_t)(nchars + 1)) { PyErr_SetString(PyExc_OverflowError, "repeated string is too long"); return NULL; } - u = _PyUnicode_New(nchars); + u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str)); if (!u) return NULL; - p = u->str; - - if (str->length == 1) { - Py_UNICODE_FILL(p, str->str[0], len); - } else { - Py_ssize_t done = str->length; /* number of characters copied this far */ - Py_UNICODE_COPY(p, str->str, str->length); + if (PyUnicode_GET_LENGTH(str) == 1) { + const int kind = PyUnicode_KIND(str); + const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0); + void *to = PyUnicode_DATA(u); + for (n = 0; n < len; ++n) + PyUnicode_WRITE(kind, to, n, fill_char); + } + else { + /* number of characters copied this far */ + Py_ssize_t done = PyUnicode_GET_LENGTH(str); + const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str); + char *to = (char *) PyUnicode_DATA(u); + Py_MEMCPY(to, PyUnicode_DATA(str), + PyUnicode_GET_LENGTH(str) * char_size); while (done < nchars) { - Py_ssize_t n = (done <= nchars-done) ? done : nchars-done; - Py_UNICODE_COPY(p+done, p, n); + n = (done <= nchars-done) ? done : nchars-done; + Py_MEMCPY(to + (done * char_size), to, n * char_size); done += n; } } @@ -8673,6 +9989,9 @@ Py_ssize_t width; Py_UNICODE fillchar = ' '; + if (PyUnicode_FAST_READY(self) == -1) + return NULL; + if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) return NULL; @@ -9020,7 +10339,14 @@ static PyObject* unicode_translate(PyUnicodeObject *self, PyObject *table) { - return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore"); + Py_UNICODE *wstr; + Py_ssize_t len; + + wstr = PyUnicode_AsUnicodeAndSize((PyObject *)self, &len); + if (!wstr) + return NULL; + + return PyUnicode_TranslateCharmap(wstr, len, table, "ignore"); } PyDoc_STRVAR(upper__doc__, @@ -9045,12 +10371,18 @@ { Py_ssize_t fill; PyUnicodeObject *u; - Py_ssize_t width; + int kind; + void *data; + Py_UCS4 chr; + + if (PyUnicode_FAST_READY(self) == -1) + return NULL; + if (!PyArg_ParseTuple(args, "n:zfill", &width)) return NULL; - if (self->length >= width) { + if (PyUnicode_GET_LENGTH(self) >= width) { if (PyUnicode_CheckExact(self)) { Py_INCREF(self); return (PyObject*) self; @@ -9069,27 +10401,24 @@ if (u == NULL) return NULL; - if (u->str[fill] == '+' || u->str[fill] == '-') { + kind = PyUnicode_KIND(u); + data = PyUnicode_DATA(u); + chr = PyUnicode_READ(kind, data, fill); + + if (chr == '+' || chr == '-') { /* move sign to beginning of string */ - u->str[0] = u->str[fill]; - u->str[fill] = '0'; + PyUnicode_WRITE(kind, data, 0, chr); + PyUnicode_WRITE(kind, data, fill, '0'); } return (PyObject*) u; } #if 0 -static PyObject* -unicode_freelistsize(PyUnicodeObject *self) -{ - return PyLong_FromLong(numfree); -} - static PyObject * unicode__decimal2ascii(PyObject *self) { - return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self), - PyUnicode_GET_SIZE(self)); + return PyUnicode_TransformDecimalAndSpaceToASCII(self); } #endif @@ -9224,8 +10553,23 @@ static PyObject * unicode__sizeof__(PyUnicodeObject *v) { - return PyLong_FromSsize_t(sizeof(PyUnicodeObject) + - sizeof(Py_UNICODE) * (v->length + 1)); + Py_ssize_t data_size = 0; + + if (PyUnicode_FAST_READY(v) == -1) + return NULL; + + if (v->str) + data_size += (v->length + 1) * PyUnicode_CHARACTER_SIZE(v); +/* This is currently disabled as returning the actual memory usage produces + unpredictable results in unit tests. */ +#if 0 + if (v->wstr && v->wstr != v->str) + data_size += (v->wstr_length + 1) * sizeof(wchar_t); + if (v->utf8 && v->utf8 != v->str) + data_size += (v->utf8_length + 1); +#endif + + return PyLong_FromSsize_t(sizeof(PyUnicodeObject) + data_size); } PyDoc_STRVAR(sizeof__doc__, @@ -9234,7 +10578,12 @@ static PyObject * unicode_getnewargs(PyUnicodeObject *v) { - return Py_BuildValue("(u#)", v->str, v->length); + Py_ssize_t size; + Py_UNICODE *wstr = PyUnicode_AsUnicodeAndSize((PyObject *)v, &size); + + if (!wstr) + return NULL; + return Py_BuildValue("(u#)", wstr, size); } static PyMethodDef unicode_methods[] = { @@ -9294,7 +10643,6 @@ #if 0 /* These methods are just used for debugging the implementation. */ - {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS}, {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS}, #endif @@ -9331,12 +10679,15 @@ static PyObject* unicode_subscript(PyUnicodeObject* self, PyObject* item) { + if (PyUnicode_FAST_READY(self) == -1) + return NULL; + if (PyIndex_Check(item)) { Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); if (i == -1 && PyErr_Occurred()) return NULL; if (i < 0) - i += PyUnicode_GET_SIZE(self); + i += PyUnicode_GET_LENGTH(self); return unicode_getitem(self, i); } else if (PySlice_Check(item)) { Py_ssize_t start, stop, step, slicelength, cur, i; @@ -9344,19 +10695,20 @@ Py_UNICODE* result_buf; PyObject* result; - if (PySlice_GetIndicesEx(item, PyUnicode_GET_SIZE(self), + if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self), &start, &stop, &step, &slicelength) < 0) { return NULL; } if (slicelength <= 0) { - return PyUnicode_FromUnicode(NULL, 0); - } else if (start == 0 && step == 1 && slicelength == self->length && + return PyUnicode_New(0, 0); + } else if (start == 0 && step == 1 && + slicelength == PyUnicode_GET_LENGTH(self) && PyUnicode_CheckExact(self)) { Py_INCREF(self); return (PyObject *)self; } else if (step == 1) { - return PyUnicode_FromUnicode(self->str + start, slicelength); + return substring(self, start, slicelength); } else { source_buf = PyUnicode_AS_UNICODE((PyObject*)self); result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength* @@ -9977,29 +11329,50 @@ { PyUnicodeObject *tmp, *pnew; Py_ssize_t n; + PyObject *err = NULL; assert(PyType_IsSubtype(type, &PyUnicode_Type)); tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); if (tmp == NULL) return NULL; assert(PyUnicode_Check(tmp)); - pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length); + // TODO: Verify the PyUnicode_GET_SIZE does the right thing. + // it seems kind of strange that tp_alloc gets passed the size + // of the unicode string because there will follow another + // malloc. + pnew = (PyUnicodeObject *) type->tp_alloc(type, + n = PyUnicode_GET_SIZE(tmp)); if (pnew == NULL) { Py_DECREF(tmp); return NULL; } - pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1)); - if (pnew->str == NULL) { - _Py_ForgetReference((PyObject *)pnew); - PyObject_Del(pnew); - Py_DECREF(tmp); - return PyErr_NoMemory(); - } - Py_UNICODE_COPY(pnew->str, tmp->str, n+1); - pnew->length = n; + pnew->wstr = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1)); + if (pnew->wstr == NULL) { + err = PyErr_NoMemory(); + goto onError; + } + Py_UNICODE_COPY(pnew->wstr, PyUnicode_AS_UNICODE(tmp), n+1); + pnew->wstr_length = n; pnew->hash = tmp->hash; + pnew->state = 0; + pnew->str = NULL; + pnew->length = 0; + pnew->utf8 = NULL; + pnew->utf8_length = 0; + + if (PyUnicode_Ready(pnew) == -1) { + PyObject_FREE(pnew->wstr); + goto onError; + } + Py_DECREF(tmp); return (PyObject *)pnew; + + onError: + _Py_ForgetReference((PyObject *)pnew); + PyObject_Del(pnew); + Py_DECREF(tmp); + return err; } PyDoc_STRVAR(unicode_doc, @@ -10074,11 +11447,9 @@ }; /* Init the implementation */ - free_list = NULL; - numfree = 0; - unicode_empty = _PyUnicode_New(0); + unicode_empty = (PyUnicodeObject *) PyUnicode_New(0, 0); if (!unicode_empty) - return; + Py_FatalError("Can't create empty string"); for (i = 0; i < 256; i++) unicode_latin1[i] = NULL; @@ -10098,21 +11469,7 @@ int PyUnicode_ClearFreeList(void) { - int freelist_size = numfree; - PyUnicodeObject *u; - - for (u = free_list; u != NULL;) { - PyUnicodeObject *v = u; - u = *(PyUnicodeObject **)u; - if (v->str) - PyObject_DEL(v->str); - Py_XDECREF(v->defenc); - PyObject_Del(v); - numfree--; - } - free_list = NULL; - assert(numfree == 0); - return freelist_size; + return 0; } void @@ -10146,6 +11503,10 @@ return; if (PyUnicode_CHECK_INTERNED(s)) return; + if (PyUnicode_FAST_READY(s) == -1) { + assert(0 && "ready fail in intern..."); + return; + } if (interned == NULL) { interned = PyDict_New(); if (interned == NULL) { @@ -10177,15 +11538,18 @@ /* The two references in interned are not counted by refcnt. The deallocator will take care of this */ Py_REFCNT(s) -= 2; - PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL; + s->state = (s->state & ~SSTATE_INTERN_MASK) | SSTATE_INTERNED_MORTAL; } void PyUnicode_InternImmortal(PyObject **p) { + PyUnicodeObject *u = (PyUnicodeObject *)*p; + PyUnicode_InternInPlace(p); if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { - PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL; + u->state = (u->state & ~SSTATE_INTERN_MASK) | + SSTATE_INTERNED_IMMORTAL; Py_INCREF(*p); } } @@ -10226,22 +11590,24 @@ n); for (i = 0; i < n; i++) { s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i); - switch (s->state) { + if (PyUnicode_FAST_READY(s) == -1) + fprintf(stderr, "could not ready string\n"); + switch (PyUnicode_CHECK_INTERNED(s)) { case SSTATE_NOT_INTERNED: /* XXX Shouldn't happen */ break; case SSTATE_INTERNED_IMMORTAL: Py_REFCNT(s) += 1; - immortal_size += s->length; + immortal_size += PyUnicode_GET_LENGTH(s); break; case SSTATE_INTERNED_MORTAL: Py_REFCNT(s) += 2; - mortal_size += s->length; + mortal_size += PyUnicode_GET_LENGTH(s); break; default: Py_FatalError("Inconsistent interned string state."); } - s->state = SSTATE_NOT_INTERNED; + s->state = (s->state & ~SSTATE_INTERN_MASK) | SSTATE_NOT_INTERNED; } fprintf(stderr, "total size of all interned strings: " "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " @@ -10288,9 +11654,11 @@ return NULL; assert(PyUnicode_Check(seq)); - if (it->it_index < PyUnicode_GET_SIZE(seq)) { - item = PyUnicode_FromUnicode( - PyUnicode_AS_UNICODE(seq)+it->it_index, 1); + if (it->it_index < PyUnicode_GET_LENGTH(seq)) { + int kind = PyUnicode_KIND(seq); + void *data = PyUnicode_DATA(seq); + Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index); + item = PyUnicode_FromOrdinal(chr); if (item != NULL) ++it->it_index; return item; @@ -10360,6 +11728,8 @@ PyErr_BadInternalCall(); return NULL; } + if (PyUnicode_FAST_READY(seq) == -1) + return NULL; it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); if (it == NULL) return NULL; @@ -10467,6 +11837,10 @@ Py_UNICODE *copy; Py_ssize_t size; + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + return NULL; + } /* Ensure we won't overflow the size. */ if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { PyErr_NoMemory(); diff --git a/Python/ast.c b/Python/ast.c --- a/Python/ast.c +++ b/Python/ast.c @@ -528,26 +528,21 @@ new_identifier(const char* n, PyArena *arena) { PyObject* id = PyUnicode_DecodeUTF8(n, strlen(n), NULL); - Py_UNICODE *u; - if (!id) + if (!id || PyUnicode_FAST_READY(id) == -1) return NULL; - u = PyUnicode_AS_UNICODE(id); /* Check whether there are non-ASCII characters in the identifier; if so, normalize to NFKC. */ - for (; *u; u++) { - if (*u >= 128) { - PyObject *m = PyImport_ImportModuleNoBlock("unicodedata"); - PyObject *id2; - if (!m) - return NULL; - id2 = PyObject_CallMethod(m, "normalize", "sO", "NFKC", id); - Py_DECREF(m); - if (!id2) - return NULL; - Py_DECREF(id); - id = id2; - break; - } + if (PyUnicode_MAX_CHAR_VALUE((PyUnicodeObject *)id) >= 128) { + PyObject *m = PyImport_ImportModuleNoBlock("unicodedata"); + PyObject *id2; + if (!m) + return NULL; + id2 = PyObject_CallMethod(m, "normalize", "sO", "NFKC", id); + Py_DECREF(m); + if (!id2) + return NULL; + Py_DECREF(id); + id = id2; } PyUnicode_InternInPlace(&id); PyArena_AddPyObject(arena, id); @@ -3660,20 +3655,14 @@ } static PyObject * -decode_utf8(struct compiling *c, const char **sPtr, const char *end, char* encoding) +decode_utf8(struct compiling *c, const char **sPtr, const char *end) { - PyObject *u, *v; char *s, *t; t = s = (char *)*sPtr; /* while (s < end && *s != '\\') s++; */ /* inefficient for u".." */ while (s < end && (*s & 0x80)) s++; *sPtr = s; - u = PyUnicode_DecodeUTF8(t, s - t, NULL); - if (u == NULL) - return NULL; - v = PyUnicode_AsEncodedString(u, encoding, NULL); - Py_DECREF(u); - return v; + return PyUnicode_DecodeUTF8(t, s - t, NULL); } static PyObject * @@ -3707,22 +3696,20 @@ } if (*s & 0x80) { /* XXX inefficient */ PyObject *w; - char *r; - Py_ssize_t rn, i; - w = decode_utf8(c, &s, end, "utf-32-be"); + int kind; + void *data; + Py_ssize_t len, i; + w = decode_utf8(c, &s, end); if (w == NULL) { Py_DECREF(u); return NULL; } - r = PyBytes_AS_STRING(w); - rn = Py_SIZE(w); - assert(rn % 4 == 0); - for (i = 0; i < rn; i += 4) { - sprintf(p, "\\U%02x%02x%02x%02x", - r[i + 0] & 0xFF, - r[i + 1] & 0xFF, - r[i + 2] & 0xFF, - r[i + 3] & 0xFF); + kind = PyUnicode_KIND(w); + data = PyUnicode_DATA(w); + len = PyUnicode_GET_LENGTH(w); + for (i = 0; i < len; i++) { + Py_UCS4 chr = PyUnicode_READ(kind, data, i); + sprintf(p, "\\U%08x", chr); p += 10; } /* Should be impossible to overflow */ diff --git a/Python/bltinmodule.c b/Python/bltinmodule.c --- a/Python/bltinmodule.c +++ b/Python/bltinmodule.c @@ -508,8 +508,8 @@ if (PyUnicode_Check(cmd)) { cf->cf_flags |= PyCF_IGNORE_COOKIE; - cmd = _PyUnicode_AsDefaultEncodedString(cmd); - if (cmd == NULL) + str = PyUnicode_AsUTF8AndSize(cmd, &size); + if (str == NULL) return NULL; } else if (!PyObject_CheckReadBuffer(cmd)) { @@ -518,9 +518,10 @@ funcname, what); return NULL; } - if (PyObject_AsReadBuffer(cmd, (const void **)&str, &size) < 0) { + else if (PyObject_AsReadBuffer(cmd, (const void **)&str, &size) < 0) { return NULL; } + if (strlen(str) != size) { PyErr_SetString(PyExc_TypeError, "source code string cannot contain null bytes"); @@ -1395,24 +1396,13 @@ } } else if (PyUnicode_Check(obj)) { - size = PyUnicode_GET_SIZE(obj); + if (PyUnicode_FAST_READY(obj) == -1) + return NULL; + size = PyUnicode_GET_LENGTH(obj); if (size == 1) { - ord = (long)*PyUnicode_AS_UNICODE(obj); + ord = (long)PyUnicode_READ_CHAR(obj, 0); return PyLong_FromLong(ord); } -#ifndef Py_UNICODE_WIDE - if (size == 2) { - /* Decode a valid surrogate pair */ - int c0 = PyUnicode_AS_UNICODE(obj)[0]; - int c1 = PyUnicode_AS_UNICODE(obj)[1]; - if (0xD800 <= c0 && c0 <= 0xDBFF && - 0xDC00 <= c1 && c1 <= 0xDFFF) { - ord = ((((c0 & 0x03FF) << 10) | (c1 & 0x03FF)) + - 0x00010000); - return PyLong_FromLong(ord); - } - } -#endif } else if (PyByteArray_Check(obj)) { /* XXX Hopefully this is temporary */ diff --git a/Python/ceval.c b/Python/ceval.c --- a/Python/ceval.c +++ b/Python/ceval.c @@ -4456,7 +4456,8 @@ } if (skip_leading_underscores && PyUnicode_Check(name) && - PyUnicode_AS_UNICODE(name)[0] == '_') + PyUnicode_FAST_READY(name) != -1 && + PyUnicode_READ_CHAR(name, 0) == '_') { Py_DECREF(name); continue; @@ -4520,6 +4521,14 @@ { /* This function implements 'variable += expr' when both arguments are (Unicode) strings. */ + + w = PyUnicode_Concat(v, w); + Py_DECREF(v); + return w; + + /* XXX: This optimization is currently disabled as unicode objects in the + new flexible representation are not in-place resizable anymore. */ +#if 0 Py_ssize_t v_len = PyUnicode_GET_SIZE(v); Py_ssize_t w_len = PyUnicode_GET_SIZE(w); Py_ssize_t new_len = v_len + w_len; @@ -4570,7 +4579,8 @@ } } - if (Py_REFCNT(v) == 1 && !PyUnicode_CHECK_INTERNED(v)) { + if (Py_REFCNT(v) == 1 && !PyUnicode_CHECK_INTERNED(v) && + !PyUnicode_IS_COMPACT((PyUnicodeObject *)v)) { /* Now we own the last reference to 'v', so we can resize it * in-place. */ @@ -4594,6 +4604,7 @@ Py_DECREF(v); return w; } +#endif } #ifdef DYNAMIC_EXECUTION_PROFILE diff --git a/Python/compile.c b/Python/compile.c --- a/Python/compile.c +++ b/Python/compile.c @@ -3045,8 +3045,7 @@ return PyObject_IsTrue(e->v.Str.s); case Name_kind: /* optimize away names that can't be reassigned */ - id = PyBytes_AS_STRING( - _PyUnicode_AsDefaultEncodedString(e->v.Name.id)); + id = PyUnicode_AsUTF8(e->v.Name.id); if (strcmp(id, "True") == 0) return 1; if (strcmp(id, "False") == 0) return 0; if (strcmp(id, "None") == 0) return 0; diff --git a/Python/getargs.c b/Python/getargs.c --- a/Python/getargs.c +++ b/Python/getargs.c @@ -546,9 +546,6 @@ -#define UNICODE_DEFAULT_ENCODING(arg) \ - _PyUnicode_AsDefaultEncodedString(arg) - /* Format an error message generated by convertsimple(). */ static char * @@ -611,7 +608,7 @@ const char *format = *p_format; char c = *format++; - PyObject *uarg; + char *sarg; switch (c) { @@ -889,13 +886,12 @@ if (c == 'z' && arg == Py_None) PyBuffer_FillInfo(p, NULL, NULL, 0, 1, 0); else if (PyUnicode_Check(arg)) { - uarg = UNICODE_DEFAULT_ENCODING(arg); - if (uarg == NULL) + Py_ssize_t len; + sarg = PyUnicode_AsUTF8AndSize(arg, &len); + if (sarg == NULL) return converterr(CONV_UNICODE, arg, msgbuf, bufsize); - PyBuffer_FillInfo(p, arg, - PyBytes_AS_STRING(uarg), PyBytes_GET_SIZE(uarg), - 1, 0); + PyBuffer_FillInfo(p, arg, sarg, len, 1, 0); } else { /* any buffer-like object */ char *buf; @@ -918,12 +914,13 @@ STORE_SIZE(0); } else if (PyUnicode_Check(arg)) { - uarg = UNICODE_DEFAULT_ENCODING(arg); - if (uarg == NULL) + Py_ssize_t len; + sarg = PyUnicode_AsUTF8AndSize(arg, &len); + if (sarg == NULL) return converterr(CONV_UNICODE, arg, msgbuf, bufsize); - *p = PyBytes_AS_STRING(uarg); - STORE_SIZE(PyBytes_GET_SIZE(uarg)); + *p = sarg; + STORE_SIZE(len); } else { /* any buffer-like object */ /* XXX Really? */ @@ -937,22 +934,22 @@ } else { /* "s" or "z" */ char **p = va_arg(*p_va, char **); - uarg = NULL; + Py_ssize_t len; + sarg = NULL; if (c == 'z' && arg == Py_None) *p = NULL; else if (PyUnicode_Check(arg)) { - uarg = UNICODE_DEFAULT_ENCODING(arg); - if (uarg == NULL) + sarg = PyUnicode_AsUTF8AndSize(arg, &len); + if (sarg == NULL) return converterr(CONV_UNICODE, arg, msgbuf, bufsize); - *p = PyBytes_AS_STRING(uarg); + *p = sarg; } else return converterr(c == 'z' ? "str or None" : "str", arg, msgbuf, bufsize); - if (*p != NULL && uarg != NULL && - (Py_ssize_t) strlen(*p) != PyBytes_GET_SIZE(uarg)) + if (*p != NULL && sarg != NULL && (Py_ssize_t) strlen(*p) != len) return converterr( c == 'z' ? "str without null bytes or None" : "str without null bytes", diff --git a/Python/marshal.c b/Python/marshal.c --- a/Python/marshal.c +++ b/Python/marshal.c @@ -311,9 +311,7 @@ } else if (PyUnicode_CheckExact(v)) { PyObject *utf8; - utf8 = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(v), - PyUnicode_GET_SIZE(v), - "surrogatepass"); + utf8 = PyUnicode_AsEncodedString(v, "utf8", "surrogatepass"); if (utf8 == NULL) { p->depth--; p->error = WFERR_UNMARSHALLABLE; diff --git a/configure b/configure --- a/configure +++ b/configure @@ -767,7 +767,6 @@ with_libm with_libc enable_big_digits -with_wide_unicode with_computed_gotos ' ac_precious_vars='build_alias @@ -778,7 +777,8 @@ LDFLAGS LIBS CPPFLAGS -CPP' +CPP +CPPFLAGS' # Initialize some variables set by options. @@ -1438,7 +1438,6 @@ --with-fpectl enable SIGFPE catching --with-libm=STRING math library --with-libc=STRING C library - --with-wide-unicode Use 4-byte Unicode characters (default is 2 bytes) --with(out)-computed-gotos Use computed gotos in evaluation loop (enabled by default on supported compilers) @@ -12319,65 +12318,19 @@ $as_echo "$ac_cv_wchar_t_signed" >&6; } fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking what type to use for str" >&5 -$as_echo_n "checking what type to use for str... " >&6; } - -# Check whether --with-wide-unicode was given. -if test "${with_wide_unicode+set}" = set; then : - withval=$with_wide_unicode; -if test "$withval" != no -then unicode_size="4" -else unicode_size="2" -fi - -else - -case "$have_ucs4_tcl" in - yes) unicode_size="4";; - *) unicode_size="2" ;; -esac - -fi - - - -case "$unicode_size" in - 4) - $as_echo "#define Py_UNICODE_SIZE 4" >>confdefs.h - - ABIFLAGS="${ABIFLAGS}u" - ;; - *) $as_echo "#define Py_UNICODE_SIZE 2" >>confdefs.h - ;; -esac - - - # wchar_t is only usable if it maps to an unsigned type -if test "$unicode_size" = "$ac_cv_sizeof_wchar_t" \ +if test "$ac_cv_sizeof_wchar_t" -ge 2 \ -a "$ac_cv_wchar_t_signed" = "no" then - PY_UNICODE_TYPE="wchar_t" + HAVE_USABLE_WCHAR_T="yes" $as_echo "#define HAVE_USABLE_WCHAR_T 1" >>confdefs.h - $as_echo "#define PY_UNICODE_TYPE wchar_t" >>confdefs.h - -elif test "$ac_cv_sizeof_short" = "$unicode_size" -then - PY_UNICODE_TYPE="unsigned short" - $as_echo "#define PY_UNICODE_TYPE unsigned short" >>confdefs.h - -elif test "$ac_cv_sizeof_long" = "$unicode_size" -then - PY_UNICODE_TYPE="unsigned long" - $as_echo "#define PY_UNICODE_TYPE unsigned long" >>confdefs.h - -else - PY_UNICODE_TYPE="no type found" -fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $PY_UNICODE_TYPE" >&5 -$as_echo "$PY_UNICODE_TYPE" >&6; } +else + HAVE_USABLE_WCHAR_T="no usable wchar_t found" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $HAVE_USABLE_WCHAR_T" >&5 +$as_echo "$HAVE_USABLE_WCHAR_T" >&6; } # check for endianness { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether byte ordering is bigendian" >&5 diff --git a/configure.in b/configure.in --- a/configure.in +++ b/configure.in @@ -3585,57 +3585,19 @@ AC_MSG_RESULT($ac_cv_wchar_t_signed) fi -AC_MSG_CHECKING(what type to use for str) -AC_ARG_WITH(wide-unicode, - AS_HELP_STRING([--with-wide-unicode], [Use 4-byte Unicode characters (default is 2 bytes)]), -[ -if test "$withval" != no -then unicode_size="4" -else unicode_size="2" -fi -], -[ -case "$have_ucs4_tcl" in - yes) unicode_size="4";; - *) unicode_size="2" ;; -esac -]) - -AH_TEMPLATE(Py_UNICODE_SIZE, - [Define as the size of the unicode type.]) -case "$unicode_size" in - 4) - AC_DEFINE(Py_UNICODE_SIZE, 4) - ABIFLAGS="${ABIFLAGS}u" - ;; - *) AC_DEFINE(Py_UNICODE_SIZE, 2) ;; -esac - -AH_TEMPLATE(PY_UNICODE_TYPE, - [Define as the integral type used for Unicode representation.]) - # wchar_t is only usable if it maps to an unsigned type -if test "$unicode_size" = "$ac_cv_sizeof_wchar_t" \ +if test "$ac_cv_sizeof_wchar_t" -ge 2 \ -a "$ac_cv_wchar_t_signed" = "no" then - PY_UNICODE_TYPE="wchar_t" + HAVE_USABLE_WCHAR_T="yes" AC_DEFINE(HAVE_USABLE_WCHAR_T, 1, [Define if you have a useable wchar_t type defined in wchar.h; useable means wchar_t must be an unsigned type with at least 16 bits. (see Include/unicodeobject.h).]) - AC_DEFINE(PY_UNICODE_TYPE,wchar_t) -elif test "$ac_cv_sizeof_short" = "$unicode_size" -then - PY_UNICODE_TYPE="unsigned short" - AC_DEFINE(PY_UNICODE_TYPE,unsigned short) -elif test "$ac_cv_sizeof_long" = "$unicode_size" -then - PY_UNICODE_TYPE="unsigned long" - AC_DEFINE(PY_UNICODE_TYPE,unsigned long) else - PY_UNICODE_TYPE="no type found" + HAVE_USABLE_WCHAR_T="no usable wchar_t found" fi -AC_MSG_RESULT($PY_UNICODE_TYPE) +AC_MSG_RESULT($HAVE_USABLE_WCHAR_T) # check for endianness AC_C_BIGENDIAN diff --git a/pyconfig.h.in b/pyconfig.h.in --- a/pyconfig.h.in +++ b/pyconfig.h.in @@ -1105,18 +1105,12 @@ /* Define to printf format modifier for Py_ssize_t */ #undef PY_FORMAT_SIZE_T -/* Define as the integral type used for Unicode representation. */ -#undef PY_UNICODE_TYPE - /* Define if you want to build an interpreter with many run-time checks. */ #undef Py_DEBUG /* Defined if Python is built as a shared library. */ #undef Py_ENABLE_SHARED -/* Define as the size of the unicode type. */ -#undef Py_UNICODE_SIZE - /* assume C89 semantics that RETSIGTYPE is always void */ #undef RETSIGTYPE