diff -r d3afe5d8a4da Doc/library/array.rst --- a/Doc/library/array.rst Wed Aug 01 11:09:55 2012 +0200 +++ b/Doc/library/array.rst Wed Aug 01 14:43:35 2012 +0200 @@ -21,7 +21,7 @@ defined: +-----------+--------------------+-------------------+-----------------------+-------+ | ``'B'`` | unsigned char | int | 1 | | +-----------+--------------------+-------------------+-----------------------+-------+ -| ``'u'`` | Py_UCS4 | Unicode character | 4 | | +| ``'u'`` | Py_UNICODE | Unicode character | 2 | \(1) | +-----------+--------------------+-------------------+-----------------------+-------+ | ``'h'`` | signed short | int | 2 | | +-----------+--------------------+-------------------+-----------------------+-------+ @@ -35,9 +35,9 @@ defined: +-----------+--------------------+-------------------+-----------------------+-------+ | ``'L'`` | unsigned long | int | 4 | | +-----------+--------------------+-------------------+-----------------------+-------+ -| ``'q'`` | signed long long | int | 8 | \(1) | +| ``'q'`` | signed long long | int | 8 | \(2) | +-----------+--------------------+-------------------+-----------------------+-------+ -| ``'Q'`` | unsigned long long | int | 8 | \(1) | +| ``'Q'`` | unsigned long long | int | 8 | \(2) | +-----------+--------------------+-------------------+-----------------------+-------+ | ``'f'`` | float | float | 4 | | +-----------+--------------------+-------------------+-----------------------+-------+ @@ -47,6 +47,11 @@ defined: Notes: (1) + The ``'u'`` type code corresponds to Python's unicode character + (:c:type:`Py_UNICODE` which is :c:type:`wchar_t`). Depending on the + platform, it can be 16 bits or 32 bits. + +(2) The ``'q'`` and ``'Q'`` type codes are available only if the platform C compiler used to build Python supports C :c:type:`long long`, or, on Windows, :c:type:`__int64`. diff -r d3afe5d8a4da Modules/arraymodule.c --- a/Modules/arraymodule.c Wed Aug 01 11:09:55 2012 +0200 +++ b/Modules/arraymodule.c Wed Aug 01 14:43:35 2012 +0200 @@ -174,25 +174,24 @@ BB_setitem(arrayobject *ap, Py_ssize_t i static PyObject * u_getitem(arrayobject *ap, Py_ssize_t i) { - return PyUnicode_FromOrdinal(((Py_UCS4 *) ap->ob_item)[i]); + return PyUnicode_FromUnicode(&((Py_UNICODE *) ap->ob_item)[i], 1); } static int u_setitem(arrayobject *ap, Py_ssize_t i, PyObject *v) { - PyObject *p; + Py_UNICODE *p; + Py_ssize_t len; - if (!PyArg_Parse(v, "U;array item must be unicode character", &p)) + if (!PyArg_Parse(v, "u#;array item must be unicode character", &p, &len)) return -1; - if (PyUnicode_READY(p)) - return -1; - if (PyUnicode_GET_LENGTH(p) != 1) { + if (len != 1) { PyErr_SetString(PyExc_TypeError, "array item must be unicode character"); return -1; } if (i >= 0) - ((Py_UCS4 *)ap->ob_item)[i] = PyUnicode_READ_CHAR(p, 0); + ((Py_UNICODE *)ap->ob_item)[i] = p[0]; return 0; } @@ -444,13 +443,6 @@ d_setitem(arrayobject *ap, Py_ssize_t i, return 0; } -#if SIZEOF_INT == 4 -# define STRUCT_LONG_FORMAT "I" -#elif SIZEOF_LONG == 4 -# define STRUCT_LONG_FORMAT "L" -#else -# error "Unable to get struct format for Py_UCS4" -#endif /* Description of types. * @@ -460,7 +452,7 @@ d_setitem(arrayobject *ap, Py_ssize_t i, static struct arraydescr descriptors[] = { {'b', 1, b_getitem, b_setitem, "b", 1, 1}, {'B', 1, BB_getitem, BB_setitem, "B", 1, 0}, - {'u', sizeof(Py_UCS4), u_getitem, u_setitem, STRUCT_LONG_FORMAT, 0, 0}, + {'u', sizeof(Py_UNICODE), u_getitem, u_setitem, "u", 0, 0}, {'h', sizeof(short), h_getitem, h_setitem, "h", 1, 1}, {'H', sizeof(short), HH_getitem, HH_setitem, "H", 1, 0}, {'i', sizeof(int), i_getitem, i_setitem, "i", 1, 1}, @@ -1519,26 +1511,25 @@ This method is deprecated. Use tobytes i static PyObject * array_fromunicode(arrayobject *self, PyObject *args) { - PyObject *ustr; + Py_UNICODE *ustr; Py_ssize_t n; + char typecode; - if (!PyArg_ParseTuple(args, "U:fromunicode", &ustr)) + if (!PyArg_ParseTuple(args, "u#:fromunicode", &ustr, &n)) return NULL; - if (self->ob_descr->typecode != 'u') { + typecode = self->ob_descr->typecode; + if ((typecode != 'u')) { PyErr_SetString(PyExc_ValueError, "fromunicode() may only be called on " "unicode type arrays"); return NULL; } - if (PyUnicode_READY(ustr)) - return NULL; - n = PyUnicode_GET_LENGTH(ustr); if (n > 0) { Py_ssize_t old_size = Py_SIZE(self); if (array_resize(self, old_size + n) == -1) return NULL; - if (!PyUnicode_AsUCS4(ustr, (Py_UCS4 *)self->ob_item + old_size, n, 0)) - return NULL; + memcpy(self->ob_item + old_size * sizeof(Py_UNICODE), + ustr, n * sizeof(Py_UNICODE)); } Py_INCREF(Py_None); @@ -1557,14 +1548,14 @@ append Unicode data to an array of some static PyObject * array_tounicode(arrayobject *self, PyObject *unused) { - if (self->ob_descr->typecode != 'u') { + char typecode; + typecode = self->ob_descr->typecode; + if ((typecode != 'u')) { PyErr_SetString(PyExc_ValueError, "tounicode() may only be called on unicode type arrays"); return NULL; } - return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, - (Py_UCS4 *) self->ob_item, - Py_SIZE(self)); + return PyUnicode_FromUnicode((Py_UNICODE *) self->ob_item, Py_SIZE(self)); } PyDoc_STRVAR(tounicode_doc, @@ -1671,7 +1662,13 @@ typecode_to_mformat_code(char typecode) return UNSIGNED_INT8; case 'u': - return UTF32_LE + is_big_endian; + if (sizeof(Py_UNICODE) == 2) { + return UTF16_LE + is_big_endian; + } + if (sizeof(Py_UNICODE) == 4) { + return UTF32_LE + is_big_endian; + } + return UNKNOWN_FORMAT; case 'f': if (sizeof(float) == 4) { @@ -2419,8 +2416,14 @@ array_buffer_getbuf(arrayobject *self, P view->strides = &(view->itemsize); view->format = NULL; view->internal = NULL; - if ((flags & PyBUF_FORMAT) == PyBUF_FORMAT) + if ((flags & PyBUF_FORMAT) == PyBUF_FORMAT) { view->format = self->ob_descr->formats; +#ifdef Py_UNICODE_WIDE + if (self->ob_descr->typecode == 'u') { + view->format = "w"; + } +#endif + } finish: self->ob_exports++; @@ -2534,25 +2537,19 @@ array_new(PyTypeObject *type, PyObject * Py_DECREF(v); } else if (initial != NULL && PyUnicode_Check(initial)) { - Py_ssize_t n; - if (PyUnicode_READY(initial)) { - Py_DECREF(a); - return NULL; - } - n = PyUnicode_GET_LENGTH(initial); + Py_ssize_t n = PyUnicode_GET_DATA_SIZE(initial); if (n > 0) { arrayobject *self = (arrayobject *)a; - Py_UCS4 *item = (Py_UCS4 *)self->ob_item; - item = (Py_UCS4 *)PyMem_Realloc(item, n * sizeof(Py_UCS4)); + char *item = self->ob_item; + item = (char *)PyMem_Realloc(item, n); if (item == NULL) { PyErr_NoMemory(); Py_DECREF(a); return NULL; } - self->ob_item = (char*)item; - Py_SIZE(self) = n; - if (!PyUnicode_AsUCS4(initial, item, n, 0)) - return NULL; + self->ob_item = item; + Py_SIZE(self) = n / sizeof(Py_UNICODE); + memcpy(item, PyUnicode_AS_DATA(initial), n); self->allocated = Py_SIZE(self); } } @@ -2593,7 +2590,7 @@ is a single character. The following ty Type code C Type Minimum size in bytes \n\ 'b' signed integer 1 \n\ 'B' unsigned integer 1 \n\ - 'u' Unicode character 4 \n\ + 'u' Unicode character 2 (see note) \n\ 'h' signed integer 2 \n\ 'H' unsigned integer 2 \n\ 'i' signed integer 2 \n\ @@ -2605,6 +2602,9 @@ is a single character. The following ty 'f' floating point 4 \n\ 'd' floating point 8 \n\ \n\ +NOTE: The 'u' typecode corresponds to Python's unicode character. On \n\ +narrow builds this is 2-bytes on wide builds this is 4-bytes.\n\ +\n\ NOTE: The 'q' and 'Q' type codes are only available if the platform \n\ C compiler used to build Python supports 'long long', or, on Windows, \n\ '__int64'.\n\