Index: Include/unicodeobject.h =================================================================== --- Include/unicodeobject.h (revision 87006) +++ Include/unicodeobject.h (working copy) @@ -1225,6 +1225,17 @@ ); #endif +/* Transforms code points that have decimal digit property to the + corresponding ASCII digit code points. + + Returns a new Unicode string on success, NULL on failure. +*/ + +PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII( + Py_UNICODE *s, /* Unicode buffer */ + Py_ssize_t length /* Number of Py_UNICODE chars to transform */ + ); + /* --- File system encoding ---------------------------------------------- */ /* ParseTuple converter: encode str objects to bytes using Index: Objects/complexobject.c =================================================================== --- Objects/complexobject.c (revision 87006) +++ Objects/complexobject.c (working copy) @@ -766,20 +766,26 @@ char *end; double x=0.0, y=0.0, z; int got_bracket=0; - char *s_buffer = NULL; + PyObject *s_buffer = NULL; Py_ssize_t len; if (PyUnicode_Check(v)) { - s_buffer = (char *)PyMem_MALLOC(PyUnicode_GET_SIZE(v) + 1); + Py_ssize_t i, buflen = PyUnicode_GET_SIZE(v); + Py_UNICODE *bufptr; + s_buffer = PyUnicode_TransformDecimalToASCII( + PyUnicode_AS_UNICODE(v), buflen); if (s_buffer == NULL) - return PyErr_NoMemory(); - if (PyUnicode_EncodeDecimal(PyUnicode_AS_UNICODE(v), - PyUnicode_GET_SIZE(v), - s_buffer, - NULL)) + return NULL; + /* Replace non-ASCII whitespace with ' ' */ + bufptr = PyUnicode_AS_UNICODE(s_buffer); + for (i = 0; i < buflen; i++) { + Py_UNICODE ch = bufptr[i]; + if (ch > 127 && Py_UNICODE_ISSPACE(ch)) + bufptr[i] = ' '; + } + s = _PyUnicode_AsStringAndSize(s_buffer, &len); + if (s == NULL) goto error; - s = s_buffer; - len = strlen(s); } else if (PyObject_AsCharBuffer(v, &s, &len)) { PyErr_SetString(PyExc_TypeError, @@ -894,16 +900,14 @@ if (s-start != len) goto parse_error; - if (s_buffer) - PyMem_FREE(s_buffer); + Py_XDECREF(s_buffer); return complex_subtype_from_doubles(type, x, y); parse_error: PyErr_SetString(PyExc_ValueError, "complex() arg is a malformed string"); error: - if (s_buffer) - PyMem_FREE(s_buffer); + Py_XDECREF(s_buffer); return NULL; } Index: Objects/unicodeobject.c =================================================================== --- Objects/unicodeobject.c (revision 87006) +++ Objects/unicodeobject.c (working copy) @@ -6206,6 +6206,30 @@ return NULL; } +PyObject * +PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, + Py_ssize_t length) +{ + PyObject *result; + Py_UNICODE *p; /* write pointer into result */ + Py_ssize_t i; + /* Copy to a new string */ + result = (PyObject *)_PyUnicode_New(length); + Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length); + if (result == NULL) + return result; + p = PyUnicode_AS_UNICODE(result); + /* Iterate over code points */ + for (i = 0; i < length; i++) { + Py_UNICODE ch =s[i]; + if (ch > 127) { + int decimal = Py_UNICODE_TODECIMAL(ch); + if (decimal >= 0) + p[i] = '0' + decimal; + } + } + return result; +} /* --- Decimal Encoder ---------------------------------------------------- */ int PyUnicode_EncodeDecimal(Py_UNICODE *s, @@ -8969,6 +8993,13 @@ } #endif +static PyObject * +unicode__decimal2ascii(PyObject *self) +{ + return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self), + PyUnicode_GET_SIZE(self)); +} + PyDoc_STRVAR(startswith__doc__, "S.startswith(prefix[, start[, end]]) -> bool\n\ \n\ @@ -9108,7 +9139,6 @@ return Py_BuildValue("(u#)", v->str, v->length); } - static PyMethodDef unicode_methods[] = { /* Order is according to common usage: often used methods should @@ -9170,9 +9200,10 @@ #endif #if 0 - /* This one is just used for debugging the implementation. */ + /* These methods are just used for debugging the implementation. */ {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS}, #endif + {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS}, {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, {NULL, NULL} Index: Objects/longobject.c =================================================================== --- Objects/longobject.c (revision 87006) +++ Objects/longobject.c (working copy) @@ -2133,17 +2133,34 @@ PyLong_FromUnicode(Py_UNICODE *u, Py_ssize_t length, int base) { PyObject *result; - char *buffer = (char *)PyMem_MALLOC(length+1); + PyObject *asciidig; + char *buffer, *end; + Py_ssize_t i, buflen; + Py_UNICODE *ptr; - if (buffer == NULL) + asciidig = PyUnicode_TransformDecimalToASCII(u, length); + if (asciidig == NULL) return NULL; - - if (PyUnicode_EncodeDecimal(u, length, buffer, NULL)) { - PyMem_FREE(buffer); + /* Replace non-ASCII whitespace with ' ' */ + ptr = PyUnicode_AS_UNICODE(asciidig); + for (i = 0; i < length; i++) { + Py_UNICODE ch = ptr[i]; + if (ch > 127 && Py_UNICODE_ISSPACE(ch)) + ptr[i] = ' '; + } + buffer = _PyUnicode_AsStringAndSize(asciidig, &buflen); + if (buffer == NULL) { + Py_DECREF(asciidig); return NULL; } - result = PyLong_FromString(buffer, NULL, base); - PyMem_FREE(buffer); + result = PyLong_FromString(buffer, &end, base); + if (result != NULL && end != buffer + buflen) { + PyErr_SetString(PyExc_ValueError, + "null byte in argument for int()"); + Py_DECREF(result); + result = NULL; + } + Py_DECREF(asciidig); return result; } Index: Objects/floatobject.c =================================================================== --- Objects/floatobject.c (revision 87006) +++ Objects/floatobject.c (working copy) @@ -174,22 +174,30 @@ { const char *s, *last, *end; double x; - char buffer[256]; /* for errors */ - char *s_buffer = NULL; + PyObject *s_buffer = NULL; Py_ssize_t len; PyObject *result = NULL; if (PyUnicode_Check(v)) { - s_buffer = (char *)PyMem_MALLOC(PyUnicode_GET_SIZE(v)+1); + Py_ssize_t i, buflen = PyUnicode_GET_SIZE(v); + Py_UNICODE *bufptr; + s_buffer = PyUnicode_TransformDecimalToASCII( + PyUnicode_AS_UNICODE(v), buflen); if (s_buffer == NULL) - return PyErr_NoMemory(); - if (PyUnicode_EncodeDecimal(PyUnicode_AS_UNICODE(v), - PyUnicode_GET_SIZE(v), - s_buffer, - NULL)) - goto error; - s = s_buffer; - len = strlen(s); + return NULL; + /* Replace non-ASCII whitespace with ' ' */ + bufptr = PyUnicode_AS_UNICODE(s_buffer); + for (i = 0; i < buflen; i++) { + Py_UNICODE ch = bufptr[i]; + if (ch > 127 && Py_UNICODE_ISSPACE(ch)) + bufptr[i] = ' '; + } + s = _PyUnicode_AsStringAndSize(s_buffer, &len); + if (s == NULL) { + Py_DECREF(s_buffer); + return NULL; + } + last = s + len; } else if (PyObject_AsCharBuffer(v, &s, &len)) { PyErr_SetString(PyExc_TypeError, @@ -197,29 +205,27 @@ return NULL; } last = s + len; - - while (Py_ISSPACE(*s)) + /* strip space */ + while (s < last && Py_ISSPACE(*s)) s++; + while (s < last - 1 && Py_ISSPACE(last[-1])) + last--; /* We don't care about overflow or underflow. If the platform * supports them, infinities and signed zeroes (on underflow) are * fine. */ x = PyOS_string_to_double(s, (char **)&end, NULL); - if (x == -1.0 && PyErr_Occurred()) - goto error; - while (Py_ISSPACE(*end)) - end++; - if (end == last) - result = PyFloat_FromDouble(x); - else { - PyOS_snprintf(buffer, sizeof(buffer), - "invalid literal for float(): %.200s", s); - PyErr_SetString(PyExc_ValueError, buffer); + if (end != last) { + PyErr_Format(PyExc_ValueError, + "could not convert string to float: " + "%R", v); result = NULL; } + else if (x == -1.0 && PyErr_Occurred()) + result = NULL; + else + result = PyFloat_FromDouble(x); - error: - if (s_buffer) - PyMem_FREE(s_buffer); + Py_XDECREF(s_buffer); return result; } Index: Misc/NEWS =================================================================== --- Misc/NEWS (revision 87006) +++ Misc/NEWS (working copy) @@ -222,6 +222,10 @@ C-API ----- +- Issue #10557: Added a new API function, PyUnicode_TransformDecimalToASCII(), + which transforms non-ASCII decimal digits in a Unicode string to their + ASCII equivalents. + - Issue #9518: Extend the PyModuleDef_HEAD_INIT macro to explicitly zero-initialize all fields, fixing compiler warnings seen when building extension modules with gcc with "-Wmissing-field-initializers" (implied by Index: Doc/c-api/unicode.rst =================================================================== --- Doc/c-api/unicode.rst (revision 87006) +++ Doc/c-api/unicode.rst (working copy) @@ -328,7 +328,14 @@ Identical to :c:func:`PyUnicode_FromFormat` except that it takes exactly two arguments. +.. c:function:: PyObject* PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, Py_ssize_t size) + Create a Unicode object by replacing all decimal digits in + :c:type:`Py_UNICODE` buffer of the given size by ASCII digits 0--9 + according to their decimal value. Return *NULL* if an exception + occurs. + + .. c:function:: Py_UNICODE* PyUnicode_AsUnicode(PyObject *unicode) Return a read-only pointer to the Unicode object's internal :c:type:`Py_UNICODE` Index: Lib/test/test_complex.py =================================================================== --- Lib/test/test_complex.py (revision 87006) +++ Lib/test/test_complex.py (working copy) @@ -220,6 +220,7 @@ self.assertEqual(complex(NS(1+10j)), 1+10j) self.assertRaises(TypeError, complex, OS(None)) self.assertRaises(TypeError, complex, NS(None)) + self.assertRaises(TypeError, complex, {}) self.assertAlmostEqual(complex("1+10j"), 1+10j) self.assertAlmostEqual(complex(10), 10+0j) Index: Lib/test/test_unicode.py =================================================================== --- Lib/test/test_unicode.py (revision 87006) +++ Lib/test/test_unicode.py (working copy) @@ -1168,8 +1168,13 @@ # Error handling (wrong arguments) self.assertRaises(TypeError, "hello".encode, 42, 42, 42) - # Error handling (PyUnicode_EncodeDecimal()) - self.assertRaises(UnicodeError, int, "\u0200") + # Error handling (lone surrogate in PyUnicode_TransformDecimalToASCII()) + self.assertRaises(UnicodeError, int, "\ud800") + self.assertRaises(UnicodeError, int, "\udf00") + self.assertRaises(UnicodeError, float, "\ud800") + self.assertRaises(UnicodeError, float, "\udf00") + self.assertRaises(UnicodeError, complex, "\ud800") + self.assertRaises(UnicodeError, complex, "\udf00") def test_codecs(self): # Encoding Index: Lib/test/test_int.py =================================================================== --- Lib/test/test_int.py (revision 87006) +++ Lib/test/test_int.py (working copy) @@ -20,7 +20,8 @@ (' 1\02 ', ValueError), ('', ValueError), (' ', ValueError), - (' \t\t ', ValueError) + (' \t\t ', ValueError), + ("\u0200", ValueError) ] class IntTestCases(unittest.TestCase): @@ -35,6 +36,8 @@ self.assertEqual(int(3.5), 3) self.assertEqual(int(-3.5), -3) self.assertEqual(int("-3"), -3) + self.assertEqual(int(" -3 "), -3) + self.assertEqual(int("\N{EM SPACE}-3\N{EN SPACE}"), -3) # Different base: self.assertEqual(int("10",16), 16) # Test conversion from strings and various anomalies @@ -302,6 +305,16 @@ self.fail("Failed to raise TypeError with %s" % ((base, trunc_result_base),)) + def test_error_message(self): + testlist = ('\xbd', '123\xbd', ' 123 456 ') + for s in testlist: + try: + int(s) + except ValueError as e: + self.assertIn(s.strip(), e.args[0]) + else: + self.fail("Expected int(%r) to raise a ValueError", s) + def test_main(): run_unittest(IntTestCases) Index: Lib/test/test_float.py =================================================================== --- Lib/test/test_float.py (revision 87006) +++ Lib/test/test_float.py (working copy) @@ -43,14 +43,30 @@ self.assertRaises(ValueError, float, "+.inf") self.assertRaises(ValueError, float, ".") self.assertRaises(ValueError, float, "-.") + self.assertRaises(ValueError, float, b"-") + self.assertRaises(TypeError, float, {}) + # Lone surrogate + self.assertRaises(UnicodeEncodeError, float, '\uD8F0') # check that we don't accept alternate exponent markers self.assertRaises(ValueError, float, "-1.7d29") self.assertRaises(ValueError, float, "3D-14") - self.assertEqual(float(b" \u0663.\u0661\u0664 ".decode('raw-unicode-escape')), 3.14) + self.assertEqual(float(" \u0663.\u0661\u0664 "), 3.14) + self.assertEqual(float("\N{EM SPACE}3.14\N{EN SPACE}"), 3.14) # extra long strings should not be a problem float(b'.' + b'1'*1000) float('.' + '1'*1000) + def test_error_message(self): + testlist = ('\xbd', '123\xbd', ' 123 456 ') + for s in testlist: + try: + float(s) + except ValueError as e: + self.assertIn(s.strip(), e.args[0]) + else: + self.fail("Expected int(%r) to raise a ValueError", s) + + @support.run_with_locale('LC_NUMERIC', 'fr_FR', 'de_DE') def test_float_with_comma(self): # set locale to something that doesn't use '.' for the decimal point