Index: Objects/stringobject.c =================================================================== --- Objects/stringobject.c (revision 58601) +++ Objects/stringobject.c (working copy) @@ -3020,16 +3020,151 @@ static PyObject * string_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { - PyObject *x = NULL; - static char *kwlist[] = {"object", 0}; + PyObject *x = NULL, *it; + PyObject *(*iternext)(PyObject *); + const char *encoding = NULL; + const char *errors = NULL; + PyObject *new = NULL; + Py_ssize_t i, size; + static char *kwlist[] = {"object", "encoding", "errors", 0}; if (type != &PyString_Type) return str_subtype_new(type, args, kwds); - if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O:str8", kwlist, &x)) + if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str8", kwlist, &x, + &encoding, &errors)) return NULL; - if (x == NULL) + if (x == NULL) { + if (encoding != NULL || errors != NULL) { + PyErr_SetString(PyExc_TypeError, + "encoding or errors without sequence " + "argument"); + return NULL; + } return PyString_FromString(""); - return PyObject_Str(x); + } + + if (PyUnicode_Check(x)) { + /* Encode via the codec registry */ + if (encoding == NULL) { + PyErr_SetString(PyExc_TypeError, + "string argument without an encoding"); + return NULL; + } + new = PyCodec_Encode(x, encoding, errors); + if (new == NULL) + return NULL; + /* XXX(gb): must accept bytes here since codecs output bytes + at the moment */ + if (PyBytes_Check(new)) { + PyObject *str; + str = PyString_FromString(PyBytes_AsString(new)); + Py_DECREF(new); + if (!str) + return NULL; + return str; + } + if (!PyString_Check(new)) { + PyErr_Format(PyExc_TypeError, + "encoder did not return a str8 " + "object (type=%.400s)", + Py_Type(new)->tp_name); + Py_DECREF(new); + return NULL; + } + return new; + } + + /* If it's not unicode, there can't be encoding or errors */ + if (encoding != NULL || errors != NULL) { + PyErr_SetString(PyExc_TypeError, + "encoding or errors without a string argument"); + return NULL; + } + + /* Use the modern buffer interface */ + if (PyObject_CheckBuffer(x)) { + Py_buffer view; + if (PyObject_GetBuffer(x, &view, PyBUF_FULL_RO) < 0) + return NULL; + new = PyString_FromStringAndSize(NULL, view.len); + if (!new) + goto fail; + // XXX(brett.cannon): Better way to get to internal buffer? + if (PyBuffer_ToContiguous(((PyStringObject *)new)->ob_sval, + &view, view.len, 'C') < 0) + goto fail; + PyObject_ReleaseBuffer(x, &view); + return new; + fail: + Py_XDECREF(new); + PyObject_ReleaseBuffer(x, &view); + return NULL; + } + + /* For the iterator version, create a string object and resize as needed. */ + /* XXX(gb): is 64 a good value? also, optimize this if length is known */ + size = 64; + new = PyString_FromStringAndSize(NULL, size); + if (new == NULL) + return NULL; + + /* XXX Optimize this if the arguments is a list, tuple */ + + /* Get the iterator */ + it = PyObject_GetIter(x); + if (it == NULL) + goto error; + // XXX(brett.cannon): No API for this? + iternext = *Py_Type(it)->tp_iternext; + + /* Run the iterator to exhaustion */ + for (i = 0; ; i++) { + PyObject *item; + Py_ssize_t value; + + /* Get the next item */ + item = iternext(it); + if (item == NULL) { + if (PyErr_Occurred()) { + if (!PyErr_ExceptionMatches(PyExc_StopIteration)) + goto error; + PyErr_Clear(); + } + break; + } + + /* Interpret it as an int (__index__) */ + value = PyNumber_AsSsize_t(item, PyExc_ValueError); + Py_DECREF(item); + if (value == -1 && PyErr_Occurred()) + goto error; + + /* Range check */ + if (value < 0 || value >= 256) { + PyErr_SetString(PyExc_ValueError, + "bytes must be in range(0, 256)"); + goto error; + } + + /* Append the byte */ + if (i >= size) { + size *= 2; + if (_PyString_Resize(&new, size) < 0) + goto error; + } + ((PyStringObject *)new)->ob_sval[i] = value; + } + _PyString_Resize(&new, i); + + /* Clean up and return success */ + Py_DECREF(it); + return new; + + error: + /* Error handling when it != NULL */ + Py_XDECREF(it); + Py_DECREF(new); + return NULL; } static PyObject * Index: Lib/modulefinder.py =================================================================== --- Lib/modulefinder.py (revision 58601) +++ Lib/modulefinder.py (working copy) @@ -17,12 +17,12 @@ READ_MODE = "r" # XXX Clean up once str8's cstor matches bytes. -LOAD_CONST = str8(chr(dis.opname.index('LOAD_CONST'))) -IMPORT_NAME = str8(chr(dis.opname.index('IMPORT_NAME'))) -STORE_NAME = str8(chr(dis.opname.index('STORE_NAME'))) -STORE_GLOBAL = str8(chr(dis.opname.index('STORE_GLOBAL'))) +LOAD_CONST = str8([dis.opname.index('LOAD_CONST')]) +IMPORT_NAME = str8([dis.opname.index('IMPORT_NAME')]) +STORE_NAME = str8([dis.opname.index('STORE_NAME')]) +STORE_GLOBAL = str8([dis.opname.index('STORE_GLOBAL')]) STORE_OPS = [STORE_NAME, STORE_GLOBAL] -HAVE_ARGUMENT = str8(chr(dis.HAVE_ARGUMENT)) +HAVE_ARGUMENT = str8([dis.HAVE_ARGUMENT]) # Modulefinder does a good job at simulating Python's, but it can not # handle __path__ modifications packages make at runtime. Therefore there @@ -368,7 +368,7 @@ consts = co.co_consts LOAD_LOAD_AND_IMPORT = LOAD_CONST + LOAD_CONST + IMPORT_NAME while code: - c = str8(chr(code[0])) + c = str8([code[0]]) if c in STORE_OPS: oparg, = unpack('>> import pickle ->>> x = [1, 2, (3, 4), {str8('abc'): "def"}] +>>> x = [1, 2, (3, 4), {str8(b'abc'): "def"}] >>> pkl = pickle.dumps(x, 0) >>> dis(pkl) 0: ( MARK Index: Lib/test/test_io.py =================================================================== --- Lib/test/test_io.py (revision 58601) +++ Lib/test/test_io.py (working copy) @@ -88,7 +88,7 @@ self.assertEqual(f.tell(), 6) self.assertEqual(f.seek(-1, 1), 5) self.assertEqual(f.tell(), 5) - self.assertEqual(f.write(str8(" world\n\n\n")), 9) + self.assertEqual(f.write(str8(b" world\n\n\n")), 9) self.assertEqual(f.seek(0), 0) self.assertEqual(f.write(b"h"), 1) self.assertEqual(f.seek(-1, 2), 13) Index: Lib/test/test_unicode.py =================================================================== --- Lib/test/test_unicode.py (revision 58601) +++ Lib/test/test_unicode.py (working copy) @@ -201,8 +201,8 @@ self.assertRaises(TypeError, 'replace'.replace, "r", 42) def test_str8_comparison(self): - self.assertEqual('abc' == str8('abc'), False) - self.assertEqual('abc' != str8('abc'), True) + self.assertEqual('abc' == str8(b'abc'), False) + self.assertEqual('abc' != str8(b'abc'), True) def test_comparison(self): # Comparisons: Index: Lib/test/test_compile.py =================================================================== --- Lib/test/test_compile.py (revision 58601) +++ Lib/test/test_compile.py (working copy) @@ -157,7 +157,7 @@ s256 = "".join(["\n"] * 256 + ["spam"]) co = compile(s256, 'fn', 'exec') self.assertEqual(co.co_firstlineno, 257) - self.assertEqual(co.co_lnotab, str8('')) + self.assertEqual(co.co_lnotab, str8()) def test_literals_with_leading_zeroes(self): for arg in ["077787", "0xj", "0x.", "0e", "090000000000000", Index: Lib/test/test_codeccallbacks.py =================================================================== --- Lib/test/test_codeccallbacks.py (revision 58601) +++ Lib/test/test_codeccallbacks.py (working copy) @@ -181,7 +181,7 @@ # mapped through the encoding again. This means, that # to be able to use e.g. the "replace" handler, the # charmap has to have a mapping for "?". - charmap = dict((ord(c), str8(2*c.upper())) for c in "abcdefgh") + charmap = dict((ord(c), str8(2*c.upper(), 'ascii')) for c in "abcdefgh") sin = "abc" sout = b"AABBCC" self.assertEquals(codecs.charmap_encode(sin, "strict", charmap)[0], sout) @@ -189,7 +189,7 @@ sin = "abcA" self.assertRaises(UnicodeError, codecs.charmap_encode, sin, "strict", charmap) - charmap[ord("?")] = str8("XYZ") + charmap[ord("?")] = str8(b"XYZ") sin = "abcDEF" sout = b"AABBCCXYZXYZXYZ" self.assertEquals(codecs.charmap_encode(sin, "replace", charmap)[0], sout) @@ -309,7 +309,7 @@ # check with one argument too much self.assertRaises(TypeError, exctype, *(args + ["too much"])) # check with one argument of the wrong type - wrongargs = [ "spam", str8("eggs"), b"spam", 42, 1.0, None ] + wrongargs = [ "spam", str8(b"eggs"), b"spam", 42, 1.0, None ] for i in range(len(args)): for wrongarg in wrongargs: if type(wrongarg) is type(args[i]): Index: Lib/test/test_locale.py =================================================================== --- Lib/test/test_locale.py (revision 58601) +++ Lib/test/test_locale.py (working copy) @@ -82,7 +82,7 @@ # Test BSD Rune locale's bug for isctype functions. def teststrop(s, method, output): - s = str8(s) + s = str8(s, 'latin1') # XXX if verbose: print("%s.%s() =? %s ..." % (repr(s), method, repr(output)), end=' ') result = getattr(s, method)() Index: Lib/test/test_unicodedata.py =================================================================== --- Lib/test/test_unicodedata.py (revision 58601) +++ Lib/test/test_unicodedata.py (working copy) @@ -176,7 +176,7 @@ def test_east_asian_width(self): eaw = self.db.east_asian_width - self.assertRaises(TypeError, eaw, str8('a')) + self.assertRaises(TypeError, eaw, str8(b'a')) self.assertRaises(TypeError, eaw, '') self.assertRaises(TypeError, eaw, 'ra') self.assertEqual(eaw('\x1e'), 'N') Index: Lib/test/testcodec.py =================================================================== --- Lib/test/testcodec.py (revision 58601) +++ Lib/test/testcodec.py (working copy) @@ -36,7 +36,7 @@ decoding_map = codecs.make_identity_dict(range(256)) decoding_map.update({ 0x78: "abc", # 1-n decoding mapping - str8("abc"): 0x0078,# 1-n encoding mapping + str8(b"abc"): 0x0078,# 1-n encoding mapping 0x01: None, # decoding mapping to 0x79: "", # decoding mapping to }) Index: Lib/test/test_struct.py =================================================================== --- Lib/test/test_struct.py (revision 58601) +++ Lib/test/test_struct.py (working copy) @@ -101,7 +101,7 @@ simple_err(struct.unpack, 'iii', s) simple_err(struct.unpack, 'i', s) -c = str8('a') +c = str8(b'a') b = 1 h = 255 i = 65535 @@ -186,7 +186,7 @@ if isinstance(arg, str): # Strings are returned as str8 since you can't know the encoding of # the string when packed. - arg = str8(arg) + arg = str8(arg, 'latin1') if rev != arg and not asy: raise TestFailed("unpack(%r, %r) -> (%r,) # expected (%r,)" % ( fmt, res, rev, arg)) @@ -428,14 +428,14 @@ def test_p_code(): for code, input, expected, expectedback in [ - ('p','abc', '\x00', str8('')), - ('1p', 'abc', '\x00', str8('')), - ('2p', 'abc', '\x01a', str8('a')), - ('3p', 'abc', '\x02ab', str8('ab')), - ('4p', 'abc', '\x03abc', str8('abc')), - ('5p', 'abc', '\x03abc\x00', str8('abc')), - ('6p', 'abc', '\x03abc\x00\x00', str8('abc')), - ('1000p', 'x'*1000, '\xff' + 'x'*999, str8('x'*255))]: + ('p','abc', '\x00', str8()), + ('1p', 'abc', '\x00', str8()), + ('2p', 'abc', '\x01a', str8(b'a')), + ('3p', 'abc', '\x02ab', str8(b'ab')), + ('4p', 'abc', '\x03abc', str8(b'abc')), + ('5p', 'abc', '\x03abc\x00', str8(b'abc')), + ('6p', 'abc', '\x03abc\x00\x00', str8(b'abc')), + ('1000p', 'x'*1000, '\xff' + 'x'*999, str8(b'x'*255))]: expected = bytes(expected, "latin-1") got = struct.pack(code, input) if got != expected: @@ -564,20 +564,24 @@ if verbose: print("test_unpack_from using", cls.__name__) data = cls(test_string) - vereq(s.unpack_from(data), (str8('abcd'),)) - vereq(s.unpack_from(data, 2), (str8('cd01'),)) - vereq(s.unpack_from(data, 4), (str8('0123'),)) + if not isinstance(data, (str8, bytes)): + bytes_data = str8(data, 'latin1') + else: + bytes_data = data + vereq(s.unpack_from(data), (str8(b'abcd'),)) + vereq(s.unpack_from(data, 2), (str8(b'cd01'),)) + vereq(s.unpack_from(data, 4), (str8(b'0123'),)) for i in range(6): - vereq(s.unpack_from(data, i), (str8(data[i:i+4]),)) + vereq(s.unpack_from(data, i), (bytes_data[i:i+4],)) for i in range(6, len(test_string) + 1): simple_err(s.unpack_from, data, i) for cls in (str, str8, bytes): # XXX + memoryview data = cls(test_string) - vereq(struct.unpack_from(fmt, data), (str8('abcd'),)) - vereq(struct.unpack_from(fmt, data, 2), (str8('cd01'),)) - vereq(struct.unpack_from(fmt, data, 4), (str8('0123'),)) + vereq(struct.unpack_from(fmt, data), (str8(b'abcd'),)) + vereq(struct.unpack_from(fmt, data, 2), (str8(b'cd01'),)) + vereq(struct.unpack_from(fmt, data, 4), (str8(b'0123'),)) for i in range(6): - vereq(struct.unpack_from(fmt, data, i), (str8(data[i:i+4]),)) + vereq(struct.unpack_from(fmt, data, i), (bytes_data[i:i+4],)) for i in range(6, len(test_string) + 1): simple_err(struct.unpack_from, fmt, data, i) Index: Lib/test/test_bytes.py =================================================================== --- Lib/test/test_bytes.py (revision 58601) +++ Lib/test/test_bytes.py (working copy) @@ -103,33 +103,33 @@ self.failIf(b3 <= b2) def test_compare_to_str(self): - self.assertEqual(b"abc" == str8("abc"), True) - self.assertEqual(b"ab" != str8("abc"), True) - self.assertEqual(b"ab" <= str8("abc"), True) - self.assertEqual(b"ab" < str8("abc"), True) - self.assertEqual(b"abc" >= str8("ab"), True) - self.assertEqual(b"abc" > str8("ab"), True) + self.assertEqual(b"abc" == str8(b"abc"), True) + self.assertEqual(b"ab" != str8(b"abc"), True) + self.assertEqual(b"ab" <= str8(b"abc"), True) + self.assertEqual(b"ab" < str8(b"abc"), True) + self.assertEqual(b"abc" >= str8(b"ab"), True) + self.assertEqual(b"abc" > str8(b"ab"), True) - self.assertEqual(b"abc" != str8("abc"), False) - self.assertEqual(b"ab" == str8("abc"), False) - self.assertEqual(b"ab" > str8("abc"), False) - self.assertEqual(b"ab" >= str8("abc"), False) - self.assertEqual(b"abc" < str8("ab"), False) - self.assertEqual(b"abc" <= str8("ab"), False) + self.assertEqual(b"abc" != str8(b"abc"), False) + self.assertEqual(b"ab" == str8(b"abc"), False) + self.assertEqual(b"ab" > str8(b"abc"), False) + self.assertEqual(b"ab" >= str8(b"abc"), False) + self.assertEqual(b"abc" < str8(b"ab"), False) + self.assertEqual(b"abc" <= str8(b"ab"), False) - self.assertEqual(str8("abc") == b"abc", True) - self.assertEqual(str8("ab") != b"abc", True) - self.assertEqual(str8("ab") <= b"abc", True) - self.assertEqual(str8("ab") < b"abc", True) - self.assertEqual(str8("abc") >= b"ab", True) - self.assertEqual(str8("abc") > b"ab", True) + self.assertEqual(str8(b"abc") == b"abc", True) + self.assertEqual(str8(b"ab") != b"abc", True) + self.assertEqual(str8(b"ab") <= b"abc", True) + self.assertEqual(str8(b"ab") < b"abc", True) + self.assertEqual(str8(b"abc") >= b"ab", True) + self.assertEqual(str8(b"abc") > b"ab", True) - self.assertEqual(str8("abc") != b"abc", False) - self.assertEqual(str8("ab") == b"abc", False) - self.assertEqual(str8("ab") > b"abc", False) - self.assertEqual(str8("ab") >= b"abc", False) - self.assertEqual(str8("abc") < b"ab", False) - self.assertEqual(str8("abc") <= b"ab", False) + self.assertEqual(str8(b"abc") != b"abc", False) + self.assertEqual(str8(b"ab") == b"abc", False) + self.assertEqual(str8(b"ab") > b"abc", False) + self.assertEqual(str8(b"ab") >= b"abc", False) + self.assertEqual(str8(b"abc") < b"ab", False) + self.assertEqual(str8(b"abc") <= b"ab", False) # Byte comparisons with unicode should always fail! # Test this for all expected byte orders and Unicode character sizes @@ -345,7 +345,7 @@ self.assertEqual(b.decode("utf8", "ignore"), "Hello world\n") def test_from_buffer(self): - sample = str8("Hello world\n\x80\x81\xfe\xff") + sample = str8(b"Hello world\n\x80\x81\xfe\xff") buf = memoryview(sample) b = bytes(buf) self.assertEqual(b, bytes(sample)) @@ -367,8 +367,8 @@ b1 = b"abc" b2 = b"def" self.assertEqual(b1 + b2, b"abcdef") - self.assertEqual(b1 + str8("def"), b"abcdef") - self.assertEqual(str8("def") + b1, b"defabc") + self.assertEqual(b1 + str8(b"def"), b"abcdef") + self.assertEqual(str8(b"def") + b1, b"defabc") self.assertRaises(TypeError, lambda: b1 + "def") self.assertRaises(TypeError, lambda: "abc" + b2) @@ -391,7 +391,7 @@ self.assertEqual(b, b"abcdef") self.assertEqual(b, b1) self.failUnless(b is b1) - b += str8("xyz") + b += str8(b"xyz") self.assertEqual(b, b"abcdefxyz") try: b += "" Index: Lib/test/test_builtin.py =================================================================== --- Lib/test/test_builtin.py (revision 58601) +++ Lib/test/test_builtin.py (working copy) @@ -580,7 +580,8 @@ self.assertEqual(hash(1), hash(1)) self.assertEqual(hash(1), hash(1.0)) hash('spam') - self.assertEqual(hash('spam'), hash(str8('spam'))) + self.assertEqual(hash('spam'), hash(str8(b'spam'))) # remove str8() + # when b"" is immutable hash((0,1,2,3)) def f(): pass self.assertRaises(TypeError, hash, []) Index: Lib/test/test_sys.py =================================================================== --- Lib/test/test_sys.py (revision 58601) +++ Lib/test/test_sys.py (working copy) @@ -300,7 +300,7 @@ def test_intern(self): self.assertRaises(TypeError, sys.intern) - s = str8("never interned before") + s = str8(b"never interned before") self.assert_(sys.intern(s) is s) s2 = s.swapcase().swapcase() self.assert_(sys.intern(s2) is s) @@ -314,7 +314,7 @@ def __hash__(self): return 123 - self.assertRaises(TypeError, sys.intern, S("abc")) + self.assertRaises(TypeError, sys.intern, S(b"abc")) s = "never interned as unicode before" self.assert_(sys.intern(s) is s)