diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -318,11 +318,13 @@ # XXX: could expand category return charset # cannot compress except IndexError: - # non-BMP characters + # non-BMP characters; XXX now they should work return charset if negate: if sys.maxunicode != 65535: # XXX: negation does not work with big charsets + # XXX2: now they should work, but removing this will make the + # charmap 17 times bigger return charset for i in range(65536): charmap[i] = not charmap[i] diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py --- a/Lib/test/pickletester.py +++ b/Lib/test/pickletester.py @@ -19,6 +19,8 @@ # kind of outer loop. protocols = range(pickle.HIGHEST_PROTOCOL + 1) +# XXX: with PEP393 the size is variable, but character_size is always set to 4. +# This might or might not be correct (the tests seem to pass though) character_size = 4 if sys.maxunicode > 0xFFFF else 2 diff --git a/Lib/test/test_bigaddrspace.py b/Lib/test/test_bigaddrspace.py --- a/Lib/test/test_bigaddrspace.py +++ b/Lib/test/test_bigaddrspace.py @@ -54,7 +54,8 @@ class StrTest(unittest.TestCase): - + # XXX: with PEP393 the size is variable, but unicodesize is always set to 4. + # This might or might not be correct (the tests seem to pass though) unicodesize = 2 if sys.maxunicode < 65536 else 4 @bigaddrspacetest diff --git a/Lib/test/test_bigmem.py b/Lib/test/test_bigmem.py --- a/Lib/test/test_bigmem.py +++ b/Lib/test/test_bigmem.py @@ -62,6 +62,8 @@ # fail as well. I do not know whether it is due to memory fragmentation # issues, or other specifics of the platform malloc() routine. +# XXX: with PEP393 the size is variable, but character_size is always set to 4. +# This might or might not be correct (the tests seem to pass though) character_size = 4 if sys.maxunicode > 0xFFFF else 2 diff --git a/Lib/test/test_builtin.py b/Lib/test/test_builtin.py --- a/Lib/test/test_builtin.py +++ b/Lib/test/test_builtin.py @@ -249,8 +249,7 @@ self.assertEqual(chr(0xff), '\xff') self.assertRaises(ValueError, chr, 1<<24) self.assertEqual(chr(sys.maxunicode), - str(('\\U%08x' % (sys.maxunicode)).encode("ascii"), - 'unicode-escape')) + str('\\U0010ffff'.encode("ascii"), 'unicode-escape')) self.assertRaises(TypeError, chr) self.assertEqual(chr(0x0000FFFF), "\U0000FFFF") self.assertEqual(chr(0x00010000), "\U00010000") diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py --- a/Lib/test/test_codeccallbacks.py +++ b/Lib/test/test_codeccallbacks.py @@ -136,22 +136,14 @@ def test_backslashescape(self): # Does the same as the "unicode-escape" encoding, but with different # base encodings. - sin = "a\xac\u1234\u20ac\u8000" - if sys.maxunicode > 0xffff: - sin += chr(sys.maxunicode) - sout = b"a\\xac\\u1234\\u20ac\\u8000" - if sys.maxunicode > 0xffff: - sout += bytes("\\U%08x" % sys.maxunicode, "ascii") + sin = "a\xac\u1234\u20ac\u8000\U0010ffff" + sout = b"a\\xac\\u1234\\u20ac\\u8000\\U0010ffff" self.assertEqual(sin.encode("ascii", "backslashreplace"), sout) - sout = b"a\xac\\u1234\\u20ac\\u8000" - if sys.maxunicode > 0xffff: - sout += bytes("\\U%08x" % sys.maxunicode, "ascii") + sout = b"a\xac\\u1234\\u20ac\\u8000\\U0010ffff" self.assertEqual(sin.encode("latin-1", "backslashreplace"), sout) - sout = b"a\xac\\u1234\xa4\\u8000" - if sys.maxunicode > 0xffff: - sout += bytes("\\U%08x" % sys.maxunicode, "ascii") + sout = b"a\xac\\u1234\xa4\\u8000\\U0010ffff" self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout) def test_decoding_callbacks(self): @@ -206,28 +198,27 @@ b"\x00\x00\x00\x00\x00".decode, "unicode-internal", ) - if sys.maxunicode > 0xffff: - def handler_unicodeinternal(exc): - if not isinstance(exc, UnicodeDecodeError): - raise TypeError("don't know how to handle %r" % exc) - return ("\x01", 1) + def handler_unicodeinternal(exc): + if not isinstance(exc, UnicodeDecodeError): + raise TypeError("don't know how to handle %r" % exc) + return ("\x01", 1) - self.assertEqual( - b"\x00\x00\x00\x00\x00".decode("unicode-internal", "ignore"), - "\u0000" - ) + self.assertEqual( + b"\x00\x00\x00\x00\x00".decode("unicode-internal", "ignore"), + "\u0000" + ) - self.assertEqual( - b"\x00\x00\x00\x00\x00".decode("unicode-internal", "replace"), - "\u0000\ufffd" - ) + self.assertEqual( + b"\x00\x00\x00\x00\x00".decode("unicode-internal", "replace"), + "\u0000\ufffd" + ) - codecs.register_error("test.hui", handler_unicodeinternal) + codecs.register_error("test.hui", handler_unicodeinternal) - self.assertEqual( - b"\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui"), - "\u0000\u0001\u0000" - ) + self.assertEqual( + b"\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui"), + "\u0000\u0001\u0000" + ) def test_callbacks(self): def handler1(exc): @@ -356,12 +347,11 @@ ["ascii", "\uffffx", 0, 1, "ouch"], "'ascii' codec can't encode character '\\uffff' in position 0: ouch" ) - if sys.maxunicode > 0xffff: - self.check_exceptionobjectargs( - UnicodeEncodeError, - ["ascii", "\U00010000x", 0, 1, "ouch"], - "'ascii' codec can't encode character '\\U00010000' in position 0: ouch" - ) + self.check_exceptionobjectargs( + UnicodeEncodeError, + ["ascii", "\U00010000x", 0, 1, "ouch"], + "'ascii' codec can't encode character '\\U00010000' in position 0: ouch" + ) def test_unicodedecodeerror(self): self.check_exceptionobjectargs( @@ -391,12 +381,11 @@ ["g\uffffrk", 1, 2, "ouch"], "can't translate character '\\uffff' in position 1: ouch" ) - if sys.maxunicode > 0xffff: - self.check_exceptionobjectargs( - UnicodeTranslateError, - ["g\U00010000rk", 1, 2, "ouch"], - "can't translate character '\\U00010000' in position 1: ouch" - ) + self.check_exceptionobjectargs( + UnicodeTranslateError, + ["g\U00010000rk", 1, 2, "ouch"], + "can't translate character '\\U00010000' in position 1: ouch" + ) self.check_exceptionobjectargs( UnicodeTranslateError, ["g\xfcrk", 1, 3, "ouch"], @@ -681,9 +670,8 @@ # enhance coverage of: # Python/codecs.c::PyCodec_XMLCharRefReplaceErrors() # and inline implementations - v = (1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000) - if sys.maxunicode>=100000: - v += (100000, 500000, 1000000) + v = (1, 5, 10, 50, 100, 500, 1000, 5000, + 10000, 50000, 100000, 500000, 1000000) s = "".join([chr(x) for x in v]) codecs.register_error("test.xmlcharrefreplace", codecs.xmlcharrefreplace_errors) for enc in ("ascii", "iso-8859-15"): diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -891,50 +891,47 @@ def test_bug1251300(self): # Decoding with unicode_internal used to not correctly handle "code # points" above 0x10ffff on UCS-4 builds. - if sys.maxunicode > 0xffff: - ok = [ - (b"\x00\x10\xff\xff", "\U0010ffff"), - (b"\x00\x00\x01\x01", "\U00000101"), - (b"", ""), - ] - not_ok = [ - b"\x7f\xff\xff\xff", - b"\x80\x00\x00\x00", - b"\x81\x00\x00\x00", - b"\x00", - b"\x00\x00\x00\x00\x00", - ] - for internal, uni in ok: - if sys.byteorder == "little": - internal = bytes(reversed(internal)) - self.assertEqual(uni, internal.decode("unicode_internal")) - for internal in not_ok: - if sys.byteorder == "little": - internal = bytes(reversed(internal)) - self.assertRaises(UnicodeDecodeError, internal.decode, - "unicode_internal") + ok = [ + (b"\x00\x10\xff\xff", "\U0010ffff"), + (b"\x00\x00\x01\x01", "\U00000101"), + (b"", ""), + ] + not_ok = [ + b"\x7f\xff\xff\xff", + b"\x80\x00\x00\x00", + b"\x81\x00\x00\x00", + b"\x00", + b"\x00\x00\x00\x00\x00", + ] + for internal, uni in ok: + if sys.byteorder == "little": + internal = bytes(reversed(internal)) + self.assertEqual(uni, internal.decode("unicode_internal")) + for internal in not_ok: + if sys.byteorder == "little": + internal = bytes(reversed(internal)) + self.assertRaises(UnicodeDecodeError, internal.decode, + "unicode_internal") def test_decode_error_attributes(self): - if sys.maxunicode > 0xffff: - try: - b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal") - except UnicodeDecodeError as ex: - self.assertEqual("unicode_internal", ex.encoding) - self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object) - self.assertEqual(4, ex.start) - self.assertEqual(8, ex.end) - else: - self.fail() + try: + b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal") + except UnicodeDecodeError as ex: + self.assertEqual("unicode_internal", ex.encoding) + self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object) + self.assertEqual(4, ex.start) + self.assertEqual(8, ex.end) + else: + self.fail() def test_decode_callback(self): - if sys.maxunicode > 0xffff: - codecs.register_error("UnicodeInternalTest", codecs.ignore_errors) - decoder = codecs.getdecoder("unicode_internal") - ab = "ab".encode("unicode_internal").decode() - ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]), - "ascii"), - "UnicodeInternalTest") - self.assertEqual(("ab", 12), ignored) + codecs.register_error("UnicodeInternalTest", codecs.ignore_errors) + decoder = codecs.getdecoder("unicode_internal") + ab = "ab".encode("unicode_internal").decode() + ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]), + "ascii"), + "UnicodeInternalTest") + self.assertEqual(("ab", 12), ignored) def test_encode_length(self): # Issue 3739 diff --git a/Lib/test/test_multibytecodec.py b/Lib/test/test_multibytecodec.py --- a/Lib/test/test_multibytecodec.py +++ b/Lib/test/test_multibytecodec.py @@ -247,14 +247,9 @@ self.assertFalse(any(x > 0x80 for x in e)) def test_bug1572832(self): - if sys.maxunicode >= 0x10000: - myunichr = chr - else: - myunichr = lambda x: chr(0xD7C0+(x>>10)) + chr(0xDC00+(x&0x3FF)) - for x in range(0x10000, 0x110000): # Any ISO 2022 codec will cause the segfault - myunichr(x).encode('iso_2022_jp', 'ignore') + chr(x).encode('iso_2022_jp', 'ignore') class TestStateful(unittest.TestCase): text = '\u4E16\u4E16' diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -13,10 +13,6 @@ from test import support, string_tests import _string -# decorator to skip tests on narrow builds -requires_wide_build = unittest.skipIf(sys.maxunicode == 65535, - 'requires wide build') - # Error handling (bad decoder return) def search_function(encoding): def decode1(input, errors="strict"): @@ -519,7 +515,6 @@ self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name)) - @requires_wide_build def test_lower(self): string_tests.CommonTest.test_lower(self) self.assertEqual('\U00010427'.lower(), '\U0001044F') @@ -530,7 +525,6 @@ self.assertEqual('X\U00010427x\U0001044F'.lower(), 'x\U0001044Fx\U0001044F') - @requires_wide_build def test_upper(self): string_tests.CommonTest.test_upper(self) self.assertEqual('\U0001044F'.upper(), '\U00010427') @@ -541,7 +535,6 @@ self.assertEqual('X\U00010427x\U0001044F'.upper(), 'X\U00010427X\U00010427') - @requires_wide_build def test_capitalize(self): string_tests.CommonTest.test_capitalize(self) self.assertEqual('\U0001044F'.capitalize(), '\U00010427') @@ -554,7 +547,6 @@ self.assertEqual('X\U00010427x\U0001044F'.capitalize(), 'X\U0001044Fx\U0001044F') - @requires_wide_build def test_title(self): string_tests.MixinStrUnicodeUserStringTest.test_title(self) self.assertEqual('\U0001044F'.title(), '\U00010427') @@ -569,7 +561,6 @@ self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(), 'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F') - @requires_wide_build def test_swapcase(self): string_tests.CommonTest.test_swapcase(self) self.assertEqual('\U0001044F'.swapcase(), '\U00010427') @@ -1111,15 +1102,12 @@ def test_codecs_utf8(self): self.assertEqual(''.encode('utf-8'), b'') self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac') - if sys.maxunicode == 65535: - self.assertEqual('\ud800\udc02'.encode('utf-8'), b'\xf0\x90\x80\x82') - self.assertEqual('\ud84d\udc56'.encode('utf-8'), b'\xf0\xa3\x91\x96') + self.assertEqual('\U00010002'.encode('utf-8'), b'\xf0\x90\x80\x82') + self.assertEqual('\U00023456'.encode('utf-8'), b'\xf0\xa3\x91\x96') self.assertEqual('\ud800'.encode('utf-8', 'surrogatepass'), b'\xed\xa0\x80') self.assertEqual('\udc00'.encode('utf-8', 'surrogatepass'), b'\xed\xb0\x80') - if sys.maxunicode == 65535: - self.assertEqual( - ('\ud800\udc02'*1000).encode('utf-8'), - b'\xf0\x90\x80\x82'*1000) + self.assertEqual(('\U00010002'*10).encode('utf-8'), + b'\xf0\x90\x80\x82'*10) self.assertEqual( '\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f' '\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00' diff --git a/Tools/pybench/pybench.py b/Tools/pybench/pybench.py --- a/Tools/pybench/pybench.py +++ b/Tools/pybench/pybench.py @@ -107,6 +107,7 @@ print('Getting machine details...') buildno, builddate = platform.python_build() python = platform.python_version() + # XXX this is now always UCS4, maybe replace it with 'PEP393' in 3.3+? if sys.maxunicode == 65535: # UCS2 build (standard) unitype = 'UCS2' diff --git a/Tools/unicode/comparecodecs.py b/Tools/unicode/comparecodecs.py --- a/Tools/unicode/comparecodecs.py +++ b/Tools/unicode/comparecodecs.py @@ -14,7 +14,7 @@ print('Comparing encoding/decoding of %r and %r' % (encoding1, encoding2)) mismatch = 0 # Check encoding - for i in range(sys.maxunicode): + for i in range(sys.maxunicode+1): u = chr(i) try: c1 = u.encode(encoding1)