diff -r cfc042ca551a Lib/test/test_unicode.py --- a/Lib/test/test_unicode.py Fri Oct 21 15:52:10 2011 +0200 +++ b/Lib/test/test_unicode.py Tue Oct 25 14:22:27 2011 -0700 @@ -1792,6 +1792,66 @@ s += "4" self.assertEqual(s, "3") + def test_comparisons(self) : + def check_answer(): + answer = [\ + [True, True, True, False, False, False], + [True, True, True, False, False, False], + [True, True, True, False, False, False], + [False, False, True, False, True, True], + [False, False, True, False, True, True], + [False, False, True, False, True, True], + [False, True, False, True, False, True], + [False, True, False, True, False, True], + [False, True, False, True, False, True], + [True, True, True, False, False, False], + [True, True, True, False, False, False], + [True, True, True, False, False, False], + [False, True, False, True, False, True], + [False, True, False, True, False, True], + [False, True, False, True, False, True], + [False, False, True, False, True, True], + [False, False, True, False, True, True], + [False, False, True, False, True, True], + [True, True, True, False, False, False], + [False, False, True, False, True, True], + [False, True, False, True, False, True], + [True, True, True, False, False, False]] + for x in answer : + yield x + + strs1 = [ "abc", "1234", "\x3a\x26" ] + strs2 = [ "abc", "1234" ] + encodings = [ "utf-8", "utf-16", "utf-32" ] + checker = check_answer() + + # Test encoding in and out of a few encodings + for x in strs1 : + for y in strs2 : + for en in encodings : + xen = x.encode(en) + xout = xen.decode(en) + yen = y.encode(en) + yout = yen.decode(en) + fro = [xout==yout, xout<=yout, xout>=yout, + xoutyout, xout!=yout] + to = next(checker) + self.assertEqual(to, fro) + + # Make sure we mix it up so that we get some UCS2 and UCS1 + # encodings here as well as some UCS2/UCS2 so we can test the + # optimize path + st1 = [ b"\xff\xfe&::&", b'\xff\xfea\x00b\x00c\x00' ] + st2 = [ b"\xff\xfe&::&", b'\xff\xfea\x00b\x00c\x00' ] + for x in st1 : + for y in st2 : + xout = x.decode("utf-16") + yout = y.decode("utf-16") + fro = [xout==yout, xout<=yout, xout>=yout, + xoutyout, xout!=yout] + to = next(checker) + self.assertEqual(to, fro) + class StringModuleTest(unittest.TestCase): def test_formatter_parser(self): diff -r cfc042ca551a Objects/unicodeobject.c --- a/Objects/unicodeobject.c Fri Oct 21 15:52:10 2011 +0200 +++ b/Objects/unicodeobject.c Tue Oct 25 14:22:27 2011 -0700 @@ -10499,14 +10499,16 @@ return pad(self, left, marg - left, fillchar); } + + /* This function assumes that str1 and str2 are readied by the caller. */ static int -unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) +unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2, int equality_test) { int kind1, kind2; void *data1, *data2; - Py_ssize_t len1, len2, i; + Py_ssize_t min_len, len1, len2, i; kind1 = PyUnicode_KIND(str1); kind2 = PyUnicode_KIND(str2); @@ -10514,19 +10516,42 @@ data2 = PyUnicode_DATA(str2); len1 = PyUnicode_GET_LENGTH(str1); len2 = PyUnicode_GET_LENGTH(str2); - - for (i = 0; i < len1 && i < len2; ++i) { - Py_UCS4 c1, c2; - c1 = PyUnicode_READ(kind1, data1, i); - c2 = PyUnicode_READ(kind2, data2, i); - - if (c1 != c2) - return (c1 < c2) ? -1 : 1; - } - + min_len = (len1 < len2) ? len1 : len2; + + /* If just checking for equality, there is no reason to + ** check the memory if the lengths are NOT the same */ + if (equality_test && len1 != len2) { + return 1; + } + + /* optimized code path */ + if (kind1 == kind2 && + (kind1 == PyUnicode_1BYTE_KIND || equality_test)) { + int result; + Py_ssize_t kind_size = (Py_ssize_t) kind1; + result = memcmp(data1, data2, kind_size*min_len); + if (result != 0) + return (result < 0) ? -1 : 1; + } + + /* Normal code path */ + else { + for (i = 0; i < min_len; ++i) { + Py_UCS4 c1, c2; + c1 = PyUnicode_READ(kind1, data1, i); + c2 = PyUnicode_READ(kind2, data2, i); + + if (c1 != c2) + return (c1 < c2) ? -1 : 1; + } + } + + /* Assertion: if we reach here, strings are same up to min_len + ** so the length of the strings becomes the decider */ return (len1 < len2) ? -1 : (len1 != len2); } + int PyUnicode_Compare(PyObject *left, PyObject *right) { @@ -10535,7 +10560,9 @@ PyUnicode_READY(right) == -1) return -1; return unicode_compare((PyUnicodeObject *)left, - (PyUnicodeObject *)right); + (PyUnicodeObject *)right, + 0); /* full ordered compare */ + } PyErr_Format(PyExc_TypeError, "Can't compare %.100s and %.100s", @@ -10599,7 +10626,8 @@ result = 0; else result = unicode_compare((PyUnicodeObject *)left, - (PyUnicodeObject *)right); + (PyUnicodeObject *)right, + op == Py_EQ || op == Py_NE); /* Convert the return value to a Boolean */ switch (op) {