diff -r 694110bc91d8 Doc/library/stdtypes.rst --- a/Doc/library/stdtypes.rst Sat Jan 07 18:34:24 2012 +0100 +++ b/Doc/library/stdtypes.rst Sat Jan 07 22:54:05 2012 -0500 @@ -1355,17 +1355,18 @@ functions based on regular expressions. 'spacious' >>> 'www.example.com'.strip('cmowz.') 'example' .. method:: str.swapcase() Return a copy of the string with uppercase characters converted to lowercase and - vice versa. + vice versa. Note that it is not necessarily true that + ``s.swapcase().swapcase() == s``. .. method:: str.title() Return a titlecased version of the string where words start with an uppercase character and the remaining characters are lowercase. The algorithm uses a simple language-independent definition of a word as diff -r 694110bc91d8 Include/unicodeobject.h --- a/Include/unicodeobject.h Sat Jan 07 18:34:24 2012 +0100 +++ b/Include/unicodeobject.h Sat Jan 07 22:54:05 2012 -0500 @@ -2003,16 +2003,34 @@ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowerca PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase( Py_UCS4 ch /* Unicode character */ ); PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase( Py_UCS4 ch /* Unicode character */ ); +PyAPI_FUNC(int) _PyUnicode_ToLowerFull( + Py_UCS4 ch, /* Unicode character */ + Py_UCS4 *res + ); + +PyAPI_FUNC(int) _PyUnicode_ToUpperFull( + Py_UCS4 ch, /* Unicode character */ + Py_UCS4 *res + ); + +PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable( + const Py_UCS4 ch /* Unicode character */ + ); + +PyAPI_FUNC(int) _PyUnicode_IsCased( + const Py_UCS4 ch /* Unicode character */ + ); + PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit( Py_UCS4 ch /* Unicode character */ ); PyAPI_FUNC(int) _PyUnicode_ToDigit( Py_UCS4 ch /* Unicode character */ ); diff -r 694110bc91d8 Lib/test/string_tests.py --- a/Lib/test/string_tests.py Sat Jan 07 18:34:24 2012 +0100 +++ b/Lib/test/string_tests.py Sat Jan 07 22:54:05 2012 -0500 @@ -664,17 +664,17 @@ class CommonTest(BaseTest): self.checkequal(' hello ', ' hello ', 'capitalize') self.checkequal('Hello ', 'Hello ','capitalize') self.checkequal('Hello ', 'hello ','capitalize') self.checkequal('Aaaa', 'aaaa', 'capitalize') self.checkequal('Aaaa', 'AaAa', 'capitalize') # check that titlecased chars are lowered correctly # \u1ffc is the titlecased char - self.checkequal('\u1ffc\u1ff3\u1ff3\u1ff3', + self.checkequal('\u03a9\u0399\u1ff3\u1ff3\u1ff3', '\u1ff3\u1ff3\u1ffc\u1ffc', 'capitalize') # check with cased non-letter chars self.checkequal('\u24c5\u24e8\u24e3\u24d7\u24de\u24dd', '\u24c5\u24ce\u24c9\u24bd\u24c4\u24c3', 'capitalize') self.checkequal('\u24c5\u24e8\u24e3\u24d7\u24de\u24dd', '\u24df\u24e8\u24e3\u24d7\u24de\u24dd', 'capitalize') self.checkequal('\u2160\u2171\u2172', '\u2160\u2161\u2162', 'capitalize') diff -r 694110bc91d8 Lib/test/test_unicode.py --- a/Lib/test/test_unicode.py Sat Jan 07 18:34:24 2012 +0100 +++ b/Lib/test/test_unicode.py Sat Jan 07 22:54:05 2012 -0500 @@ -543,38 +543,60 @@ class UnicodeTest(string_tests.CommonTes string_tests.CommonTest.test_lower(self) self.assertEqual('\U00010427'.lower(), '\U0001044F') self.assertEqual('\U00010427\U00010427'.lower(), '\U0001044F\U0001044F') self.assertEqual('\U00010427\U0001044F'.lower(), '\U0001044F\U0001044F') self.assertEqual('X\U00010427x\U0001044F'.lower(), 'x\U0001044Fx\U0001044F') + self.assertEqual('fi'.lower(), 'fi') + self.assertEqual('\u0130'.lower(), '\u0069\u0307') + # Special case for GREEK CAPITAL LETTER SIGMA U+03A3 + self.assertEqual('\u03a3'.lower(), '\u03c3') + self.assertEqual('\u0345\u03a3'.lower(), '\u0345\u03c3') + self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2') + self.assertEqual('A\u0345\u03a3a'.lower(), 'a\u0345\u03c3a') + self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2') + self.assertEqual('A\u03a3\u0345'.lower(), 'a\u03c2\u0345') + self.assertEqual('\u03a3\u0345 '.lower(), '\u03c3\u0345 ') + self.assertEqual('\U0008fffe'.lower(), '\U0008fffe') def test_upper(self): string_tests.CommonTest.test_upper(self) self.assertEqual('\U0001044F'.upper(), '\U00010427') self.assertEqual('\U0001044F\U0001044F'.upper(), '\U00010427\U00010427') self.assertEqual('\U00010427\U0001044F'.upper(), '\U00010427\U00010427') self.assertEqual('X\U00010427x\U0001044F'.upper(), 'X\U00010427X\U00010427') + self.assertEqual('fi'.upper(), 'FI') + self.assertEqual('\u0130'.upper(), '\u0130') + self.assertEqual('\u03a3'.upper(), '\u03a3') + self.assertEqual('ß'.upper(), 'SS') + self.assertEqual('\u1fd2'.upper(), '\u0399\u0308\u0300') + self.assertEqual('\U0008fffe'.upper(), '\U0008fffe') def test_capitalize(self): string_tests.CommonTest.test_capitalize(self) self.assertEqual('\U0001044F'.capitalize(), '\U00010427') self.assertEqual('\U0001044F\U0001044F'.capitalize(), '\U00010427\U0001044F') self.assertEqual('\U00010427\U0001044F'.capitalize(), '\U00010427\U0001044F') self.assertEqual('\U0001044F\U00010427'.capitalize(), '\U00010427\U0001044F') self.assertEqual('X\U00010427x\U0001044F'.capitalize(), 'X\U0001044Fx\U0001044F') + self.assertEqual('h\u0130'.capitalize(), 'H\u0069\u0307') + exp = '\u0399\u0308\u0300\u0069\u0307' + self.assertEqual('\u1fd2\u0130'.capitalize(), exp) + self.assertEqual('finnish'.capitalize(), 'FInnish') + self.assertEqual('A\u0345\u03a3'.capitalize(), 'A\u0345\u03c2') def test_title(self): string_tests.MixinStrUnicodeUserStringTest.test_title(self) self.assertEqual('\U0001044F'.title(), '\U00010427') self.assertEqual('\U0001044F\U0001044F'.title(), '\U00010427\U0001044F') self.assertEqual('\U0001044F\U0001044F \U0001044F\U0001044F'.title(), '\U00010427\U0001044F \U00010427\U0001044F') @@ -592,16 +614,29 @@ class UnicodeTest(string_tests.CommonTes self.assertEqual('\U0001044F\U0001044F'.swapcase(), '\U00010427\U00010427') self.assertEqual('\U00010427\U0001044F'.swapcase(), '\U0001044F\U00010427') self.assertEqual('\U0001044F\U00010427'.swapcase(), '\U00010427\U0001044F') self.assertEqual('X\U00010427x\U0001044F'.swapcase(), 'x\U0001044FX\U00010427') + self.assertEqual('fi'.swapcase(), 'FI') + self.assertEqual('\u0130'.swapcase(), '\u0069\u0307') + # Special case for GREEK CAPITAL LETTER SIGMA U+03A3 + self.assertEqual('\u03a3'.swapcase(), '\u03c3') + self.assertEqual('\u0345\u03a3'.swapcase(), '\u0345\u03c3') + self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0345\u03c2') + self.assertEqual('A\u0345\u03a3a'.swapcase(), 'a\u0345\u03c3A') + self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0345\u03c2') + self.assertEqual('A\u03a3\u0345'.swapcase(), 'a\u03c2\u0345') + self.assertEqual('\u03a3\u0345 '.swapcase(), '\u03c3\u0345 ') + self.assertEqual('\u03a3'.swapcase(), '\u03c3') + self.assertEqual('ß'.swapcase(), 'SS') + self.assertEqual('\u1fd2'.swapcase(), '\u0399\u0308\u0300') def test_contains(self): # Testing Unicode contains method self.assertIn('a', 'abdb') self.assertIn('a', 'bdab') self.assertIn('a', 'bdaba') self.assertIn('a', 'bdba') self.assertNotIn('a', 'bdb') diff -r 694110bc91d8 Lib/test/test_unicodedata.py --- a/Lib/test/test_unicodedata.py Sat Jan 07 18:34:24 2012 +0100 +++ b/Lib/test/test_unicodedata.py Sat Jan 07 22:54:05 2012 -0500 @@ -16,17 +16,17 @@ encoding = 'utf-8' errors = 'surrogatepass' ### Run tests class UnicodeMethodsTest(unittest.TestCase): # update this, if the database changes - expectedchecksum = '21b90f1aed00081b81ca7942b22196af090015a0' + expectedchecksum = '33f9b16f5e82c9e46a5d3f2da5d3ea611f5e8d80' def test_method_checksum(self): h = hashlib.sha1() for i in range(0x10000): char = chr(i) data = [ # Predicates (single char) "01"[char.isalnum()], diff -r 694110bc91d8 Objects/unicodectype.c --- a/Objects/unicodectype.c Sat Jan 07 18:34:24 2012 +0100 +++ b/Objects/unicodectype.c Sat Jan 07 22:54:05 2012 -0500 @@ -16,18 +16,21 @@ #define LOWER_MASK 0x08 #define LINEBREAK_MASK 0x10 #define SPACE_MASK 0x20 #define TITLE_MASK 0x40 #define UPPER_MASK 0x80 #define XID_START_MASK 0x100 #define XID_CONTINUE_MASK 0x200 #define PRINTABLE_MASK 0x400 -#define NODELTA_MASK 0x800 -#define NUMERIC_MASK 0x1000 +#define NUMERIC_MASK 0x800 +#define CASE_IGNORABLE_MASK 0x1000 +#define CASED_MASK 0x2000 +#define EXTRA_UPPER_MASK 0x4000 +#define EXTRA_LOWER_MASK 0x8000 typedef struct { const Py_UCS4 upper; const Py_UCS4 lower; const Py_UCS4 title; const unsigned char decimal; const unsigned char digit; const unsigned short flags; @@ -52,25 +55,17 @@ gettyperecord(Py_UCS4 code) } /* Returns the titlecase Unicode characters corresponding to ch or just ch if no titlecase mapping is known. */ Py_UCS4 _PyUnicode_ToTitlecase(register Py_UCS4 ch) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); - int delta = ctype->title; - - if (ctype->flags & NODELTA_MASK) - return delta; - - if (delta >= 32768) - delta -= 65536; - - return ch + delta; + return ctype->title; } /* Returns 1 for Unicode characters having the category 'Lt', 0 otherwise. */ int _PyUnicode_IsTitlecase(Py_UCS4 ch) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -183,36 +178,105 @@ int _PyUnicode_IsUppercase(Py_UCS4 ch) } /* Returns the uppercase Unicode characters corresponding to ch or just ch if no uppercase mapping is known. */ Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); - int delta = ctype->upper; - if (ctype->flags & NODELTA_MASK) - return delta; - if (delta >= 32768) - delta -= 65536; - return ch + delta; + Py_UCS4 res = ctype->upper; + + if (res) { + if (ctype->flags & (EXTRA_UPPER_MASK | EXTRA_LOWER_MASK)) + res &= 0xFFFF; + } + else { + res = ch; + } + return res; } /* Returns the lowercase Unicode characters corresponding to ch or just ch if no lowercase mapping is known. */ Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); - int delta = ctype->lower; - if (ctype->flags & NODELTA_MASK) - return delta; - if (delta >= 32768) - delta -= 65536; - return ch + delta; + Py_UCS4 res = ctype->lower; + + if (res) { + if (ctype->flags & (EXTRA_UPPER_MASK | EXTRA_LOWER_MASK)) + res &= 0xFFFF; + } + else { + res = ch; + } + return res; +} + +int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res) +{ + const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); + + res[0] = ctype->upper; + if (res[0]) { + if (ctype->flags & (EXTRA_UPPER_MASK | EXTRA_LOWER_MASK)) + res[0] &= 0xFFFF; + if (ctype->flags & EXTRA_UPPER_MASK) { + res[1] = ctype->upper >> 16; + if (ctype->lower & 0xFFFF0000) { + res[2] = ctype->lower >> 16; + return 3; + } + return 2; + } + } + else { + res[0] = ch; + } + return 1; +} + +int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res) +{ + const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); + + res[0] = ctype->lower; + if (res[0]) { + if (ctype->flags & (EXTRA_UPPER_MASK | EXTRA_LOWER_MASK)) + res[0] &= 0xFFFF; + if (ctype->flags & EXTRA_LOWER_MASK) { + res[0] &= 0xFFFF; + res[1] = ctype->lower >> 16; + if (ctype->upper & 0xFFFF0000) { + res[2] = ctype->upper >> 16; + return 3; + } + return 2; + } + } + else { + res[0] = ch; + } + return 1; +} + +int _PyUnicode_IsCased(Py_UCS4 ch) +{ + const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); + + return (ctype->flags & CASED_MASK) != 0; +} + +int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch) +{ + const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); + + return (ctype->flags & CASE_IGNORABLE_MASK) != 0; } /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt', 'Lo' or 'Lm', 0 otherwise. */ int _PyUnicode_IsAlpha(Py_UCS4 ch) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); diff -r 694110bc91d8 Objects/unicodeobject.c --- a/Objects/unicodeobject.c Sat Jan 07 18:34:24 2012 +0100 +++ b/Objects/unicodeobject.c Sat Jan 07 22:54:05 2012 -0500 @@ -9423,152 +9423,137 @@ fixup(PyObject *self, else { copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self)); } Py_DECREF(u); assert(_PyUnicode_CheckConsistency(v, 1)); return v; } +static PyObject * +ascii_upper_or_lower(PyObject *self, int lower) +{ + Py_ssize_t len = PyUnicode_GET_LENGTH(self), i; + char *resdata; + PyObject *res; + char lo, hi, diff; + + if (lower) { + lo = 'A'; + diff = 'A' - 'a'; + } + else { + lo = 'a'; + diff = 'a' - 'A'; + } + hi = lo + 25; + + res = _PyUnicode_Copy(self); + if (res == NULL) + return NULL; + resdata = PyUnicode_DATA(res); + + for (i = 0; i < len; i++) { + char c = resdata[i]; + if (lo <= c && c <= hi) + resdata[i] = c - diff; + } + return res; +} + static Py_UCS4 -fixupper(PyObject *self) -{ - /* No need to call PyUnicode_READY(self) because this function is only - called as a callback from fixup() which does it already. */ - const Py_ssize_t len = PyUnicode_GET_LENGTH(self); - const int kind = PyUnicode_KIND(self); - void *data = PyUnicode_DATA(self); - int touched = 0; - Py_UCS4 maxchar = 0; - Py_ssize_t i; - - for (i = 0; i < len; ++i) { - const Py_UCS4 ch = PyUnicode_READ(kind, data, i); - const Py_UCS4 up = Py_UNICODE_TOUPPER(ch); - if (up != ch) { - if (up > maxchar) - maxchar = up; - PyUnicode_WRITE(kind, data, i, up); - touched = 1; - } - else if (ch > maxchar) - maxchar = ch; - } - - if (touched) - return maxchar; - else - return 0; -} - -static Py_UCS4 -fixlower(PyObject *self) -{ - /* No need to call PyUnicode_READY(self) because fixup() which does it. */ - const Py_ssize_t len = PyUnicode_GET_LENGTH(self); - const int kind = PyUnicode_KIND(self); - void *data = PyUnicode_DATA(self); - int touched = 0; - Py_UCS4 maxchar = 0; - Py_ssize_t i; - - for(i = 0; i < len; ++i) { - const Py_UCS4 ch = PyUnicode_READ(kind, data, i); - const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch); - if (lo != ch) { - if (lo > maxchar) - maxchar = lo; - PyUnicode_WRITE(kind, data, i, lo); - touched = 1; - } - else if (ch > maxchar) - maxchar = ch; - } - - if (touched) - return maxchar; - else - return 0; -} - -static Py_UCS4 -fixswapcase(PyObject *self) -{ - /* No need to call PyUnicode_READY(self) because fixup() which does it. */ - const Py_ssize_t len = PyUnicode_GET_LENGTH(self); - const int kind = PyUnicode_KIND(self); - void *data = PyUnicode_DATA(self); - int touched = 0; - Py_UCS4 maxchar = 0; - Py_ssize_t i; - - for(i = 0; i < len; ++i) { - const Py_UCS4 ch = PyUnicode_READ(kind, data, i); - Py_UCS4 nu = 0; - - if (Py_UNICODE_ISUPPER(ch)) - nu = Py_UNICODE_TOLOWER(ch); - else if (Py_UNICODE_ISLOWER(ch)) - nu = Py_UNICODE_TOUPPER(ch); - - if (nu != 0) { - if (nu > maxchar) - maxchar = nu; - PyUnicode_WRITE(kind, data, i, nu); - touched = 1; - } - else if (ch > maxchar) - maxchar = ch; - } - - if (touched) - return maxchar; - else - return 0; -} - -static Py_UCS4 -fixcapitalize(PyObject *self) -{ - /* No need to call PyUnicode_READY(self) because fixup() which does it. */ - const Py_ssize_t len = PyUnicode_GET_LENGTH(self); - const int kind = PyUnicode_KIND(self); - void *data = PyUnicode_DATA(self); - int touched = 0; - Py_UCS4 maxchar = 0; - Py_ssize_t i = 0; - Py_UCS4 ch; - - if (len == 0) - return 0; - - ch = PyUnicode_READ(kind, data, i); - if (!Py_UNICODE_ISUPPER(ch)) { - maxchar = Py_UNICODE_TOUPPER(ch); - PyUnicode_WRITE(kind, data, i, maxchar); - touched = 1; - } - ++i; - for(; i < len; ++i) { - ch = PyUnicode_READ(kind, data, i); - if (!Py_UNICODE_ISLOWER(ch)) { - const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch); - if (lo > maxchar) - maxchar = lo; - PyUnicode_WRITE(kind, data, i, lo); - touched = 1; - } - else if (ch > maxchar) - maxchar = ch; - } - - if (touched) - return maxchar; - else - return 0; +handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i) +{ + Py_ssize_t j; + int final_sigma; + Py_UCS4 c; + /* U+03A3 is in the Final_Sigma context when, it is found like this: + + \p{cased}\p{case-ignorable}*\u03A3!(\p{case-ignorable}*\p{cased}) + + where ! is a negation and \p{xxx} is a character with property xxx. + */ + for (j = i - 1; j >= 0; j--) { + c = PyUnicode_READ(kind, data, j); + if (!_PyUnicode_IsCaseIgnorable(c)) + break; + } + final_sigma = j >= 0 && _PyUnicode_IsCased(c); + if (final_sigma) { + for (j = i + 1; j < length; j++) { + c = PyUnicode_READ(kind, data, j); + if (!_PyUnicode_IsCaseIgnorable(c)) + break; + } + final_sigma = j == length || !_PyUnicode_IsCased(c); + } + return (final_sigma) ? 0x3C2 : 0x3C3; +} + +static int +lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i, + Py_UCS4 c, Py_UCS4 *mapped) +{ + /* Obscure special case. */ + if (c == 0x3A3) { + mapped[0] = handle_capital_sigma(kind, data, length, i); + return 1; + } + return _PyUnicode_ToLowerFull(c, mapped); +} + +static int +upper_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i, + Py_UCS4 c, Py_UCS4 *mapped) +{ + return _PyUnicode_ToUpperFull(c, mapped); +} + +static PyObject * +unicode_upper_or_lower(PyObject *self, int lower) +{ + PyObject *res; + Py_ssize_t i, k, length, newlength = 0; + int kind, outkind; + int j, n_res; + void *data, *outdata; + Py_UCS4 mapped[3], c, maxchar = 0; + int (*casemap)(int, void *, Py_ssize_t, Py_ssize_t, Py_UCS4, Py_UCS4 *); + + if (PyUnicode_READY(self) == -1) + return NULL; + + if (PyUnicode_IS_ASCII(self)) + return ascii_upper_or_lower(self, lower); + + casemap = (lower) ? lower_ucs4 : upper_ucs4; + kind = PyUnicode_KIND(self); + data = PyUnicode_DATA(self); + length = PyUnicode_GET_LENGTH(self); + for (i = 0; i < length; i++) { + c = PyUnicode_READ(kind, data, i); + n_res = casemap(kind, data, length, i, c, mapped); + for (j = 0; j < n_res; j++) + if (mapped[j] > maxchar) + maxchar = mapped[j]; + newlength += n_res; + } + res = PyUnicode_New(newlength, maxchar); + if (!res) + return NULL; + outkind = PyUnicode_KIND(res); + outdata = PyUnicode_DATA(res); + k = 0; + for (i = 0; i < length; i++) { + c = PyUnicode_READ(kind, data, i); + n_res = casemap(kind, data, length, i, c, mapped); + for (j = 0; j < n_res; j++) + PyUnicode_WRITE(outkind, outdata, k++, mapped[j]); + } + return res; } static Py_UCS4 fixtitle(PyObject *self) { /* No need to call PyUnicode_READY(self) because fixup() which does it. */ const Py_ssize_t len = PyUnicode_GET_LENGTH(self); const int kind = PyUnicode_KIND(self); @@ -10452,17 +10437,58 @@ PyDoc_STRVAR(capitalize__doc__, "S.capitalize() -> str\n\ \n\ Return a capitalized version of S, i.e. make the first character\n\ have upper case and the rest lower case."); static PyObject* unicode_capitalize(PyObject *self) { - return fixup(self, fixcapitalize); + Py_ssize_t length, newlength, i, k = 0; + int kind, newkind, n_res, j; + void *data, *newdata; + Py_UCS4 c, maxchar = 0, mapped[3]; + PyObject *res; + + if (PyUnicode_READY(self) == -1) + return NULL; + + kind = PyUnicode_KIND(self); + data = PyUnicode_DATA(self); + length = PyUnicode_GET_LENGTH(self); + if (length == 0) + return unicode_result_unchanged(self); + c = PyUnicode_READ(kind, data, 0); + n_res = newlength = _PyUnicode_ToUpperFull(c, mapped); + for (j = 0; j < n_res; j++) + if (mapped[j] > maxchar) + maxchar = mapped[j]; + for (i = 1; i < length; i++) { + c = PyUnicode_READ(kind, data, i); + n_res = lower_ucs4(kind, data, length, i, c, mapped); + for (j = 0; j < n_res; j++) + if (mapped[j] > maxchar) + maxchar = mapped[j]; + newlength += n_res; + } + res = PyUnicode_New(newlength, maxchar); + if (res == NULL) + return NULL; + newkind = PyUnicode_KIND(res); + newdata = PyUnicode_DATA(res); + n_res = _PyUnicode_ToUpperFull(PyUnicode_READ(kind, data, 0), mapped); + for (j = 0; j < n_res; j++, k++) + PyUnicode_WRITE(newkind, newdata, k, mapped[j]); + for (i = 1; i < length; i++) { + c = PyUnicode_READ(kind, data, i); + n_res = lower_ucs4(kind, data, length, i, c, mapped); + for (j = 0; j < n_res; j++, k++) + PyUnicode_WRITE(newkind, newdata, k, mapped[j]); + } + return res; } #if 0 PyDoc_STRVAR(capwords__doc__, "S.capwords() -> str\n\ \n\ Apply .capitalize() to all words in S and return the result with\n\ normalized whitespace (all whitespace strings are replaced by ' ')."); @@ -11710,17 +11736,17 @@ unicode_ljust(PyObject *self, PyObject * PyDoc_STRVAR(lower__doc__, "S.lower() -> str\n\ \n\ Return a copy of the string S converted to lowercase."); static PyObject* unicode_lower(PyObject *self) { - return fixup(self, fixlower); + return unicode_upper_or_lower(self, 1); } #define LEFTSTRIP 0 #define RIGHTSTRIP 1 #define BOTHSTRIP 2 /* Arrays indexed by above */ static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; @@ -12599,17 +12625,66 @@ PyDoc_STRVAR(swapcase__doc__, "S.swapcase() -> str\n\ \n\ Return a copy of S with uppercase characters converted to lowercase\n\ and vice versa."); static PyObject* unicode_swapcase(PyObject *self) { - return fixup(self, fixswapcase); + Py_ssize_t length, newlength = 0, i, k = 0; + int kind, newkind, n_res, j; + void *data, *newdata; + Py_UCS4 c, maxchar = 0, mapped[3]; + PyObject *res; + + if (PyUnicode_READY(self) == -1) + return NULL; + + kind = PyUnicode_KIND(self); + data = PyUnicode_DATA(self); + length = PyUnicode_GET_LENGTH(self); + for (i = 0; i < length; i++) { + c = PyUnicode_READ(kind, data, i); + if (Py_UNICODE_ISUPPER(c)) { + n_res = lower_ucs4(kind, data, length, i, c, mapped); + } + else if (Py_UNICODE_ISLOWER(c)) { + n_res = _PyUnicode_ToUpperFull(c, mapped); + } + else { + n_res = 1; + mapped[0] = c; + } + newlength += n_res; + for (j = 0; j < n_res; j++) + if (mapped[j] > maxchar) + maxchar = mapped[j]; + } + res = PyUnicode_New(newlength, maxchar); + if (res == NULL) + return NULL; + newkind = PyUnicode_KIND(res); + newdata = PyUnicode_DATA(res); + for (i = 0; i < length; i++) { + c = PyUnicode_READ(kind, data, i); + if (Py_UNICODE_ISUPPER(c)) { + n_res = lower_ucs4(kind, data, length, i, c, mapped); + } + else if (Py_UNICODE_ISLOWER(c)) { + n_res = _PyUnicode_ToUpperFull(c, mapped); + } + else { + n_res = 1; + mapped[0] = c; + } + for (j = 0; j < n_res; j++, k++) + PyUnicode_WRITE(newkind, newdata, k, mapped[j]); + } + return res; } PyDoc_STRVAR(maketrans__doc__, "str.maketrans(x[, y[, z]]) -> dict (static method)\n\ \n\ Return a translation table usable for str.translate().\n\ If there is only one argument, it must be a dictionary mapping Unicode\n\ ordinals (integers) or characters to Unicode ordinals, strings or None.\n\ @@ -12745,17 +12820,17 @@ unicode_translate(PyObject *self, PyObje PyDoc_STRVAR(upper__doc__, "S.upper() -> str\n\ \n\ Return a copy of S converted to uppercase."); static PyObject* unicode_upper(PyObject *self) { - return fixup(self, fixupper); + return unicode_upper_or_lower(self, 0); } PyDoc_STRVAR(zfill__doc__, "S.zfill(width) -> str\n\ \n\ Pad a numeric string S with zeros on the left, to fill a field\n\ of the specified width. The string S is never truncated."); diff -r 694110bc91d8 Tools/unicode/makeunicodedata.py --- a/Tools/unicode/makeunicodedata.py Sat Jan 07 18:34:24 2012 +0100 +++ b/Tools/unicode/makeunicodedata.py Sat Jan 07 22:54:05 2012 -0500 @@ -17,16 +17,17 @@ # 2002-10-18 mvl update to Unicode 3.2 # 2002-10-22 mvl generate NFC tables # 2002-11-24 mvl expand all ranges, sort names version-independently # 2002-11-25 mvl add UNIDATA_VERSION # 2004-05-29 perky add east asian width information # 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta # 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch # 2011-10-21 ezio add support for name aliases and named sequences +# 2012-01 benjamin add full case mappings # # written by Fredrik Lundh (fredrik@pythonware.com) # import os import sys import zipfile @@ -42,16 +43,17 @@ UNICODE_DATA = "UnicodeData%s.txt" COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt" EASTASIAN_WIDTH = "EastAsianWidth%s.txt" UNIHAN = "Unihan%s.zip" DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt" DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt" LINE_BREAK = "LineBreak%s.txt" NAME_ALIASES = "NameAliases%s.txt" NAMED_SEQUENCES = "NamedSequences%s.txt" +SPECIAL_CASING = "SpecialCasing%s.txt" # Private Use Areas -- in planes 1, 15, 16 PUA_1 = range(0xE000, 0xF900) PUA_15 = range(0xF0000, 0xFFFFE) PUA_16 = range(0x100000, 0x10FFFE) # we use this ranges of PUA_15 to store name aliases and named sequences NAME_ALIASES_START = 0xF0000 @@ -79,18 +81,21 @@ DIGIT_MASK = 0x04 LOWER_MASK = 0x08 LINEBREAK_MASK = 0x10 SPACE_MASK = 0x20 TITLE_MASK = 0x40 UPPER_MASK = 0x80 XID_START_MASK = 0x100 XID_CONTINUE_MASK = 0x200 PRINTABLE_MASK = 0x400 -NODELTA_MASK = 0x800 -NUMERIC_MASK = 0x1000 +NUMERIC_MASK = 0x800 +CASE_IGNORABLE_MASK = 0x1000 +CASED_MASK = 0x2000 +EXTRA_UPPER_MASK = 0x4000 +EXTRA_LOWER_MASK = 0x8000 # these ranges need to match unicodedata.c:is_unified_ideograph cjk_ranges = [ ('3400', '4DB5'), ('4E00', '9FCB'), ('20000', '2A6D6'), ('2A700', '2B734'), ('2B740', '2B81D') @@ -409,45 +414,78 @@ def makeunicodetype(unicode, trace): if category == "Lu": flags |= UPPER_MASK if char == ord(" ") or category[0] not in ("C", "Z"): flags |= PRINTABLE_MASK if "XID_Start" in properties: flags |= XID_START_MASK if "XID_Continue" in properties: flags |= XID_CONTINUE_MASK - # use delta predictor for upper/lower/title if it fits - if record[12]: - upper = int(record[12], 16) + if "Cased" in properties: + flags |= CASED_MASK + if "Case_Ignorable" in properties: + flags |= CASE_IGNORABLE_MASK + sc = unicode.special_casing.get(char) + if sc is None: + if record[12]: + upper = int(record[12], 16) + else: + upper = char + if record[13]: + lower = int(record[13], 16) + else: + lower = char + if record[14]: + title = int(record[14], 16) + else: + title = upper else: - upper = char - if record[13]: - lower = int(record[13], 16) - else: - lower = char - if record[14]: - title = int(record[14], 16) - else: - # UCD.html says that a missing title char means that - # it defaults to the uppercase character, not to the - # character itself. Apparently, in the current UCD (5.x) - # this feature is never used - title = upper - upper_d = upper - char - lower_d = lower - char - title_d = title - char - if -32768 <= upper_d <= 32767 and \ - -32768 <= lower_d <= 32767 and \ - -32768 <= title_d <= 32767: - # use deltas - upper = upper_d & 0xffff - lower = lower_d & 0xffff - title = title_d & 0xffff - else: - flags |= NODELTA_MASK + # This happens when some character maps to more than one + # character in uppercase or lowercase. I employ an evil hack to + # store the mappings: a mapping to 2 characters is stored in the + # same Py_UCS4 instance as a one character mapping. A 3 + # character mapping uses that and the Py_UCS4 instance for the + # opposite case mapping of the character. For this hack to work, + # a lot of conditions, have to hold. + lower_len = len(sc[0]) + upper_len = len(sc[1]) + no_hacks = ValueError("upper/lower hacks just failed") + if lower_len > 1 and upper_len > 1: + raise no_hacks + if lower_len > 1: + split = sc[0] + other = sc[1][0] + flags |= EXTRA_LOWER_MASK + else: + assert upper_len > 1 + split = sc[1] + other = sc[0][0] + flags |= EXTRA_UPPER_MASK + if record[14]: + title = int(record[14], 16) + elif record[12]: + title = int(record[12], 16) + else: + title = char + if other > 0xFFFF: + raise no_hacks + for c in split: + if c > 0xFFFF: + raise no_hacks + final = split[0] + if len(split) >= 2: + final |= split[1] << 16 + if len(split) == 3: + other |= split[2] << 16 + if lower_len > 1: + lower = final + upper = other + else: + upper = final + lower = other # decimal digit, integer digit decimal = 0 if record[6]: flags |= DECIMAL_MASK decimal = int(record[6]) digit = 0 if record[7]: flags |= DIGIT_MASK @@ -1065,16 +1103,31 @@ class UnicodeData: if tag not in ('kAccountingNumeric', 'kPrimaryNumeric', 'kOtherNumeric'): continue value = value.strip().replace(',', '') i = int(code[2:], 16) # Patch the numeric field if table[i] is not None: table[i][8] = value + sc = self.special_casing = {} + with open_data(SPECIAL_CASING, version) as file: + for s in file: + s = s[:-1].split('#', 1)[0] + if not s: + continue + data = s.split("; ") + if data[4]: + # We ignore all conditionals (since they depend on + # languages) except for one, which is hardcoded. + continue + c = int(data[0], 16) + lower = [int(char, 16) for char in data[1].split()] + upper = [int(char, 16) for char in data[3].split()] + sc[c] = (lower, upper) def uselatin1(self): # restrict character range to ISO Latin 1 self.chars = list(range(256)) # hash table tools # this is a straight-forward reimplementation of Python's built-in