Index: Include/unicodeobject.h =================================================================== --- Include/unicodeobject.h (Revision 75130) +++ Include/unicodeobject.h (Arbeitskopie) @@ -390,6 +390,8 @@ Py_UNICODE_ISDIGIT(ch) || \ Py_UNICODE_ISNUMERIC(ch)) +#define Py_UNICODE_ISCASEIGNORABLE(ch) _PyUnicode_IsCaseIgnorable(ch) + #define Py_UNICODE_COPY(target, source, length) \ Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE)) Index: Objects/unicodeobject.c =================================================================== --- Objects/unicodeobject.c (Revision 75130) +++ Objects/unicodeobject.c (Arbeitskopie) @@ -5417,8 +5417,9 @@ Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) previous_is_cased = 1; - else - previous_is_cased = 0; + else if (!Py_UNICODE_ISCASEIGNORABLE(ch)) { + previous_is_cased = 0; + } } return 1; } @@ -6855,8 +6856,9 @@ previous_is_cased = 1; cased = 1; } - else - previous_is_cased = 0; + else if (!Py_UNICODE_ISCASEIGNORABLE(ch)) { + previous_is_cased = 0; + } } return PyBool_FromLong(cased); } Index: Objects/unicodectype.c =================================================================== --- Objects/unicodectype.c (Revision 75130) +++ Objects/unicodectype.c (Arbeitskopie) @@ -20,6 +20,7 @@ #define TITLE_MASK 0x40 #define UPPER_MASK 0x80 #define NODELTA_MASK 0x100 +#define CASE_IGNORABLE_MASK 0x200 typedef struct { const Py_UNICODE upper; @@ -652,6 +653,16 @@ return _PyUnicode_ToNumeric(ch) != -1.0; } +/* Returns 1 for Unicode characters having the category 'Mn', 'Me', 'Cf', 'Lm' + or 'Sk', 0 otherwise. */ + +int _PyUnicode_IsCaseIgnorable(Py_UNICODE ch) +{ + const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); + + return (ctype->flags & CASE_IGNORABLE_MASK) != 0; +} + #ifndef WANT_WCTYPE_FUNCTIONS /* Returns 1 for Unicode characters having the bidirectional type Index: Tools/unicode/makeunicodedata.py =================================================================== --- Tools/unicode/makeunicodedata.py (Revision 75130) +++ Tools/unicode/makeunicodedata.py (Arbeitskopie) @@ -58,6 +58,7 @@ TITLE_MASK = 0x40 UPPER_MASK = 0x80 NODELTA_MASK = 0x100 +CASE_IGNORABLE_MASK = 0x200 def maketables(trace=0): @@ -369,6 +370,26 @@ flags |= TITLE_MASK if category == "Lu": flags |= UPPER_MASK + if (category in ["Mn", "Me", "Cf", "Lm", "Sk"] + or unichr(char) in [u':', u'\xb7', u'\u0387', u'\u05f4', + u'\u2027', u'\ufe13', u'\ufe55', u'\uff1a'] + or unichr(char) in [u"'", u'.', u'\u2018', u'\u2019', u'\u2024', + u'\ufe52', u'\uff07', u'\uff0e']): + # As of Unicode 5.2.0: "C is defined to be case-ignorable if + # Word_Break(C) = MidLetter or MidNumLet, or + # General_Category(C) = Nonspacing_Mark (Mn), + # Enclosing_Mark (Me), Format (Cf), Modifier_Letter (Lm), or + # Modifier_Symbol (Sk)." + # See WordBreakProperty.txt for "Word_Break(C)" + # TODO Those hardcoded characters above cannot currently be + # extracted automatically, as neither DerivedCoreProperties.txt + # nor the source file for property + # "Word_Break(C) = MidLetter or MidNumLet" are provided here. In + # future "CASE_IGNORABLE_MASK" should be set from the property + # "Case_Ignorable (CI)" in http://www.unicode.org/\ + # Public/5.2.0/ucd/DerivedCoreProperties.txt + + flags |= CASE_IGNORABLE_MASK # use delta predictor for upper/lower/title if it fits if record[12]: upper = int(record[12], 16) Index: Lib/test/test_unicode.py =================================================================== --- Lib/test/test_unicode.py (Revision 75130) +++ Lib/test/test_unicode.py (Arbeitskopie) @@ -273,6 +273,9 @@ string_tests.MixinStrUnicodeUserStringTest.test_title(self) self.checkequalnofix(True, u'\u1FFc', 'istitle') self.checkequalnofix(True, u'Greek \u1FFcitlecases ...', 'istitle') + self.checkequalnofix(True, u'H\u0301ngh', 'istitle') + self.checkequalnofix(False, u"This Isn'T Right", 'istitle') + self.checkequalnofix(True, u"I Pity The 'Foo'.", 'istitle') def test_isspace(self): string_tests.MixinStrUnicodeUserStringTest.test_isspace(self)