Index: Include/unicodeobject.h =================================================================== --- Include/unicodeobject.h (Revision 74793) +++ Include/unicodeobject.h (Arbeitskopie) @@ -390,6 +390,8 @@ Py_UNICODE_ISDIGIT(ch) || \ Py_UNICODE_ISNUMERIC(ch)) +#define Py_UNICODE_ISCASEIGNORABLE(ch) _PyUnicode_IsCaseIgnorable(ch) + #define Py_UNICODE_COPY(target, source, length) \ Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE)) Index: Objects/unicodeobject.c =================================================================== --- Objects/unicodeobject.c (Revision 74793) +++ Objects/unicodeobject.c (Arbeitskopie) @@ -5417,8 +5417,9 @@ Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) previous_is_cased = 1; - else - previous_is_cased = 0; + else if (!Py_UNICODE_ISCASEIGNORABLE(ch)) { + previous_is_cased = 0; + } } return 1; } @@ -6855,8 +6856,9 @@ previous_is_cased = 1; cased = 1; } - else - previous_is_cased = 0; + else if (!Py_UNICODE_ISCASEIGNORABLE(ch)) { + previous_is_cased = 0; + } } return PyBool_FromLong(cased); } Index: Objects/unicodectype.c =================================================================== --- Objects/unicodectype.c (Revision 74793) +++ Objects/unicodectype.c (Arbeitskopie) @@ -20,6 +20,7 @@ #define TITLE_MASK 0x40 #define UPPER_MASK 0x80 #define NODELTA_MASK 0x100 +#define CASE_IGNORABLE_MASK 0x200 typedef struct { const Py_UNICODE upper; @@ -652,6 +653,16 @@ return _PyUnicode_ToNumeric(ch) != -1.0; } +/* Returns 1 for Unicode characters having the category 'Mn', 'Me', 'Cf', 'Lm' + or 'Sk', 0 otherwise. */ + +int _PyUnicode_IsCaseIgnorable(Py_UNICODE ch) +{ + const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); + + return (ctype->flags & CASE_IGNORABLE_MASK) != 0; +} + #ifndef WANT_WCTYPE_FUNCTIONS /* Returns 1 for Unicode characters having the bidirectional type Index: Tools/unicode/makeunicodedata.py =================================================================== --- Tools/unicode/makeunicodedata.py (Revision 74793) +++ Tools/unicode/makeunicodedata.py (Arbeitskopie) @@ -58,6 +58,7 @@ TITLE_MASK = 0x40 UPPER_MASK = 0x80 NODELTA_MASK = 0x100 +CASE_IGNORABLE_MASK = 0x200 def maketables(trace=0): @@ -369,6 +370,18 @@ flags |= TITLE_MASK if category == "Lu": flags |= UPPER_MASK + if category in ["Mn", "Me", "Cf", "Lm", "Sk"] \ + or unichr(char) in [u':', u'\xb7', u'\u0387', u'\u05f4', + u'\u2027', u'\ufe13', u'\ufe55', u'\uff1a'] \ + or unichr(char) in [u"'", u'.', u'\u2018', u'\u2019', u'\u2024', + u'\ufe52', u'\uff07', u'\uff0e']: + # C is defined to be case-ignorable if + # Word_Break(C) = MidLetter or MidNumLet, or + # General_Category(C) = Nonspacing_Mark (Mn), + # Enclosing_Mark (Me), Format (Cf), Modifier_Letter (Lm), or + # Modifier_Symbol (Sk). + # TODO generate from WordBreakProperty.txt + flags |= CASE_IGNORABLE_MASK # use delta predictor for upper/lower/title if it fits if record[12]: upper = int(record[12], 16)