Index: Lib/test/test_unicodedata.py =================================================================== --- Lib/test/test_unicodedata.py (revision 79081) +++ Lib/test/test_unicodedata.py (working copy) @@ -24,7 +24,7 @@ def test_method_checksum(self): h = hashlib.sha1() - for i in range(65536): + for i in range(0x10000): char = unichr(i) data = [ # Predicates (single char) @@ -282,6 +282,17 @@ self.assertEqual(u"\u01c5".title(), u"\u01c5") self.assertEqual(u"\u01c6".title(), u"\u01c5") + def test_linebreak_7643(self): + for i in range(0x10000): + lines = (unichr(i) + u'A').splitlines() + if i in (0x0a, 0x0b, 0x0c, 0x0d, 0x85, + 0x1c, 0x1d, 0x1e, 0x2028, 0x2029): + self.assertEqual(len(lines), 2, + r"\u%.4x should be a linebreak" % i) + else: + self.assertEqual(len(lines), 1, + r"\u%.4x should not be a linebreak" % i) + def test_main(): test.test_support.run_unittest( UnicodeMiscTest, Index: Objects/unicodeobject.c =================================================================== --- Objects/unicodeobject.c (revision 79081) +++ Objects/unicodeobject.c (working copy) @@ -115,9 +115,9 @@ /* Fast detection of the most frequent whitespace characters */ const unsigned char _Py_ascii_whitespace[] = { 0, 0, 0, 0, 0, 0, 0, 0, -/* case 0x0009: * HORIZONTAL TABULATION */ +/* case 0x0009: * CHARACTER TABULATION */ /* case 0x000A: * LINE FEED */ -/* case 0x000B: * VERTICAL TABULATION */ +/* case 0x000B: * LINE TABULATION */ /* case 0x000C: * FORM FEED */ /* case 0x000D: * CARRIAGE RETURN */ 0, 1, 1, 1, 1, 1, 0, 0, @@ -147,8 +147,10 @@ static unsigned char ascii_linebreak[] = { 0, 0, 0, 0, 0, 0, 0, 0, /* 0x000A, * LINE FEED */ +/* 0x000B, * LINE TABULATION */ +/* 0x000C, * FORM FEED */ /* 0x000D, * CARRIAGE RETURN */ - 0, 0, 1, 0, 0, 1, 0, 0, + 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x001C, * FILE SEPARATOR */ /* 0x001D, * GROUP SEPARATOR */ Index: Objects/unicodetype_db.h =================================================================== --- Objects/unicodetype_db.h (revision 79081) +++ Objects/unicodetype_db.h (working copy) @@ -661,7 +661,7 @@ }; static unsigned char index2[] = { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 2, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1, 1, 1, 1, 1, 1, 1, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, @@ -3313,13 +3313,16 @@ #endif } -/* Returns 1 for Unicode characters having the category 'Zl', - * 'Zp' or type 'B', 0 otherwise. +/* Returns 1 for Unicode characters having the line break + * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional + * type 'B', 0 otherwise. */ int _PyUnicode_IsLinebreak(register const Py_UNICODE ch) { switch (ch) { case 0x000A: + case 0x000B: + case 0x000C: case 0x000D: case 0x001C: case 0x001D: Index: Tools/unicode/makeunicodedata.py =================================================================== --- Tools/unicode/makeunicodedata.py (revision 79081) +++ Tools/unicode/makeunicodedata.py (working copy) @@ -36,6 +36,7 @@ EASTASIAN_WIDTH = "EastAsianWidth%s.txt" UNIHAN = "Unihan%s.txt" DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt" +LINE_BREAK = "LineBreak%s.txt" old_versions = ["3.2.0"] @@ -50,6 +51,8 @@ EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ] +MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ] + # note: should match definitions in Objects/unicodectype.c ALPHA_MASK = 0x01 DECIMAL_MASK = 0x02 @@ -71,7 +74,8 @@ COMPOSITION_EXCLUSIONS % version, EASTASIAN_WIDTH % version, UNIHAN % version, - DERIVEDNORMALIZATION_PROPS % version) + DERIVEDNORMALIZATION_PROPS % version, + LINE_BREAK % version) print len(filter(None, unicode.table)), "characters" @@ -113,7 +117,7 @@ bidirectional = BIDIRECTIONAL_NAMES.index(record[4]) mirrored = record[9] == "Y" eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15]) - normalizationquickcheck = record[16] + normalizationquickcheck = record[17] item = ( category, combining, bidirectional, mirrored, eastasianwidth, normalizationquickcheck @@ -365,13 +369,14 @@ # extract database properties category = record[2] bidirectional = record[4] + properties = record[16] flags = 0 delta = True if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]: flags |= ALPHA_MASK if category == "Ll": flags |= LOWER_MASK - if category == "Zl" or bidirectional == "B": + if 'Line_Break' in properties or bidirectional == "B": flags |= LINEBREAK_MASK linebreaks.append(char) if category == "Zs" or bidirectional in ("WS", "B", "S"): @@ -524,8 +529,9 @@ print >>fp # Generate code for _PyUnicode_IsLinebreak() - print >>fp, "/* Returns 1 for Unicode characters having the category 'Zl'," - print >>fp, " * 'Zp' or type 'B', 0 otherwise." + print >>fp, "/* Returns 1 for Unicode characters having the line break" + print >>fp, " * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional" + print >>fp, " * type 'B', 0 otherwise." print >>fp, " */" print >>fp, 'int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)' print >>fp, '{' @@ -787,6 +793,9 @@ elif k == 14: # change to simple titlecase mapping; ignore pass + elif k == 16: + # change to properties; not yet + pass else: class Difference(Exception):pass raise Difference, (hex(i), k, old.table[i], new.table[i]) @@ -803,9 +812,15 @@ # load a unicode-data file from disk class UnicodeData: + # Record structure: + # [ID, name, category, combining, bidi, decomp, (6) + # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11) + # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16) + # properties] (17) def __init__(self, filename, exclusions, eastasianwidth, unihan, - derivednormalizationprops=None, expand=1): + derivednormalizationprops=None, linebreakprops=None, + expand=1): self.changed = [] file = open(filename) table = [None] * 0x110000 @@ -868,6 +883,23 @@ for i in range(0, 0x110000): if table[i] is not None: table[i].append(widths[i]) + + for i in range(0, 0x110000): + if table[i] is not None: + table[i].append(set()) + if linebreakprops: + for s in open(linebreakprops): + s = s.partition('#')[0] + s = [i.strip() for i in s.split(';')] + if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS: + continue + if '..' not in s[0]: + first = last = int(s[0], 16) + else: + first, last = [int(c, 16) for c in s[0].split('..')] + for char in range(first, last+1): + table[char][-1].add('Line_Break') + if derivednormalizationprops: quickchecks = [0] * 0x110000 # default is Yes qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()