diff --git a/Doc/library/stdtypes.rst b/Doc/library/stdtypes.rst index 5a133e32fc..f7245e4bcb 100644 --- a/Doc/library/stdtypes.rst +++ b/Doc/library/stdtypes.rst @@ -1916,38 +1916,13 @@ expression support in the :mod:`re` module). breaks are not included in the resulting list unless *keepends* is given and true. - This method splits on the following line boundaries. In particular, the - boundaries are a superset of :term:`universal newlines`. - - +-----------------------+-----------------------------+ - | Representation | Description | - +=======================+=============================+ - | ``\n`` | Line Feed | - +-----------------------+-----------------------------+ - | ``\r`` | Carriage Return | - +-----------------------+-----------------------------+ - | ``\r\n`` | Carriage Return + Line Feed | - +-----------------------+-----------------------------+ - | ``\v`` or ``\x0b`` | Line Tabulation | - +-----------------------+-----------------------------+ - | ``\f`` or ``\x0c`` | Form Feed | - +-----------------------+-----------------------------+ - | ``\x1c`` | File Separator | - +-----------------------+-----------------------------+ - | ``\x1d`` | Group Separator | - +-----------------------+-----------------------------+ - | ``\x1e`` | Record Separator | - +-----------------------+-----------------------------+ - | ``\x85`` | Next Line (C1 Control Code) | - +-----------------------+-----------------------------+ - | ``\u2028`` | Line Separator | - +-----------------------+-----------------------------+ - | ``\u2029`` | Paragraph Separator | - +-----------------------+-----------------------------+ - - .. versionchanged:: 3.2 - - ``\v`` and ``\f`` added to list of line boundaries. + This method uses the :term:`universal newlines` approach + to splitting lines. + + .. versionchanged: 3.8 + + Treat only ``\r`` and ``\n`` as line boundaries, matching the + behavior of `bytes.splitlines`. For example:: diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 00b5d317c4..b5f43fe173 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -143,14 +143,14 @@ class ReadTest(MixInCheckStateHandling): return "|".join(lines) s = "foo\nbar\r\nbaz\rspam\u2028eggs" - sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs" - sexpectednoends = "foo|bar|baz|spam|eggs" + sexpected = "foo\n|bar\r\n|baz\r|spam\u2028eggs" + sexpectednoends = "foo|bar|baz|spam\u2028eggs" self.assertEqual(readalllines(s, True), sexpected) self.assertEqual(readalllines(s, False), sexpectednoends) self.assertEqual(readalllines(s, True, 10), sexpected) self.assertEqual(readalllines(s, False, 10), sexpectednoends) - lineends = ("\n", "\r\n", "\r", "\u2028") + lineends = ("\n", "\r\n", "\r") # Test long lines (multiple calls to read() in readline()) vw = [] vwo = [] diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index 170778fa97..26564c2c1f 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -317,8 +317,7 @@ class UnicodeMiscTest(UnicodeDatabaseTest): def test_linebreak_7643(self): for i in range(0x10000): lines = (chr(i) + 'A').splitlines() - if i in (0x0a, 0x0b, 0x0c, 0x0d, 0x85, - 0x1c, 0x1d, 0x1e, 0x2028, 0x2029): + if i in {0x0a, 0x0d}: self.assertEqual(len(lines), 2, r"\u%.4x should be a linebreak" % i) else: diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index a797f838eb..11633ecc5a 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -286,34 +286,6 @@ raise_encode_exception(PyObject **exceptionObject, Py_ssize_t startpos, Py_ssize_t endpos, const char *reason); -/* Same for linebreaks */ -static const unsigned char ascii_linebreak[] = { - 0, 0, 0, 0, 0, 0, 0, 0, -/* 0x000A, * LINE FEED */ -/* 0x000B, * LINE TABULATION */ -/* 0x000C, * FORM FEED */ -/* 0x000D, * CARRIAGE RETURN */ - 0, 0, 1, 1, 1, 1, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, -/* 0x001C, * FILE SEPARATOR */ -/* 0x001D, * GROUP SEPARATOR */ -/* 0x001E, * RECORD SEPARATOR */ - 0, 0, 0, 0, 1, 1, 1, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0 -}; - static int convert_uc(PyObject *obj, void *addr); #include "clinic/unicodeobject.c.h" @@ -721,9 +693,7 @@ static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0; #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) -#define BLOOM_LINEBREAK(ch) \ - ((ch) < 128U ? ascii_linebreak[(ch)] : \ - (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) +#define BLOOM_LINEBREAK(ch) ((ch) == '\n' || (ch) == '\r') static inline BLOOM_MASK make_bloom_mask(int kind, void* ptr, Py_ssize_t len)