diff -ur Python-3.0a4-orig/Objects/unicodeobject.c Python-3.0a4/Objects/unicodeobject.c --- Python-3.0a4-orig/Objects/unicodeobject.c 2008-04-14 15:14:55.671875000 +0900 +++ Python-3.0a4/Objects/unicodeobject.c 2008-04-15 20:26:39.468750000 +0900 @@ -7460,6 +7460,59 @@ return result; } + +/* XXX: Following codes are copied from Modules/unicodedata.c. + Unicodedata should become a part of Python Unicode API. +*/ + +typedef struct { + const unsigned char category; /* index into + _PyUnicode_CategoryNames */ + const unsigned char combining; /* combining class value 0 - 255 */ + const unsigned char bidirectional; /* index into + _PyUnicode_BidirectionalNames */ + const unsigned char mirrored; /* true if mirrored in bidir mode */ + const unsigned char east_asian_width; /* index into + _PyUnicode_EastAsianWidth */ +} _PyUnicode_DatabaseRecord; + +typedef struct change_record { + /* sequence of fields should be the same as in merge_old_version */ + const unsigned char bidir_changed; + const unsigned char category_changed; + const unsigned char decimal_changed; + const int numeric_changed; +} change_record; + +/* data file generated by Tools/unicode/makeunicodedata.py */ +#include "../Modules/unicodedata_db.h" + +/* End of copy */ + + +/* Returns 1 for Unicode characters having the category 'Z*' or 'C*', + 0 otherwise. +*/ +Py_LOCAL_INLINE(int) is_hex_repr(Py_UNICODE code) { + int index; + unsigned char category; + const char *category_name; + + if (code >= 0x110000) + return 0; + + index = index1[(code>>SHIFT)]; + index = index2[(index<= 0x10000) { - *p++ = '\\'; - *p++ = 'U'; - *p++ = hexdigits[(ch >> 28) & 0x0000000F]; - *p++ = hexdigits[(ch >> 24) & 0x0000000F]; - *p++ = hexdigits[(ch >> 20) & 0x0000000F]; - *p++ = hexdigits[(ch >> 16) & 0x0000000F]; - *p++ = hexdigits[(ch >> 12) & 0x0000000F]; - *p++ = hexdigits[(ch >> 8) & 0x0000000F]; - *p++ = hexdigits[(ch >> 4) & 0x0000000F]; - *p++ = hexdigits[ch & 0x0000000F]; - continue; - } -#else - /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ - else if (ch >= 0xD800 && ch < 0xDC00) { - Py_UNICODE ch2; - Py_UCS4 ucs; - - ch2 = *s++; - size--; - if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { - ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; - *p++ = '\\'; - *p++ = 'U'; - *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; - *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; - *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; - *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; - *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; - *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; - *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; - *p++ = hexdigits[ucs & 0x0000000F]; - continue; - } - /* Fall through: isolated surrogates are copied as-is */ - s--; - size++; - } -#endif - - /* Map 16-bit characters to '\uxxxx' */ - if (ch >= 256) { - *p++ = '\\'; - *p++ = 'u'; - *p++ = hexdigits[(ch >> 12) & 0x000F]; - *p++ = hexdigits[(ch >> 8) & 0x000F]; - *p++ = hexdigits[(ch >> 4) & 0x000F]; - *p++ = hexdigits[ch & 0x000F]; - } - /* Map special whitespace to '\t', \n', '\r' */ - else if (ch == '\t') { + if (ch == '\t') { *p++ = '\\'; *p++ = 't'; } @@ -7578,18 +7578,71 @@ *p++ = '\\'; *p++ = 'r'; } - /* Map non-printable US ASCII to '\xhh' */ - else if (ch < ' ' || ch >= 0x7F) { + else if (ch < ' ' || ch == 0x7F) { *p++ = '\\'; *p++ = 'x'; *p++ = hexdigits[(ch >> 4) & 0x000F]; *p++ = hexdigits[ch & 0x000F]; } - - /* Copy everything else as-is */ - else - *p++ = (char) ch; + /* Copy ASCII characters as-is */ + else if (ch < 0x7F) { + *p++ = ch; + } + /* Non-ASCII characters */ + else { + Py_UCS4 ucs = ch; + +#ifndef Py_UNICODE_WIDE + Py_UNICODE ch2; + /* Get code point from surrogate pair */ + if (size > 0) { + ch2 = *s; + if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00 + && ch2 <= 0xDFFF) { + ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + + 0x00010000; + s++; + size--; + } + } +#endif + /* Map Unicode whitespace and control characters + (categories Z* and C*) + */ + if (is_hex_repr(ucs)) { + /* Map 21-bit characters to '\U00xxxxxx' */ + if (ucs >= 0x10000) { + *p++ = '\\'; + *p++ = 'U'; + *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; + *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; + *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; + *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; + *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; + *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; + *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; + *p++ = hexdigits[ucs & 0x0000000F]; + } + /* Map non-printable characters to '\uxxxx' */ + else { + *p++ = '\\'; + *p++ = 'u'; + *p++ = hexdigits[(ucs >> 12) & 0x000F]; + *p++ = hexdigits[(ucs >> 8) & 0x000F]; + *p++ = hexdigits[(ucs >> 4) & 0x000F]; + *p++ = hexdigits[ucs & 0x000F]; + } + } + /* Copy characters as-is */ + else { + *p++ = ch; +#ifndef Py_UNICODE_WIDE + if (ucs >= 0x10000) + *p++ = ch2; +#endif + } + } } /* Add quote */ *p++ = PyUnicode_AS_UNICODE(repr)[0]; diff -ur Python-3.0a4-orig/Python/pythonrun.c Python-3.0a4/Python/pythonrun.c --- Python-3.0a4-orig/Python/pythonrun.c 2008-03-27 08:25:24.000000000 +0900 +++ Python-3.0a4/Python/pythonrun.c 2008-04-15 20:24:58.203125000 +0900 @@ -819,7 +819,7 @@ #endif } else { - if (!(std = PyFile_FromFd(fd, "", "w", -1, NULL, NULL, + if (!(std = PyFile_FromFd(fd, "", "w", -1, NULL, "backslashreplace", "\n", 0))) { goto error; }