diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst --- a/Doc/library/codecs.rst +++ b/Doc/library/codecs.rst @@ -1131,6 +1131,8 @@ particular, the following variants typic +-----------------+--------------------------------+--------------------------------+ | utf_8 | U8, UTF, utf8 | all languages | +-----------------+--------------------------------+--------------------------------+ +| utf_8_java | | all languages | ++-----------------+--------------------------------+--------------------------------+ | utf_8_sig | | all languages | +-----------------+--------------------------------+--------------------------------+ diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -181,6 +181,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE; # define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful # define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8 # define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS2_DecodeUTF8Stateful +# define PyUnicode_DecodeUTF8JavaStateful PyUnicodeUCS2_DecodeUTF8JavaStateful # define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS2_DecodeUnicodeEscape # define PyUnicode_Encode PyUnicodeUCS2_Encode # define PyUnicode_EncodeASCII PyUnicodeUCS2_EncodeASCII @@ -191,6 +192,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE; # define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32 # define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16 # define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8 +# define PyUnicode_EncodeUTF8Java PyUnicodeUCS2_EncodeUTF8Java # define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape # define PyUnicode_Find PyUnicodeUCS2_Find # define PyUnicode_Format PyUnicodeUCS2_Format @@ -265,6 +267,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE; # define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful # define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8 # define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS4_DecodeUTF8Stateful +# define PyUnicode_DecodeUTF8JavaStateful PyUnicodeUCS4_DecodeUTF8JavaStateful # define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS4_DecodeUnicodeEscape # define PyUnicode_Encode PyUnicodeUCS4_Encode # define PyUnicode_EncodeASCII PyUnicodeUCS4_EncodeASCII @@ -275,6 +278,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE; # define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32 # define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16 # define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8 +# define PyUnicode_EncodeUTF8Java PyUnicodeUCS4_EncodeUTF8Java # define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape # define PyUnicode_Find PyUnicodeUCS4_Find # define PyUnicode_Format PyUnicodeUCS4_Format @@ -828,6 +832,13 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeUT Py_ssize_t *consumed /* bytes consumed */ ); +PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8JavaStateful( + const char *string, /* utf-8-java encoded string */ + Py_ssize_t length, /* size of string */ + const char *errors, /* error handling */ + Py_ssize_t *consumed /* bytes consumed */ + ); + PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String( PyObject *unicode /* Unicode object */ ); @@ -838,6 +849,11 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeUT Py_ssize_t length, /* number of Py_UNICODE chars to encode */ const char *errors /* error handling */ ); +PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8Java( + const Py_UNICODE *data, /* Unicode char buffer */ + Py_ssize_t length, /* number of Py_UNICODE chars to encode */ + const char *errors /* error handling */ + ); #endif /* --- UTF-32 Codecs ------------------------------------------------------ */ diff --git a/Lib/encodings/utf_8_java.py b/Lib/encodings/utf_8_java.py new file mode 100644 --- /dev/null +++ b/Lib/encodings/utf_8_java.py @@ -0,0 +1,36 @@ +""" Python 'utf-8-java' Codec +""" +import codecs + +### Codec APIs + +encode = codecs.utf_8_java_encode + +def decode(input, errors='strict'): + return codecs.utf_8_java_decode(input, errors, True) + +class IncrementalEncoder(codecs.IncrementalEncoder): + def encode(self, input, final=False): + return codecs.utf_8_java_encode(input, self.errors)[0] + +class IncrementalDecoder(codecs.BufferedIncrementalDecoder): + _buffer_decode = codecs.utf_8_java_decode + +class StreamWriter(codecs.StreamWriter): + encode = codecs.utf_8_java_encode + +class StreamReader(codecs.StreamReader): + decode = codecs.utf_8_java_decode + +### encodings module API + +def getregentry(): + return codecs.CodecInfo( + name='utf-8-java', + encode=encode, + decode=decode, + incrementalencoder=IncrementalEncoder, + incrementaldecoder=IncrementalDecoder, + streamreader=StreamReader, + streamwriter=StreamWriter, + ) diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -578,7 +578,45 @@ class UTF16BETest(ReadTest): self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding), "\U00010203") -class UTF8Test(ReadTest): + +class BaseUTF8Test(ReadTest): + def test_decoder_state(self): + u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff" + self.check_state_handling_decode(self.encoding, + u, u.encode(self.encoding)) + + def test_lone_surrogates(self): + self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding) + self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, self.encoding) + self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"), + b'[\\udc80]') + self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"), + b'[�]') + self.assertEqual("[\uDC80]".encode(self.encoding, "surrogateescape"), + b'[\x80]') + self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"), + b'[]') + self.assertEqual("[\uDC80]".encode(self.encoding, "replace"), + b'[?]') + + def test_surrogatepass_handler(self): + self.assertEqual("abc\ud800def".encode(self.encoding, "surrogatepass"), + b"abc\xed\xa0\x80def") + self.assertEqual(b"abc\xed\xa0\x80def".decode(self.encoding, "surrogatepass"), + "abc\ud800def") + self.assertTrue(codecs.lookup_error("surrogatepass")) + + def test_invalid(self): + for invalid in ( + b'\xC0\x81', + b'\xC0\xFF', + b'\xC1\x10', + b'\xC1\x80', + ): + with self.assertRaises(UnicodeDecodeError): + invalid.decode(self.encoding) + +class UTF8Test(BaseUTF8Test): encoding = "utf-8" def test_partial(self): @@ -599,31 +637,35 @@ class UTF8Test(ReadTest): ] ) - def test_decoder_state(self): - u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff" - self.check_state_handling_decode(self.encoding, - u, u.encode(self.encoding)) + def test_null_byte(self): + self.assertEqual('a\x00b'.encode(self.encoding), b'a\x00b') + self.assertEqual(b'a\x00b'.decode(self.encoding), 'a\x00b') - def test_lone_surrogates(self): - self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8") - self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8") - self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"), - b'[\\udc80]') - self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"), - b'[�]') - self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"), - b'[\x80]') - self.assertEqual("[\uDC80]".encode("utf-8", "ignore"), - b'[]') - self.assertEqual("[\uDC80]".encode("utf-8", "replace"), - b'[?]') +class UTF8JavaTest(BaseUTF8Test): + encoding = "utf-8-java" - def test_surrogatepass_handler(self): - self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"), - b"abc\xed\xa0\x80def") - self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"), - "abc\ud800def") - self.assertTrue(codecs.lookup_error("surrogatepass")) + def test_partial(self): + self.check_partial( + "\x00\xff\u07ff\u0800\uffff", + [ + "", + "\x00", + "\x00", + "\x00\xff", + "\x00\xff", + "\x00\xff\u07ff", + "\x00\xff\u07ff", + "\x00\xff\u07ff", + "\x00\xff\u07ff\u0800", + "\x00\xff\u07ff\u0800", + "\x00\xff\u07ff\u0800", + "\x00\xff\u07ff\u0800\uffff", + ] + ) + + def test_null_byte(self): + self.assertEqual('a\x00b'.encode(self.encoding), b'a\xc0\x80b') + self.assertEqual(b'a\xc0\x80b'.decode(self.encoding), 'a\x00b') class UTF7Test(ReadTest): encoding = "utf-7" @@ -1728,6 +1770,7 @@ def test_main(): UTF16LETest, UTF16BETest, UTF8Test, + UTF8JavaTest, UTF8SigTest, UTF7Test, UTF16ExTest, diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -295,6 +295,29 @@ utf_8_decode(PyObject *self, } static PyObject * +utf_8_java_decode(PyObject *self, + PyObject *args) +{ + Py_buffer pbuf; + const char *errors = NULL; + int final = 0; + Py_ssize_t consumed; + PyObject *decoded = NULL; + + if (!PyArg_ParseTuple(args, "y*|zi:utf_8_java_decode", + &pbuf, &errors, &final)) + return NULL; + consumed = pbuf.len; + + decoded = PyUnicode_DecodeUTF8JavaStateful(pbuf.buf, pbuf.len, errors, + final ? NULL : &consumed); + PyBuffer_Release(&pbuf); + if (decoded == NULL) + return NULL; + return codec_tuple(decoded, consumed); +} + +static PyObject * utf_16_decode(PyObject *self, PyObject *args) { @@ -710,6 +733,28 @@ utf_8_encode(PyObject *self, return v; } +static PyObject * +utf_8_java_encode(PyObject *self, + PyObject *args) +{ + PyObject *str, *v; + const char *errors = NULL; + + if (!PyArg_ParseTuple(args, "O|z:utf_8_encode", + &str, &errors)) + return NULL; + + str = PyUnicode_FromObject(str); + if (str == NULL) + return NULL; + v = codec_tuple(PyUnicode_EncodeUTF8Java(PyUnicode_AS_UNICODE(str), + PyUnicode_GET_SIZE(str), + errors), + PyUnicode_GET_SIZE(str)); + Py_DECREF(str); + return v; +} + /* This version provides access to the byteorder parameter of the builtin UTF-16 codecs as optional third argument. It defaults to 0 which means: use the native byte order and prepend the data with a @@ -1071,6 +1116,8 @@ static PyMethodDef _codecs_functions[] = {"escape_decode", escape_decode, METH_VARARGS}, {"utf_8_encode", utf_8_encode, METH_VARARGS}, {"utf_8_decode", utf_8_decode, METH_VARARGS}, + {"utf_8_java_encode", utf_8_java_encode, METH_VARARGS}, + {"utf_8_java_decode", utf_8_java_decode, METH_VARARGS}, {"utf_7_encode", utf_7_encode, METH_VARARGS}, {"utf_7_decode", utf_7_decode, METH_VARARGS}, {"utf_16_encode", utf_16_encode, METH_VARARGS}, diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2567,6 +2567,27 @@ char utf8_code_length[256] = { 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */ }; +char utf8java_code_length[256] = { + /* similar to utf8_code_length except that utf8java_code_length[0xC0] is 2 + instead of 0 to decode {0xC0, 0x80} as U+0000 */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */ + 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */ + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */ + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */ + 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */ +}; + PyObject * PyUnicode_DecodeUTF8(const char *s, Py_ssize_t size, @@ -2588,11 +2609,12 @@ PyUnicode_DecodeUTF8(const char *s, # error C 'long' size should be either 4 or 8! #endif -PyObject * -PyUnicode_DecodeUTF8Stateful(const char *s, - Py_ssize_t size, - const char *errors, - Py_ssize_t *consumed) +static PyObject * +decode_utf8_stateful(const char *s, + Py_ssize_t size, + const char *errors, + Py_ssize_t *consumed, + int java) { const char *starts = s; int n; @@ -2606,6 +2628,7 @@ PyUnicode_DecodeUTF8Stateful(const char const char *errmsg = ""; PyObject *errorHandler = NULL; PyObject *exc = NULL; + char *code_length; /* Note: size will always be longer than the resulting Unicode character count */ @@ -2623,6 +2646,11 @@ PyUnicode_DecodeUTF8Stateful(const char e = s + size; aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); + if (java) + code_length = utf8java_code_length; + else + code_length = utf8_code_length; + while (s < e) { Py_UCS4 ch = (unsigned char)*s; @@ -2672,7 +2700,7 @@ PyUnicode_DecodeUTF8Stateful(const char continue; } - n = utf8_code_length[ch]; + n = code_length[ch]; if (s + n > e) { if (consumed) @@ -2702,14 +2730,13 @@ PyUnicode_DecodeUTF8Stateful(const char goto utf8Error; case 2: - if ((s[1] & 0xc0) != 0x80) { + ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); + if ((ch <= 0x007F && (ch != 0x0000 || !java)) || ch > 0x07FF) { errmsg = "invalid continuation byte"; startinpos = s-starts; endinpos = startinpos + 1; goto utf8Error; } - ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); - assert ((ch > 0x007F) && (ch <= 0x07FF)); *p++ = (Py_UNICODE)ch; break; @@ -2787,7 +2814,8 @@ PyUnicode_DecodeUTF8Stateful(const char outpos = p-PyUnicode_AS_UNICODE(unicode); if (unicode_decode_call_errorhandler( errors, &errorHandler, - "utf8", errmsg, + java ? "utf-8-java" : "utf-8", + errmsg, &starts, &e, &startinpos, &endinpos, &exc, &s, &unicode, &outpos, &p)) goto onError; @@ -2811,6 +2839,25 @@ PyUnicode_DecodeUTF8Stateful(const char return NULL; } +PyObject * +PyUnicode_DecodeUTF8Stateful(const char *s, + Py_ssize_t size, + const char *errors, + Py_ssize_t *consumed) +{ + return decode_utf8_stateful(s, size, errors, consumed, 0); +} + +PyObject * +PyUnicode_DecodeUTF8JavaStateful(const char *s, + Py_ssize_t size, + const char *errors, + Py_ssize_t *consumed) +{ + return decode_utf8_stateful(s, size, errors, consumed, 1); +} + + #undef ASCII_CHAR_MASK #ifdef __APPLE__ @@ -2933,10 +2980,11 @@ _Py_DecodeUTF8_surrogateescape(const cha maximum possible needed (4 result bytes per Unicode character), and return the excess memory at the end. */ -PyObject * -PyUnicode_EncodeUTF8(const Py_UNICODE *s, +static PyObject * +encode_utf8(const Py_UNICODE *s, Py_ssize_t size, - const char *errors) + const char *errors, + int java) { #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ @@ -2976,8 +3024,15 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s Py_UCS4 ch = s[i++]; if (ch < 0x80) - /* Encode ASCII */ - *p++ = (char) ch; + if (ch == 0x00 && java) { + /* Encode U+0000 as 0xC0 0x80 */ + *p++ = 0xC0; + *p++ = 0x80; + } + else { + /* Encode ASCII */ + *p++ = (char) ch; + } else if (ch < 0x0800) { /* Encode Latin-1 */ @@ -3003,7 +3058,9 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s PyObject *rep; Py_ssize_t repsize, k; rep = unicode_encode_call_errorhandler - (errors, &errorHandler, "utf-8", "surrogates not allowed", + (errors, &errorHandler, + java ? "utf-8-java" : "utf-8", + "surrogates not allowed", s, size, &exc, i-1, i, &newpos); if (!rep) goto error; @@ -3050,7 +3107,9 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s for(k=0; k