Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code | Sign in
(45987)

Side by Side Diff: Objects/unicodeobject.c

Issue 15027: Faster UTF-32 encoding
Patch Set: Created 5 years, 9 months ago
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.
Jump to:
View unified diff | Download patch
« Objects/stringlib/codecs.h ('K') | « Objects/stringlib/codecs.h ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 2
3 Unicode implementation based on original code by Fredrik Lundh, 3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com>. 4 modified by Marc-Andre Lemburg <mal@lemburg.com>.
5 5
6 Major speed upgrades to the method implementations at the Reykjavik 6 Major speed upgrades to the method implementations at the Reykjavik
7 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 7 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8 8
9 Copyright (c) Corporation for National Research Initiatives. 9 Copyright (c) Corporation for National Research Initiatives.
10 10
(...skipping 5082 matching lines...) Expand 10 before | Expand all | Expand 10 after
5093 Py_XDECREF(errorHandler); 5093 Py_XDECREF(errorHandler);
5094 Py_XDECREF(exc); 5094 Py_XDECREF(exc);
5095 return NULL; 5095 return NULL;
5096 } 5096 }
5097 5097
5098 PyObject * 5098 PyObject *
5099 _PyUnicode_EncodeUTF32(PyObject *str, 5099 _PyUnicode_EncodeUTF32(PyObject *str,
5100 const char *errors, 5100 const char *errors,
5101 int byteorder) 5101 int byteorder)
5102 { 5102 {
5103 int kind; 5103 enum PyUnicode_Kind kind;
5104 void *data; 5104 const void *data;
5105 Py_ssize_t len; 5105 Py_ssize_t len;
5106 PyObject *v; 5106 PyObject *v;
5107 unsigned char *p; 5107 PY_UINT32_T *out;
5108 Py_ssize_t nsize, i;
5109 /* Offsets from p for storing byte pairs in the right order. */
5110 #if PY_LITTLE_ENDIAN 5108 #if PY_LITTLE_ENDIAN
5111 int iorder[] = {0, 1, 2, 3}; 5109 int native_ordering = byteorder <= 0;
5112 #else 5110 #else
5113 int iorder[] = {3, 2, 1, 0}; 5111 int native_ordering = byteorder >= 0;
5114 #endif 5112 #endif
5115 const char *encoding; 5113 const char *encoding;
5114 Py_ssize_t nsize, pos;
5116 PyObject *errorHandler = NULL; 5115 PyObject *errorHandler = NULL;
5117 PyObject *exc = NULL; 5116 PyObject *exc = NULL;
5118 PyObject *rep = NULL; 5117 PyObject *rep = NULL;
5119 5118
5120 #define STORECHAR(CH) \
5121 do { \
5122 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5123 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5124 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5125 p[iorder[0]] = (CH) & 0xff; \
5126 p += 4; \
5127 } while(0)
5128
5129 if (!PyUnicode_Check(str)) { 5119 if (!PyUnicode_Check(str)) {
5130 PyErr_BadArgument(); 5120 PyErr_BadArgument();
5131 return NULL; 5121 return NULL;
5132 } 5122 }
5133 if (PyUnicode_READY(str) == -1) 5123 if (PyUnicode_READY(str) == -1)
5134 return NULL; 5124 return NULL;
5135 kind = PyUnicode_KIND(str); 5125 kind = PyUnicode_KIND(str);
5136 data = PyUnicode_DATA(str); 5126 data = PyUnicode_DATA(str);
5137 len = PyUnicode_GET_LENGTH(str); 5127 len = PyUnicode_GET_LENGTH(str);
5138 5128
5129 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5130 return PyErr_NoMemory();
5139 nsize = len + (byteorder == 0); 5131 nsize = len + (byteorder == 0);
5140 if (nsize > PY_SSIZE_T_MAX / 4)
5141 return PyErr_NoMemory();
5142 v = PyBytes_FromStringAndSize(NULL, nsize * 4); 5132 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5143 if (v == NULL) 5133 if (v == NULL)
5144 return NULL; 5134 return NULL;
5145 5135
5146 p = (unsigned char *)PyBytes_AS_STRING(v); 5136 /* output buffer is 4-bytes aligned */
5137 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5138 out = (PY_UINT32_T *)PyBytes_AS_STRING(v);
5147 if (byteorder == 0) 5139 if (byteorder == 0)
5148 STORECHAR(0xFEFF); 5140 *out++ = 0xFEFF;
5149 if (len == 0) 5141 if (len == 0)
5150 return v; 5142 goto done;
5151 5143
5152 if (byteorder == -1) { 5144 if (byteorder == -1)
5153 /* force LE */
5154 iorder[0] = 0;
5155 iorder[1] = 1;
5156 iorder[2] = 2;
5157 iorder[3] = 3;
5158 encoding = "utf-32-le"; 5145 encoding = "utf-32-le";
5159 } 5146 else if (byteorder == 1)
5160 else if (byteorder == 1) {
5161 /* force BE */
5162 iorder[0] = 3;
5163 iorder[1] = 2;
5164 iorder[2] = 1;
5165 iorder[3] = 0;
5166 encoding = "utf-32-be"; 5147 encoding = "utf-32-be";
5167 }
5168 else 5148 else
5169 encoding = "utf-32"; 5149 encoding = "utf-32";
5170 5150
5171 if (kind == PyUnicode_1BYTE_KIND) { 5151 if (kind == PyUnicode_1BYTE_KIND) {
5172 for (i = 0; i < len; i++) 5152 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5173 STORECHAR(PyUnicode_READ(kind, data, i)); 5153 goto done;
5174 return v;
5175 } 5154 }
5176 5155
5177 for (i = 0; i < len;) { 5156 pos = 0;
5157 while (pos < len) {
5178 Py_ssize_t repsize, moreunits; 5158 Py_ssize_t repsize, moreunits;
5179 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 5159
5180 i++; 5160 if (kind == PyUnicode_2BYTE_KIND) {
5181 assert(ch <= MAX_UNICODE); 5161 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5182 if (!Py_UNICODE_IS_SURROGATE(ch)) { 5162 &out, native_ordering);
5183 STORECHAR(ch);
5184 continue;
5185 } 5163 }
5164 else {
5165 assert(kind == PyUnicode_4BYTE_KIND);
5166 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5167 &out, native_ordering);
5168 }
5169 if (pos == len)
5170 break;
5186 5171
5187 rep = unicode_encode_call_errorhandler( 5172 rep = unicode_encode_call_errorhandler(
5188 errors, &errorHandler, 5173 errors, &errorHandler,
5189 encoding, "surrogates not allowed", 5174 encoding, "surrogates not allowed",
5190 str, &exc, i-1, i, &i); 5175 str, &exc, pos, pos + 1, &pos);
5191
5192 if (!rep) 5176 if (!rep)
5193 goto error; 5177 goto error;
5194 5178
5195 if (PyBytes_Check(rep)) { 5179 if (PyBytes_Check(rep)) {
5196 repsize = PyBytes_GET_SIZE(rep); 5180 repsize = PyBytes_GET_SIZE(rep);
5197 if (repsize & 3) { 5181 if (repsize & 3) {
5198 raise_encode_exception(&exc, encoding, 5182 raise_encode_exception(&exc, encoding,
5199 str, i - 1, i, 5183 str, pos - 1, pos,
5200 "surrogates not allowed"); 5184 "surrogates not allowed");
5201 goto error; 5185 goto error;
5202 } 5186 }
5203 moreunits = repsize / 4; 5187 moreunits = repsize / 4;
5204 } 5188 }
5205 else { 5189 else {
5206 assert(PyUnicode_Check(rep)); 5190 assert(PyUnicode_Check(rep));
5207 if (PyUnicode_READY(rep) < 0) 5191 if (PyUnicode_READY(rep) < 0)
5208 goto error; 5192 goto error;
5209 moreunits = repsize = PyUnicode_GET_LENGTH(rep); 5193 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5210 if (!PyUnicode_IS_ASCII(rep)) { 5194 if (!PyUnicode_IS_ASCII(rep)) {
5211 raise_encode_exception(&exc, encoding, 5195 raise_encode_exception(&exc, encoding,
5212 str, i - 1, i, 5196 str, pos - 1, pos,
5213 "surrogates not allowed"); 5197 "surrogates not allowed");
5214 goto error; 5198 goto error;
5215 } 5199 }
5216 } 5200 }
5217 5201
5218 /* four bytes are reserved for each surrogate */ 5202 /* four bytes are reserved for each surrogate */
5219 if (moreunits > 1) { 5203 if (moreunits > 1) {
5220 Py_ssize_t outpos = p - (unsigned char*) PyBytes_AS_STRING(v); 5204 Py_ssize_t outpos = out - (PY_UINT32_T*) PyBytes_AS_STRING(v);
5221 Py_ssize_t morebytes = 4 * (moreunits - 1); 5205 Py_ssize_t morebytes = 4 * (moreunits - 1);
5222 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) { 5206 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5223 /* integer overflow */ 5207 /* integer overflow */
5224 PyErr_NoMemory(); 5208 PyErr_NoMemory();
5225 goto error; 5209 goto error;
5226 } 5210 }
5227 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0) 5211 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5228 goto error; 5212 goto error;
5229 p = (unsigned char*) PyBytes_AS_STRING(v) + outpos; 5213 out = (PY_UINT32_T*) PyBytes_AS_STRING(v) + outpos;
5230 } 5214 }
5231 5215
5232 if (PyBytes_Check(rep)) { 5216 if (PyBytes_Check(rep)) {
5233 Py_MEMCPY(p, PyBytes_AS_STRING(rep), repsize); 5217 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5234 p += repsize; 5218 out += moreunits;
5235 } else /* rep is unicode */ { 5219 } else /* rep is unicode */ {
5236 const Py_UCS1 *repdata;
5237 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); 5220 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5238 repdata = PyUnicode_1BYTE_DATA(rep); 5221 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5239 while (repsize--) { 5222 &out, native_ordering);
5240 Py_UCS4 ch = *repdata++;
5241 STORECHAR(ch);
5242 }
5243 } 5223 }
5244 5224
5245 Py_CLEAR(rep); 5225 Py_CLEAR(rep);
5246 } 5226 }
5247 5227
5248 /* Cut back to size actually needed. This is necessary for, for example, 5228 /* Cut back to size actually needed. This is necessary for, for example,
5249 encoding of a string containing isolated surrogates and the 'ignore' 5229 encoding of a string containing isolated surrogates and the 'ignore'
5250 handler is used. */ 5230 handler is used. */
5251 nsize = p - (unsigned char*) PyBytes_AS_STRING(v); 5231 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5252 if (nsize != PyBytes_GET_SIZE(v)) 5232 if (nsize != PyBytes_GET_SIZE(v))
5253 _PyBytes_Resize(&v, nsize); 5233 _PyBytes_Resize(&v, nsize);
5254 Py_XDECREF(errorHandler); 5234 Py_XDECREF(errorHandler);
5255 Py_XDECREF(exc); 5235 Py_XDECREF(exc);
5236 done:
5256 return v; 5237 return v;
5257 error: 5238 error:
5258 Py_XDECREF(rep); 5239 Py_XDECREF(rep);
5259 Py_XDECREF(errorHandler); 5240 Py_XDECREF(errorHandler);
5260 Py_XDECREF(exc); 5241 Py_XDECREF(exc);
5261 Py_XDECREF(v); 5242 Py_XDECREF(v);
5262 return NULL; 5243 return NULL;
5263 #undef STORECHAR
5264 } 5244 }
5265 5245
5266 PyObject * 5246 PyObject *
5267 PyUnicode_EncodeUTF32(const Py_UNICODE *s, 5247 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5268 Py_ssize_t size, 5248 Py_ssize_t size,
5269 const char *errors, 5249 const char *errors,
5270 int byteorder) 5250 int byteorder)
5271 { 5251 {
5272 PyObject *result; 5252 PyObject *result;
5273 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5253 PyObject *tmp = PyUnicode_FromUnicode(s, size);
(...skipping 10118 matching lines...) Expand 10 before | Expand all | Expand 10 after
15392 PyMODINIT_FUNC 15372 PyMODINIT_FUNC
15393 PyInit__string(void) 15373 PyInit__string(void)
15394 { 15374 {
15395 return PyModule_Create(&_string_module); 15375 return PyModule_Create(&_string_module);
15396 } 15376 }
15397 15377
15398 15378
15399 #ifdef __cplusplus 15379 #ifdef __cplusplus
15400 } 15380 }
15401 #endif 15381 #endif
OLDNEW
« Objects/stringlib/codecs.h ('K') | « Objects/stringlib/codecs.h ('k') | no next file » | no next file with comments »

RSS Feeds Recent Issues | This issue
This is Rietveld 894c83f36cb7+