Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code | Sign in
(161427)

Delta Between Two Patch Sets: Objects/stringlib/codecs.h

Issue 15027: Faster UTF-32 encoding
Left Patch Set: Created 6 years, 9 months ago
Right Patch Set: Created 5 years, 7 months ago
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
« no previous file with change/comment | « no previous file | Objects/unicodeobject.c » ('j') | no next file with change/comment »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
LEFTRIGHT
1 /* stringlib: codec implementations */ 1 /* stringlib: codec implementations */
2 2
3 #if STRINGLIB_IS_UNICODE 3 #if STRINGLIB_IS_UNICODE
4 4
5 /* Mask to quickly check whether a C 'long' contains a 5 /* Mask to quickly check whether a C 'long' contains a
6 non-ASCII, UTF8-encoded char. */ 6 non-ASCII, UTF8-encoded char. */
7 #if (SIZEOF_LONG == 8) 7 #if (SIZEOF_LONG == 8)
8 # define ASCII_CHAR_MASK 0x8080808080808080UL 8 # define ASCII_CHAR_MASK 0x8080808080808080UL
9 #elif (SIZEOF_LONG == 4) 9 #elif (SIZEOF_LONG == 4)
10 # define ASCII_CHAR_MASK 0x80808080UL 10 # define ASCII_CHAR_MASK 0x80808080UL
(...skipping 20 matching lines...) Expand all
31 if (ch < 0x80) { 31 if (ch < 0x80) {
32 /* Fast path for runs of ASCII characters. Given that common UTF-8 32 /* Fast path for runs of ASCII characters. Given that common UTF-8
33 input will consist of an overwhelming majority of ASCII 33 input will consist of an overwhelming majority of ASCII
34 characters, we try to optimize for this case by checking 34 characters, we try to optimize for this case by checking
35 as many characters as a C 'long' can contain. 35 as many characters as a C 'long' can contain.
36 First, check if we can do an aligned read, as most CPUs have 36 First, check if we can do an aligned read, as most CPUs have
37 a penalty for unaligned reads. 37 a penalty for unaligned reads.
38 */ 38 */
39 if (_Py_IS_ALIGNED(s, SIZEOF_LONG)) { 39 if (_Py_IS_ALIGNED(s, SIZEOF_LONG)) {
40 /* Help register allocation */ 40 /* Help register allocation */
41 register const char *_s = s; 41 const char *_s = s;
42 register STRINGLIB_CHAR *_p = p; 42 STRINGLIB_CHAR *_p = p;
43 while (_s < aligned_end) { 43 while (_s < aligned_end) {
44 /* Read a whole long at a time (either 4 or 8 bytes), 44 /* Read a whole long at a time (either 4 or 8 bytes),
45 and do a fast unrolled copy if it only contains ASCII 45 and do a fast unrolled copy if it only contains ASCII
46 characters. */ 46 characters. */
47 unsigned long value = *(unsigned long *) _s; 47 unsigned long value = *(unsigned long *) _s;
48 if (value & ASCII_CHAR_MASK) 48 if (value & ASCII_CHAR_MASK)
49 break; 49 break;
50 #if PY_LITTLE_ENDIAN 50 #if PY_LITTLE_ENDIAN
51 _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); 51 _p[0] = (STRINGLIB_CHAR)(value & 0xFFu);
52 _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); 52 _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after
84 break; 84 break;
85 ch = (unsigned char)*s; 85 ch = (unsigned char)*s;
86 } 86 }
87 if (ch < 0x80) { 87 if (ch < 0x80) {
88 s++; 88 s++;
89 *p++ = ch; 89 *p++ = ch;
90 continue; 90 continue;
91 } 91 }
92 } 92 }
93 93
94 if (ch < 0xC2) {
95 /* invalid sequence
96 \x80-\xBF -- continuation byte
97 \xC0-\xC1 -- fake 0000-007F */
98 goto InvalidStart;
99 }
100
101 if (ch < 0xE0) { 94 if (ch < 0xE0) {
102 /* \xC2\x80-\xDF\xBF -- 0080-07FF */ 95 /* \xC2\x80-\xDF\xBF -- 0080-07FF */
103 Py_UCS4 ch2; 96 Py_UCS4 ch2;
97 if (ch < 0xC2) {
98 /* invalid sequence
99 \x80-\xBF -- continuation byte
100 \xC0-\xC1 -- fake 0000-007F */
101 goto InvalidStart;
102 }
104 if (end - s < 2) { 103 if (end - s < 2) {
105 /* unexpected end of data: the caller will decide whether 104 /* unexpected end of data: the caller will decide whether
106 it's an error or not */ 105 it's an error or not */
107 break; 106 break;
108 } 107 }
109 ch2 = (unsigned char)s[1]; 108 ch2 = (unsigned char)s[1];
110 if (!IS_CONTINUATION_BYTE(ch2)) 109 if (!IS_CONTINUATION_BYTE(ch2))
111 /* invalid continuation byte */ 110 /* invalid continuation byte */
112 goto InvalidContinuation; 111 goto InvalidContinuation1;
113 ch = (ch << 6) + ch2 - 112 ch = (ch << 6) + ch2 -
114 ((0xC0 << 6) + 0x80); 113 ((0xC0 << 6) + 0x80);
115 assert ((ch > 0x007F) && (ch <= 0x07FF)); 114 assert ((ch > 0x007F) && (ch <= 0x07FF));
116 s += 2; 115 s += 2;
117 if (STRINGLIB_MAX_CHAR <= 0x007F || 116 if (STRINGLIB_MAX_CHAR <= 0x007F ||
118 (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) 117 (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR))
119 goto Overflow; 118 /* Out-of-range */
119 goto Return;
120 *p++ = ch; 120 *p++ = ch;
121 continue; 121 continue;
122 } 122 }
123 123
124 if (ch < 0xF0) { 124 if (ch < 0xF0) {
125 /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ 125 /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
126 Py_UCS4 ch2, ch3; 126 Py_UCS4 ch2, ch3;
127 if (end - s < 3) { 127 if (end - s < 3) {
128 /* unexpected end of data: the caller will decide whether 128 /* unexpected end of data: the caller will decide whether
129 it's an error or not */ 129 it's an error or not */
130 if (end - s < 2)
131 break;
132 ch2 = (unsigned char)s[1];
133 if (!IS_CONTINUATION_BYTE(ch2) ||
134 (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED))
135 /* for clarification see comments below */
136 goto InvalidContinuation1;
130 break; 137 break;
131 } 138 }
132 ch2 = (unsigned char)s[1]; 139 ch2 = (unsigned char)s[1];
133 ch3 = (unsigned char)s[2]; 140 ch3 = (unsigned char)s[2];
134 if (!IS_CONTINUATION_BYTE(ch2) || 141 if (!IS_CONTINUATION_BYTE(ch2)) {
135 !IS_CONTINUATION_BYTE(ch3)) {
136 /* invalid continuation byte */ 142 /* invalid continuation byte */
137 goto InvalidContinuation; 143 goto InvalidContinuation1;
138 } 144 }
139 if (ch == 0xE0) { 145 if (ch == 0xE0) {
140 if (ch2 < 0xA0) 146 if (ch2 < 0xA0)
141 /* invalid sequence 147 /* invalid sequence
142 \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ 148 \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
143 goto InvalidContinuation; 149 goto InvalidContinuation1;
144 } 150 } else if (ch == 0xED && ch2 >= 0xA0) {
145 else if (ch == 0xED && ch2 > 0x9F) {
146 /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF 151 /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
147 will result in surrogates in range D800-DFFF. Surrogates are 152 will result in surrogates in range D800-DFFF. Surrogates are
148 not valid UTF-8 so they are rejected. 153 not valid UTF-8 so they are rejected.
149 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 154 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
150 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ 155 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
151 goto InvalidContinuation; 156 goto InvalidContinuation1;
157 }
158 if (!IS_CONTINUATION_BYTE(ch3)) {
159 /* invalid continuation byte */
160 goto InvalidContinuation2;
152 } 161 }
153 ch = (ch << 12) + (ch2 << 6) + ch3 - 162 ch = (ch << 12) + (ch2 << 6) + ch3 -
154 ((0xE0 << 12) + (0x80 << 6) + 0x80); 163 ((0xE0 << 12) + (0x80 << 6) + 0x80);
155 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); 164 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
156 s += 3; 165 s += 3;
157 if (STRINGLIB_MAX_CHAR <= 0x07FF || 166 if (STRINGLIB_MAX_CHAR <= 0x07FF ||
158 (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) 167 (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR))
159 goto Overflow; 168 /* Out-of-range */
169 goto Return;
160 *p++ = ch; 170 *p++ = ch;
161 continue; 171 continue;
162 } 172 }
163 173
164 if (ch < 0xF5) { 174 if (ch < 0xF5) {
165 /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ 175 /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
166 Py_UCS4 ch2, ch3, ch4; 176 Py_UCS4 ch2, ch3, ch4;
167 if (end - s < 4) { 177 if (end - s < 4) {
168 /* unexpected end of data: the caller will decide whether 178 /* unexpected end of data: the caller will decide whether
169 it's an error or not */ 179 it's an error or not */
180 if (end - s < 2)
181 break;
182 ch2 = (unsigned char)s[1];
183 if (!IS_CONTINUATION_BYTE(ch2) ||
184 (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4))
185 /* for clarification see comments below */
186 goto InvalidContinuation1;
187 if (end - s < 3)
188 break;
189 ch3 = (unsigned char)s[2];
190 if (!IS_CONTINUATION_BYTE(ch3))
191 goto InvalidContinuation2;
170 break; 192 break;
171 } 193 }
172 ch2 = (unsigned char)s[1]; 194 ch2 = (unsigned char)s[1];
173 ch3 = (unsigned char)s[2]; 195 ch3 = (unsigned char)s[2];
174 ch4 = (unsigned char)s[3]; 196 ch4 = (unsigned char)s[3];
175 if (!IS_CONTINUATION_BYTE(ch2) || 197 if (!IS_CONTINUATION_BYTE(ch2)) {
176 !IS_CONTINUATION_BYTE(ch3) ||
177 !IS_CONTINUATION_BYTE(ch4)) {
178 /* invalid continuation byte */ 198 /* invalid continuation byte */
179 goto InvalidContinuation; 199 goto InvalidContinuation1;
180 } 200 }
181 if (ch == 0xF0) { 201 if (ch == 0xF0) {
182 if (ch2 < 0x90) 202 if (ch2 < 0x90)
183 /* invalid sequence 203 /* invalid sequence
184 \xF0\x80\x80\x80-\xF0\x80\xBF\xBF -- fake 0000-FFFF */ 204 \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */
185 goto InvalidContinuation; 205 goto InvalidContinuation1;
186 } 206 } else if (ch == 0xF4 && ch2 >= 0x90) {
187 else if (ch == 0xF4 && ch2 > 0x8F) {
188 /* invalid sequence 207 /* invalid sequence
189 \xF4\x90\x80\80- -- 110000- overflow */ 208 \xF4\x90\x80\80- -- 110000- overflow */
190 goto InvalidContinuation; 209 goto InvalidContinuation1;
210 }
211 if (!IS_CONTINUATION_BYTE(ch3)) {
212 /* invalid continuation byte */
213 goto InvalidContinuation2;
214 }
215 if (!IS_CONTINUATION_BYTE(ch4)) {
216 /* invalid continuation byte */
217 goto InvalidContinuation3;
191 } 218 }
192 ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - 219 ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 -
193 ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); 220 ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80);
194 assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); 221 assert ((ch > 0xFFFF) && (ch <= 0x10FFFF));
195 s += 4; 222 s += 4;
196 if (STRINGLIB_MAX_CHAR <= 0xFFFF || 223 if (STRINGLIB_MAX_CHAR <= 0xFFFF ||
197 (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) 224 (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR))
198 goto Overflow; 225 /* Out-of-range */
226 goto Return;
199 *p++ = ch; 227 *p++ = ch;
200 continue; 228 continue;
201 } 229 }
202 goto InvalidStart; 230 goto InvalidStart;
203 } 231 }
204 ch = 0; 232 ch = 0;
205 Overflow:
206 Return: 233 Return:
207 *inptr = s; 234 *inptr = s;
208 *outpos = p - dest; 235 *outpos = p - dest;
209 return ch; 236 return ch;
210 InvalidStart: 237 InvalidStart:
211 ch = 1; 238 ch = 1;
212 goto Return; 239 goto Return;
213 InvalidContinuation: 240 InvalidContinuation1:
214 ch = 2; 241 ch = 2;
215 goto Return; 242 goto Return;
243 InvalidContinuation2:
244 ch = 3;
245 goto Return;
246 InvalidContinuation3:
247 ch = 4;
248 goto Return;
216 } 249 }
217 250
218 #undef ASCII_CHAR_MASK 251 #undef ASCII_CHAR_MASK
219 #undef IS_CONTINUATION_BYTE
220 252
221 253
222 /* UTF-8 encoder specialized for a Unicode kind to avoid the slow 254 /* UTF-8 encoder specialized for a Unicode kind to avoid the slow
223 PyUnicode_READ() macro. Delete some parts of the code depending on the kind: 255 PyUnicode_READ() macro. Delete some parts of the code depending on the kind:
224 UCS-1 strings don't need to handle surrogates for example. */ 256 UCS-1 strings don't need to handle surrogates for example. */
225 Py_LOCAL_INLINE(PyObject *) 257 Py_LOCAL_INLINE(PyObject *)
226 STRINGLIB(utf8_encoder)(PyObject *unicode, 258 STRINGLIB(utf8_encoder)(PyObject *unicode,
227 STRINGLIB_CHAR *data, 259 STRINGLIB_CHAR *data,
228 Py_ssize_t size, 260 Py_ssize_t size,
229 const char *errors) 261 const char *errors)
(...skipping 230 matching lines...) Expand 10 before | Expand all | Expand 10 after
460 int ihi = !native_ordering, ilo = !!native_ordering; 492 int ihi = !native_ordering, ilo = !!native_ordering;
461 #endif 493 #endif
462 --e; 494 --e;
463 495
464 while (q < e) { 496 while (q < e) {
465 Py_UCS4 ch2; 497 Py_UCS4 ch2;
466 /* First check for possible aligned read of a C 'long'. Unaligned 498 /* First check for possible aligned read of a C 'long'. Unaligned
467 reads are more expensive, better to defer to another iteration. */ 499 reads are more expensive, better to defer to another iteration. */
468 if (_Py_IS_ALIGNED(q, SIZEOF_LONG)) { 500 if (_Py_IS_ALIGNED(q, SIZEOF_LONG)) {
469 /* Fast path for runs of in-range non-surrogate chars. */ 501 /* Fast path for runs of in-range non-surrogate chars. */
470 register const unsigned char *_q = q; 502 const unsigned char *_q = q;
471 while (_q < aligned_end) { 503 while (_q < aligned_end) {
472 unsigned long block = * (unsigned long *) _q; 504 unsigned long block = * (unsigned long *) _q;
473 if (native_ordering) { 505 if (native_ordering) {
474 /* Can use buffer directly */ 506 /* Can use buffer directly */
475 if (block & FAST_CHAR_MASK) 507 if (block & FAST_CHAR_MASK)
476 break; 508 break;
477 } 509 }
478 else { 510 else {
479 /* Need to byte-swap */ 511 /* Need to byte-swap */
480 if (block & SWAB(FAST_CHAR_MASK)) 512 if (block & SWAB(FAST_CHAR_MASK))
(...skipping 76 matching lines...) Expand 10 before | Expand all | Expand 10 after
557 IllegalSurrogate: 589 IllegalSurrogate:
558 ch = 3; 590 ch = 3;
559 goto Return; 591 goto Return;
560 } 592 }
561 #undef UCS2_REPEAT_MASK 593 #undef UCS2_REPEAT_MASK
562 #undef FAST_CHAR_MASK 594 #undef FAST_CHAR_MASK
563 #undef STRIPPED_MASK 595 #undef STRIPPED_MASK
564 #undef SWAB 596 #undef SWAB
565 597
566 598
567 Py_LOCAL_INLINE(void) 599 #if STRINGLIB_MAX_CHAR >= 0x80
568 STRINGLIB(utf16_encode)(unsigned short *out, 600 Py_LOCAL_INLINE(Py_ssize_t)
569 const STRINGLIB_CHAR *in, 601 STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in,
570 Py_ssize_t len, 602 Py_ssize_t len,
603 unsigned short **outptr,
571 int native_ordering) 604 int native_ordering)
572 { 605 {
606 unsigned short *out = *outptr;
573 const STRINGLIB_CHAR *end = in + len; 607 const STRINGLIB_CHAR *end = in + len;
574 #if STRINGLIB_SIZEOF_CHAR == 1 608 #if STRINGLIB_SIZEOF_CHAR == 1
575 # define SWAB2(CH) ((CH) << 8)
576 #else
577 # define SWAB2(CH) (((CH) << 8) | ((CH) >> 8))
578 #endif
579 #if STRINGLIB_MAX_CHAR < 0x10000
580 if (native_ordering) { 609 if (native_ordering) {
581 # if STRINGLIB_SIZEOF_CHAR == 2 610 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
582 Py_MEMCPY(out, in, 2 * len); 611 while (in < unrolled_end) {
583 # else 612 out[0] = in[0];
584 _PyUnicode_CONVERT_BYTES(STRINGLIB_CHAR, unsigned short, in, end, out); 613 out[1] = in[1];
585 # endif 614 out[2] = in[2];
615 out[3] = in[3];
616 in += 4; out += 4;
617 }
618 while (in < end) {
619 *out++ = *in++;
620 }
586 } else { 621 } else {
622 # define SWAB2(CH) ((CH) << 8) /* high byte is zero */
587 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); 623 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
588 while (in < unrolled_end) { 624 while (in < unrolled_end) {
589 out[0] = SWAB2(in[0]); 625 out[0] = SWAB2(in[0]);
590 out[1] = SWAB2(in[1]); 626 out[1] = SWAB2(in[1]);
591 out[2] = SWAB2(in[2]); 627 out[2] = SWAB2(in[2]);
592 out[3] = SWAB2(in[3]); 628 out[3] = SWAB2(in[3]);
593 in += 4; out += 4; 629 in += 4; out += 4;
594 } 630 }
595 while (in < end) { 631 while (in < end) {
596 *out++ = SWAB2(*in); 632 Py_UCS4 ch = *in++;
597 ++in; 633 *out++ = SWAB2((Py_UCS2)ch);
598 } 634 }
599 } 635 #undef SWAB2
636 }
637 *outptr = out;
638 return len;
600 #else 639 #else
601 if (native_ordering) { 640 if (native_ordering) {
641 #if STRINGLIB_MAX_CHAR < 0x10000
642 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
643 while (in < unrolled_end) {
644 /* check if any character is a surrogate character */
645 if (((in[0] ^ 0xd800) &
646 (in[1] ^ 0xd800) &
647 (in[2] ^ 0xd800) &
648 (in[3] ^ 0xd800) & 0xf800) == 0)
649 break;
650 out[0] = in[0];
651 out[1] = in[1];
652 out[2] = in[2];
653 out[3] = in[3];
654 in += 4; out += 4;
655 }
656 #endif
602 while (in < end) { 657 while (in < end) {
603 Py_UCS4 ch = *in++; 658 Py_UCS4 ch;
604 if (ch < 0x10000) 659 ch = *in++;
660 if (ch < 0xd800)
605 *out++ = ch; 661 *out++ = ch;
606 else { 662 else if (ch < 0xe000)
663 /* reject surrogate characters (U+DC800-U+DFFF) */
664 goto fail;
665 #if STRINGLIB_MAX_CHAR >= 0x10000
666 else if (ch >= 0x10000) {
607 out[0] = Py_UNICODE_HIGH_SURROGATE(ch); 667 out[0] = Py_UNICODE_HIGH_SURROGATE(ch);
608 out[1] = Py_UNICODE_LOW_SURROGATE(ch); 668 out[1] = Py_UNICODE_LOW_SURROGATE(ch);
609 out += 2; 669 out += 2;
610 } 670 }
671 #endif
672 else
673 *out++ = ch;
611 } 674 }
612 } else { 675 } else {
676 #define SWAB2(CH) (((CH) << 8) | ((CH) >> 8))
677 #if STRINGLIB_MAX_CHAR < 0x10000
678 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
679 while (in < unrolled_end) {
680 /* check if any character is a surrogate character */
681 if (((in[0] ^ 0xd800) &
682 (in[1] ^ 0xd800) &
683 (in[2] ^ 0xd800) &
684 (in[3] ^ 0xd800) & 0xf800) == 0)
685 break;
686 out[0] = SWAB2(in[0]);
687 out[1] = SWAB2(in[1]);
688 out[2] = SWAB2(in[2]);
689 out[3] = SWAB2(in[3]);
690 in += 4; out += 4;
691 }
692 #endif
613 while (in < end) { 693 while (in < end) {
614 Py_UCS4 ch = *in++; 694 Py_UCS4 ch = *in++;
615 if (ch < 0x10000) 695 if (ch < 0xd800)
616 *out++ = SWAB2((Py_UCS2)ch); 696 *out++ = SWAB2((Py_UCS2)ch);
617 else { 697 else if (ch < 0xe000)
698 /* reject surrogate characters (U+DC800-U+DFFF) */
699 goto fail;
700 #if STRINGLIB_MAX_CHAR >= 0x10000
701 else if (ch >= 0x10000) {
618 Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch); 702 Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);
619 Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch); 703 Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
620 out[0] = SWAB2(ch1); 704 out[0] = SWAB2(ch1);
621 out[1] = SWAB2(ch2); 705 out[1] = SWAB2(ch2);
622 out += 2; 706 out += 2;
623 } 707 }
624 } 708 #endif
625 } 709 else
626 #endif 710 *out++ = SWAB2((Py_UCS2)ch);
711 }
627 #undef SWAB2 712 #undef SWAB2
713 }
714 *outptr = out;
715 return len;
716 fail:
717 *outptr = out;
718 return len - (end - in + 1);
719 #endif
628 } 720 }
629 721
630 722 #if STRINGLIB_SIZEOF_CHAR == 1
631 Py_LOCAL_INLINE(void) 723 # define SWAB4(CH, tmp) ((CH) << 24) /* high bytes are zero */
632 STRINGLIB(utf32_encode)(PY_UINT32_T *out, 724 #elif STRINGLIB_SIZEOF_CHAR == 2
633 const STRINGLIB_CHAR *in, 725 # define SWAB4(CH, tmp) (tmp = (CH), \
726 ((tmp & 0x00FFu) << 24) + ((tmp & 0xFF00u) << 8))
727 /* high bytes are zero */
728 #else
729 # define SWAB4(CH, tmp) (tmp = (CH), \
730 tmp = ((tmp & 0x00FF00FFu) << 8) + ((tmp >> 8) & 0x00FF00FFu), \
731 ((tmp & 0x0000FFFFu) << 16) + ((tmp >> 16) & 0x0000FFFFu))
732 #endif
733 Py_LOCAL_INLINE(Py_ssize_t)
734 STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in,
634 Py_ssize_t len, 735 Py_ssize_t len,
736 PY_UINT32_T **outptr,
635 int native_ordering) 737 int native_ordering)
636 { 738 {
739 PY_UINT32_T *out = *outptr;
637 const STRINGLIB_CHAR *end = in + len; 740 const STRINGLIB_CHAR *end = in + len;
638 if (native_ordering) { 741 if (native_ordering) {
639 #if STRINGLIB_SIZEOF_CHAR == 4 742 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
640 Py_MEMCPY(out, in, 4 * len); 743 while (in < unrolled_end) {
641 #else 744 #if STRINGLIB_SIZEOF_CHAR > 1
642 _PyUnicode_CONVERT_BYTES(STRINGLIB_CHAR, PY_UINT32_T, in, end, out); 745 /* check if any character is a surrogate character */
643 #endif 746 if (((in[0] ^ 0xd800) &
747 (in[1] ^ 0xd800) &
748 (in[2] ^ 0xd800) &
749 (in[3] ^ 0xd800) & 0xf800) == 0)
750 break;
751 #endif
752 out[0] = in[0];
753 out[1] = in[1];
754 out[2] = in[2];
755 out[3] = in[3];
756 in += 4; out += 4;
757 }
758 while (in < end) {
759 Py_UCS4 ch;
760 ch = *in++;
761 #if STRINGLIB_SIZEOF_CHAR > 1
762 if (Py_UNICODE_IS_SURROGATE(ch)) {
763 /* reject surrogate characters (U+DC800-U+DFFF) */
764 goto fail;
765 }
766 #endif
767 *out++ = ch;
768 }
644 } else { 769 } else {
645 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); 770 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
646 #define SWAB4(CH, tmp) (tmp = (CH), \
647 tmp = ((tmp & 0x00FF00FFu) << 8) + ((tmp >> 8) & 0x00FF00FFu), \
648 ((tmp & 0x0000FFFFu) << 16) + ((tmp >> 16) & 0x0000FFFFu))
649 while (in < unrolled_end) { 771 while (in < unrolled_end) {
772 #if STRINGLIB_SIZEOF_CHAR > 1
650 Py_UCS4 ch1, ch2, ch3, ch4; 773 Py_UCS4 ch1, ch2, ch3, ch4;
gregory.p.smith 2013/12/12 00:04:23 i think you want this line above the #if as these
storchaka 2013/12/12 00:21:20 They are not used when STRINGLIB_SIZEOF_CHAR == 1
gregory.p.smith 2013/12/12 04:28:05 Oh I see. Because of the macro. OK.
774 /* check if any character is a surrogate character */
775 if (((in[0] ^ 0xd800) &
776 (in[1] ^ 0xd800) &
777 (in[2] ^ 0xd800) &
778 (in[3] ^ 0xd800) & 0xf800) == 0)
779 break;
780 #endif
651 out[0] = SWAB4(in[0], ch1); 781 out[0] = SWAB4(in[0], ch1);
652 out[1] = SWAB4(in[1], ch2); 782 out[1] = SWAB4(in[1], ch2);
653 out[2] = SWAB4(in[2], ch3); 783 out[2] = SWAB4(in[2], ch3);
654 out[3] = SWAB4(in[3], ch4); 784 out[3] = SWAB4(in[3], ch4);
655 in += 4; 785 in += 4; out += 4;
656 out += 4;
657 } 786 }
658 while (in < end) { 787 while (in < end) {
659 Py_UCS4 ch; 788 Py_UCS4 ch = *in++;
660 *out++ = SWAB4(*in++, ch); 789 #if STRINGLIB_SIZEOF_CHAR > 1
661 } 790 if (Py_UNICODE_IS_SURROGATE(ch)) {
791 /* reject surrogate characters (U+DC800-U+DFFF) */
792 goto fail;
793 }
794 #endif
795 *out++ = SWAB4(ch, ch);
796 }
797 }
798 *outptr = out;
799 return len;
800 #if STRINGLIB_SIZEOF_CHAR > 1
801 fail:
802 *outptr = out;
803 return len - (end - in + 1);
804 #endif
805 }
662 #undef SWAB4 806 #undef SWAB4
663 } 807
664 } 808 #endif
809
665 #endif /* STRINGLIB_IS_UNICODE */ 810 #endif /* STRINGLIB_IS_UNICODE */
LEFTRIGHT

RSS Feeds Recent Issues | This issue
This is Rietveld 894c83f36cb7+