Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code | Sign in
(90248)

Side by Side Diff: Objects/stringlib/codecs.h

Issue 15027: Faster UTF-32 encoding
Patch Set: Created 5 years, 8 months ago
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | Objects/unicodeobject.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* stringlib: codec implementations */ 1 /* stringlib: codec implementations */
2 2
3 #if STRINGLIB_IS_UNICODE 3 #if STRINGLIB_IS_UNICODE
4 4
5 /* Mask to quickly check whether a C 'long' contains a 5 /* Mask to quickly check whether a C 'long' contains a
6 non-ASCII, UTF8-encoded char. */ 6 non-ASCII, UTF8-encoded char. */
7 #if (SIZEOF_LONG == 8) 7 #if (SIZEOF_LONG == 8)
8 # define ASCII_CHAR_MASK 0x8080808080808080UL 8 # define ASCII_CHAR_MASK 0x8080808080808080UL
9 #elif (SIZEOF_LONG == 4) 9 #elif (SIZEOF_LONG == 4)
10 # define ASCII_CHAR_MASK 0x80808080UL 10 # define ASCII_CHAR_MASK 0x80808080UL
(...skipping 700 matching lines...) Expand 10 before | Expand all | Expand 10 after
711 } 711 }
712 #undef SWAB2 712 #undef SWAB2
713 } 713 }
714 *outptr = out; 714 *outptr = out;
715 return len; 715 return len;
716 fail: 716 fail:
717 *outptr = out; 717 *outptr = out;
718 return len - (end - in + 1); 718 return len - (end - in + 1);
719 #endif 719 #endif
720 } 720 }
721
722 #if STRINGLIB_SIZEOF_CHAR == 1
723 # define SWAB4(CH, tmp) ((CH) << 24) /* high bytes are zero */
724 #elif STRINGLIB_SIZEOF_CHAR == 2
725 # define SWAB4(CH, tmp) (tmp = (CH), \
726 ((tmp & 0x00FFu) << 24) + ((tmp & 0xFF00u) << 8))
727 /* high bytes are zero */
728 #else
729 # define SWAB4(CH, tmp) (tmp = (CH), \
730 tmp = ((tmp & 0x00FF00FFu) << 8) + ((tmp >> 8) & 0x00FF00FFu), \
731 ((tmp & 0x0000FFFFu) << 16) + ((tmp >> 16) & 0x0000FFFFu))
732 #endif
733 Py_LOCAL_INLINE(Py_ssize_t)
734 STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in,
735 Py_ssize_t len,
736 PY_UINT32_T **outptr,
737 int native_ordering)
738 {
739 PY_UINT32_T *out = *outptr;
740 const STRINGLIB_CHAR *end = in + len;
741 if (native_ordering) {
742 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
743 while (in < unrolled_end) {
744 #if STRINGLIB_SIZEOF_CHAR > 1
745 /* check if any character is a surrogate character */
746 if (((in[0] ^ 0xd800) &
747 (in[1] ^ 0xd800) &
748 (in[2] ^ 0xd800) &
749 (in[3] ^ 0xd800) & 0xf800) == 0)
750 break;
751 #endif
752 out[0] = in[0];
753 out[1] = in[1];
754 out[2] = in[2];
755 out[3] = in[3];
756 in += 4; out += 4;
757 }
758 while (in < end) {
759 Py_UCS4 ch;
760 ch = *in++;
761 #if STRINGLIB_SIZEOF_CHAR > 1
762 if (Py_UNICODE_IS_SURROGATE(ch)) {
763 /* reject surrogate characters (U+DC800-U+DFFF) */
764 goto fail;
765 }
766 #endif
767 *out++ = ch;
768 }
769 } else {
770 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
771 while (in < unrolled_end) {
772 #if STRINGLIB_SIZEOF_CHAR > 1
773 Py_UCS4 ch1, ch2, ch3, ch4;
gregory.p.smith 2013/12/12 00:04:23 i think you want this line above the #if as these
storchaka 2013/12/12 00:21:20 They are not used when STRINGLIB_SIZEOF_CHAR == 1
gregory.p.smith 2013/12/12 04:28:05 Oh I see. Because of the macro. OK.
774 /* check if any character is a surrogate character */
775 if (((in[0] ^ 0xd800) &
776 (in[1] ^ 0xd800) &
777 (in[2] ^ 0xd800) &
778 (in[3] ^ 0xd800) & 0xf800) == 0)
779 break;
780 #endif
781 out[0] = SWAB4(in[0], ch1);
782 out[1] = SWAB4(in[1], ch2);
783 out[2] = SWAB4(in[2], ch3);
784 out[3] = SWAB4(in[3], ch4);
785 in += 4; out += 4;
786 }
787 while (in < end) {
788 Py_UCS4 ch = *in++;
789 #if STRINGLIB_SIZEOF_CHAR > 1
790 if (Py_UNICODE_IS_SURROGATE(ch)) {
791 /* reject surrogate characters (U+DC800-U+DFFF) */
792 goto fail;
793 }
794 #endif
795 *out++ = SWAB4(ch, ch);
796 }
797 }
798 *outptr = out;
799 return len;
800 #if STRINGLIB_SIZEOF_CHAR > 1
801 fail:
802 *outptr = out;
803 return len - (end - in + 1);
804 #endif
805 }
806 #undef SWAB4
807
721 #endif 808 #endif
722 809
723 #endif /* STRINGLIB_IS_UNICODE */ 810 #endif /* STRINGLIB_IS_UNICODE */
OLDNEW
« no previous file with comments | « no previous file | Objects/unicodeobject.c » ('j') | no next file with comments »

RSS Feeds Recent Issues | This issue
This is Rietveld 894c83f36cb7+