# 2011 Aug 14 # Copyright Terry Jan Reedy # proof of concept for codepoint indexing on narrow builds # by adjusting cp index to get corresponding code unit index from bisect import bisect_left class UTF16: def __init__(self, text, cpdex = None): self.text = text if cpdex is None: cpdex = self.make_cpdex() # else like text is surrogate pair and cpdex is (0,) self.cpdex = cpdex self.len = len(text) - len(cpdex) def make_cpdex(self): cpdex = [] # codepoint indexes of upper chars i = 0 # codepoint index, not incremented for 2nd surrogates t_iter = iter(self.text) for c in t_iter: oc = ord(c) if 0xD800 <= oc <= 0xDBFF: # first surrogate if 0xDC00 <= ord(next(t_iter)) <= 0xDFFF: cpdex.append(i) else: raise ValueError("hi surrogate not paired by lo surrogate") elif 0xDC00 <= oc <= 0xDFFF: raise ValueError("lo surrogate not paired by hi surrogate") i += 1 return cpdex def __eq__(self, other): # temporary kludge if type(other) is type(self): other = other.text # otherwise, it better be a (unicode) string return self.text == other def __len__(self): return self.len def __getitem__(self, cp): cu = cp + bisect_left(self.cpdex, cp) c = self.text[cu] if 0xD800 <= ord(c) <= 0xDBFF: return UTF16(self.text[cu:cu+2], (0,)) else: return c def __repr__(self): return repr(self.text) __str__ = __repr__ tucs2 = 'A\U0001043cBC\U0001042f\U00010445DE\U00010428H' tutf16= UTF16(tucs2) tlist = ['A', '\U0001043c','B','C','\U0001042f','\U00010445', 'D','E','\U00010428','H'] assert len(tutf16) == len(tlist) tlis2 = [tutf16[i] for i in range(len(tlist))] for c in tlis2: assert len(c) == 1 assert tlist == tlis2