# 2011 Aug 22 # Copyright Terry Jan Reedy # proof of concept for codepoint indexing on narrow builds # by adjusting cp index to get corresponding code unit index from bisect import bisect_left class UTF16: def __init__(self, text, cpdex = None): self.text = text if cpdex is None: cpdex = self.make_cpdex() # else, for instance, text is surrogate pair and cpdex is [0] self.cpdex = cpdex self.len = len(text) - len(cpdex) # def __iter__(self): # is not needed because inherited str.__iter__ uses self.__getitem__ def make_cpdex(self): cpdex = [] # codepoint indexes of extended non_BMP chars i = 0 # codepoint index, not incremented for 2nd surrogates t_iter = iter(self.text) for c in t_iter: oc = ord(c) if 0xD800 <= oc <= 0xDBFF: # first (hi) surrogate if 0xDC00 <= ord(next(t_iter)) <= 0xDFFF: cpdex.append(i) else: raise ValueError("hi surrogate not followed by lo surrogate") elif 0xDC00 <= oc <= 0xDFFF: raise ValueError("lo surrogate not preceded by hi surrogate") i += 1 return cpdex def __eq__(self, other): # temporary kludge if type(other) is type(self): return self.text == other.text and self.cpdex == other.cpdex # testing cpdex is at least partly for internal testing # otherwise, it better be a (unicode) string return self.text == other def __len__(self): return self.len def __getitem__(self, cpi): # cpi = codepoint index or slice thereof if isinstance(cpi, int): if cpi < 0: cpi += self.len if cpi < 0: raise IndexError("string index out of range") i = bisect_left(self.cpdex, cpi ) cu = cpi + i if i < len(self.cpdex) and self.cpdex[i] == cpi : # indexing extended char return UTF16(self.text[cu:cu+2], [0]) else: # indexing BMP char return self.text[cu] # alternative implementation of above block, also tested ## cu = cp + bisect_left(self.cpdex, cp) ## c = self.text[cu] # will raise if cpi and hence cu is too large ## if 0xD800 <= ord(c) <= 0xDBFF: # indexing extended char ## return UTF16(self.text[cu:cu+2], [0]) ## else: ## return c elif isinstance(cpi, slice): start, stop, step = cpi.indices(self.len) delta = stop - start start += bisect_left(self.cpdex, start) stop += bisect_left(self.cpdex, stop) ret = self.text[start:stop:step] if stop-start > delta: # slice contains extended chars ret = UTF16(ret) # could potentially compute ret.cpdex from self.cpdex and bisect returns return ret else: raise TypeError("string indices must be integers, not " + type(cp).__name__) def __repr__(self): return repr(self.text) def __str__(self): return self.text tucs2 = 'A\U0001043cBC\U0001042f\U00010445DE\U00010428H' tutf16= UTF16(tucs2) tlist = ['A', '\U0001043c','B','C','\U0001042f','\U00010445', 'D','E','\U00010428','H'] assert len(tutf16) == len(tlist) tlis2 = [tutf16[i] for i in range(len(tlist))] for c in tlis2: assert len(c) == 1 assert tlis2 == tlist assert list(tutf16) == tlist # test iteration, which uses indexing tlis3 = [tutf16[i] for i in range(-len(tlist), 0)] assert tlis3 == tlist assert tutf16[:3] == UTF16(tucs2[:4]) assert tutf16[1:5] == UTF16(tucs2[1:7]) assert tutf16[5:] == UTF16(tucs2[7:]) assert tutf16[2:4] == tucs2[3:5]