from itertools import product def tryone(tag, xs): import sys hashbits = sys.maxsize.bit_length() + 1 assert hashbits in (32, 64) nballs = len(xs) # When nballs is much less than nbins, the mean expected number # of collisions is closely approximated by # combinations(nballs, 2) / nbins. cn2 = nballs * (nballs - 1) / 2.0 hashes = list(map(hash, xs)) nbins = 1 << hashbits mean = cn2 / nbins collisions = nballs - len(set(hashes)) msg = f"{tag}; {hashbits}-bit hash codes; mean {mean:.2f} got {collisions:,}" print(msg) if hashbits > 32: mean = cn2 / 2**32 mask = (1 << 32) - 1 collisions = nballs - len(set(h & mask for h in hashes)) msg = f"{tag}; 32-bit lower hash codes; mean {mean:.2f} got {collisions:,}" print(msg) shift = hashbits - 32 collisions = nballs - len(set(h >> shift for h in hashes)) msg = f"{tag}; 32-bit upper hash codes; mean {mean:.2f} got {collisions:,}" print(msg) tryone("range(100) by 3", list(product(range(100), repeat=3))) cands = list(range(-10, -1)) + list(range(9)) tryone("-10 .. 8 by 4", list(product(cands, repeat=4))) del cands cands = list(range(-50, 51)) cands.remove(-1) # hashes to -2 tryone("-50 .. 50 less -1 by 3", list(product(cands, repeat=3))) del cands L = [n << 60 for n in range(100)] tryone("0..99 << 60 by 3", list(product(L, repeat=3))) del L tryone("[-3, 3] by 20", list(product([-3, 3], repeat=20))) tryone("[0.5, 0.25] by 20", list(product([0.5, 0.25], repeat=20))) if 1: def test_hash(): # See SF bug 942952: Weakness in tuple hash # The hash should: # be non-commutative # should spread-out closely spaced values # should not exhibit cancellation in tuples like (x,(x,y)) # should be distinct from element hashes: hash(x)!=hash((x,)) # This test exercises those cases. # For a pure random hash and N=50, the expected number of occupied # buckets when tossing 252,600 balls into 2**32 buckets # is 252,592.6, or about 7.4 expected collisions. The # standard deviation is 2.73. On a box with 64-bit hash # codes, no collisions are expected. Here we accept no # more than 15 collisions. Any worse and the hash function # is sorely suspect. N=50 base = list(range(N)) xp = [(i, j) for i in base for j in base] inps = base + [(i, j) for i in base for j in xp] + \ [(i, j) for i in xp for j in base] + xp + list(zip(base)) tryone("old tuple test", inps) test_hash() if 1: # https://github.com/python/cpython/pull/9534/files n = 5 A = [x for x in range(-n, n+1) if x != -1] B = A + [(a,) for a in A] L2 = [(a,b) for a in A for b in A] L3 = L2 + [(a,b,c) for a in A for b in A for c in A] L4 = L3 + [(a,b,c,d) for a in A for b in A for c in A for d in A] # T = list of testcases. These consist of all (possibly nested # at most 2 levels deep) tuples containing at most 4 items from # the set A. T = A T += [(a,) for a in B + L4] T += [(a,b) for a in L3 for b in B] T += [(a,b) for a in L2 for b in L2] T += [(a,b) for a in B for b in L3] T += [(a,b,c) for a in B for b in B for c in L2] T += [(a,b,c) for a in B for b in L2 for c in B] T += [(a,b,c) for a in L2 for b in B for c in B] T += [(a,b,c,d) for a in B for b in B for c in B for d in B] assert len(T) == 345130 tryone("new tuple test", T) ## self.assertLessEqual(collisions, 20)