M64 = (1 << 64) - 1 def phash(obj, M=M64): # hash(obj) as uint64 return hash(obj) & M def spray(nbits, objs, *, used=None, shift=5): building = used is None nslots = 1 << nbits mask = nslots - 1 if building: used = [0] * nslots ns = [] for o in objs: n = 1 p = phash(o) i = p & mask while used[i]: n += 1 p >>= shift i = (5*i + p + 1) & mask if building: used[i] = 1 ns.append(n) return ns, used def strgen(i=1): while True: yield str(i) i += 1 def pstats(ns): import statistics print(f"min {min(ns)} " f"max {max(ns)} " f"mean {statistics.mean(ns):.2f} " f"median {statistics.median(ns):.2f} " f"psdev {statistics.pstdev(ns):.2f}") def drive(nbits): import itertools import math import sys nslots = 1 << nbits dlen = nslots * 2 // 3 assert (sys.getsizeof({i: i for i in range(dlen)}) < sys.getsizeof({i: i for i in range(dlen + 1)})) alpha = dlen / nslots # actual load factor of max dict print("bits", nbits, f"nslots {nslots:,} dlen {dlen:,} alpha {alpha:.2f}") print(f"theoretical avg probes for uniform hashing " f"when found {math.log(1/(1-alpha))/alpha:.2f} " f"not found {1/(1-alpha):.2f}") for shift in range(1, 65): print(" shift", shift) strs = strgen() objs = itertools.islice(strs, dlen) ns, used = spray(nbits, objs, shift=shift) print(" " * 8 + "found ", end="") pstats(ns) objs = itertools.islice(strs, 100_000) ns, used = spray(nbits, objs, used=used, shift=shift) print(" " * 8 + "fail ", end="") pstats(ns) drive(10) drive(20)