diff -r ca78b9449e04 Lib/random.py --- a/Lib/random.py Thu Jul 16 00:24:48 2015 -0500 +++ b/Lib/random.py Thu Jul 16 22:22:15 2015 +0300 @@ -351,6 +351,84 @@ class Random(_random.Random): return self.sample(tuple(population), k) return result + def sample2(self, population, k): + n = len(population) + if not 0 <= k <= n: + raise ValueError("sample larger than population") + random = self.random + _int = int + result = [None] * k + setsize = 21 # size of a small set minus size of an empty list + if k > 5: + setsize += 4 ** _ceil(_log(k * 3, 4)) # table size for big sets + if n <= setsize or hasattr(population, "keys"): + # An n-length list is smaller than a k-length set, or this is a + # mapping type so the other algorithm wouldn't work. + pool = list(population) + for i in xrange(k): # invariant: non-selected at [0,n-i) + limit = n - i + j = _int(random() * limit) + if j == limit: + j = limit - 1 + result[i] = pool[j] + pool[j] = pool[limit-1] # move non-selected item into vacancy + else: + try: + selected = set() + selected_add = selected.add + for i in xrange(k): + j = _int(random() * n) + if j == n: + j = n - 1 + while j in selected: + j = _int(random() * n) + if j == n: + j = n - 1 + selected_add(j) + result[i] = population[j] + except (TypeError, KeyError): # handle (at least) sets + if isinstance(population, list): + raise + return self.sample(tuple(population), k) + return result + + def sample3(self, population, k): + n = len(population) + if not 0 <= k <= n: + raise ValueError("sample larger than population") + if not n: + return [] + random = self.random + _int = int + result = [None] * k + setsize = 21 # size of a small set minus size of an empty list + if k > 5: + setsize += 4 ** _ceil(_log(k * 3, 4)) # table size for big sets + if n <= setsize or hasattr(population, "keys"): + # An n-length list is smaller than a k-length set, or this is a + # mapping type so the other algorithm wouldn't work. + pool = list(population) + pool.append(pool[-1]) + for i in xrange(k): # invariant: non-selected at [0,n-i) + j = _int(random() * (n-i)) + result[i] = pool[j] + pool[j] = pool[n-i-1] # move non-selected item into vacancy + else: + try: + selected = {n} + selected_add = selected.add + for i in xrange(k): + j = _int(random() * n) + while j in selected: + j = _int(random() * n) + selected_add(j) + result[i] = population[j] + except (TypeError, KeyError): # handle (at least) sets + if isinstance(population, list): + raise + return self.sample(tuple(population), k) + return result + ## -------------------- real-valued distributions ------------------- ## -------------------- uniform distribution ------------------- $ ./python -m timeit -r11 -s 'import random; sample = random.Random(12345).sample; k = 10; p = list(range(k))' -- 'sample(p, k)' 100000 loops, best of 11: 13.9 usec per loop $ ./python -m timeit -r11 -s 'import random; sample = random.Random(12345).sample2; k = 10; p = list(range(k))' -- 'sample(p, k)' 100000 loops, best of 11: 14.1 usec per loop $ ./python -m timeit -r11 -s 'import random; sample = random.Random(12345).sample3; k = 10; p = list(range(k))' -- 'sample(p, k)' 100000 loops, best of 11: 14.1 usec per loop $ ./python -m timeit -r11 -s 'import random; sample = random.Random(12345).sample; k = 100; p = list(range(k))' -- 'sample(p, k)' 10000 loops, best of 11: 94.4 usec per loop $ ./python -m timeit -r11 -s 'import random; sample = random.Random(12345).sample2; k = 100; p = list(range(k))' -- 'sample(p, k)' 10000 loops, best of 11: 98.1 usec per loop $ ./python -m timeit -r11 -s 'import random; sample = random.Random(12345).sample3; k = 100; p = list(range(k))' -- 'sample(p, k)' 10000 loops, best of 11: 97.1 usec per loop $ ./python -m timeit -r11 -s 'import random; sample = random.Random(12345).sample; k = 10000; p = list(range(k))' -- 'sample(p, k)' 100 loops, best of 11: 9.38 msec per loop $ ./python -m timeit -r11 -s 'import random; sample = random.Random(12345).sample2; k = 10000; p = list(range(k))' -- 'sample(p, k)' 100 loops, best of 11: 9.73 msec per loop $ ./python -m timeit -r11 -s 'import random; sample = random.Random(12345).sample3; k = 10000; p = list(range(k))' -- 'sample(p, k)' 100 loops, best of 11: 9.42 msec per loop