diff -r 9923b81a1d34 perf.py --- a/perf.py Wed Feb 03 14:19:18 2016 -0600 +++ b/perf.py Thu Feb 25 14:10:03 2016 +0200 @@ -2160,6 +2160,11 @@ def BM_regex_compile(base_python, change bm_path = "performance/bm_regex_compile.py" return RegexBenchmark(base_python, changed_python, options, bm_path) +@VersionRange() +def BM_regex_dna(base_python, changed_python, options): + bm_path = "performance/bm_regex_dna.py" + return RegexBenchmark(base_python, changed_python, options, bm_path) + def MeasureThreading(python, options, bm_name): """Test the performance of Python's threading support. @@ -2370,7 +2375,8 @@ BENCH_GROUPS = {"default": ["2to3", "cha "regex_v8", "json_dump_v2", "json_load"], "startup": ["normal_startup", "startup_nosite", "bzr_startup", "hg_startup"], - "regex": ["regex_v8", "regex_effbot", "regex_compile"], + "regex": ["regex_v8", "regex_effbot", "regex_compile", + "regex_dna"], "threading": ["threaded_count", "iterative_count"], "serialize": ["slowpickle", "slowunpickle", # Not for Python 3 "fastpickle", "fastunpickle", diff -r 9923b81a1d34 performance/bm_regex_dna.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/performance/bm_regex_dna.py Thu Feb 25 14:10:03 2016 +0200 @@ -0,0 +1,214 @@ +#!/usr/bin/env python + +# The Computer Language Benchmarks Game +# http://benchmarksgame.alioth.debian.org/ +# +# regex-dna Python 3 #5 program: +# contributed by Dominique Wahli +# 2to3 +# modified by Justin Peel +# +# fasta Python 3 #3 program: +# modified by Ian Osgood +# modified again by Heinrich Acker +# modified by Justin Peel +# Modified by Christopher Sean Forgeron + +# Python imports +import bisect +import optparse +import os.path +import re +import time + +# Local imports +import util +from compat import xrange + + +alu = ( + 'GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG' + 'GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA' + 'CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT' + 'ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA' + 'GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG' + 'AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC' + 'AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA') + +iub = list(zip('acgtBDHKMNRSVWY', [0.27, 0.12, 0.12, 0.27] + [0.02] * 11)) + +homosapiens = [ + ('a', 0.3029549426680), + ('c', 0.1979883004921), + ('g', 0.1975473066391), + ('t', 0.3015094502008), +] + + +def make_cumulative(table): + P = [] + C = [] + prob = 0. + for char, p in table: + prob += p + P += [prob] + C += [ord(char)] + return (P, C) + + +def repeat_fasta(src, n, nprint): + width = 60 + + is_trailing_line = False + count_modifier = 0.0 + + len_of_src = len(src) + ss = src + src + src[:n % len_of_src] + # CSF - It's faster to work with a bytearray than a string + s = bytearray(ss, encoding='utf8') + + if n % width: + # We don't end on a 60 char wide line + is_trailing_line = True + count_modifier = 1.0 + + # CSF - Here we are stuck with using an int instead of a float for the loop, + # but testing showed it still to be faster than a for loop + count = 0 + end = (n / float(width)) - count_modifier + while count < end: + i = count*60 % len_of_src + nprint(s[i:i+60] + b'\n') + count += 1 + if is_trailing_line: + nprint(s[-(n % width):] + b'\n') + + +def random_fasta(table, n, seed, nprint): + width = 60 + r = range(width) + bb = bisect.bisect + + # If we don't have a multiple of the width, then we will have a trailing + # line, which needs a slightly different approach + is_trailing_line = False + count_modifier = 0.0 + + line = bytearray(width + 1) # Width of 60 + 1 for the \n char + + probs, chars = make_cumulative(table) + + # pRNG Vars + im = 139968.0 + #seed = 42.0 + + if n % width: + # We don't end on a 60 char wide line + is_trailing_line = True + count_modifier = 1.0 + + # CSF - Loops with a high iteration count run faster as a while/float loop. + count = 0.0 + end = (n / float(width)) - count_modifier + while count < end: + # CSF - Low iteration count loops may run faster as a for loop. + for i in r: + # CSF - Python is faster for all float math than it is for int, on my + # machine at least. + seed = (seed * 3877.0 + 29573.0) % 139968.0 + # CSF - While real values, not variables are faster for most things, on my + # machine, it's faster to have 'im' already in a var + line[i] = chars[bb(probs, seed / im)] + + line[60] = 10 # End of Line + nprint(line) + count += 1.0 + + if is_trailing_line: + for i in range(n % width): + seed = (seed * 3877.0 + 29573.0) % 139968.0 + line[i] = chars[bb(probs, seed / im)] + + nprint(line[:i+1] + b"\n") + + return seed + + +def init_benchmarks(n): + result = bytearray() + nprint = result.extend + nprint(b'>ONE Homo sapiens alu\n') + repeat_fasta(alu, n * 2, nprint=nprint) + + # We need to keep track of the state of 'seed' so we pass it in, and return + # it back so our output can pass the diff test + nprint(b'>TWO IUB ambiguity codes\n') + seed = random_fasta(iub, n * 3, seed=42.0, nprint=nprint) + + nprint(b'>THREE Homo sapiens frequency\n') + random_fasta(homosapiens, n * 5, seed, nprint=nprint) + + return bytes(result) + + +variants = ( + b'agggtaaa|tttaccct', + b'[cgt]gggtaaa|tttaccc[acg]', + b'a[act]ggtaaa|tttacc[agt]t', + b'ag[act]gtaaa|tttac[agt]ct', + b'agg[act]taaa|ttta[agt]cct', + b'aggg[acg]aaa|ttt[cgt]ccct', + b'agggt[cgt]aa|tt[acg]accct', + b'agggta[cgt]a|t[acg]taccct', + b'agggtaa[cgt]|[acg]ttaccct', +) + +subst = ( + (b'B', b'(c|g|t)'), (b'D', b'(a|g|t)'), (b'H', b'(a|c|t)'), + (b'K', b'(g|t)'), (b'M', b'(a|c)'), (b'N', b'(a|c|g|t)'), + (b'R', b'(a|g)'), (b'S', b'(c|g)'), (b'V', b'(a|c|g)'), + (b'W', b'(a|t)'), (b'Y', b'(c|t)'), +) + +def run_benchmarks(seq): + ilen = len(seq) + + seq = re.sub(b'>.*\n|\n', b'', seq) + clen = len(seq) + + results = [] + for f in variants: + results.append(len(re.findall(f, seq))) + + for f, r in subst: + seq = re.sub(f, r, seq) + + return results, ilen, clen, len(seq) + + +def test_regex_dna(iterations, timer): + seq = init_benchmarks(100000) + assert len(seq) == 1016745 + + # Warm up. + res = run_benchmarks(seq) + assert res == ([6, 26, 86, 58, 113, 31, 31, 32, 43], 1016745, 1000000, 1336326) + + times = [] + for i in xrange(iterations): + t0 = timer() + run_benchmarks(seq) + t1 = timer() + times.append(t1 - t0) + return times + + +if __name__ == '__main__': + parser = optparse.OptionParser( + usage="%prog [options]", + description=("Test the performance of regexps using benchmarks " + "from The Computer Language Benchmarks Game.")) + util.add_standard_options_to(parser) + options, args = parser.parse_args() + + util.run_benchmark(options, options.num_runs, test_regex_dna)