#!python3
"""Investigation of how useful it is to optimize Python powers.
See longobject.c, lines 4245:
        /* Left-to-right binary exponentiation (HAC Algorithm 14.79) */
        /* http://www.cacr.math.uwaterloo.ca/hac/about/chap14.pdf    */
        for (i = Py_SIZE(b) - 1; i >= 0; --i) {
            digit bi = b->ob_digit[i];

            for (j = (digit)1 << (PyLong_SHIFT-1); j != 0; j >>= 1) {
                MULT(z, z, z);
                if (bi & j)
                    MULT(z, a, z);

and for the 5-ary (which is actually 32-ary);
note that each digit contains 15 or 30 bits of a number.
        /* Left-to-right 5-ary exponentiation (HAC Algorithm 14.82) */
        Py_INCREF(z);           /* still holds 1L */
        table[0] = z;
        for (i = 1; i < 32; ++i)
            MULT(table[i-1], a, table[i]);

        for (i = Py_SIZE(b) - 1; i >= 0; --i) {
            const digit bi = b->ob_digit[i];

            for (j = PyLong_SHIFT - 5; j >= 0; j -= 5) {
                const int index = (bi >> j) & 0x1f;
                for (k = 0; k < 5; ++k)
                    MULT(z, z, z);
                if (index)
                    MULT(z, table[index], z);
            }
        }
"""
# for phone ----------------------------------------
import os
try:
    os.chdir(os.path.dirname(__file__) or '.')
except NameError: pass

import re
from collections import namedtuple, defaultdict, Counter
from random import randrange, getrandbits, sample
from operator import methodcaller, itemgetter, add
from itertools import chain, product,islice
from functools import partial

# number of multiplications, squares and "empty" squares (of 1)
_Mults = namedtuple("_Mults", "squares mults empty chunkSize")
class Mults(namedtuple("Mults", "squares mults empty chunkSize")):
    """Make sensible ordering in Mults"""
    def __int__(self): return self.mults + self.squares
    def __eq__(self, other): return int(self) == int(other)
    def __lt__(self, other): return int(self) < int(other)
    def __gt__(self, other): return int(self) > int(other)
    def __le__(self, other): return int(self) <= int(other)
    def __ge__(self, other): return int(self) >= int(other)
    def __ne__(self, other): return int(self) != int(other)

# auxiliary functions ----------------------------------------
def digits(n, chunkSize=30):
    """Split n into 'big digits' of chunkSize bits each
    >>> digits(10000, 3)
    [0, 2, 4, 3, 2]
    """
    mask = (1 << chunkSize) - 1
    return [n >> i & mask
            for i in range(0, n.bit_length(), chunkSize)]

def hamming(n):
    return bin(n).count('1')

# original functions ----------------------------------------
def mul_bin(n, ws=30):
    """Compute number of multiplications using basic binary method
    As close to python code as possible
    counts squares, multiplications, empty squares
    >>> mul_bin(10000, 8)
    Mults(squares=13, mults=5, empty=3, chunkSize=None)
    """
    oneSquares = squares = mults = 0
    resultPower = 0
    for digit in reversed(digits(n, ws)):
        for bitPos in range(ws - 1, -1, -1):
            if resultPower: squares += 1
            else: oneSquares += 1
            resultPower *= 2
            if digit >> bitPos & 1:
                mults += 1
                resultPower += 1
    if resultPower != n: raise AssertionError
    return Mults(squares, mults, oneSquares, None)

def est_mul_bin(n, ws=30):
    """Compute counts of mul_bin directly
    >>> est_mul_bin(10000, 8)
    Mults(squares=13, mults=5, empty=3, chunkSize=None)
    """
    if not n: return Mults(0, 0, 0, None)
    bl = n.bit_length()
    return Mults(bl - 1, hamming(n), -bl % ws + 1, None)

def mulP(n, ws=30, chunkSize=5):
    """Compute number of multiplications with chunkSize bits at a time
    As close to python code as possible
    counts squares, multiplications, empty squares
    >>> mulP(10000, 8, 4)
    Mults(squares=13, mults=17, empty=4, chunkSize=4)
    """
    # make table
    if ws % chunkSize: raise AssertionError
    mask = (1 << chunkSize) - 1
    oneSquares, squares, mults = 0, 1, (1 << chunkSize) - 2
    resultPower = 0
    for digit in reversed(digits(n, ws)):
        for bitPos in range(ws - chunkSize, -chunkSize, -chunkSize):
            if resultPower: squares += chunkSize
            else: oneSquares += chunkSize
            resultPower <<= chunkSize
            d = digit >> bitPos & mask
            if d:
                mults += 1
                resultPower += d
    if resultPower != n: raise AssertionError
    return Mults(squares, mults, oneSquares, chunkSize)

def est_mulP(n, ws=30, chunkSize=5):
    """Compute counts of mulP directly
    >>> est_mulP(10000, 8, 4)
    Mults(squares=13, mults=17, empty=4, chunkSize=4)
    """
    if not n: return Mults(1, (1 << chunkSize) - 2, 0, chunkSize)
    bl = n.bit_length()
    dg = digits(n, chunkSize)
    return Mults((len(dg) - 1) * chunkSize + 1,
                 sum(map(bool, dg)) + (1 << chunkSize) - 2,
                 (chunkSize - 1 - len(dg) * chunkSize) % ws + 1,
                 chunkSize)

# proposed function --------------------------------------------
"""C:
static const unsigned char[8] CHUNKSIZE5TO8 = {2,3,2,3,2,3,2,2};
static const unsigned char[8] MAXLENGTHFOR4 = {
static const unsigned char[8] MAXLENGTHFOR5 = {
int goodChunkSize(PyLongPtr* n)
{
    int length = n -> length;
    if (length == 1) {
        firstDigit = n -> digits [0];
        if (firstDigit < 16)
            return (2);
    } else {
        firstDigit = n[-1]<<ws|n[-1]
    firstDigitLength = firstDigit.bit_length()
    prefix = firstDigit > firstDigitLength - 4
    if length == 1:
        if firstDigitLength < 8:
            return [2, 2, 3, 3, 2, 2, 3, 2][prefix - 8]
        moreDigits = 0
    else:
        moreDigits = length - 2
    fullLength = firstDigitLength + moreDigits * ws
    if moreDigits:
        <* h2 free length checks *>
    h2 = (firstDigit >> 2).bit_count()
    switch (prefix) {
    case 8:
    case 12:
        if (fullLength <= 14) return 2;
        if (fullLength <= 14) return 2;
        return ...
    case 9:
        return ...
    case 10:
    case 14:
        return 3;
    case 11:
        return ...
    case 13:
        return...
    case 15:
        return ...
    }
    
"""
def otfR(n, ws=30, chunkSize=2, special=False):
    """On the fly powering.
    Using a table of powers, this routine multplies multiple bits
    at a time.
    Optimizations:
      - only store odd numbers in the table, and search for the locations to
        apply the power.
      - do not compute table entries until needed
      - use the square that is needed to construct the table where useful
      - special case for numbers that start with 1001
    Results:
    chunkSize = 2 is never worse than binary (!)
    chunkSize = 3 costs at most 1 (8% of numbers <100, 1.4% under a million)
    The best value for the chunkSize parameter is computed by otfR_good
    >>> otfR(27, 8, 2)
    Mults(squares=4, mults=2, empty=0, chunkSize=2)
    """
    if n < 2: return Mults(0, 0, 0, chunkSize)
    squares, mults, maxTable = 1, 0, 1
    def tableFetch(p):
        nonlocal maxTable, mults
        assert p & 1
        while maxTable < p:
            mults += 1
            maxTable += 2
    resultPower = 0
    myDigits = list(digits(n, ws))
    if len(myDigits) > 1:
        myDigits[-2] |= myDigits[-1] << ws
        del myDigits[-1]
    for digit in reversed(myDigits):
        resultShift = ws
        while digit:
            # compute active position
            bitPos = max(digit.bit_length(), chunkSize) - chunkSize
            head = digit >> bitPos
            if not resultPower:
                # start here: first part of first digit
                if head == 2:
                    pass # from variable
                elif head == 4 and bitPos > 1 and digit >> bitPos - 1 == 9:
                    # head can be extended to 9 (we know chunkSize==3 now)
                    bitPos -= 1
                    head = 9
                    tableFetch(head)
                else:
                    # make digit >> bitPos odd by moving to the left
                    bitPos += (head ^ head - 1).bit_length() - 1
                    head = digit >> bitPos
                    if head == 1 and bitPos and digit >> bitPos - 1 == 2:
                        # we can move to the right for free from the square
                        bitPos -= 1
                        head = 2
                    else:
                        tableFetch(head)
            else:
                # make digit >> bitPos odd
                bitPos += (head ^ head - 1).bit_length() - 1
                head = digit >> bitPos
                # shift to bitPos, multiply with table entry
                squares += resultShift - bitPos
                resultPower <<= resultShift - bitPos
                tableFetch(head)
                mults += 1
            resultShift = bitPos
            resultPower += head
            # remove chunk
            digit &= (1 << resultShift) - 1
        squares += resultShift
        resultPower <<= resultShift
    if resultPower != n: raise AssertionError
    return Mults(squares, mults, 0, chunkSize)

def goodChunksize(n):
    """Determine a very good chunk size for n.
    Roughly, using chunk size n costs 1<<n-1 multiplications to build the table
    and n+2 multiplications per n+1 bits of exponent.
    It is easy to verify that the crossover points are roughly:
    2-3: 2*3*4 = 24 bits
    3-4: 4*4*5 = 80 bits
    4-5: 8*5*6 = 240 bits
    5-6: 16*6*7 = 672 bits
    If fact, for shorter numbers it depends heavily on the first few bits of
    the exponent. So we look at the first four, and use this to make a better
    guess. This saves a fraction of a multiplication on average, which still
    makes it worth the effort spendind a few instructions.
    """
    # let's do the easy to see cases first
    length = n.bit_length()
    if length < 5: return 2
    # first four bits of n to optimize the boundaries
    prefix = n >> length - 4
    if length > 100:
        if length > [647, 669, 657, 671, 647, 670, 657, 671][prefix - 8]: return 6
        if length > [197, 230, 215, 230, 197, 230, 214, 231][prefix - 8]: return 5
        return 4
    # now n has between 8 and 100 bits
    # the optimal chunksize depends heavily on bit pattern subtleties in n
    # we use the two values below to get an idea on the best approach
    if length < 8:
        return [2, 2, 3, 3, 2, 2, 3, 2][prefix - 8]
    # number of times there are two ones with distance 2 apart in n
    # note this is at least 1 for prefix 10,11,13,14 and at least two for 15
    h2 = hamming(n & n >> 2)
    # with these two numbers, we can combine a value for chunksize
    # that is (in average) a fraction of a multiplication from optimal
    # for lengths up to 17, it is optimal
    if prefix == 8 or prefix == 12:
        if length <= 14: return 2
        if length > 83: return 4
        return 2 if h2 < 4 + 25 // (length - 14) else 3
    if prefix == 9:
        # this is the most complicated one
        if h2 < (31 - length) // 8: return 2
        if length >= 60: return 4
        if h2 < 3 + 110 // (64 - length) >> 1: return 4
        return 3
    if prefix == 10:
        return 3 if length + h2 < 105 else 4
    if prefix == 11:
        # return 3 if length < 46 else 4
        # there's a very complicated region around 45 to be handled here
        return 3 if length + (h2 - 18) ** 2 // 19 < 50 else 4
        # this saves .08 multiplications for numbers of 46 bits
    if prefix == 13:
        if length <= 14 or h2 < 9 + 40 // (length - 14) >> 1:
            return 2
        return 3 if length < 28 else 4
    if prefix == 14:
        return 3 if length < 84 else 4
    if prefix == 15:
        # the complicated region around 57 bits costs about .01 multiplication:
        # we leave that since there's no benefit anymore
        return 2 if h2 < 4 else (3 if length + h2 < 71 else 4)
    raise ValueError

def fastChunksize(n):
    """Determine a very good chunk size for n,
    but don't look further than the first two digits of n.
    Roughly, using chunk size n costs 1<<n-1 multiplications to build the table
    and n+2 multiplications per n+1 bits of exponent.
    It is easy to verify that the crossover points are roughly:
    2-3: 2*3*4 = 24 bits
    3-4: 4*4*5 = 80 bits
    4-5: 8*5*6 = 240 bits
    5-6: 16*6*7 = 672 bits
    If fact, for shorter numbers it depends heavily on the first few bits of
    the exponent. So we look at the first four, and use this to make a better
    guess. This saves a fraction of a multiplication on average, which still
    makes it worth the effort spending a few instructions.
    """
    #TODO: for 10,12,14,15 we have to be careful for the big numbers
    # let's do the easy to see cases first
    length = n.bit_length()
    if length <= 4: return 2
    # first four bits of n to optimize the boundaries
    prefix = n >> length - 4
    # very short numbers
    if length < 8:
        return [2, 2, 3, 3, 2, 2, 3, 2][prefix - 8]
    if length > 60:
    # we don't want to compute h2 for such long numbers
        # for odd prefix we don't need 3
        if length < 84 and ~prefix & 1: return 3
        if length < [198, 231, 216, 231, 198, 231, 215, 232][prefix - 8]: return 4
        # in the C code we compare length >> 2
        if length < [648, 670, 658, 672, 648, 671, 658, 672][prefix - 8]: return 5
        return 6
    # now n has between 8 and 60 bits
    # the optimal chunksize depends heavily on bit pattern subtleties in n
    # we use the h2 below to get an idea on the best approach
    # number of times there are two ones with distance 2 apart in n
    # note this is at least 1 for prefix 10,11,13,14 and at least two for 15
    h2 = hamming(n & n >> 2) if length <= 60 else length // 4
    # with length  prefix and h2, we can combine a value for chunksize
    # that is (in average) a fraction of a multiplication from optimal
    # for lengths up to 17, it is optimal
    if prefix == 8 or prefix == 12:
        if length <= 14: return 2
        if length > 83: return 4
        return 2 if h2 < 4 + 25 // (length - 14) else 3
    if prefix == 9:
        # this is the most complicated one
        if h2 < (31 - length) // 8: return 2
        if h2 < 3 + 110 // (64 - length) >> 1: return 4
        return 3
    if prefix == 10 or prefix == 14:
        return 3
    if prefix == 11:
        # return 3 if length < 46 else 4
        # there's a very complicated region around 45 to be handled here
        return 3 if length + (h2 - 18) ** 2 // 19 < 50 else 4
        # this saves .08 multiplications for numbers of 46 bits
    if prefix == 13:
        if length <= 14 or h2 < 9 + 40 // (length - 14) >> 1: return 2
        return 3 if length < 28 else 4
    if prefix == 15:
        # the complicated region around 57 bits costs about .01 multiplication:
        # we leave that since there's no benefit anymore
        return 2 if h2 < 4 else (3 if length + h2 < 71 else 4)
    raise ValueError

#goodChunksize = fastChunksize

def otfR_good(n, ws=30):
    """This optimal choice for chunk size gives pretty nice results.
    Here's a comparison with respect to optimal parameter choice.
    Method  Up to 10**3   10**4    10**5     10**6      10**7       10**8
    Binary        12910  178226  2283954  27836418  327657410  3780229378
    Optimal       11032  154298  1970878  23957755  282553573
    Good          11039  154553  1974797  24024764  283496827  3254408073
    Difference   -14.5%  -13.3%   -13.5%    -13.7%     -13.5%      -13.9%
    """
    return otfR(n, 32, goodChunksize(n))

# analytic stuff -------------------------------------------
def otfR_optimal(n, ws=30):
    return min((otfR(n, ws, k) for k in range(2, 6)), key=int)

def chooseBigChunk():
    """Give statistical information for very big numbers
    Apparently, we get something like this:
    - 30: what otfR_good says
    30-70: 4 appears to work nicely for prefixes 13, 9, 11, 15 too
    70-250: most prefer 4, some 5
    250-: all numbers use 5
    750: all numbers use 66
    """
    for p in range(100):
        if p % 10 == 0:
            print("length  8  9 10 11 12 13 14 15   h2")
        length = int(20 * 1.05 ** p)
        n = getrandbits(length - 1) | 1 << length - 1
        mults = [int(otfR(n, 30, k)) for k in range(2, 7)]
        opt = min(mults)
        opts = [i for i,m in enumerate(mults, start=2) if m==opt]
        prefix = n >> length - 4
        mask = (1 << length) - 1
        h2 = hamming(mask & n & n >> 2)
        print(f"{length:4d}:", "   " * (prefix - 8),
            str(min(opts))+str(max(opts)), "   " * (15 - prefix), h2)

def showSummary(m=9):
    """Print the text in the header of otfR_good
    """
    print(end="Method To  ")
    print(*(" " * (d-2) + f"10**{d:d}" for d in range(3, m)))
    binary = [sum(int(est_mul_bin(n, 30)) for n in range(10 ** d))
              for d in range(3, m)]
    print(end="Binary     ")
    print(*(f"{binary[d-3]:{d+3}d}" for d in range(3, m)))
    optimal = [sum(int(otfR_optimal(n, 30)) for n in range(10 ** d))
              for d in range(3, m-1)]
    print(end="Optimal    ")
    print(*(f"{optimal[d-3]:{d+3}d}" for d in range(3, m-1)))
    good = [sum(int(otfR_good(n, 30)) for n in range(10 ** d))
              for d in range(3, m)]
    print(end="Good       ")
    print(*(f"{good[d-3]:{d+3}d}" for d in range(3, m)))
    print(end="Difference ")
    print(*(f"{(good[d-3]-binary[d-3])/binary[d-3]:{d+3}.1%}"
            for d in range(3, m)))

# If you look at the output of this routine, you see the patterns
# that allow to guess the optimal chunk size
# for more, see showOverviewOutput.txt
def showOverview(margin=.01, ws=30, lengths=None, prefixes=None, verbose=False):
    """Shows a very concise overview of te best choices to make for
    the otfR parameter.
    Parameters: accuracy margin (take e.g. .001 for high accuracy (slow!)
    ws: word size of Python's integers
    lengths: iterable of bit lengths: default 6 to 100 in increasing steps
    prefixs: iterable of prefixes: default is all
    Lines are like this:
    [22,9:2<1;3>4]24 4 4 4 4 34 3 s 3 3 3 3 3 3 3 3 3
      | |  |   |  ^ choices for chunkSize for h2=0 and up that give value
      | |  |   |    that uses almost optimal multiplications
      | |  |   |    e.g. 34 means that for h2=5, both s and 4 are almost optimal
      | |  |   ^ if h2>4, you can safely use s (! means: use always)
      | |  ^ if h2<1, you can safely use 2
      | ^ prefix (first four bits of number in hex)
      ^ length of number in bits
    If the line is near optimal for 2 (prefix 89cd) or 3 (prefix abef)
    irrespective of h2, is it omitted.
    Also notice the wordsize artifacts for numbers just above 30, 60.
    Lower margin is more accurate, but slower of course
    """
    samplesize = int(1 / margin**2)
    if prefixes is None: prefixes = range(8, 16)
    if lengths is None: lengths = chain(
            range(6,35),
            range(35, 65, 2),
            range(65, 100, 5))
    for length in lengths:
        for prefix in prefixes:
            m2 = defaultdict(int)
            m3 = defaultdict(int)
            m4 = defaultdict(int)
            m5 = defaultdict(int)
            ns = range(prefix << length-4, prefix+1 << length-4)
            if samplesize >> length - 4: pass
            elif length < 30: ns = sample(ns, samplesize)
            else: ns = islice(iter(
                lambda:getrandbits(length-4)|prefix<<length-4,
                None),samplesize)
            for n in ns:
                h2 = hamming(n&n>>2)
                m2[h2] += int(otfR(n,ws,2))
                m3[h2] += int(otfR(n,ws,3))
                m4[h2] += int(otfR(n,ws,4))
                m5[h2] += int(otfR(n,ws,5))
            line = []
            irrelevant = sum(m2.values()) * margin
            for k in range(max(m2)+1):
                m = m2[k],m3[k],m4[k],m5[k]
                b = min(m) * (margin + 1)
                line.append((''.join(
                    str(i)
                    for i,v in enumerate(m, start=2)
                    if irrelevant <= v <= b)) or '-')
            if not verbose:
                prediction = str((prefix >> 1 & 1) + 2)
                if all(prediction in w or w=='-' for w in line): continue
            while line[-1] == '-': del line[-1]
            head = ""
            try:
                non2 = min(i
                           for i,w in enumerate(line)
                           if "2" not in w and w != '-')
                head += f"2<{non2}"
            except ValueError: head += "2!"
            try:
                hi3 = max(i
                          for i,w in enumerate(line)
                          if "3" not in w and w != '-')
                head += f";3>{hi3}"
            except ValueError: head += ";3!"
            try:
                hi4 = max(i
                          for i,w in enumerate(line)
                          if "4" not in w and w != '-')
                if hi4 < len(line) - 1: head += f";4>{hi4}"
            except ValueError: head += ";4!"
            line = ']' + ' '.join(line)
            if prefix == 9: line = line.replace("3", "s")
            print(f"[{length},{prefix:x}:" + head + line)
        print()
        #if length>15: input("?")

def showOverview2(margin=.01, ws=30, lengths=None, prefixes=None):
    """Checks out if goodChunksize works.
    Parameters: accuracy margin (take e.g. .001 for high accuracy (slow!)
    ws: word size of Python's integers
    lengths: iterable of bit lengths: default 6 to 100 in increasing steps
    prefixs: iterable of prefixes: default is all
    [22,9:0.100]24 4 4 4 4 s4 s s s s s s s s s s s
      ^ ^  ^    ^ choices for chunkSize for h2=0 and up that give value
      | |  |      that uses almost optimal multiplications
      | |  |      e.g. 34 means that for h2=5, both s and 4 are almost optimal
      | |  ^  number of multiplications lost per call to goodChunksize
      | ^ prefix (first four bits of number in hex)
      ^ length of number in bits
    """
    if prefixes is None: prefixes = range(8, 16)
    if lengths is None: lengths = chain(
            range(6,35),
            range(35, 65, 2),
            range(65, 100, 5))
    totalLoss = 0
    samplesize = int(1 / margin**2)
    for length in lengths:
        for prefix in prefixes:
            m2 = defaultdict(int)
            m3 = defaultdict(int)
            m4 = defaultdict(int)
            m5 = defaultdict(int)
            m6 = defaultdict(int)
            choice = {}
            ns = range(prefix << length-4, prefix+1 << length-4)
            if samplesize >> length - 4: pass
            elif length < 30: ns = sample(ns, samplesize)
            else: ns = islice(iter(
                lambda:getrandbits(length-4)|prefix<<length-4,
                None),samplesize)
            samples = 0
            for n in ns:
                samples += 1
                h2 = hamming(n&n>>2)
                m2[h2] += int(otfR(n,ws,2))
                m3[h2] += int(otfR(n,ws,3))
                m4[h2] += int(otfR(n,ws,4))
                m5[h2] += int(otfR(n,ws,5))
                m6[h2] += int(otfR(n,ws,6))
                if h2 not in choice: choice[h2] = goodChunksize(n)
            line = []
            irrelevant = sum(m2.values()) * margin
            loss = 0
            for k in range(max(m2)+1):
                m = m2[k],m3[k],m4[k],m5[k],m6[k]
                b = min(m) * (margin + 1)
                line.append((''.join(
                    str(i)
                    for i,v in enumerate(m, start=2)
                    if irrelevant <= v <= b)) or '-')
                if k in choice:
                    loss += m[choice[k] - 2] - min(m)
            while line and line[-1] == '-': del line[-1]
            if line[:3] == ['-', '-', '-']:
                firstNonDash = 0
                while line[firstNonDash] == '-': firstNonDash += 1
                line[:firstNonDash] = [str(firstNonDash) + '*-']
            line = ']' + ' '.join(line)
            print(f"[{length},{prefix:x}:{loss/samples:.3f}"  + line)
            totalLoss += loss/samples
        if len(prefixes) != 1: print()
    return totalLoss

# old stuff, no longer used -------------------------------------
def genWithDoubles(length, doubles, includeZero=True):
    """generate binary tuples up to length m
    with given number of consecutive ones
    overlap is allowed
    set includeZero to False if you want to start with a 1
    >>> for t in genWithDoubles(5, 2):
    ...    print(*t)
    0 0 1 1 1
    0 1 1 1 0
    1 0 1 1 1
    1 1 0 1 1
    1 1 1 0 0
    1 1 1 0 1
    """
    lenMdbl = length - doubles
    if doubles < 0 or lenMdbl < bool(length):
        raise ValueError(length, doubles)
    if length < 3:
        table = {
            (0,0): ((),),
            (1,0): ((0,), (1,)),
            (2,0): ((0,0), (0,1), (1,0)),
            (2,1): ((1,1),),
        }
        if not includeZero:
            table[1,0] = ()
            table[2,0] = (1,0),
        yield from table[length, doubles]
        return
    # only one way possible: all ones
    if lenMdbl == 1:  
        yield (1,) * length
        return
    # we know 0 <= doubles and lenMdbl > 1
    # put a 0 in front
    if includeZero:
        yield from map(
            partial(add, (0,)),
            genWithDoubles(length - 1, doubles))
    # put 1..1 0 in front
    if doubles and lenMdbl > 2:
        for i in range(doubles):
            yield from map(
                partial(add, (1,) * (i + 1) + (0,)),
                genWithDoubles(length - i - 2, doubles - i))
    # put, 1..1 0 in front, no doubles left
    yield from map(
        partial(add, (1,) * (doubles + 1) + (0,)),
        genWithDoubles(lenMdbl - 2, 0))

def genH2numbers(length, doubles, prefix):
    """generate numbers of length bits
    with given value for hamming(n & n >> 2)
    and four bit prefix (n >> n.bit_length() - 4)
    I wrote very complicated and nice code for this, but it is not faster, alas.
    """
    mask = (1 << length - 4) - 1
    if length < 30:
        biglist = range(prefix << length - 4, prefix + 1 << length - 4)
        if len(biglist) > 1000000:
            biglist = sample(biglist, 1000000)
    else:
        biglist = iter(
            lambda:getrandbits(length - 4) | prefix << length - 4,
            None)
    return islice(
        (n
         for n in biglist
         if hamming(mask & n & n >> 2) == doubles),
        10000)

def genWithInitial(length, doubles, prefix):
    """Like genWithDoubles, but given the initial few bits.
    """
    if not prefix:
        return genWithDoubles(length, doubles)
    # adjust doubles for the doubles we already have
    initialPatValue = int(''.join(map(str, prefix)), 2)
    doubles -= hamming(initialPatValue & initialPatValue >> 1)
    # compute new length
    newLength = length - len(prefix)
    # allow all results for now; see below
    includeZero = True
    if prefix[-1]:
        # to make sure the pairs are counted correctly
        # we let the generator fix the initial bit to 1
        # and clip it from the prefix
        newLength += 1
        includeZero = False
        prefix = prefix[:-1]
    # check if there is any result to be expected
    if not 0 <= doubles < newLength + (newLength==0):
        # new length is too short; there is no solution
        return ()
    # now build result
    return map(partial(add, prefix),
                       genWithDoubles(newLength,
                                      doubles,
                                      includeZero))

def genH2numbers(length, doubles, prefix):
    """generate numbers of length bits
    with given value for hamming(n & n >> 2)
    and four bit prefix (n >> n.bit_length() - 4)
    Not the most readable code, but it works.
    >>> successes = 0
    >>> for length in range(4, 12):
    ...    mask = (1 << length - 4) - 1
    ...    for prefix in range(8, 16):
    ...      for doubles in range(length - 4 >> 1):
    ...        s = set(
    ...            n
    ...            for n in range(prefix << length - 4, prefix + 1 << length - 4)
    ...            if hamming(mask & n & n >> 2) == doubles)
    ...        if s ^ set(genH2numbers(length, doubles, prefix)):
    ...           print(length, doubles, prefix)
    ...        else: successes += 1
    >>> successes
    96
    """
    finalPrefix = prefix >> 2 << length - 2
    # Generate even and odd bits separately, so that
    # the "1.1" pattern turns into consecutive bits
    # the first bit of the pattern is given by prefix.
    patLen = (length - 3) // 2 + 1
    # determine upper bits of patterns, determined by prefix
    if length & 1:
        # make sure the even generator produces an extra bit
        # so we can zip later
        evenHiBits = (prefix >> 2 & 1, prefix & 1)
        oddHiBits = (1,) if prefix & 2 else (0,)
    else:
        evenHiBits = (1,) if prefix & 2 else (0,)
        oddHiBits = (1,) if prefix & 1 else (0,)
    # Distribute the doubles over both parts in all ways
    maxDoubles = patLen - 1
    for evenDoubles in range(max(0, doubles - maxDoubles),
                             min(doubles, maxDoubles) + 1):
        # there's an extra double in evenHiBits that doesn't count
        evenGen = genWithInitial(patLen,
                                 evenDoubles + (evenHiBits == (1,1)),
                                 evenHiBits)
        oddGen = genWithInitial(patLen, doubles - evenDoubles, oddHiBits)
        for even, odd in product(evenGen, oddGen):
            # weave the bits, build a string, convert to binary
            yield int(
                ''.join(map(
                    str,
                    chain.from_iterable(zip(even, odd))
                    )),
                2) | finalPrefix

def mulJ(n, ws=30, chunkSize=5):
    """My way of doing chunkSize bits at a time
    Bit pattern: 00100111 00010000
          sqr mult oneSquare
    table  1    3     0
    001    0    1     3
    00111  5    1     0
    0001   4    1     0
    0000   4    0     0
    >>> mulJ(10000, 8, 3)
    Mults(squares=12, mults=6, empty=0, chunkSize=3)
    """
    # make table
    squares, mults = 0, (1 << chunkSize - 1) - 1
    if chunkSize > 1: squares += 1
    resultPower = 0
    for digit in reversed(digits(n, ws)):
        resultShift = ws
        while digit:
            # compute active position
            bitPos = max(digit.bit_length(), chunkSize) - chunkSize
            if not resultPower:
                # first part of first digit
                resultPower = digit >> bitPos
                if resultPower & 1:
                    pass # from table
                elif resultPower == 2:
                    pass # from variable
                else:
                    mults += 1 # from previous table entry
                resultShift = bitPos
            else:
                # make digit >> bitPos odd
                while digit >> bitPos & 1 == 0:
                    bitPos += 1
                # shift to bitPos, multiply with table entry
                squares += resultShift - bitPos
                resultPower <<= resultShift - bitPos
                resultShift = bitPos
                mults += 1
                resultPower += digit >> resultShift
            digit &= (1 << resultShift) - 1
        squares += resultShift
        resultPower <<= resultShift
    if resultPower != n: raise AssertionError
    return Mults(squares, mults, 0, chunkSize)

def est_mulJ(n, ws=30, chunkSize=5):
    """Estimate output of mulJ
    >>> est_mulJ(10000, 8, 3)
    Mults(squares=12, mults=6, empty=0, chunkSize=3)
    """
    squares, mults = (chunkSize>1), (1 << chunkSize - 1) - 1
    if not n: return Mults(squares, mults, 0, chunkSize)
    dg = digits(n, ws)
    pat = "0*(.{,%d}1)" % (chunkSize - 1)
    mults  += sum(
        len(re.findall(pat, format(digit, 'b')))
        for digit in dg)
    squares += ws * (len(dg) - 1)
    first = format(dg[-1], 'b')
    m = first[:chunkSize]
    if m[-1] == '1' or m == '10':
        mults -= 1
    squares += len(first) - len(m)
    return Mults(squares, mults, 0, chunkSize)

# extended tests ---------------------------------------------------
__test__ = {
'mul_bin_small': '''
    >>> for i in range(256):
    ...   if est_mul_bin(i,4) != mul_bin(i, 4):
    ...      print(i, est_mul_bin(i,4), mul_bin(i, 4))
''',
'mul_bin_big': '''
    >>> for b in range(8, 100):
    ...   r = getrandbits(b)
    ...   if est_mul_bin(r, 15) != mul_bin(r, 15):
    ...      print(i, est_mul_bin(i,15), mul_bin(i, 15))
    ...   if est_mul_bin(r, 30) != mul_bin(r, 30):
    ...      print(i, est_mul_bin(i,30), mul_bin(i, 30))
''',
'mulP_small': '''
    >>> for i in range(256):
    ...   if est_mulP(i,4,2) != mulP(i, 4, 2):
    ...      print(i, est_mulP(i,4, 2), mulP(i, 4, 2))
''',
'mulP_big': '''
    >>> for b in range(8, 100):
    ...   r = getrandbits(b)
    ...   if est_mulP(r, 15, 3) != mulP(r, 15, 3):
    ...      print(b, est_mulP(r, 15, 3), mulP(r, 15, 3))
    ...   if est_mulP(r, 30, 5) != mulP(r, 30, 5):
    ...      print(b, est_mulP(r, 30, 5), mulP(r, 30, 5))
''',
'mulJ_small': '''
    >>> mulJ(0, 15, 1)
    Mults(squares=0, mults=0, empty=0, chunkSize=1)
    >>> mulJ(0, 15, 3)
    Mults(squares=1, mults=3, empty=0, chunkSize=3)
    >>> mulJ(0, 15, 5)
    Mults(squares=1, mults=15, empty=0, chunkSize=5)
    >>> for i in range(12):
    ...    mulJ(i, 8, 3) #doctest:  +REPORT_NDIFF
    Mults(squares=1, mults=3, empty=0, chunkSize=3)
    Mults(squares=1, mults=3, empty=0, chunkSize=3)
    Mults(squares=1, mults=3, empty=0, chunkSize=3)
    Mults(squares=1, mults=3, empty=0, chunkSize=3)
    Mults(squares=1, mults=4, empty=0, chunkSize=3)
    Mults(squares=1, mults=3, empty=0, chunkSize=3)
    Mults(squares=1, mults=4, empty=0, chunkSize=3)
    Mults(squares=1, mults=3, empty=0, chunkSize=3)
    Mults(squares=2, mults=4, empty=0, chunkSize=3)
    Mults(squares=2, mults=5, empty=0, chunkSize=3)
    Mults(squares=2, mults=3, empty=0, chunkSize=3)
    Mults(squares=2, mults=4, empty=0, chunkSize=3)
''',
'mulJ_big': '''
    >>> for b in range(8, 20):
    ...   r = getrandbits(b)
    ...   if est_mulJ(r, 15, 3) != mulJ(r, 15, 3):
    ...      print(b, r, est_mulJ(r, 15, 3), mulJ(r, 15, 3))
    ...   if est_mulJ(r, 30, 5) != mulJ(r, 30, 5):
    ...      print(b, r, est_mulJ(r, 30, 5), mulJ(r, 30, 5))
    ...  # doctest: 
''',
'otfR_small': '''
    >>> otfR(0x1b, 15, 2)
    Mults(squares=4, mults=2, empty=0, chunkSize=2)
    >>> otfR(0x27, 15, 3)
    Mults(squares=3, mults=5, empty=0, chunkSize=3)
    >>> otfR(0x73, 15, 3)
    Mults(squares=5, mults=4, empty=0, chunkSize=3)
    >>> for i in range(10):
    ...    print (i, otfR(i, 8, 2 + i % 2))   #doctest:   +REPORT_NDIFF
    0 Mults(squares=0, mults=0, empty=0, chunkSize=2)
    1 Mults(squares=0, mults=0, empty=0, chunkSize=3)
    2 Mults(squares=1, mults=0, empty=0, chunkSize=2)
    3 Mults(squares=1, mults=1, empty=0, chunkSize=3)
    4 Mults(squares=2, mults=0, empty=0, chunkSize=2)
    5 Mults(squares=1, mults=2, empty=0, chunkSize=3)
    6 Mults(squares=2, mults=1, empty=0, chunkSize=2)
    7 Mults(squares=1, mults=3, empty=0, chunkSize=3)
    8 Mults(squares=3, mults=0, empty=0, chunkSize=2)
    9 Mults(squares=3, mults=1, empty=0, chunkSize=3)
''',
'quality': '''
    >>> sum(int(est_mulP(n, 30, 5)) for n in range(1024))
    38688
    >>> sum(int(est_mulJ(n, 30, 5)) for n in range(1024))
    21934
    >>> sum(int(est_mulP(n, 30, 3)) for n in range(1024))
    17832
    >>> sum(int(est_mulP(n, 30, 2)) for n in range(1024))
    14424
    >>> sum(int(est_mul_bin(n, 30)) for n in range(1024))
    13314
    >>> sum(int(est_mulJ(n, 30, 3)) for n in range(1024))
    12482
    >>> sum(int(est_mulJ(n, 30, 2)) for n in range(1024))
    11724
    >>> sum(int(otfR(n, 30, 3)) for n in range(1024))
    11581
    >>> sum(int(otfR(n, 30, 2)) for n in range(1024))
    11578
    >>> sum(int(otfR(n, 30, 2)) > int(est_mul_bin(n, 30)) for n in range(1024))
    0
''',
}

def setupDoctest():
    import ast
    parsed = ast.parse(open(__file__).read(), "doctest")
    doctypes = ast.Module, ast.FunctionDef, ast.ClassDef
    for node in ast.walk(parsed):
        if isinstance(node, doctypes):
            d = ast.get_docstring(node, True)
            if d:
                if getattr(node, "name", "module") in __test__:
                    print("Doubly defined:", node.name)
                __test__[getattr(node, "name", "module")] = d

if __name__ == "__main__":
    try:
        assert False
        setupDoctest()
    except AssertionError: pass
    print("Testing...")
    import doctest
    print(doctest.testmod())

if False:
    while True:
        prefix = 4
        l = int(input("length:"))
        n = getrandbits(l-3)|prefix<<l-3
        print(hex(n))
        print(est_mulJ(n,30,1))
        print(est_mulJ(n,30,2))