"""
A script to merge Doc/ACKS.txt into Misc/ACKS.

Usage: python merge-acks.py

This script overwrites Misc/ACKS with a new file that adds all new names
from Doc/ACKS.txt.  The order of names in Misc/ACKS is preserved.  New
names are inserted in order relative to Misc/ACKS using the last name,
and then the first.

The script must be run from the source root.

Because the same name can be spelled differently in the two files, the
script prefixes possible duplicates with the string ">>> " so that the
names can be reviewed manually.  Currently, the script prefixes any name
whose last name matches an existing last name.

The script maintains a translation table of non-ascii characters
for alphabetization purposes.  The script exits prematurely if
this table needs to be updated (in case new names have been added
since the script was written).  Before exiting, the script prints the
new characters that need to be added to simplify cutting and pasting.

No attempt was made to make the script performant, as it runs in a few
seconds.

"""

# Approximations obtained from:
#   http://www.fileformat.info/info/unicode/char/search.htm
#
# з: Cyrillic ze
# и: Cyrillic i
# л: Cyrillic el
# м: Cyrillic em
# у: Cyrillic u
# х: Cyrillic ha
# ы: Cyrillic yeru
# ь: Cyrillic "soft sign"
NON_ASCII = "ÅÉØáäåæçéëíимñóôöùúüćęŁлńŽКМСабгекнхорушызь“”"
ASCII_SUB = 'AEOaaaaceeiimnooouuuceLlnZKMCabrekhhopuwyzz""'

CHARACTER_TRANS = str.maketrans(NON_ASCII, ASCII_SUB)

MISC_NAMES_OUT_OF_ORDER = [
    'Jan-Hein Bührman',
    'Jeff McNeil',
    'Hugo Lopes Tavares'
]

# Ending strings that should not be used for sorting purposes.
ENDINGS_TO_IGNORE = [' IV', ', Jr.']

DOC_PATH = 'Doc/ACKS.txt'
MISC_PATH = 'Misc/ACKS'
# The 0-based line index of the first name in Misc/ACKS.
MISC_INDEX = 13


def _get_nonascii_chars_from_string(s):
    """Return as a set the non-ascii characters in a string."""
    chars = set()
    try:
        s.encode('ascii')
        return chars
    except UnicodeEncodeError:
        pass
    for c in s:
        try:
            c.encode('ascii')
        except UnicodeEncodeError:
            chars.add(c)
    return chars

def get_nonascii_chars(strings):
    """Return as a string the non-ascii characters in a list of strings.

    The characters in the return value are sorted by their ord() value.

    """
    chars = set()
    for s in strings:
        new_chars = _get_nonascii_chars_from_string(s)
        if new_chars:
            print(s)
        chars = chars.union(new_chars)

    chars = sorted(chars, key=ord)
    return "".join(chars)

def remove_endings_to_ignore(name):
    for ending in ENDINGS_TO_IGNORE:
        if name.endswith(ending):
            name = name[:-len(ending)]
    return name

def get_last_name(name):
    name = remove_endings_to_ignore(name)
    return name.split(" ").pop()

def get_last_names(names):
    last_names = set()
    for name in names:
        last_name = get_last_name(name)
        last_names.add(last_name)
    return last_names

def _normalize(name):
    """Normalize a name for sorting purposes."""
    name = remove_endings_to_ignore(name)
    # "de Gaye" is ordered as if "deGaye" is the last name.
    name = name.replace("de Gaye", "deGaye")
    name = name.translate(CHARACTER_TRANS)
    name = name.lower()

    parts = name.rsplit(" ", 1)  # last name followed by first names, if any.

    return list(reversed(parts))

def is_less_than(name1, name2):
    return _normalize(name1) < _normalize(name2)

def read_lines(path):
    """Return the lines in a UTF-8 encoded file."""
    with open(path, encoding='utf-8') as f:
        text = f.read()
    return text.splitlines()

def read_doc_names():
    """Return the names in Doc/ACKS.txt."""
    lines = read_lines(DOC_PATH)
    for i, line in enumerate(lines):
        if line.startswith("  "):
            break
    return [line.lstrip(" *") for line in lines[i:]]

def read_misc():
    lines = read_lines(MISC_PATH)
    intro, names = lines[:MISC_INDEX], lines[MISC_INDEX:]
    return intro, names

def check_nonascii(strings):
    print("Scanning for all non-ascii characters...")
    found_non_ascii = get_nonascii_chars(strings)

    if set(found_non_ascii) <= set(NON_ASCII):
        return

    print("New non-ascii characters found.\n"
          "You need to add them to the translation table.")
    print("current: %s" % NON_ASCII)
    print("found:   %s" % found_non_ascii)

    for c in found_non_ascii:
        is_new = c not in NON_ASCII
        print("*" if is_new else " ", c, ord(c))
    exit(1)

def main():
    doc_names = read_doc_names()
    misc_intro, misc_names = read_misc()

    check_nonascii(doc_names + misc_names)

    misc_names_set = set(misc_names)
    misc_last_names = get_last_names(misc_names)

    for doc_name in doc_names:
        if doc_name in misc_names_set:
            # Then it is a duplicate.
            continue

        doc_last_name = get_last_name(doc_name)
        is_possible_duplicate = doc_last_name in misc_last_names

        for i, misc_name in enumerate(misc_names):
            if misc_name in MISC_NAMES_OUT_OF_ORDER:
                # Don't use out-of-order names as a reference when
                # inserting new names.
                continue
            if is_less_than(doc_name, misc_name):
                break
        else:
            # Insert at the end.
            i += 1
        print("insert: %s < %s < %s" %
              (misc_names[i - 1], doc_name, misc_name))

        prefix = '>>> ' if is_possible_duplicate else ''
        misc_names.insert(i, prefix + doc_name)
    print()

    misc_lines = misc_intro + misc_names
    misc_text = "\n".join(misc_lines) + '\n'

    print("Overwriting: " + MISC_PATH)
    with open(MISC_PATH, mode='w', encoding='utf-8') as f:
        f.write(misc_text)

if __name__=='__main__':
    main()