""" A script to merge Doc/ACKS.txt into Misc/ACKS. Usage: python merge-acks.py This script overwrites Misc/ACKS with a new file that adds all new names from Doc/ACKS.txt. The order of names in Misc/ACKS is preserved. New names are inserted in order relative to Misc/ACKS using the last name, and then the first. The script must be run from the source root. Because the same name can be spelled differently in the two files, the script prefixes possible duplicates with the string ">>> " so that the names can be reviewed manually. Currently, the script prefixes any name whose last name matches an existing last name. The script maintains a translation table of non-ascii characters for alphabetization purposes. The script exits prematurely if this table needs to be updated (in case new names have been added since the script was written). Before exiting, the script prints the new characters that need to be added to simplify cutting and pasting. No attempt was made to make the script performant, as it runs in a few seconds. """ # Approximations obtained from: # http://www.fileformat.info/info/unicode/char/search.htm # # з: Cyrillic ze # и: Cyrillic i # л: Cyrillic el # м: Cyrillic em # у: Cyrillic u # х: Cyrillic ha # ы: Cyrillic yeru # ь: Cyrillic "soft sign" NON_ASCII = "ÅÉØáäåæçéëíимñóôöùúüćęŁлńŽКМСабгекнхорушызь“”" ASCII_SUB = 'AEOaaaaceeiimnooouuuceLlnZKMCabrekhhopuwyzz""' CHARACTER_TRANS = str.maketrans(NON_ASCII, ASCII_SUB) MISC_NAMES_OUT_OF_ORDER = [ 'Jan-Hein Bührman', 'Jeff McNeil', 'Hugo Lopes Tavares' ] # Ending strings that should not be used for sorting purposes. ENDINGS_TO_IGNORE = [' IV', ', Jr.'] DOC_PATH = 'Doc/ACKS.txt' MISC_PATH = 'Misc/ACKS' # The 0-based line index of the first name in Misc/ACKS. MISC_INDEX = 13 def _get_nonascii_chars_from_string(s): """Return as a set the non-ascii characters in a string.""" chars = set() try: s.encode('ascii') return chars except UnicodeEncodeError: pass for c in s: try: c.encode('ascii') except UnicodeEncodeError: chars.add(c) return chars def get_nonascii_chars(strings): """Return as a string the non-ascii characters in a list of strings. The characters in the return value are sorted by their ord() value. """ chars = set() for s in strings: new_chars = _get_nonascii_chars_from_string(s) if new_chars: print(s) chars = chars.union(new_chars) chars = sorted(chars, key=ord) return "".join(chars) def remove_endings_to_ignore(name): for ending in ENDINGS_TO_IGNORE: if name.endswith(ending): name = name[:-len(ending)] return name def get_last_name(name): name = remove_endings_to_ignore(name) return name.split(" ").pop() def get_last_names(names): last_names = set() for name in names: last_name = get_last_name(name) last_names.add(last_name) return last_names def _normalize(name): """Normalize a name for sorting purposes.""" name = remove_endings_to_ignore(name) # "de Gaye" is ordered as if "deGaye" is the last name. name = name.replace("de Gaye", "deGaye") name = name.translate(CHARACTER_TRANS) name = name.lower() parts = name.rsplit(" ", 1) # last name followed by first names, if any. return list(reversed(parts)) def is_less_than(name1, name2): return _normalize(name1) < _normalize(name2) def read_lines(path): """Return the lines in a UTF-8 encoded file.""" with open(path, encoding='utf-8') as f: text = f.read() return text.splitlines() def read_doc_names(): """Return the names in Doc/ACKS.txt.""" lines = read_lines(DOC_PATH) for i, line in enumerate(lines): if line.startswith(" "): break return [line.lstrip(" *") for line in lines[i:]] def read_misc(): lines = read_lines(MISC_PATH) intro, names = lines[:MISC_INDEX], lines[MISC_INDEX:] return intro, names def check_nonascii(strings): print("Scanning for all non-ascii characters...") found_non_ascii = get_nonascii_chars(strings) if set(found_non_ascii) <= set(NON_ASCII): return print("New non-ascii characters found.\n" "You need to add them to the translation table.") print("current: %s" % NON_ASCII) print("found: %s" % found_non_ascii) for c in found_non_ascii: is_new = c not in NON_ASCII print("*" if is_new else " ", c, ord(c)) exit(1) def main(): doc_names = read_doc_names() misc_intro, misc_names = read_misc() check_nonascii(doc_names + misc_names) misc_names_set = set(misc_names) misc_last_names = get_last_names(misc_names) for doc_name in doc_names: if doc_name in misc_names_set: # Then it is a duplicate. continue doc_last_name = get_last_name(doc_name) is_possible_duplicate = doc_last_name in misc_last_names for i, misc_name in enumerate(misc_names): if misc_name in MISC_NAMES_OUT_OF_ORDER: # Don't use out-of-order names as a reference when # inserting new names. continue if is_less_than(doc_name, misc_name): break else: # Insert at the end. i += 1 print("insert: %s < %s < %s" % (misc_names[i - 1], doc_name, misc_name)) prefix = '>>> ' if is_possible_duplicate else '' misc_names.insert(i, prefix + doc_name) print() misc_lines = misc_intro + misc_names misc_text = "\n".join(misc_lines) + '\n' print("Overwriting: " + MISC_PATH) with open(MISC_PATH, mode='w', encoding='utf-8') as f: f.write(misc_text) if __name__=='__main__': main()