#!/usr/bin/env python3.2 # -*- coding: UTF-8 -*- import re import regex # Lord, they do make these things difficult in python, don't they?! import sys import os if not (("PYTHONIOENCODING" in os.environ) and re.search("^utf-?8$", os.environ["PYTHONIOENCODING"], re.I)): sys.stderr.write(sys.argv[0] + ": set your PYTHONIOENCODING envariable to utf8 and rerun\n") sys.exit(1) # python is only capable of simple casemapping (which seems an error in a string lang) # and of simple casefolding (which is rather more debatable) ONLY_SIMPLES = 1 # which tests to run, can be any or all RUN_MAPPING_TESTS = 1 # whether tolower&c returns what we expected RUN_AGREEMENT_TESTS = 1 # whether tolower&c returns islower&c RUN_FOLDING_TESTS = 1 # whether original ci-matches lower/title/upper/fold data # determines which pattern matching library to use for folding tests: # when true uses Matthew Barnett's "regex" # otherwise uses std "re" USE_MRAB_REGEX = 0 def str_matched_rx(string, pattern): anchor_pattern = '^' + pattern + '$' if USE_MRAB_REGEX: return regex.search(anchor_pattern, string, regex.IGNORECASE + regex.UNICODE) else: return re.search( anchor_pattern, string, re.I + re.U) # this is the only part that differs in the p2/p3 versions; too annoying # to massage the data conditionally, and can't use proper unicode strings # in p2 without the ugly u'' hack data = [ # ORIG LC_SIMPLE TC_SIMPLE UC_SIMPLE LC_FULL TC_FULL UC_FULL FC_SIMPLE FC_TURKIC FC_FULL [ "þǽr rihtes", "þǽr rihtes", "Þǽr Rihtes", "ÞǼR RIHTES", "þǽr rihtes", "Þǽr Rihtes", "ÞǼR RIHTES", "þǽr rihtes", "þǽr rihtes", "þǽr rihtes", ], [ "duȝeðlice", "duȝeðlice", "Duȝeðlice", "DUȜEÐLICE", "duȝeðlice", "Duȝeðlice", "DUȜEÐLICE", "duȝeðlice", "duȝeðlice", "duȝeðlice", ], [ "Ævar Arnfjörð Bjarmason", "ævar arnfjörð bjarmason", "Ævar Arnfjörð Bjarmason", "ÆVAR ARNFJÖRÐ BJARMASON", "ævar arnfjörð bjarmason", "Ævar Arnfjörð Bjarmason", "ÆVAR ARNFJÖRÐ BJARMASON", "ævar arnfjörð bjarmason", "ævar arnfjörð bjarmason", "ævar arnfjörð bjarmason", ], [ "Кириллица", "кириллица", "Кириллица", "КИРИЛЛИЦА", "кириллица", "Кириллица", "КИРИЛЛИЦА", "кириллица", "кириллица", "кириллица", ], [ "Van Dijke", "van dijke", "Van Dijke", "VAN DIJKE", "van dijke", "Van Dijke", "VAN DIJKE", "van dijke", "van dijke", "van dijke", ], [ "fi", "fi", "fi", "fi", "fi", "Fi", "FI", "fi", "fi", "fi", ], [ "filesystem", "filesystem", "filesystem", "fiLESYSTEM", "filesystem", "Filesystem", "FILESYSTEM", "filesystem", "filesystem", "filesystem", ], [ "efficient", "efficient", "Efficient", "EffiCIENT", "efficient", "Efficient", "EFFICIENT", "efficient", "efficient", "efficient", ], [ "flour and water", "flour and water", "flour And Water", "flOUR AND WATER", "flour and water", "Flour And Water", "FLOUR AND WATER", "flour and water", "flour and water", "flour and water", ], [ "dz", "dz", "Dz", "DZ", "dz", "Dz", "DZ", "dz", "dz", "dz", ], [ "dzur mountain", "dzur mountain", "Dzur Mountain", "DZUR MOUNTAIN", "dzur mountain", "Dzur Mountain", "DZUR MOUNTAIN", "dzur mountain", "dzur mountain", "dzur mountain", ], [ "poſt", "poſt", "Poſt", "POST", "poſt", "Poſt", "POST", "post", "post", "post", ], [ "poſt", "poſt", "Poſt", "POſt", "poſt", "Poſt", "POST", "poſt", "post", "post", ], [ "ſtop", "ſtop", "ſtop", "ſtOP", "ſtop", "Stop", "STOP", "ſtop", "stop", "stop", ], [ "tschüß", "tschüß", "Tschüß", "TSCHÜß", "tschüß", "Tschüß", "TSCHÜSS", "tschüß", "tschüss", "tschüss", ], [ "TSCHÜẞ", "tschüß", "Tschüß", "TSCHÜẞ", "tschüß", "Tschüß", "TSCHÜẞ", "tschüß", "tschüss", "tschüss", ], [ "rußland", "rußland", "Rußland", "RUßLAND", "rußland", "Rußland", "RUSSLAND", "rußland", "russland", "russland", ], [ "RUẞLAND", "rußland", "Rußland", "RUẞLAND", "rußland", "Rußland", "RUẞLAND", "rußland", "russland", "russland", ], [ "weiß", "weiß", "Weiß", "WEIß", "weiß", "Weiß", "WEISS", "weiß", "weiss", "weiss", ], [ "WEIẞ", "weiß", "Weiß", "WEIẞ", "weiß", "Weiß", "WEIẞ", "weiß", "weıss", "weiss", ], [ "ẞIEW", "ßiew", "ẞiew", "ẞIEW", "ßiew", "ẞiew", "ẞIEW", "ßiew", "ssıew", "ssiew", ], [ "ͅ", "ͅ", "Ι", "Ι", "ͅ", "Ι", "Ι", "ι", "ι", "ι", ], [ "ᾲ", "ᾲ", "Ὰͅ", "ᾺΙ", "ᾲ", "Ὰͅ", "ᾺΙ", "ὰι", "ὰι", "ὰι", ], [ "Ὰι", "ὰι", "Ὰι", "ᾺΙ", "ὰι", "Ὰι", "ᾺΙ", "ὰι", "ὰι", "ὰι", ], [ "ᾺΙ", "ὰι", "Ὰι", "ᾺΙ", "ὰι", "Ὰι", "ᾺΙ", "ὰι", "ὰι", "ὰι", ], [ "ᾲ", "ᾲ", "ᾲ", "ᾲ", "ᾲ", "Ὰͅ", "ᾺΙ", "ᾲ", "ὰι", "ὰι", ], [ "Ὰͅ", "ᾲ", "Ὰͅ", "ᾺΙ", "ᾲ", "Ὰͅ", "ᾺΙ", "ὰι", "ὰι", "ὰι", ], [ "ᾺΙ", "ὰι", "Ὰι", "ᾺΙ", "ὰι", "Ὰι", "ᾺΙ", "ὰι", "ὰι", "ὰι", ], [ "ᾲ στο διάολο", "ᾲ στο διάολο", "ᾲ Στο Διάολο", "ᾲ ΣΤΟ ΔΙΆΟΛΟ", "ᾲ στο διάολο", "Ὰͅ Στο Διάολο", "ᾺΙ ΣΤΟ ΔΙΆΟΛΟ", "ᾲ στο διάολο", "ὰι στο διάολο", "ὰι στο διάολο", ], [ "ᾲ στο διάολο", "ᾲ στο διάολο", "Ὰͅ Στο Διάολο", "ᾺΙ ΣΤΟ ΔΙΆΟΛΟ", "ᾲ στο διάολο", "Ὰͅ Στο Διάολο", "ᾺΙ ΣΤΟ ΔΙΆΟΛΟ", "ὰι στο διάολο", "ὰι στο διάολο", "ὰι στο διάολο", ], [ "ⅷ", "ⅷ", "Ⅷ", "Ⅷ", "ⅷ", "Ⅷ", "Ⅷ", "ⅷ", "ⅷ", "ⅷ", ], [ "henry ⅷ", "henry ⅷ", "Henry Ⅷ", "HENRY Ⅷ", "henry ⅷ", "Henry Ⅷ", "HENRY Ⅷ", "henry ⅷ", "henry ⅷ", "henry ⅷ", ], [ "ⓚ", "ⓚ", "Ⓚ", "Ⓚ", "ⓚ", "Ⓚ", "Ⓚ", "ⓚ", "ⓚ", "ⓚ", ], [ "i work at ⓚ", "i work at ⓚ", "I Work At Ⓚ", "I WORK AT Ⓚ", "i work at ⓚ", "I Work At Ⓚ", "I WORK AT Ⓚ", "i work at ⓚ", "i work at ⓚ", "i work at ⓚ", ], [ "istambul", "istambul", "Istambul", "ISTAMBUL", "istambul", "Istambul", "ISTAMBUL", "istambul", "istambul", "istambul", ], [ "i̇stanbul", "i̇stanbul", "İstanbul", "İSTANBUL", "i̇stanbul", "İstanbul", "İSTANBUL", "i̇stanbul", "i̇stanbul", "i̇stanbul", ], [ "İstanbul", "i̇stanbul", "İstanbul", "İSTANBUL", "i̇stanbul", "İstanbul", "İSTANBUL", "i̇stanbul", "ı̇stanbul", "i̇stanbul", ], [ "İSTANBUL", "istanbul", "İstanbul", "İSTANBUL", "i̇stanbul", "İstanbul", "İSTANBUL", "İstanbul", "istanbul", "i̇stanbul", ], [ "στιγμας", "στιγμας", "Στιγμας", "ΣΤΙΓΜΑΣ", "στιγμας", "Στιγμας", "ΣΤΙΓΜΑΣ", "στιγμασ", "στιγμασ", "στιγμασ", ], [ "στιγμασ", "στιγμασ", "Στιγμασ", "ΣΤΙΓΜΑΣ", "στιγμασ", "Στιγμασ", "ΣΤΙΓΜΑΣ", "στιγμασ", "στιγμασ", "στιγμασ", ], [ "ΣΤΙΓΜΑΣ", "στιγμασ", "Στιγμασ", "ΣΤΙΓΜΑΣ", "στιγμασ", "Στιγμασ", "ΣΤΙΓΜΑΣ", "στιγμασ", "στιγμασ", "στιγμασ", ], [ "ʀᴀʀᴇ", "ʀᴀʀᴇ", "Ʀᴀʀᴇ", "ƦᴀƦᴇ", "ʀᴀʀᴇ", "Ʀᴀʀᴇ", "ƦᴀƦᴇ", "ʀᴀʀᴇ", "ʀᴀʀᴇ", "ʀᴀʀᴇ", ], [ "𐐼𐐯𐑅𐐨𐑉𐐯𐐻", "𐐼𐐯𐑅𐐨𐑉𐐯𐐻", "𐐔𐐯𐑅𐐨𐑉𐐯𐐻", "𐐔𐐇𐐝𐐀𐐡𐐇𐐓", "𐐼𐐯𐑅𐐨𐑉𐐯𐐻", "𐐔𐐯𐑅𐐨𐑉𐐯𐐻", "𐐔𐐇𐐝𐐀𐐡𐐇𐐓", "𐐼𐐯𐑅𐐨𐑉𐐯𐐻", "𐐼𐐯𐑅𐐨𐑉𐐯𐐻", "𐐼𐐯𐑅𐐨𐑉𐐯𐐻", ], [ "Ԧԧ", "ԧԧ", "Ԧԧ", "ԦԦ", "ԧԧ", "Ԧԧ", "ԦԦ", "ԧԧ", "ԧԧ", "ԧԧ", ], [ "ﬓﬔﬕﬖﬗ", "ﬓﬔﬕﬖﬗ", "ﬓﬔﬕﬖﬗ", "ﬓﬔﬕﬖﬗ", "ﬓﬔﬕﬖﬗ", "Մնﬔﬕﬖﬗ", "ՄՆՄԵՄԻՎՆՄԽ", "ﬓﬔﬕﬖﬗ", "մնմեմիվնմխ", "մնմեմիվնմխ", ], [ "ʼn groot", "ʼn groot", "ʼn Groot", "ʼn GROOT", "ʼn groot", "ʼN Groot", "ʼN GROOT", "ʼn groot", "ʼn groot", "ʼn groot", ], [ "ẚ", "ẚ", "ẚ", "ẚ", "ẚ", "Aʾ", "Aʾ", "ẚ", "aʾ", "aʾ", ], [ "ff", "ff", "ff", "ff", "ff", "Ff", "FF", "ff", "ff", "ff", ], [ "ǰ", "ǰ", "ǰ", "ǰ", "ǰ", "J̌", "J̌", "ǰ", "ǰ", "ǰ", ], [ "550 nm or Å", "550 nm or å", "550 Nm Or Å", "550 NM OR Å", "550 nm or å", "550 Nm Or Å", "550 NM OR Å", "550 nm or å", "550 nm or å", "550 nm or å", ], ] allgood = 0 allbad = 0 testno = 0 for mapping_row in data: good = 0 bad = 0 (orig, lower_simple, title_simple, upper_simple, lower_full, title_full, upper_full, fold_simple, fold_turkic, fold_full, ) = mapping_row print("\ntest", testno, "for string", orig) if ONLY_SIMPLES: (lower_want, title_want, upper_want, fold_want ) = ( lower_simple, title_simple, upper_simple, fold_simple) else: (lower_want, title_want, upper_want, fold_want ) = ( lower_full, title_full, upper_full, fold_full) lower_have = orig.lower() title_have = orig.title() upper_have = orig.upper() if RUN_AGREEMENT_TESTS: if lower_have.islower(): good = good + 1 else: print("wanted <"+lower_want+"> to be lowercase of <"+orig+"> but python disagrees") bad = bad + 1 if title_have.istitle(): good = good + 1 else: print("wanted <"+title_want+"> to be titlecase of <"+orig+"> but python disagrees") bad = bad + 1 if upper_have.isupper(): good = good + 1 else: print("wanted <"+upper_want+"> to be uppercase of <"+orig+"> but python disagrees") bad = bad + 1 if RUN_MAPPING_TESTS: if lower_want == lower_have: good = good + 1 else: print("failed casemap: lowercase of <"+orig+"> should be <"+lower_want+"> not <"+lower_have+">") bad = bad + 1 if title_want == title_have: good = good + 1 else: print("failed casemap: titlecase of <"+orig+"> should be <"+title_want+"> not <"+title_have+">") bad = bad + 1 if upper_want == upper_have: good = good + 1 else: print("failed casemap: uppercase of <"+orig+"> should be <"+upper_want+"> not <"+upper_have+">") bad = bad + 1 if RUN_FOLDING_TESTS: if str_matched_rx(orig, lower_want): good = good + 1 else: print("failed casefold: <"+orig+"> should match lower <"+lower_want+">") bad = bad + 1 if str_matched_rx(orig, title_want): good = good + 1 else: print("failed casefold: <"+orig+"> should match title <"+title_want+">") bad = bad + 1 if str_matched_rx(orig, upper_want): good = good + 1 else: print("failed casefold: <"+orig+"> should match upper <"+upper_want+">") bad = bad + 1 if str_matched_rx(orig, fold_want): good = good + 1 else: print("failed casefold: <"+orig+"> should match fold <"+fold_want+">") bad = bad + 1 if not ONLY_SIMPLES: if fold_simple == fold_turkic and fold_simple == fold_full: if re.match(orig, fold_simple, re.I): good = good + 1 else: print("failed simple casefold: <"+orig+"> should match simple casefold <"+fold_simple+">") bad = bad + 1 else: if re.match(orig, fold_full, re.I): good = good + 1 else: print("failed full casefold: <"+orig+"> should match full casefold <"+fold_full+">") bad = bad + 1 if fold_full != fold_turkic: if re.match(orig, fold_turkic, re.I): good = good + 1 else: print("failed turkic casefold: <"+orig+"> should match turkic casefold <"+fold_turkic+">") bad = bad + 1 if bad != 0: print("test", testno, "failed", bad, "subtests") else: print("all subtests passed for test", testno) testno = testno + 1 allgood = allgood + good allbad = allbad + bad print("") total = allgood+allbad mask = "Total %-10s %3d / %3d (%3.0f%%)" print(mask % ("failures:", allbad, total, 100.0 * allbad/(allgood+allbad))) print(mask % ("successes:", allgood, total, 100.0 * allgood/(allgood+allbad))) ####################################################