#!/usr/bin/env python3.2
# -*- coding: UTF-8 -*-

import re
import regex

# Lord, they do make these things difficult in python, don't they?!
import sys
import os
if not (("PYTHONIOENCODING"  in os.environ)
            and
        re.search("^utf-?8$", os.environ["PYTHONIOENCODING"], re.I)):
    sys.stderr.write(sys.argv[0] + ": set your PYTHONIOENCODING envariable to utf8 and rerun\n")
    sys.exit(1)

# python is only capable of simple casemapping (which seems an error in a string lang)
#                    and of simple casefolding (which is rather more debatable)
ONLY_SIMPLES   = 1

# which tests to run, can be any or all
RUN_MAPPING_TESTS   = 1   # whether tolower&c returns what we expected
RUN_AGREEMENT_TESTS = 1   # whether tolower&c returns islower&c
RUN_FOLDING_TESTS   = 1   # whether original ci-matches lower/title/upper/fold data

# determines which pattern matching library to use for folding tests: 
#       when true uses Matthew Barnett's "regex"
#       otherwise uses std "re"
USE_MRAB_REGEX = 0

def str_matched_rx(string, pattern):
    anchor_pattern = '^' + pattern + '$'
    if USE_MRAB_REGEX:
        return regex.search(anchor_pattern, string, regex.IGNORECASE + regex.UNICODE)
    else:
        return re.search(   anchor_pattern, string, re.I             + re.U)

# this is the only part that differs in the p2/p3 versions; too annoying
# to massage the data conditionally, and can't use proper unicode strings
# in p2 without the ugly u'' hack
data = [

    # ORIG LC_SIMPLE TC_SIMPLE UC_SIMPLE LC_FULL TC_FULL UC_FULL FC_SIMPLE FC_TURKIC FC_FULL
    [ "þǽr rihtes", "þǽr rihtes", "Þǽr Rihtes", "ÞǼR RIHTES", "þǽr rihtes", "Þǽr Rihtes", "ÞǼR RIHTES", "þǽr rihtes", "þǽr rihtes", "þǽr rihtes",  ],
    [ "duȝeðlice", "duȝeðlice", "Duȝeðlice", "DUȜEÐLICE", "duȝeðlice", "Duȝeðlice", "DUȜEÐLICE", "duȝeðlice", "duȝeðlice", "duȝeðlice",  ],
    [ "Ævar Arnfjörð Bjarmason", "ævar arnfjörð bjarmason", "Ævar Arnfjörð Bjarmason", "ÆVAR ARNFJÖRÐ BJARMASON", "ævar arnfjörð bjarmason", "Ævar Arnfjörð Bjarmason", "ÆVAR ARNFJÖRÐ BJARMASON", "ævar arnfjörð bjarmason", "ævar arnfjörð bjarmason", "ævar arnfjörð bjarmason",  ],
    [ "Кириллица", "кириллица", "Кириллица", "КИРИЛЛИЦА", "кириллица", "Кириллица", "КИРИЛЛИЦА", "кириллица", "кириллица", "кириллица",  ],
    [ "Van Dĳke", "van dĳke", "Van Dĳke", "VAN DĲKE", "van dĳke", "Van Dĳke", "VAN DĲKE", "van dĳke", "van dĳke", "van dĳke",  ],
    [ "ﬁ", "ﬁ", "ﬁ", "ﬁ", "ﬁ", "Fi", "FI", "ﬁ", "fi", "fi",  ],
    [ "ﬁlesystem", "ﬁlesystem", "ﬁlesystem", "ﬁLESYSTEM", "ﬁlesystem", "Filesystem", "FILESYSTEM", "ﬁlesystem", "filesystem", "filesystem",  ],
    [ "eﬃcient", "eﬃcient", "Eﬃcient", "EﬃCIENT", "eﬃcient", "Eﬃcient", "EFFICIENT", "eﬃcient", "efficient", "efficient",  ],
    [ "ﬂour and water", "ﬂour and water", "ﬂour And Water", "ﬂOUR AND WATER", "ﬂour and water", "Flour And Water", "FLOUR AND WATER", "ﬂour and water", "flour and water", "flour and water",  ],
    [ "ǳ", "ǳ", "ǲ", "Ǳ", "ǳ", "ǲ", "Ǳ", "ǳ", "ǳ", "ǳ",  ],
    [ "ǳur mountain", "ǳur mountain", "ǲur Mountain", "ǱUR MOUNTAIN", "ǳur mountain", "ǲur Mountain", "ǱUR MOUNTAIN", "ǳur mountain", "ǳur mountain", "ǳur mountain",  ],
    [ "poſt", "poſt", "Poſt", "POST", "poſt", "Poſt", "POST", "post", "post", "post",  ],
    [ "poﬅ", "poﬅ", "Poﬅ", "POﬅ", "poﬅ", "Poﬅ", "POST", "poﬅ", "post", "post",  ],
    [ "ﬅop", "ﬅop", "ﬅop", "ﬅOP", "ﬅop", "Stop", "STOP", "ﬅop", "stop", "stop",  ],
    [ "tschüß", "tschüß", "Tschüß", "TSCHÜß", "tschüß", "Tschüß", "TSCHÜSS", "tschüß", "tschüss", "tschüss",  ],
    [ "TSCHÜẞ", "tschüß", "Tschüß", "TSCHÜẞ", "tschüß", "Tschüß", "TSCHÜẞ", "tschüß", "tschüss", "tschüss",  ],
    [ "rußland", "rußland", "Rußland", "RUßLAND", "rußland", "Rußland", "RUSSLAND", "rußland", "russland", "russland",  ],
    [ "RUẞLAND", "rußland", "Rußland", "RUẞLAND", "rußland", "Rußland", "RUẞLAND", "rußland", "russland", "russland",  ],
    [ "weiß", "weiß", "Weiß", "WEIß", "weiß", "Weiß", "WEISS", "weiß", "weiss", "weiss",  ],
    [ "WEIẞ", "weiß", "Weiß", "WEIẞ", "weiß", "Weiß", "WEIẞ", "weiß", "weıss", "weiss",  ],
    [ "ẞIEW", "ßiew", "ẞiew", "ẞIEW", "ßiew", "ẞiew", "ẞIEW", "ßiew", "ssıew", "ssiew",  ],
    [ "ͅ", "ͅ", "Ι", "Ι", "ͅ", "Ι", "Ι", "ι", "ι", "ι",  ],
    [ "ᾲ", "ᾲ", "Ὰͅ", "ᾺΙ", "ᾲ", "Ὰͅ", "ᾺΙ", "ὰι", "ὰι", "ὰι",  ],
    [ "Ὰι", "ὰι", "Ὰι", "ᾺΙ", "ὰι", "Ὰι", "ᾺΙ", "ὰι", "ὰι", "ὰι",  ],
    [ "ᾺΙ", "ὰι", "Ὰι", "ᾺΙ", "ὰι", "Ὰι", "ᾺΙ", "ὰι", "ὰι", "ὰι",  ],
    [ "ᾲ", "ᾲ", "ᾲ", "ᾲ", "ᾲ", "Ὰͅ", "ᾺΙ", "ᾲ", "ὰι", "ὰι",  ],
    [ "Ὰͅ", "ᾲ", "Ὰͅ", "ᾺΙ", "ᾲ", "Ὰͅ", "ᾺΙ", "ὰι", "ὰι", "ὰι",  ],
    [ "ᾺΙ", "ὰι", "Ὰι", "ᾺΙ", "ὰι", "Ὰι", "ᾺΙ", "ὰι", "ὰι", "ὰι",  ],
    [ "ᾲ στο διάολο", "ᾲ στο διάολο", "ᾲ Στο Διάολο", "ᾲ ΣΤΟ ΔΙΆΟΛΟ", "ᾲ στο διάολο", "Ὰͅ Στο Διάολο", "ᾺΙ ΣΤΟ ΔΙΆΟΛΟ", "ᾲ στο διάολο", "ὰι στο διάολο", "ὰι στο διάολο",  ],
    [ "ᾲ στο διάολο", "ᾲ στο διάολο", "Ὰͅ Στο Διάολο", "ᾺΙ ΣΤΟ ΔΙΆΟΛΟ", "ᾲ στο διάολο", "Ὰͅ Στο Διάολο", "ᾺΙ ΣΤΟ ΔΙΆΟΛΟ", "ὰι στο διάολο", "ὰι στο διάολο", "ὰι στο διάολο",  ],
    [ "ⅷ", "ⅷ", "Ⅷ", "Ⅷ", "ⅷ", "Ⅷ", "Ⅷ", "ⅷ", "ⅷ", "ⅷ",  ],
    [ "henry ⅷ", "henry ⅷ", "Henry Ⅷ", "HENRY Ⅷ", "henry ⅷ", "Henry Ⅷ", "HENRY Ⅷ", "henry ⅷ", "henry ⅷ", "henry ⅷ",  ],
    [ "ⓚ", "ⓚ", "Ⓚ", "Ⓚ", "ⓚ", "Ⓚ", "Ⓚ", "ⓚ", "ⓚ", "ⓚ",  ],
    [ "i work at ⓚ", "i work at ⓚ", "I Work At Ⓚ", "I WORK AT Ⓚ", "i work at ⓚ", "I Work At Ⓚ", "I WORK AT Ⓚ", "i work at ⓚ", "i work at ⓚ", "i work at ⓚ",  ],
    [ "istambul", "istambul", "Istambul", "ISTAMBUL", "istambul", "Istambul", "ISTAMBUL", "istambul", "istambul", "istambul",  ],
    [ "i̇stanbul", "i̇stanbul", "İstanbul", "İSTANBUL", "i̇stanbul", "İstanbul", "İSTANBUL", "i̇stanbul", "i̇stanbul", "i̇stanbul",  ],
    [ "İstanbul", "i̇stanbul", "İstanbul", "İSTANBUL", "i̇stanbul", "İstanbul", "İSTANBUL", "i̇stanbul", "ı̇stanbul", "i̇stanbul",  ],
    [ "İSTANBUL", "istanbul", "İstanbul", "İSTANBUL", "i̇stanbul", "İstanbul", "İSTANBUL", "İstanbul", "istanbul", "i̇stanbul",  ],
    [ "στιγμας", "στιγμας", "Στιγμας", "ΣΤΙΓΜΑΣ", "στιγμας", "Στιγμας", "ΣΤΙΓΜΑΣ", "στιγμασ", "στιγμασ", "στιγμασ",  ],
    [ "στιγμασ", "στιγμασ", "Στιγμασ", "ΣΤΙΓΜΑΣ", "στιγμασ", "Στιγμασ", "ΣΤΙΓΜΑΣ", "στιγμασ", "στιγμασ", "στιγμασ",  ],
    [ "ΣΤΙΓΜΑΣ", "στιγμασ", "Στιγμασ", "ΣΤΙΓΜΑΣ", "στιγμασ", "Στιγμασ", "ΣΤΙΓΜΑΣ", "στιγμασ", "στιγμασ", "στιγμασ",  ],
    [ "ʀᴀʀᴇ", "ʀᴀʀᴇ", "Ʀᴀʀᴇ", "ƦᴀƦᴇ", "ʀᴀʀᴇ", "Ʀᴀʀᴇ", "ƦᴀƦᴇ", "ʀᴀʀᴇ", "ʀᴀʀᴇ", "ʀᴀʀᴇ",  ],
    [ "𐐼𐐯𐑅𐐨𐑉𐐯𐐻", "𐐼𐐯𐑅𐐨𐑉𐐯𐐻", "𐐔𐐯𐑅𐐨𐑉𐐯𐐻", "𐐔𐐇𐐝𐐀𐐡𐐇𐐓", "𐐼𐐯𐑅𐐨𐑉𐐯𐐻", "𐐔𐐯𐑅𐐨𐑉𐐯𐐻", "𐐔𐐇𐐝𐐀𐐡𐐇𐐓", "𐐼𐐯𐑅𐐨𐑉𐐯𐐻", "𐐼𐐯𐑅𐐨𐑉𐐯𐐻", "𐐼𐐯𐑅𐐨𐑉𐐯𐐻",  ],
    [ "Ԧԧ", "ԧԧ", "Ԧԧ", "ԦԦ", "ԧԧ", "Ԧԧ", "ԦԦ", "ԧԧ", "ԧԧ", "ԧԧ",  ],
    [ "ﬓﬔﬕﬖﬗ", "ﬓﬔﬕﬖﬗ", "ﬓﬔﬕﬖﬗ", "ﬓﬔﬕﬖﬗ", "ﬓﬔﬕﬖﬗ", "Մնﬔﬕﬖﬗ", "ՄՆՄԵՄԻՎՆՄԽ", "ﬓﬔﬕﬖﬗ", "մնմեմիվնմխ", "մնմեմիվնմխ",  ],
    [ "ŉ groot", "ŉ groot", "ŉ Groot", "ŉ GROOT", "ŉ groot", "ʼN Groot", "ʼN GROOT", "ŉ groot", "ʼn groot", "ʼn groot",  ],
    [ "ẚ", "ẚ", "ẚ", "ẚ", "ẚ", "Aʾ", "Aʾ", "ẚ", "aʾ", "aʾ",  ],
    [ "ﬀ", "ﬀ", "ﬀ", "ﬀ", "ﬀ", "Ff", "FF", "ﬀ", "ff", "ff",  ],
    [ "ǰ", "ǰ", "ǰ", "ǰ", "ǰ", "J̌", "J̌", "ǰ", "ǰ", "ǰ",  ],
    [ "550 nm or Å", "550 nm or å", "550 Nm Or Å", "550 NM OR Å", "550 nm or å", "550 Nm Or Å", "550 NM OR Å", "550 nm or å", "550 nm or å", "550 nm or å",  ],

]

allgood = 0
allbad  = 0

testno = 0
for mapping_row in data:
    good = 0
    bad  = 0

    (orig, lower_simple, title_simple, upper_simple, 
           lower_full,   title_full,   upper_full,
           fold_simple,  fold_turkic,  fold_full, 
    ) = mapping_row

    print("\ntest", testno, "for string", orig)

    if ONLY_SIMPLES:
        (lower_want,   title_want,   upper_want,   fold_want  ) = (
         lower_simple, title_simple, upper_simple, fold_simple)
    else:
        (lower_want,   title_want,   upper_want,   fold_want  ) = (
         lower_full,   title_full,   upper_full,   fold_full)

    lower_have = orig.lower()
    title_have = orig.title()
    upper_have = orig.upper()

    if RUN_AGREEMENT_TESTS:

        if lower_have.islower():
            good = good + 1
        else:
            print("wanted <"+lower_want+"> to be lowercase of <"+orig+"> but python disagrees")
            bad = bad + 1

        if title_have.istitle():
            good = good + 1
        else:
            print("wanted <"+title_want+"> to be titlecase of <"+orig+"> but python disagrees")
            bad = bad + 1

        if upper_have.isupper():
            good = good + 1
        else:
            print("wanted <"+upper_want+"> to be uppercase of <"+orig+"> but python disagrees")
            bad = bad + 1

    if RUN_MAPPING_TESTS:

        if lower_want == lower_have:
            good = good + 1
        else:
            print("failed casemap: lowercase of <"+orig+"> should be <"+lower_want+"> not <"+lower_have+">")
            bad = bad + 1

        if title_want == title_have:
            good = good + 1
        else:
            print("failed casemap: titlecase of <"+orig+"> should be <"+title_want+"> not <"+title_have+">")
            bad = bad + 1

        if upper_want == upper_have:
            good = good + 1
        else:
            print("failed casemap: uppercase of <"+orig+"> should be <"+upper_want+"> not <"+upper_have+">")
            bad = bad + 1

    if RUN_FOLDING_TESTS:

        if str_matched_rx(orig, lower_want):
            good = good + 1
        else:
            print("failed casefold: <"+orig+"> should match lower <"+lower_want+">")
            bad = bad + 1

        if str_matched_rx(orig, title_want):
            good = good + 1
        else:
            print("failed casefold: <"+orig+"> should match title <"+title_want+">")
            bad = bad + 1

        if str_matched_rx(orig, upper_want):
            good = good + 1
        else:
            print("failed casefold: <"+orig+"> should match upper <"+upper_want+">")
            bad = bad + 1

        if str_matched_rx(orig, fold_want):
            good = good + 1
        else:
            print("failed casefold: <"+orig+"> should match fold <"+fold_want+">")
            bad = bad + 1

        if not ONLY_SIMPLES:
            if fold_simple == fold_turkic and fold_simple == fold_full:
                if re.match(orig, fold_simple, re.I):
                    good = good + 1
                else: 
                    print("failed simple casefold: <"+orig+"> should match simple casefold <"+fold_simple+">")
                    bad = bad + 1
            else:
                if re.match(orig, fold_full, re.I):
                    good = good + 1
                else:
                    print("failed full casefold: <"+orig+"> should match full casefold <"+fold_full+">")
                    bad = bad + 1
                if fold_full != fold_turkic:
                    if re.match(orig, fold_turkic, re.I):
                        good = good + 1
                    else:
                        print("failed turkic casefold: <"+orig+"> should match turkic casefold <"+fold_turkic+">")
                        bad = bad + 1

    if bad != 0:
        print("test", testno, "failed", bad, "subtests")
    else:
        print("all subtests passed for test", testno)

    testno = testno + 1

    allgood = allgood + good
    allbad  = allbad  + bad

print("")

total = allgood+allbad

mask = "Total %-10s %3d / %3d (%3.0f%%)"
print(mask % ("failures:",  allbad,  total, 100.0 *   allbad/(allgood+allbad)))
print(mask % ("successes:", allgood, total, 100.0 *  allgood/(allgood+allbad)))

####################################################