#!/usr/bin/env python2.7 # -*- coding: UTF-8 -*- from __future__ import print_function from __future__ import unicode_literals import re import regex import sys import os if not (("PYTHONIOENCODING" in os.environ) and re.search("^utf-?8$", os.environ["PYTHONIOENCODING"], re.I)): sys.stderr.write(sys.argv[0] + ": Please set your PYTHONIOENCODING envariable to utf8\n") sys.exit(1) import unicodedata if unicodedata.unidata_version < "6.0.0": print("WARNING: Your old UCD is out of date, expected 6.0.0 but got", unicodedata.unidata_version) wide_enough = (sys.maxunicode > 65536) if not wide_enough: print("WARNING: Narrow build detected, your Python lacks full Unicode support!!") pass_data = [ "po\N{LATIN SMALL LETTER LONG S}t", "caf\N{LATIN SMALL LETTER E WITH ACUTE}", "cafe\N{COMBINING ACUTE ACCENT}", # marks are word chars!! "\N{CIRCLED LATIN CAPITAL LETTER K}", # Other_Uppercase, GC=So "\N{COMBINING GREEK YPOGEGRAMMENI}", # Other_Lowercase "\N{HEBREW POINT SHEVA}", "\N{MATHEMATICAL DOUBLE-STRUCK DIGIT ZERO}", # astral digit # wide digits "123", "\N{ROMAN NUMERAL TWELVE}", # GC=Nl "\N{RUNIC TVIMADUR SYMBOL}", # GC=Nl "\N{GOTHIC LETTER NINETY}", # astral and GC=Nl "𝔘𝔫𝔦𝔠𝔬𝔡𝔢", "𐐔𐐯𐑅𐐨𐑉𐐯𐐻", # astral "\u0526\u0527", # Unicode 6 # all connectors are valid word chars "under_score", "connector‿punctuation", # lc/tc/uc "ᾲ_στο_διάολο", "Ὰͅ_Στο_Διάολο", "ᾺΙ_ΣΤΟ_ΔΙΆΟΛΟ", # go goth "ATTA_UNSAR_ÞU_IN_HIMINAM", "𐌰𐍄𐍄𐌰‿𐌿𐌽𐍃𐌰𐍂‿𐌸𐌿‿𐌹𐌽‿𐌷𐌹𐌼𐌹𐌽𐌰𐌼", ] fail_data = [ # these are all GC=No, not GC=Nd or GC=Nl, so *not* word chars per UTS#18 RL1.2a: "¹²³", "₁₂₃", "¼½¾", "\N{PARENTHESIZED DIGIT THREE}", ] re_pass, re_fail = 0, 0 regex_pass, regex_fail = 0, 0 for str in pass_data: if re.search(r'^\w+$', str, re.IGNORECASE + re.UNICODE): print("pass lib re found all alphanumeric string", str) re_pass = re_pass + 1 else: print("FAIL lib re found non alphanumeric string", str) re_fail = re_fail + 1 if regex.search(r'^\w+$', str, regex.IGNORECASE + regex.UNICODE): print("pass lib regex found non alphanumeric string", str) regex_pass = regex_pass + 1 else: print("FAIL lib regex found non alphanumeric string", str) regex_fail = regex_fail + 1 print("") for str in fail_data: if not re.search(r'^\w+$', str, re.IGNORECASE + re.UNICODE): print("pass lib re found non alphanumeric string", str) re_pass = re_pass + 1 else: print("FAIL lib re found all alphanumeric string", str) re_fail = re_fail + 1 if not regex.search(r'^\w+$', str, regex.IGNORECASE + regex.UNICODE): print("pass lib regex found non alphanumeric string", str) regex_pass = regex_pass + 1 else: print("FAIL lib regex found all alphanumeric string", str) regex_fail = regex_fail + 1 print("") re_total = re_pass + re_fail regex_total = regex_pass + regex_fail print("") print("re lib passed", re_pass, "of", re_total, "tests") print("regex lib passed", regex_pass, "of", regex_total, "tests") ########################################################## # WARNING: Narrow build detected, your Python lacks full Unicode support!! # pass lib re found all alphanumeric string poſt # pass lib regex found non alphanumeric string poſt # # pass lib re found all alphanumeric string café # pass lib regex found non alphanumeric string café # # FAIL lib re found non alphanumeric string café # pass lib regex found non alphanumeric string café # # FAIL lib re found non alphanumeric string Ⓚ # pass lib regex found non alphanumeric string Ⓚ # # FAIL lib re found non alphanumeric string ͅ # pass lib regex found non alphanumeric string ͅ # # FAIL lib re found non alphanumeric string ְ # pass lib regex found non alphanumeric string ְ # # FAIL lib re found non alphanumeric string 𝟘 # FAIL lib regex found non alphanumeric string 𝟘 # # pass lib re found all alphanumeric string 123 # pass lib regex found non alphanumeric string 123 # # pass lib re found all alphanumeric string Ⅻ # pass lib regex found non alphanumeric string Ⅻ # # pass lib re found all alphanumeric string ᛯ # pass lib regex found non alphanumeric string ᛯ # # FAIL lib re found non alphanumeric string 𐍁 # FAIL lib regex found non alphanumeric string 𐍁 # # FAIL lib re found non alphanumeric string 𝔘𝔫𝔦𝔠𝔬𝔡𝔢 # FAIL lib regex found non alphanumeric string 𝔘𝔫𝔦𝔠𝔬𝔡𝔢 # # FAIL lib re found non alphanumeric string 𐐔𐐯𐑅𐐨𐑉𐐯𐐻 # FAIL lib regex found non alphanumeric string 𐐔𐐯𐑅𐐨𐑉𐐯𐐻 # # pass lib re found all alphanumeric string Ԧԧ # pass lib regex found non alphanumeric string Ԧԧ # # pass lib re found all alphanumeric string under_score # pass lib regex found non alphanumeric string under_score # # FAIL lib re found non alphanumeric string connector‿punctuation # pass lib regex found non alphanumeric string connector‿punctuation # # pass lib re found all alphanumeric string ᾲ_στο_διάολο # pass lib regex found non alphanumeric string ᾲ_στο_διάολο # # FAIL lib re found non alphanumeric string Ὰͅ_Στο_Διάολο # pass lib regex found non alphanumeric string Ὰͅ_Στο_Διάολο # # pass lib re found all alphanumeric string ᾺΙ_ΣΤΟ_ΔΙΆΟΛΟ # pass lib regex found non alphanumeric string ᾺΙ_ΣΤΟ_ΔΙΆΟΛΟ # # pass lib re found all alphanumeric string ATTA_UNSAR_ÞU_IN_HIMINAM # pass lib regex found non alphanumeric string ATTA_UNSAR_ÞU_IN_HIMINAM # # FAIL lib re found non alphanumeric string 𐌰𐍄𐍄𐌰‿𐌿𐌽𐍃𐌰𐍂‿𐌸𐌿‿𐌹𐌽‿𐌷𐌹𐌼𐌹𐌽𐌰𐌼 # FAIL lib regex found non alphanumeric string 𐌰𐍄𐍄𐌰‿𐌿𐌽𐍃𐌰𐍂‿𐌸𐌿‿𐌹𐌽‿𐌷𐌹𐌼𐌹𐌽𐌰𐌼 # # FAIL lib re found all alphanumeric string ¹²³ # pass lib regex found non alphanumeric string ¹²³ # # FAIL lib re found all alphanumeric string ₁₂₃ # pass lib regex found non alphanumeric string ₁₂₃ # # FAIL lib re found all alphanumeric string ¼½¾ # pass lib regex found non alphanumeric string ¼½¾ # # FAIL lib re found all alphanumeric string ⑶ # pass lib regex found non alphanumeric string ⑶ # # # re lib passed 10 of 25 tests # regex lib passed 20 of 25 tests