#!/usr/bin/env python # -*- coding: utf-8 -*- # Name: create-unicodedata-dicts-prop-value-alias.py # Goal: Create dictionaries for unicodedata package contining property value aliases in terms of abbreviated names and long names. # Fixes: http://bugs.python.org/issue16684 # Date: 2012-12-23 # Author: Pander # License: Python license import urllib2 inbc = False ingc = False ineaw = False dictbc = {} dictgc = {} dicteaw = {} aliasfile = urllib2.urlopen('http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt') for line in aliasfile.readlines(): line = line.strip() if line == '# Bidi_Class (bc)': inbc = True continue elif line == '# General_Category (gc)': ingc = True continue elif line == '# East_Asian_Width (ea)': ineaw = True continue if inbc: if line == '': if len(dictbc) != 0: # passed all bc lines inbc = False continue line = line.split(';') abbrname = line[1].strip() longname = line[2].strip() if '#' in longname: # strip comments longname = longname.split('#')[0].strip() dictbc[abbrname] = longname elif ingc: if line == '': if len(dictgc) != 0: # passed all gc lines ingc = False continue line = line.split(';') abbrname = line[1].strip() longname = line[2].strip() if '#' in longname: # strip comments longname = longname.split('#')[0].strip() dictgc[abbrname] = longname elif ineaw: if line == '': if len(dicteaw) != 0: # passed all eaw lines ineaw = False continue line = line.split(';') abbrname = line[1].strip() longname = line[2].strip() if '#' in longname: # strip comments longname = longname.split('#')[0].strip() dicteaw[abbrname] = longname print 'bcdict = {' for bc in dictbc.iterkeys(): if len(bc) == 1: print ' \'%s\' : \'%s\',' %(bc, dictbc[bc]) elif len(bc) == 2: print ' \'%s\' : \'%s\',' %(bc, dictbc[bc]) else: print ' \'%s\': \'%s\',' %(bc, dictbc[bc]) print '}' print print 'gcdict = {' for gc in dictgc.iterkeys(): if len(gc) == 1: print ' \'%s\' : \'%s\',' %(gc, dictgc[gc]) else: print ' \'%s\': \'%s\',' %(gc, dictgc[gc]) print '}' print print 'eawdict = {' for eaw in dicteaw.iterkeys(): if len(eaw) == 1: print ' \'%s\' : \'%s\',' %(eaw, dicteaw[eaw]) else: print ' \'%s\': \'%s\',' %(eaw, dicteaw[eaw]) print '}' print