diff -r ab4432daf69f Tools/scripts/parse_html5_entities.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Tools/scripts/parse_html5_entities.py Tue Oct 23 15:09:43 2012 +0300 @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 +""" +Utility for parsing HTML5 entity definitions available from: + + http://dev.w3.org/html5/spec/entities.json + +Written by Ezio Melotti. + +""" +import os +import sys +import json +from urllib.request import urlopen +from html.entities import html5 + +entities_url = 'http://dev.w3.org/html5/spec/entities.json' + +def get_json(url): + """Download the json file from the url and returns a decoded object.""" + with urlopen(url) as f: + data = f.read().decode('utf-8') + return json.loads(data) + +def create_dict(entities): + """Create the html5 dict from the decoded json object.""" + new_html5 = {} + for name, value in entities.items(): + new_html5[name.lstrip('&')] = value['characters'] + return new_html5 + +def compare_dicts(old, new): + """Compare the old and new dicts and print the differences.""" + added = new.keys() - old.keys() + if added: + print('{} entitie(s) have been added:'.format(len(added))) + for name in sorted(added): + print(' {!r}: {!r}'.format(name, new[name])) + removed = old.keys() - new.keys() + if removed: + print('{} entitie(s) have been removed:'.format(len(removed))) + for name in sorted(removed): + print(' {!r}: {!r}'.format(name, old[name])) + changed = set() + for name in (old.keys() & new.keys()): + if old[name] != new[name]: + changed.add((name, old[name], new[name])) + if changed: + print('{} entitie(s) have been modified:'.format(len(changed))) + for item in sorted(changed): + print(' {!r}: {!r} -> {!r}'.format(*item)) + +if __name__ == '__main__': + # without args print a diff between html.entities.html5 and new_html5 + # with --create print the new html5 dict + new_html5 = create_dict(get_json(entities_url)) + if '--create' in sys.argv: + # The keys in the generated dictionary should be sorted + # in a case-insensitive way, however, when two keys are equal + # the uppercase version should come first so that the result + # looks like: ['Aacute', 'aacute', 'Aacute;', 'aacute;', ...] + # To do this we first sort in a case-sensitive way (so all the + # uppercase chars come first) and then sort with key=str.lower. + # Since the sorting is stable the uppercase keys will eventually + # come before their equivalent lowercase version. + keys = sorted(new_html5.keys()) + keys = sorted(keys, key=str.lower) + print('# map the HTML5 named character references to the ' + 'equivalent Unicode character(s)') + print('# Generated by {}. Do not edit manually.'.format(__file__)) + print('html5 = {') + for name in keys: + print(' {!r}: {!a},'.format(name, new_html5[name])) + print('}') + elif '--patch' in sys.argv: + with open('Lib/html/entities.py') as f1, open('Lib/html/entities1.py', 'w') as f2: + skip = False + for line in f1: + if line.startswith('html5 = {'): + # The keys in the generated dictionary should be sorted + # in a case-insensitive way, however, when two keys are equal + # the uppercase version should come first so that the result + # looks like: ['Aacute', 'aacute', 'Aacute;', 'aacute;', ...] + # To do this we first sort in a case-sensitive way (so all the + # uppercase chars come first) and then sort with key=str.lower. + # Since the sorting is stable the uppercase keys will eventually + # come before their equivalent lowercase version. + keys = sorted(new_html5.keys()) + keys = sorted(keys, key=str.lower) + f2.write('html5 = {\n') + for name in keys: + f2.write(' {!r}: {!a},\n'.format(name, new_html5[name])) + skip = True + continue + if skip: + if not line.startswith('}'): + continue + else: + skip = False + + f2.write(line) + os.remove('Lib/html/entities.py') + os.rename('Lib/html/entities1.py', 'Lib/html/entities.py') + else: + if html5 == new_html5: + print('The current dictionary is updated.') + else: + compare_dicts(html5, new_html5) + print('Run "./python {} --create" to create the new dict and ' + 'copy the output in Lib/html/entities.py.'.format(__file__))