diff -r 1b58f14f5d60 Doc/library/html.rst --- a/Doc/library/html.rst Wed Nov 20 17:44:28 2013 +0200 +++ b/Doc/library/html.rst Wed Nov 20 20:19:25 2013 +0200 @@ -31,6 +31,19 @@ .. versionadded:: 3.4 +.. function:: htmlcharrefreplace_errors(exception) + + Implements the ``htmlcharrefreplace`` error handling (for encoding only): the + unencodable character is replaced by an appropriate HTML named or numeric + character reference. This error handler is not registered by default, you + should register it with :meth:`codecs.register_error`:: + + >>> codecs.register_error('htmlcharrefreplace', htmlcharrefreplace_errors) + >>> '∀ x∈ℜ'.encode('ascii', 'htmlcharrefreplace') + b'∀ x∈ℜ' + + .. versionadded:: 3.4 + -------------- Submodules in the ``html`` package are: diff -r 1b58f14f5d60 Lib/html/__init__.py --- a/Lib/html/__init__.py Wed Nov 20 17:44:28 2013 +0200 +++ b/Lib/html/__init__.py Wed Nov 20 20:19:25 2013 +0200 @@ -3,7 +3,7 @@ """ import re as _re -from html.entities import html5 as _html5 +from html.entities import codepoint2name as _codepoint2name, html5 as _html5 __all__ = ['escape', 'unescape'] @@ -130,3 +130,21 @@ if '&' not in s: return s return _charref.sub(_replace_charref, s) + + +def htmlcharrefreplace_errors(exception): + """ + Implements the 'htmlcharrefreplace' error handling, which replaces an + unencodable character with the appropriate HTML named or numeric + character reference. + """ + if not isinstance(exception, UnicodeEncodeError): + raise exception + replace = [] + for c in exception.object[exception.start:exception.end]: + n = ord(c) + try: + replace.append(r'&%s;' % _codepoint2name[n]) + except KeyError: + replace.append(r'&#%d;' % n) + return ''.join(replace), exception.end diff -r 1b58f14f5d60 Lib/test/test_html.py --- a/Lib/test/test_html.py Wed Nov 20 17:44:28 2013 +0200 +++ b/Lib/test/test_html.py Wed Nov 20 20:19:25 2013 +0200 @@ -2,6 +2,7 @@ Tests for the html module functions. """ +import codecs import html import unittest from test.support import run_unittest @@ -99,6 +100,14 @@ 'ÉricÉric&alphacentauriαcentauri') check('&co;', '&co;') + def test_htmlcharrefreplace(self): + codecs.register_error('htmlcharrefreplace', + html.htmlcharrefreplace_errors) + self.assertEqual( + '[$\xa5\u20a3\u20ac\U0001d56b]'.encode('latin1', + 'htmlcharrefreplace'), + b'[$\xa5₣€𝕫]') + if __name__ == '__main__': unittest.main()