Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code | Sign in
(4)

Side by Side Diff: Lib/html/parser.py

Issue 21047: html.parser.HTMLParser: convert_charrefs should become True by default
Patch Set: Created 5 years, 10 months ago
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « Doc/whatsnew/3.5.rst ('k') | Lib/test/test_htmlparser.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 """A parser for HTML and XHTML.""" 1 """A parser for HTML and XHTML."""
2 2
3 # This file is based on sgmllib.py, but the API is slightly different. 3 # This file is based on sgmllib.py, but the API is slightly different.
4 4
5 # XXX There should be a way to distinguish between PCDATA (parsed 5 # XXX There should be a way to distinguish between PCDATA (parsed
6 # character data -- the normal case), RCDATA (replaceable character 6 # character data -- the normal case), RCDATA (replaceable character
7 # data -- only char and entity references and end tags are special) 7 # data -- only char and entity references and end tags are special)
8 # and CDATA (character data -- only end tags are special). 8 # and CDATA (character data -- only end tags are special).
9 9
10 10
(...skipping 105 matching lines...) Expand 10 before | Expand all | Expand 10 after
116 True the character references are converted automatically to the 116 True the character references are converted automatically to the
117 corresponding Unicode character (and self.handle_data() is no 117 corresponding Unicode character (and self.handle_data() is no
118 longer split in chunks), otherwise they are passed by calling 118 longer split in chunks), otherwise they are passed by calling
119 self.handle_entityref() or self.handle_charref() with the string 119 self.handle_entityref() or self.handle_charref() with the string
120 containing respectively the named or numeric reference as the 120 containing respectively the named or numeric reference as the
121 argument. 121 argument.
122 """ 122 """
123 123
124 CDATA_CONTENT_ELEMENTS = ("script", "style") 124 CDATA_CONTENT_ELEMENTS = ("script", "style")
125 125
126 def __init__(self, strict=_default_sentinel, *, 126 def __init__(self, strict=_default_sentinel, *, convert_charrefs=True):
127 convert_charrefs=_default_sentinel):
128 """Initialize and reset this instance. 127 """Initialize and reset this instance.
129 128
130 If convert_charrefs is True (default: False), all character references 129 If convert_charrefs is True (the default), all character references
131 are automatically converted to the corresponding Unicode characters. 130 are automatically converted to the corresponding Unicode characters.
132 If strict is set to False (the default) the parser will parse invalid 131 If strict is set to False (the default) the parser will parse invalid
133 markup, otherwise it will raise an error. Note that the strict mode 132 markup, otherwise it will raise an error. Note that the strict mode
134 and argument are deprecated. 133 and argument are deprecated.
135 """ 134 """
136 if strict is not _default_sentinel: 135 if strict is not _default_sentinel:
137 warnings.warn("The strict argument and mode are deprecated.", 136 warnings.warn("The strict argument and mode are deprecated.",
138 DeprecationWarning, stacklevel=2) 137 DeprecationWarning, stacklevel=2)
139 else: 138 else:
140 strict = False # default 139 strict = False # default
141 self.strict = strict 140 self.strict = strict
142 if convert_charrefs is _default_sentinel:
143 convert_charrefs = False # default
144 warnings.warn("The value of convert_charrefs will become True in "
145 "3.5. You are encouraged to set the value explicitly." ,
146 DeprecationWarning, stacklevel=2)
147 self.convert_charrefs = convert_charrefs 141 self.convert_charrefs = convert_charrefs
148 self.reset() 142 self.reset()
149 143
150 def reset(self): 144 def reset(self):
151 """Reset this instance. Loses all unprocessed data.""" 145 """Reset this instance. Loses all unprocessed data."""
152 self.rawdata = '' 146 self.rawdata = ''
153 self.lasttag = '???' 147 self.lasttag = '???'
154 self.interesting = interesting_normal 148 self.interesting = interesting_normal
155 self.cdata_elem = None 149 self.cdata_elem = None
156 _markupbase.ParserBase.reset(self) 150 _markupbase.ParserBase.reset(self)
(...skipping 384 matching lines...) Expand 10 before | Expand all | Expand 10 after
541 def unknown_decl(self, data): 535 def unknown_decl(self, data):
542 if self.strict: 536 if self.strict:
543 self.error("unknown declaration: %r" % (data,)) 537 self.error("unknown declaration: %r" % (data,))
544 538
545 # Internal -- helper to remove special character quoting 539 # Internal -- helper to remove special character quoting
546 def unescape(self, s): 540 def unescape(self, s):
547 warnings.warn('The unescape method is deprecated and will be removed ' 541 warnings.warn('The unescape method is deprecated and will be removed '
548 'in 3.5, use html.unescape() instead.', 542 'in 3.5, use html.unescape() instead.',
549 DeprecationWarning, stacklevel=2) 543 DeprecationWarning, stacklevel=2)
550 return unescape(s) 544 return unescape(s)
OLDNEW
« no previous file with comments | « Doc/whatsnew/3.5.rst ('k') | Lib/test/test_htmlparser.py » ('j') | no next file with comments »

RSS Feeds Recent Issues | This issue
This is Rietveld 894c83f36cb7+