Attached please find a script which takes on the order of 1 minute to parse even though the embedded message is reasonably trivial. The main flaw is that the Content-Type: header has a long string of redundant ;;;; which is something some spammers apparently use to bypass some content filters or analyzers.
The script is short enough to inline here for what I hope is convenience for most visitors.
from email import message_from_bytes
#from email.parser import BytesParser
from email.policy import default
##from cProfile import run as cprun
b = (b'''From: me <self@example.com>
To: you <recipient@example.net>
Subject: sample with ;;;;;;;;; stuffing in MIME Content-Type: header
Content-type: text/plain''' + b';' * 54 + b'\n' +
36 * (b' ' + b';' * 990 + b'\n') + b'''\
Content-transfer-encoding: 7bit
MIME-Version: 1.0
Hello.
''')
## cprun('message_from_bytes(b, policy=default)', sort=1)
m = message_from_bytes(b, policy=default)
#m = BytesParser(policy=default).parsebytes(b)
print(m.as_string())
I have commented out two sets of changes; the ones with a single # demonstrate that the same error happens with BytesParser, and the ones with ## are for profiling the code.
For what it's worth, profiling consistently indicates that it gets stuck in builtins.sum while attempting to parse the message. Here is a partial cProfile result from Python 3.7.2:
2148205 function calls (2004560 primitive calls) in 34.533 seconds
Ordered by: internal time
ncalls tottime percall cumtime percall filename:lineno(function)
160/4 32.356 0.202 32.728 8.182 {built-in method builtins.sum}
4 0.560 0.140 1.661 0.415 _header_value_parser.py:2380(parse_mime_parameters)
142772 0.230 0.000 0.230 0.000 {method 'format' of 'str' objects}
142932 0.203 0.000 0.203 0.000 _header_value_parser.py:866(all_defects)
142772 0.197 0.000 0.288 0.000 errors.py:84(__init__)
143248/24 0.167 0.000 32.727 1.364 _header_value_parser.py:126(<genexpr>)
142772 0.151 0.000 0.456 0.000 _header_value_parser.py:2126(get_attribute)
285848 0.146 0.000 0.146 0.000 _header_value_parser.py:109(__init__)
142772 0.102 0.000 0.631 0.000 _header_value_parser.py:2241(get_parameter)
142772 0.091 0.000 0.091 0.000 errors.py:36(__init__)
143076 0.088 0.000 0.153 0.000 _header_value_parser.py:854(__new__)
143080 0.065 0.000 0.065 0.000 {built-in method __new__ of type object at 0x101c20640}
12 0.044 0.004 0.086 0.007 _header_value_parser.py:716(params)
285839 0.042 0.000 0.042 0.000 {method 'endswith' of 'str' objects}
3 0.030 0.010 27.512 9.171 message.py:588(get_content_maintype)
286482 0.029 0.000 0.029 0.000 {method 'append' of 'list' objects}
2 0.014 0.007 0.014 0.007 <frozen importlib._bootstrap_external>:914(get_data)
6 0.008 0.001 25.369 4.228 feedparser.py:218(_parsegen)
4 0.001 0.000 0.001 0.000 {method 'split' of 're.Pattern' objects}
160/4 0.001 0.000 32.728 8.182 _header_value_parser.py:124(all_defects)
288 0.001 0.000 0.002 0.000 _header_value_parser.py:1000(get_fws)
288 0.001 0.000 0.003 0.000 _header_value_parser.py:1217(get_cfws)
Starting the code and doing a KeyboardInterrupt after a few, or many, seconds tends to get a traceback like this, which also points to roughly the same culprit:
^CTraceback (most recent call last):
File "repro.py", line 18, in <module>
m = message_from_bytes(b, policy=default)
File "/Users/eriker/.pyenv/versions/3.7.2/lib/python3.7/email/__init__.py", line 46, in message_from_bytes
return BytesParser(*args, **kws).parsebytes(s)
File "/Users/eriker/.pyenv/versions/3.7.2/lib/python3.7/email/parser.py", line 124, in parsebytes
return self.parser.parsestr(text, headersonly)
File "/Users/eriker/.pyenv/versions/3.7.2/lib/python3.7/email/parser.py", line 68, in parsestr
return self.parse(StringIO(text), headersonly=headersonly)
File "/Users/eriker/.pyenv/versions/3.7.2/lib/python3.7/email/parser.py", line 57, in parse
feedparser.feed(data)
File "/Users/eriker/.pyenv/versions/3.7.2/lib/python3.7/email/feedparser.py", line 176, in feed
self._call_parse()
File "/Users/eriker/.pyenv/versions/3.7.2/lib/python3.7/email/feedparser.py", line 180, in _call_parse
self._parse()
File "/Users/eriker/.pyenv/versions/3.7.2/lib/python3.7/email/feedparser.py", line 256, in _parsegen
if self._cur.get_content_type() == 'message/delivery-status':
File "/Users/eriker/.pyenv/versions/3.7.2/lib/python3.7/email/message.py", line 578, in get_content_type
value = self.get('content-type', missing)
File "/Users/eriker/.pyenv/versions/3.7.2/lib/python3.7/email/message.py", line 471, in get
return self.policy.header_fetch_parse(k, v)
File "/Users/eriker/.pyenv/versions/3.7.2/lib/python3.7/email/policy.py", line 162, in header_fetch_parse
return self.header_factory(name, value)
File "/Users/eriker/.pyenv/versions/3.7.2/lib/python3.7/email/headerregistry.py", line 589, in __call__
return self[name](name, value)
File "/Users/eriker/.pyenv/versions/3.7.2/lib/python3.7/email/headerregistry.py", line 197, in __new__
cls.parse(value, kwds)
File "/Users/eriker/.pyenv/versions/3.7.2/lib/python3.7/email/headerregistry.py", line 448, in parse
kwds['defects'].extend(parse_tree.all_defects)
File "/Users/eriker/.pyenv/versions/3.7.2/lib/python3.7/email/_header_value_parser.py", line 126, in all_defects
return sum((x.all_defects for x in self), self.defects)
File "/Users/eriker/.pyenv/versions/3.7.2/lib/python3.7/email/_header_value_parser.py", line 126, in <genexpr>
return sum((x.all_defects for x in self), self.defects)
File "/Users/eriker/.pyenv/versions/3.7.2/lib/python3.7/email/_header_value_parser.py", line 126, in all_defects
return sum((x.all_defects for x in self), self.defects)
File "/Users/eriker/.pyenv/versions/3.7.2/lib/python3.7/email/_header_value_parser.py", line 126, in <genexpr>
return sum((x.all_defects for x in self), self.defects)
KeyboardInterrupt
I have been able to repro on Python 3.9 and 3.10rc as well by way of the official Docker images rc-buster and 3.9-slim.
|
python -mtrace -t repro.py reveals a long string of these:
--- modulename: errors, funcname: __init__
errors.py(85): super().__init__(*args, **kw)
--- modulename: errors, funcname: __init__
errors.py(37): if line is not None:
errors.py(38): super().__init__(line)
errors.py(39): self.line = line
_header_value_parser.py(2417): if value and value[0] != ';':
_header_value_parser.py(2426): if value:
_header_value_parser.py(2428): mime_parameters.append(ValueTerminal(';', 'parameter-separator'))
--- modulename: _header_value_parser, funcname: __new__
_header_value_parser.py(855): self = super().__new__(cls, value)
_header_value_parser.py(856): self.token_type = token_type
_header_value_parser.py(857): self.defects = []
_header_value_parser.py(858): return self
_header_value_parser.py(2429): value = value[1:]
_header_value_parser.py(2394): while value:
_header_value_parser.py(2395): try:
_header_value_parser.py(2396): token, value = get_parameter(value)
--- modulename: _header_value_parser, funcname: get_parameter
_header_value_parser.py(2252): param = Parameter()
--- modulename: _header_value_parser, funcname: __init__
_header_value_parser.py(110): super().__init__(*args, **kw)
_header_value_parser.py(111): self.defects = []
_header_value_parser.py(2253): token, value = get_attribute(value)
--- modulename: _header_value_parser, funcname: get_attribute
_header_value_parser.py(2135): attribute = Attribute()
--- modulename: _header_value_parser, funcname: __init__
_header_value_parser.py(110): super().__init__(*args, **kw)
_header_value_parser.py(111): self.defects = []
_header_value_parser.py(2136): if value and value[0] in CFWS_LEADER:
_header_value_parser.py(2139): if value and value[0] in ATTRIBUTE_ENDS:
_header_value_parser.py(2140): raise errors.HeaderParseError(
_header_value_parser.py(2141): "expected token but found '{}'".format(value))
_header_value_parser.py(2398): except errors.HeaderParseError as err:
_header_value_parser.py(2399): leader = None
_header_value_parser.py(2400): if value[0] in CFWS_LEADER:
_header_value_parser.py(2402): if not value:
_header_value_parser.py(2405): if value[0] == ';':
_header_value_parser.py(2406): if leader is not None:
_header_value_parser.py(2408): mime_parameters.defects.append(errors.InvalidHeaderDefect(
_header_value_parser.py(2409): "parameter entry with no content"))
... which would make sense as such, except at the end, the list of defects is empty. And, of course, it should not take so long.
|