diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -2742,12 +2742,21 @@ if value and value[0] != "'": token, value = get_attrtext(value) appendto.append(token) - param.lang = token.value - if not value or value[0] != "'": - raise errors.HeaderParseError("Expected RFC2231 char/lang encoding " - "delimiter, but found {}".format(value)) - appendto.append(ValueTerminal("'", 'RFC2231 delimiter')) - value = value[1:] + if value and value[0] == "'": + param.lang = token.value + appendto.append(ValueTerminal("'", 'RFC2231 delimiter')) + value = value[1:] + else: + # XXX this recovers for only a small subset of the possible + # bad data. Need to extend the error recovery. + if value: + param.defects.append(errors.InvalidHeaderDefect( + "Expected RFC2231 char/lang encoding " + "delimiter, but found {}".format(value))) + else: + param.defects.append(errors.InvalidHeaderDefect( + "Expected RFC2231 char/lang encoding " + "delimiter, but found end of parameter")) if remainder is not None: # Treat the rest of value as bare quoted string content. v = Value() @@ -2774,7 +2783,7 @@ the formal RFC grammar, but it is more convenient for us for the set of parameters to be treated as its own TokenList. - This is 'parse' routine because it consumes the reminaing value, but it + This is a 'parse' routine because it consumes the remaining value, but it would never be called to parse a full header. Instead it is called to parse everything after the non-parameter value of a specific MIME header. diff --git a/Lib/email/utils.py b/Lib/email/utils.py --- a/Lib/email/utils.py +++ b/Lib/email/utils.py @@ -337,6 +337,10 @@ # object. We do not want bytes() normal utf-8 decoder, we want a straight # interpretation of the string as character bytes. charset, language, text = value + if charset is None: + # Issue 17369: if charset/lang is None, decode_rfc2231 couldn't parse + # the value, so use the fallback_charset. + charset = fallback_charset rawbytes = bytes(text, 'raw-unicode-escape') try: return str(rawbytes, charset, errors) diff --git a/Lib/test/test_email/test_email.py b/Lib/test/test_email/test_email.py --- a/Lib/test/test_email/test_email.py +++ b/Lib/test/test_email/test_email.py @@ -4956,6 +4956,26 @@ self.assertFalse(isinstance(param, tuple)) self.assertEqual(param, "us-ascii'en-us'Frank's Document") + def test_rfc2231_missing_tick(self): + m = '''\ +Content-Disposition: inline; +\tfilename*0*="'This%20is%20broken"; +''' + msg = email.message_from_string(m) + self.assertEqual( + msg.get_filename(), + "'This is broken") + + def test_rfc2231_missing_tick_with_encoded_non_ascii(self): + m = '''\ +Content-Disposition: inline; +\tfilename*0*="'This%20is%E2broken"; +''' + msg = email.message_from_string(m) + self.assertEqual( + msg.get_filename(), + "'This is\ufffdbroken") + # test_headerregistry.TestContentTypeHeader.rfc2231_single_quotes_inside_quotes def test_rfc2231_no_extended_values(self): eq = self.assertEqual diff --git a/Lib/test/test_email/test_headerregistry.py b/Lib/test/test_email/test_headerregistry.py --- a/Lib/test/test_email/test_headerregistry.py +++ b/Lib/test/test_email/test_headerregistry.py @@ -556,6 +556,26 @@ ' name*1=" Document"\n'), ), + # Issue 17369 + 'rfc2231_only_one_leading_single_quote': ( + 'text/plain; NAME*0*="\'This%20is%20broken"', + 'text/plain', + 'text', + 'plain', + {'name': "'This is broken"}, + [errors.InvalidHeaderDefect], + 'text/plain; NAME="\'This is broken"'), + + # Issue 17369 + 'rfc2231_only_one_single_quote_with_encoded_non_ascii': ( + 'text/plain; NAME*0*="\'This%20is%E2broken"', + 'text/plain', + 'text', + 'plain', + {'name': "'This is\ufffdbroken"}, + [errors.InvalidHeaderDefect], + 'text/plain; NAME="\'This is%E2broken"'), + 'rfc2231_no_language_or_charset': ( 'text/plain; NAME*0*=english_is_the_default.html', 'text/plain',