Index: Python/ast.c =================================================================== --- Python/ast.c (revision 55970) +++ Python/ast.c (working copy) @@ -3166,7 +3166,7 @@ return NULL; } } - if (!*bytesmode) { + if (!*bytesmode && !rawmode) { return decode_unicode(s, len, rawmode, encoding); } if (*bytesmode) { @@ -3180,7 +3180,7 @@ } } } - need_encoding = (!*bytesmode && encoding != NULL && + need_encoding = (!*bytesmode && !rawmode && encoding != NULL && strcmp(encoding, "utf-8") != 0 && strcmp(encoding, "iso-8859-1") != 0); if (rawmode || strchr(s, '\\') == NULL) { Index: Parser/tokenizer.c =================================================================== --- Parser/tokenizer.c (revision 55970) +++ Parser/tokenizer.c (working copy) @@ -1156,20 +1156,25 @@ } /* Identifier (most frequent token!) */ + int is_raw_str = 0; if (is_potential_identifier_start(c)) { /* Process r"", u"" and ur"" */ switch (c) { case 'r': case 'R': c = tok_nextc(tok); - if (c == '"' || c == '\'') + if (c == '"' || c == '\'') { + is_raw_str = 1; goto letter_quote; + } break; case 'b': case 'B': c = tok_nextc(tok); - if (c == 'r' || c == 'R') + if (c == 'r' || c == 'R') { + is_raw_str = 1; c = tok_nextc(tok); + } if (c == '"' || c == '\'') goto letter_quote; break; @@ -1317,54 +1322,60 @@ letter_quote: /* String */ if (c == '\'' || c == '"') { - Py_ssize_t quote2 = tok->cur - tok->start + 1; int quote = c; - int triple = 0; - int tripcount = 0; - for (;;) { + int is_triple_quote = 0; + int is_null_str = 0; + int escaped = 0; + /* Find start of string */ + c = tok_nextc(tok); + if (c == quote){ c = tok_nextc(tok); - if (c == '\n') { - if (!triple) { - tok->done = E_EOLS; - tok_backup(tok, c); - return ERRORTOKEN; - } - tripcount = 0; - tok->cont_line = 1; /* multiline string. */ + if (c == quote) { + is_triple_quote = 1; + c = tok_nextc(tok); } - else if (c == EOF) { - if (triple) + else { + tok_backup(tok, c); + is_null_str = 1; + } + } + /* Get rest of string */ + while (!is_null_str) { + if (c == EOF) { + if (is_triple_quote) tok->done = E_EOFS; else tok->done = E_EOLS; tok->cur = tok->inp; return ERRORTOKEN; } - else if (c == quote) { - tripcount++; - if (tok->cur - tok->start == quote2) { + if (is_triple_quote) { + /* Detect end of string */ + if (c == quote && (is_raw_str || !escaped)) { + escaped = 0; c = tok_nextc(tok); - if (c == quote) { - triple = 1; - tripcount = 0; - continue; - } - tok_backup(tok, c); + if (c != quote) + continue; + c = tok_nextc(tok); + if (c != quote) + continue; + break; } - if (!triple || tripcount == 3) - break; } - else if (c == '\\') { - tripcount = 0; - c = tok_nextc(tok); - if (c == EOF) { + /* Single quote string */ + else { + /* Check for unterminated sting */ + if (c == '\n' && !escaped) { tok->done = E_EOLS; - tok->cur = tok->inp; return ERRORTOKEN; } + /* Dectect end of string */ + if (c == quote && (is_raw_str || !escaped)) + break; } - else - tripcount = 0; + /* Detect if next character is escaped. */ + escaped = (c == '\\' && !escaped); + c = tok_nextc(tok); } *p_start = tok->start; *p_end = tok->cur; Index: Lib/sgmllib.py =================================================================== --- Lib/sgmllib.py (revision 55970) +++ Lib/sgmllib.py (working copy) @@ -33,7 +33,7 @@ tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*') attrfind = re.compile( r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' - r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?') + r'''(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?''') class SGMLParseError(RuntimeError): Index: Lib/markupbase.py =================================================================== --- Lib/markupbase.py (revision 55970) +++ Lib/markupbase.py (working copy) @@ -9,7 +9,7 @@ import re _declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match -_declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match +_declstringlit_match = re.compile(r'''(\'[^\']*\'|"[^"]*")\s*''').match _commentclose = re.compile(r'--\s*>') _markedsectionclose = re.compile(r']\s*]\s*>') Index: Lib/textwrap.py =================================================================== --- Lib/textwrap.py (revision 55970) +++ Lib/textwrap.py (working copy) @@ -75,13 +75,13 @@ wordsep_re = re.compile( r'(\s+|' # any whitespace r'[^\s\w]*\w+[a-zA-Z]-(?=\w+[a-zA-Z])|' # hyphenated words - r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))') # em-dash + r'''(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))''') # em-dash # XXX this is not locale- or charset-aware -- string.lowercase # is US-ASCII only (and therefore English-only) sentence_end_re = re.compile(r'[%s]' # lowercase letter r'[\.\!\?]' # sentence-ending punct. - r'[\"\']?' # optional end-of-quote + r'''[\"\']?''' # optional end-of-quote % string.lowercase) Index: Lib/tokenize.py =================================================================== --- Lib/tokenize.py (revision 55970) +++ Lib/tokenize.py (working copy) @@ -65,14 +65,28 @@ Single = r"[^'\\]*(?:\\.[^'\\]*)*'" # Tail end of " string. Double = r'[^"\\]*(?:\\.[^"\\]*)*"' + +# Tail end of ' raw string. +rSingle = r"[^']*(?:.[^']*)*'" +# Tail end of " raw string. +rDouble = r'[^"]*(?:.[^"]*)*"' + # Tail end of ''' string. Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" # Tail end of """ string. Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' -Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""') + +# Tail end of ''' raw string. +rSingle3 = r"[^']*(?:(?:\\.|'(?!''))[^']*)*'''" +# Tail end of """ raw string. +rDouble3 = r'[^"]*(?:(?:\\.|"(?!""))[^"]*)*"""' + + +Triple = group("[bB]?[rR]?'''", '[bB]?[rR]?"""') + # Single-line ' or " string. -String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'", - r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"') +String = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'", + r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"') # Because of leftmost-then-longest match semantics, be sure to put the # longest operators first (e.g., if = came before ==, == would get @@ -90,40 +104,58 @@ Token = Ignore + PlainToken # First (or only) line of ' or " string. -ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" + - group("'", r'\\\r?\n'), - r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' + - group('"', r'\\\r?\n')) +ContStr = group( \ + r"[bB]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r'\\\r?\n'), + r'[bB]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r'\\\r?\n'), + r"[bB]?[rR]+'[^\n']*(?:\\.[^\n']*)*" + group("'", r'\\\r?\n'), + r'[bB]?[rR]+"[^\n"]*(?:\\.[^\n"]*)*' + group('"', r'\\\r?\n')) + PseudoExtras = group(r'\\\r?\n', Comment, Triple) PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) -tokenprog, pseudoprog, single3prog, double3prog = map( - re.compile, (Token, PseudoToken, Single3, Double3)) -endprogs = {"'": re.compile(Single), '"': re.compile(Double), +(tokenprog, pseudoprog, + singleprog, doubleprog, rsingleprog, rdoubleprog, + single3prog, double3prog, rsingle3prog, rdouble3prog) \ + = map(re.compile, (Token, PseudoToken, + Single, Double, + rSingle, rDouble, + Single3, Double3, + rSingle3, rDouble3)) + +endprogs = {"'": singleprog, '"': doubleprog, + "r'": rsingleprog, 'r"': rdoubleprog, + "b'": singleprog, 'b"': doubleprog, + "br'": rsingleprog, 'br"': rdoubleprog, + "R'": rsingleprog, 'R"': rdoubleprog, + "B'": singleprog, 'B"': doubleprog, + "bR'": rsingleprog, 'bR"': rdoubleprog, + "Br'": rsingleprog, 'Br"': rdoubleprog, + "BR'": rsingleprog, 'BR"': rdoubleprog, + "'''": single3prog, '"""': double3prog, - "r'''": single3prog, 'r"""': double3prog, - "u'''": single3prog, 'u"""': double3prog, - "ur'''": single3prog, 'ur"""': double3prog, - "R'''": single3prog, 'R"""': double3prog, - "U'''": single3prog, 'U"""': double3prog, - "uR'''": single3prog, 'uR"""': double3prog, - "Ur'''": single3prog, 'Ur"""': double3prog, - "UR'''": single3prog, 'UR"""': double3prog, - 'r': None, 'R': None, 'u': None, 'U': None} + "r'''": rsingle3prog, 'r"""': rdouble3prog, + "b'''": single3prog, 'b"""': double3prog, + "br'''": rsingle3prog, 'br"""': rdouble3prog, + "R'''": rsingle3prog, 'R"""': rdouble3prog, + "B'''": single3prog, 'B"""': double3prog, + "bR'''": rsingle3prog, 'bR"""': rdouble3prog, + "Br'''": rsingle3prog, 'Br"""': rdouble3prog, + "BR'''": rsingle3prog, 'BR"""': rdouble3prog, + 'r': None, 'R': None, 'b': None, 'B': None} triple_quoted = {} for t in ("'''", '"""', "r'''", 'r"""', "R'''", 'R"""', - "u'''", 'u"""', "U'''", 'U"""', - "ur'''", 'ur"""', "Ur'''", 'Ur"""', - "uR'''", 'uR"""', "UR'''", 'UR"""'): + "b'''", 'b"""', "B'''", 'B"""', + "br'''", 'br"""', "Br'''", 'Br"""', + "bR'''", 'bR"""', "BR'''", 'BR"""'): triple_quoted[t] = t single_quoted = {} for t in ("'", '"', "r'", 'r"', "R'", 'R"', - "u'", 'u"', "U'", 'U"', - "ur'", 'ur"', "Ur'", 'Ur"', - "uR'", 'uR"', "UR'", 'UR"' ): + "b'", 'b"', "B'", 'B"', + "br'", 'br"', "Br'", 'Br"', + "bR'", 'bR"', "BR'", 'BR"' ): single_quoted[t] = t tabsize = 8 Index: Lib/distutils/util.py =================================================================== --- Lib/distutils/util.py (revision 55970) +++ Lib/distutils/util.py (working copy) @@ -260,7 +260,7 @@ _wordchars_re = _squote_re = _dquote_re = None def _init_regex(): global _wordchars_re, _squote_re, _dquote_re - _wordchars_re = re.compile(r'[^\\\'\"%s ]*' % string.whitespace) + _wordchars_re = re.compile(r'''[^\\\'\"%s ]*''' % string.whitespace) _squote_re = re.compile(r"'(?:[^'\\]|\\.)*'") _dquote_re = re.compile(r'"(?:[^"\\]|\\.)*"') Index: Lib/cookielib.py =================================================================== --- Lib/cookielib.py (revision 55970) +++ Lib/cookielib.py (working copy) @@ -1,4 +1,4 @@ -"""HTTP cookie handling for web clients. +r"""HTTP cookie handling for web clients. This module has (now fairly distant) origins in Gisle Aas' Perl module HTTP::Cookies, from the libwww-perl library. @@ -319,7 +319,7 @@ return match.string[:start]+match.string[end:] HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)") -HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"") +HEADER_QUOTED_VALUE_RE = re.compile(r'''^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"''') HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)") HEADER_ESCAPE_RE = re.compile(r"\\(.)") def split_header_words(header_values): @@ -407,7 +407,7 @@ if pairs: result.append(pairs) return result -HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])") +HEADER_JOIN_ESCAPE_RE = re.compile(r'([\"\\])') def join_header_words(lists): """Do the inverse (almost) of the conversion done by split_header_words. @@ -1204,7 +1204,7 @@ """ non_word_re = re.compile(r"\W") - quote_re = re.compile(r"([\"\\])") + quote_re = re.compile(r'([\"\\])') strict_domain_re = re.compile(r"\.?[^.]*") domain_re = re.compile(r"[^.]*") dots_re = re.compile(r"^\.+") Index: Lib/pydoc.py =================================================================== --- Lib/pydoc.py (revision 55970) +++ Lib/pydoc.py (working copy) @@ -389,7 +389,7 @@ # Backslashes are only literal in the string and are never # needed to make any special characters, so show a raw string. return 'r' + testrepr[0] + self.escape(test) + testrepr[0] - return re.sub(r'((\\[\\abfnrtv\'"]|\\[0-9]..|\\x..|\\u....)+)', + return re.sub(r'''((\\[\\abfnrtv\'"]|\\[0-9]..|\\x..|\\u....)+)''', r'\1', self.escape(testrepr)) Index: Lib/doctest.py =================================================================== --- Lib/doctest.py (revision 55970) +++ Lib/doctest.py (working copy) @@ -638,7 +638,7 @@ # "#doctest:". Eliminating these false positives would require # actually parsing the string; but we limit them by ignoring any # line containing "#doctest:" that is *followed* by a quote mark. - _OPTION_DIRECTIVE_RE = re.compile(r'#\s*doctest:\s*([^\n\'"]*)$', + _OPTION_DIRECTIVE_RE = re.compile(r'''#\s*doctest:\s*([^\n\'"]*)$''', re.MULTILINE) def _find_options(self, source, name, lineno): Index: Lib/test/test_tokenize.py =================================================================== --- Lib/test/test_tokenize.py (revision 55970) +++ Lib/test/test_tokenize.py (working copy) @@ -194,6 +194,9 @@ print(' test_main still working, be patient...', file=sys.__stdout__) sys.__stdout__.flush() + # print the file name since it's a random selection and it will help + # diagnose any problems. + print(' ' + f, file=sys.__stdout__) test_roundtrip(f) # Test detecton of IndentationError. Index: Lib/test/tokenize_tests.txt =================================================================== --- Lib/test/tokenize_tests.txt (revision 55970) +++ Lib/test/tokenize_tests.txt (working copy) @@ -101,7 +101,7 @@ the \'lazy\' dog.\n\ '; x = r'\\' + R'\\' -x = r'\'' + '' +x = r"\'" + '' y = r''' foo bar \\ baz''' + R''' @@ -110,19 +110,19 @@ bar \\ baz """ + R'''spam ''' -x = u'abc' + U'ABC' -y = u"abc" + U"ABC" -x = ur'abc' + Ur'ABC' + uR'ABC' + UR'ABC' -y = ur"abc" + Ur"ABC" + uR"ABC" + UR"ABC" -x = ur'\\' + UR'\\' -x = ur'\'' + '' -y = ur''' +x = b'abc' + B'ABC' +y = b"abc" + B"ABC" +x = br'abc' + Br'ABC' + bR'ABC' + BR'ABC' +y = br"abc" + Br"ABC" + bR"ABC" + BR"ABC" +x = br'\\' + BR'\\' +x = br"\'" + '' +y = br''' foo bar \\ -baz''' + UR''' +baz''' + BR''' foo''' -y = Ur"""foo +y = Br"""foo bar \\ baz -""" + uR'''spam +""" + bR'''spam ''' # Indentation Index: Lib/test/output/test_tokenize =================================================================== --- Lib/test/output/test_tokenize (revision 55970) +++ Lib/test/output/test_tokenize (working copy) @@ -324,7 +324,7 @@ 103,17-103,18: NEWLINE '\n' 104,0-104,1: NAME 'x' 104,2-104,3: OP '=' -104,4-104,9: STRING "r'\\''" +104,4-104,9: STRING 'r"\\\'"' 104,10-104,11: OP '+' 104,12-104,14: STRING "''" 104,14-104,15: NEWLINE '\n' @@ -342,59 +342,59 @@ 112,3-112,4: NEWLINE '\n' 113,0-113,1: NAME 'x' 113,2-113,3: OP '=' -113,4-113,10: STRING "u'abc'" +113,4-113,10: STRING "b'abc'" 113,11-113,12: OP '+' -113,13-113,19: STRING "U'ABC'" +113,13-113,19: STRING "B'ABC'" 113,19-113,20: NEWLINE '\n' 114,0-114,1: NAME 'y' 114,2-114,3: OP '=' -114,4-114,10: STRING 'u"abc"' +114,4-114,10: STRING 'b"abc"' 114,11-114,12: OP '+' -114,13-114,19: STRING 'U"ABC"' +114,13-114,19: STRING 'B"ABC"' 114,19-114,20: NEWLINE '\n' 115,0-115,1: NAME 'x' 115,2-115,3: OP '=' -115,4-115,11: STRING "ur'abc'" +115,4-115,11: STRING "br'abc'" 115,12-115,13: OP '+' -115,14-115,21: STRING "Ur'ABC'" +115,14-115,21: STRING "Br'ABC'" 115,22-115,23: OP '+' -115,24-115,31: STRING "uR'ABC'" +115,24-115,31: STRING "bR'ABC'" 115,32-115,33: OP '+' -115,34-115,41: STRING "UR'ABC'" +115,34-115,41: STRING "BR'ABC'" 115,41-115,42: NEWLINE '\n' 116,0-116,1: NAME 'y' 116,2-116,3: OP '=' -116,4-116,11: STRING 'ur"abc"' +116,4-116,11: STRING 'br"abc"' 116,12-116,13: OP '+' -116,14-116,21: STRING 'Ur"ABC"' +116,14-116,21: STRING 'Br"ABC"' 116,22-116,23: OP '+' -116,24-116,31: STRING 'uR"ABC"' +116,24-116,31: STRING 'bR"ABC"' 116,32-116,33: OP '+' -116,34-116,41: STRING 'UR"ABC"' +116,34-116,41: STRING 'BR"ABC"' 116,41-116,42: NEWLINE '\n' 117,0-117,1: NAME 'x' 117,2-117,3: OP '=' -117,4-117,10: STRING "ur'\\\\'" +117,4-117,10: STRING "br'\\\\'" 117,11-117,12: OP '+' -117,13-117,19: STRING "UR'\\\\'" +117,13-117,19: STRING "BR'\\\\'" 117,19-117,20: NEWLINE '\n' 118,0-118,1: NAME 'x' 118,2-118,3: OP '=' -118,4-118,10: STRING "ur'\\''" +118,4-118,10: STRING 'br"\\\'"' 118,11-118,12: OP '+' 118,13-118,15: STRING "''" 118,15-118,16: NEWLINE '\n' 119,0-119,1: NAME 'y' 119,2-119,3: OP '=' -119,4-121,6: STRING "ur'''\nfoo bar \\\\\nbaz'''" +119,4-121,6: STRING "br'''\nfoo bar \\\\\nbaz'''" 121,7-121,8: OP '+' -121,9-122,6: STRING "UR'''\nfoo'''" +121,9-122,6: STRING "BR'''\nfoo'''" 122,6-122,7: NEWLINE '\n' 123,0-123,1: NAME 'y' 123,2-123,3: OP '=' -123,4-125,3: STRING 'Ur"""foo\nbar \\\\ baz\n"""' +123,4-125,3: STRING 'Br"""foo\nbar \\\\ baz\n"""' 125,4-125,5: OP '+' -125,6-126,3: STRING "uR'''spam\n'''" +125,6-126,3: STRING "bR'''spam\n'''" 126,3-126,4: NEWLINE '\n' 127,0-127,1: NL '\n' 128,0-128,13: COMMENT '# Indentation' Index: Lib/xml/etree/ElementTree.py =================================================================== --- Lib/xml/etree/ElementTree.py (revision 55970) +++ Lib/xml/etree/ElementTree.py (working copy) @@ -728,7 +728,7 @@ return s # 1.5.2: assume the string uses the right encoding if sys.version[:3] == "1.5": - _escape = re.compile(r"[&<>\"\x80-\xff]+") # 1.5.2 + _escape = re.compile(r'[&<>\"\x80-\xff]+') # 1.5.2 else: _escape = re.compile(eval(r'u"[&<>\"\u0080-\uffff]+"')) Index: Lib/HTMLParser.py =================================================================== --- Lib/HTMLParser.py (revision 55970) +++ Lib/HTMLParser.py (working copy) @@ -26,7 +26,7 @@ tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*') attrfind = re.compile( r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' - r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?') + r'''(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?''') locatestarttagend = re.compile(r""" <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name