diff -r e390e60fcb20 Lib/base64.py --- a/Lib/base64.py Sat Jun 28 20:23:49 2008 +0200 +++ b/Lib/base64.py Sun Jun 29 03:19:04 2008 +0200 @@ -39,7 +39,7 @@ def _translate(s, altchars): return s.translate(translation) - + # Base64 encoding/decoding uses binascii def b64encode(s, altchars=None): @@ -126,7 +126,7 @@ def urlsafe_b64decode(s): return b64decode(s, b'-_') - + # Base32 encoding/decoding must be done in Python _b32alphabet = { 0: b'A', 9: b'J', 18: b'S', 27: b'3', @@ -225,7 +225,7 @@ def b32decode(s, casefold=False, map01=N # characters because this will tell us how many null bytes to remove from # the end of the decoded string. padchars = 0 - mo = re.search('(?P[=]*)$', s) + mo = re.search(b'(?P[=]*)$', s) if mo: padchars = len(mo.group('pad')) if padchars > 0: @@ -262,7 +262,7 @@ def b32decode(s, casefold=False, map01=N return b''.join(parts) - + # RFC 3548, Base 16 Alphabet specifies uppercase, but hexlify() returns # lowercase. The RFC also recommends against accepting input case # insensitively. @@ -291,12 +291,12 @@ def b16decode(s, casefold=False): raise TypeError("expected bytes, not %s" % s.__class__.__name__) if casefold: s = s.upper() - if re.search('[^0-9A-F]', s): + if re.search(b'[^0-9A-F]', s): raise binascii.Error('Non-base16 digit found') return binascii.unhexlify(s) - + # Legacy interface. This code could be cleaned up since I don't believe # binascii has any line length limitations. It just doesn't seem worth it # though. The files should be opened in binary mode. @@ -353,7 +353,7 @@ def decodestring(s): return binascii.a2b_base64(s) - + # Usable as a script... def main(): """Small main program""" diff -r e390e60fcb20 Lib/encodings/idna.py --- a/Lib/encodings/idna.py Sat Jun 28 20:23:49 2008 +0200 +++ b/Lib/encodings/idna.py Sun Jun 29 03:19:04 2008 +0200 @@ -176,12 +176,10 @@ class Codec(codecs.Codec): return "", 0 # IDNA allows decoding to operate on Unicode strings, too. - if isinstance(input, bytes): - labels = dots.split(input) - else: - # Force to bytes + if not isinstance(input, bytes): + # XXX obviously wrong, see #3232 input = bytes(input) - labels = input.split(b".") + labels = input.split(b".") if labels and len(labels[-1]) == 0: trailing_dot = '.' diff -r e390e60fcb20 Lib/json/scanner.py --- a/Lib/json/scanner.py Sat Jun 28 20:23:49 2008 +0200 +++ b/Lib/json/scanner.py Sun Jun 29 03:19:04 2008 +0200 @@ -7,12 +7,14 @@ import sre_compile import sre_compile import sre_constants -from re import VERBOSE, MULTILINE, DOTALL +from re import VERBOSE, MULTILINE, DOTALL, UNICODE from sre_constants import BRANCH, SUBPATTERN __all__ = ['Scanner', 'pattern'] -FLAGS = (VERBOSE | MULTILINE | DOTALL) +# UNICODE must be specified explicitly as we build the Pattern object +# ourselves +FLAGS = (VERBOSE | MULTILINE | DOTALL | UNICODE) class Scanner(object): def __init__(self, lexicon, flags=FLAGS): diff -r e390e60fcb20 Lib/py_compile.py --- a/Lib/py_compile.py Sat Jun 28 20:23:49 2008 +0200 +++ b/Lib/py_compile.py Sun Jun 29 03:19:04 2008 +0200 @@ -86,7 +86,7 @@ def read_encoding(file, default): line = f.readline() if not line: break - m = re.match(r".*\bcoding:\s*(\S+)\b", line) + m = re.match(br".*\bcoding:\s*(\S+)\b", line) if m: return m.group(1).decode("ascii") return default diff -r e390e60fcb20 Lib/re.py --- a/Lib/re.py Sat Jun 28 20:23:49 2008 +0200 +++ b/Lib/re.py Sun Jun 29 03:19:04 2008 +0200 @@ -294,6 +294,9 @@ class Scanner: p.append(sre_parse.SubPattern(s, [ (SUBPATTERN, (len(p)+1, sre_parse.parse(phrase, flags))), ])) + # XXX: this should be fixed properly by checking that all phrases + # are of compatible types, but does anyone care? + s.flags = sre_parse.fix_flags(phrase, s.flags) s.groups = len(p)+1 p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) self.scanner = sre_compile.compile(p) diff -r e390e60fcb20 Lib/sre_parse.py --- a/Lib/sre_parse.py Sat Jun 28 20:23:49 2008 +0200 +++ b/Lib/sre_parse.py Sun Jun 29 03:19:04 2008 +0200 @@ -200,7 +200,7 @@ class Tokenizer: except IndexError: raise error("bogus escape (end of line)") if isinstance(self.string, bytes): - char = chr(c) + c = chr(c) char = char + c self.index = self.index + len(char) self.next = char @@ -672,9 +672,19 @@ def _parse(source, state): return subpattern +def fix_flags(src, flags): + # Check and fix flags according to the type of pattern (str or bytes) + if isinstance(src, str): + flags |= SRE_FLAG_UNICODE + else: + if flags & SRE_FLAG_UNICODE: + raise ValueError("can't use UNICODE flag with a bytes pattern") + return flags + def parse(str, flags=0, pattern=None): # parse 're' pattern into list of (opcode, argument) tuples + flags = fix_flags(str, flags) source = Tokenizer(str) if pattern is None: diff -r e390e60fcb20 Lib/tarfile.py --- a/Lib/tarfile.py Sat Jun 28 20:23:49 2008 +0200 +++ b/Lib/tarfile.py Sun Jun 29 03:19:04 2008 +0200 @@ -1368,7 +1368,7 @@ class TarInfo(object): # "%d %s=%s\n" % (length, keyword, value). length is the size # of the complete record including the length field itself and # the newline. keyword and value are both UTF-8 encoded strings. - regex = re.compile(r"(\d+) ([^=]+)=", re.U) + regex = re.compile(br"(\d+) ([^=]+)=") pos = 0 while True: match = regex.match(buf, pos) diff -r e390e60fcb20 Lib/test/re_tests.py --- a/Lib/test/re_tests.py Sat Jun 28 20:23:49 2008 +0200 +++ b/Lib/test/re_tests.py Sun Jun 29 03:19:04 2008 +0200 @@ -661,14 +661,10 @@ 123""", SUCCEED, 'found', 'abc'), ('^([ab]*?)(?tp_as_buffer; - if (!buffer || !buffer->bf_getbuffer || + if (!buffer || !buffer->bf_getbuffer || (*buffer->bf_getbuffer)(string, &view, PyBUF_SIMPLE) < 0) { PyErr_SetString(PyExc_TypeError, "expected string or buffer"); return NULL; @@ -1717,7 +1717,7 @@ getstring(PyObject* string, Py_ssize_t* if (PyBytes_Check(string) || bytes == size) charsize = 1; #if defined(HAVE_UNICODE) - else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE))) + else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE))) charsize = sizeof(Py_UNICODE); #endif else { @@ -1729,7 +1729,7 @@ getstring(PyObject* string, Py_ssize_t* *p_charsize = charsize; if (ptr == NULL) { - PyErr_SetString(PyExc_ValueError, + PyErr_SetString(PyExc_ValueError, "Buffer is NULL"); } return ptr; @@ -1753,6 +1753,17 @@ state_init(SRE_STATE* state, PatternObje ptr = getstring(string, &length, &charsize); if (!ptr) return NULL; + + if (charsize == 1 && pattern->flags & SRE_FLAG_UNICODE) { + PyErr_SetString(PyExc_TypeError, + "can't use a string pattern on a bytes-like object"); + return NULL; + } + if (charsize > 1 && !(pattern->flags & SRE_FLAG_UNICODE)) { + PyErr_SetString(PyExc_TypeError, + "can't use a bytes pattern on a string-like object"); + return NULL; + } /* adjust boundaries */ if (start < 0)