diff -r e390e60fcb20 Lib/base64.py
--- a/Lib/base64.py	Sat Jun 28 20:23:49 2008 +0200
+++ b/Lib/base64.py	Sun Jun 29 03:19:04 2008 +0200
@@ -39,7 +39,7 @@ def _translate(s, altchars):
     return s.translate(translation)
 
 
-
+
 # Base64 encoding/decoding uses binascii
 
 def b64encode(s, altchars=None):
@@ -126,7 +126,7 @@ def urlsafe_b64decode(s):
     return b64decode(s, b'-_')
 
 
-
+
 # Base32 encoding/decoding must be done in Python
 _b32alphabet = {
     0: b'A',  9: b'J', 18: b'S', 27: b'3',
@@ -225,7 +225,7 @@ def b32decode(s, casefold=False, map01=N
     # characters because this will tell us how many null bytes to remove from
     # the end of the decoded string.
     padchars = 0
-    mo = re.search('(?P<pad>[=]*)$', s)
+    mo = re.search(b'(?P<pad>[=]*)$', s)
     if mo:
         padchars = len(mo.group('pad'))
         if padchars > 0:
@@ -262,7 +262,7 @@ def b32decode(s, casefold=False, map01=N
     return b''.join(parts)
 
 
-
+
 # RFC 3548, Base 16 Alphabet specifies uppercase, but hexlify() returns
 # lowercase.  The RFC also recommends against accepting input case
 # insensitively.
@@ -291,12 +291,12 @@ def b16decode(s, casefold=False):
         raise TypeError("expected bytes, not %s" % s.__class__.__name__)
     if casefold:
         s = s.upper()
-    if re.search('[^0-9A-F]', s):
+    if re.search(b'[^0-9A-F]', s):
         raise binascii.Error('Non-base16 digit found')
     return binascii.unhexlify(s)
 
 
-
+
 # Legacy interface.  This code could be cleaned up since I don't believe
 # binascii has any line length limitations.  It just doesn't seem worth it
 # though.  The files should be opened in binary mode.
@@ -353,7 +353,7 @@ def decodestring(s):
     return binascii.a2b_base64(s)
 
 
-
+
 # Usable as a script...
 def main():
     """Small main program"""
diff -r e390e60fcb20 Lib/encodings/idna.py
--- a/Lib/encodings/idna.py	Sat Jun 28 20:23:49 2008 +0200
+++ b/Lib/encodings/idna.py	Sun Jun 29 03:19:04 2008 +0200
@@ -176,12 +176,10 @@ class Codec(codecs.Codec):
             return "", 0
 
         # IDNA allows decoding to operate on Unicode strings, too.
-        if isinstance(input, bytes):
-            labels = dots.split(input)
-        else:
-            # Force to bytes
+        if not isinstance(input, bytes):
+            # XXX obviously wrong, see #3232
             input = bytes(input)
-            labels = input.split(b".")
+        labels = input.split(b".")
 
         if labels and len(labels[-1]) == 0:
             trailing_dot = '.'
diff -r e390e60fcb20 Lib/json/scanner.py
--- a/Lib/json/scanner.py	Sat Jun 28 20:23:49 2008 +0200
+++ b/Lib/json/scanner.py	Sun Jun 29 03:19:04 2008 +0200
@@ -7,12 +7,14 @@ import sre_compile
 import sre_compile
 import sre_constants
 
-from re import VERBOSE, MULTILINE, DOTALL
+from re import VERBOSE, MULTILINE, DOTALL, UNICODE
 from sre_constants import BRANCH, SUBPATTERN
 
 __all__ = ['Scanner', 'pattern']
 
-FLAGS = (VERBOSE | MULTILINE | DOTALL)
+# UNICODE must be specified explicitly as we build the Pattern object
+# ourselves
+FLAGS = (VERBOSE | MULTILINE | DOTALL | UNICODE)
 
 class Scanner(object):
     def __init__(self, lexicon, flags=FLAGS):
diff -r e390e60fcb20 Lib/py_compile.py
--- a/Lib/py_compile.py	Sat Jun 28 20:23:49 2008 +0200
+++ b/Lib/py_compile.py	Sun Jun 29 03:19:04 2008 +0200
@@ -86,7 +86,7 @@ def read_encoding(file, default):
             line = f.readline()
             if not line:
                 break
-            m = re.match(r".*\bcoding:\s*(\S+)\b", line)
+            m = re.match(br".*\bcoding:\s*(\S+)\b", line)
             if m:
                 return m.group(1).decode("ascii")
         return default
diff -r e390e60fcb20 Lib/re.py
--- a/Lib/re.py	Sat Jun 28 20:23:49 2008 +0200
+++ b/Lib/re.py	Sun Jun 29 03:19:04 2008 +0200
@@ -294,6 +294,9 @@ class Scanner:
             p.append(sre_parse.SubPattern(s, [
                 (SUBPATTERN, (len(p)+1, sre_parse.parse(phrase, flags))),
                 ]))
+            # XXX: this should be fixed properly by checking that all phrases
+            # are of compatible types, but does anyone care?
+            s.flags = sre_parse.fix_flags(phrase, s.flags)
         s.groups = len(p)+1
         p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
         self.scanner = sre_compile.compile(p)
diff -r e390e60fcb20 Lib/sre_parse.py
--- a/Lib/sre_parse.py	Sat Jun 28 20:23:49 2008 +0200
+++ b/Lib/sre_parse.py	Sun Jun 29 03:19:04 2008 +0200
@@ -200,7 +200,7 @@ class Tokenizer:
             except IndexError:
                 raise error("bogus escape (end of line)")
             if isinstance(self.string, bytes):
-                char = chr(c)
+                c = chr(c)
             char = char + c
         self.index = self.index + len(char)
         self.next = char
@@ -672,9 +672,19 @@ def _parse(source, state):
 
     return subpattern
 
+def fix_flags(src, flags):
+    # Check and fix flags according to the type of pattern (str or bytes)
+    if isinstance(src, str):
+        flags |= SRE_FLAG_UNICODE
+    else:
+        if flags & SRE_FLAG_UNICODE:
+            raise ValueError("can't use UNICODE flag with a bytes pattern")
+    return flags
+
 def parse(str, flags=0, pattern=None):
     # parse 're' pattern into list of (opcode, argument) tuples
 
+    flags = fix_flags(str, flags)
     source = Tokenizer(str)
 
     if pattern is None:
diff -r e390e60fcb20 Lib/tarfile.py
--- a/Lib/tarfile.py	Sat Jun 28 20:23:49 2008 +0200
+++ b/Lib/tarfile.py	Sun Jun 29 03:19:04 2008 +0200
@@ -1368,7 +1368,7 @@ class TarInfo(object):
         # "%d %s=%s\n" % (length, keyword, value). length is the size
         # of the complete record including the length field itself and
         # the newline. keyword and value are both UTF-8 encoded strings.
-        regex = re.compile(r"(\d+) ([^=]+)=", re.U)
+        regex = re.compile(br"(\d+) ([^=]+)=")
         pos = 0
         while True:
             match = regex.match(buf, pos)
diff -r e390e60fcb20 Lib/test/re_tests.py
--- a/Lib/test/re_tests.py	Sat Jun 28 20:23:49 2008 +0200
+++ b/Lib/test/re_tests.py	Sun Jun 29 03:19:04 2008 +0200
@@ -661,14 +661,10 @@ 123""", SUCCEED, 'found', 'abc'),
     ('^([ab]*?)(?<!(a))c', 'abc', SUCCEED, 'g1+"-"+g2', 'ab-None'),
 ]
 
-try:
-    u = eval("u'\N{LATIN CAPITAL LETTER A WITH DIAERESIS}'")
-except SyntaxError:
-    pass
-else:
-    tests.extend([
+u = '\N{LATIN CAPITAL LETTER A WITH DIAERESIS}'
+tests.extend([
     # bug 410271: \b broken under locales
     (r'\b.\b', 'a', SUCCEED, 'found', 'a'),
     (r'(?u)\b.\b', u, SUCCEED, 'found', u),
     (r'(?u)\w', u, SUCCEED, 'found', u),
-    ])
+])
diff -r e390e60fcb20 Lib/test/test_bytes.py
--- a/Lib/test/test_bytes.py	Sat Jun 28 20:23:49 2008 +0200
+++ b/Lib/test/test_bytes.py	Sun Jun 29 03:19:04 2008 +0200
@@ -498,7 +498,7 @@ class ByteArrayTest(BaseBytesTest):
         def by(s):
             return bytearray(map(ord, s))
         b = by("Hello, world")
-        self.assertEqual(re.findall(r"\w+", b), [by("Hello"), by("world")])
+        self.assertEqual(re.findall(br"\w+", b), [by("Hello"), by("world")])
 
     def test_setitem(self):
         b = bytearray([1, 2, 3])
diff -r e390e60fcb20 Lib/test/test_mmap.py
--- a/Lib/test/test_mmap.py	Sat Jun 28 20:23:49 2008 +0200
+++ b/Lib/test/test_mmap.py	Sun Jun 29 03:19:04 2008 +0200
@@ -54,7 +54,7 @@ class MmapTests(unittest.TestCase):
         m.flush()
 
         # Test doing a regular expression match in an mmap'ed file
-        match = re.search('[A-Za-z]+', m)
+        match = re.search(b'[A-Za-z]+', m)
         if match is None:
             self.fail('regex match on mmap failed!')
         else:
diff -r e390e60fcb20 Lib/test/test_re.py
--- a/Lib/test/test_re.py	Sat Jun 28 20:23:49 2008 +0200
+++ b/Lib/test/test_re.py	Sun Jun 29 03:19:04 2008 +0200
@@ -82,23 +82,6 @@ class ReTests(unittest.TestCase):
                          'abc\ndef\n')
         self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
                          'abc\ndef\n')
-
-    def test_bug_1140(self):
-        # re.sub(x, y, b'') should return b'', not '', and
-        # re.sub(x, y, '') should return '', not b''.
-        # Also:
-        # re.sub(x, y, str(x)) should return str(y), and
-        # re.sub(x, y, bytes(x)) should return
-        #     str(y) if isinstance(y, str) else unicode(y).
-        for x in 'x',  b'x':
-            for y in 'y', b'y':
-                z = re.sub(x, y, b'')
-                self.assertEqual(z, b'')
-                self.assertEqual(type(z), bytes)
-                #
-                z = re.sub(x, y, '')
-                self.assertEqual(z, '')
-                self.assertEqual(type(z), str)
 
     def test_bug_1661(self):
         # Verify that flags do not get silently ignored with compiled patterns
@@ -607,8 +590,8 @@ class ReTests(unittest.TestCase):
         import array
         for typecode in 'bBuhHiIlLfd':
             a = array.array(typecode)
-            self.assertEqual(re.compile("bla").match(a), None)
-            self.assertEqual(re.compile("").match(a).groups(), ())
+            self.assertEqual(re.compile(b"bla").match(a), None)
+            self.assertEqual(re.compile(b"").match(a).groups(), ())
 
     def test_inline_flags(self):
         # Bug #1700
@@ -650,6 +633,32 @@ class ReTests(unittest.TestCase):
         self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
         self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
         self.assertEqual(pattern.sub('#', '\n'), '#\n#')
+
+    def test_bytes_str_mixing(self):
+        # Mixing str and bytes is disallowed
+        pat = re.compile('.')
+        bpat = re.compile(b'.')
+        self.assertRaises(TypeError, pat.match, b'b')
+        self.assertRaises(TypeError, bpat.match, 'b')
+        self.assertRaises(TypeError, pat.sub, b'b', 'c')
+        self.assertRaises(TypeError, pat.sub, 'b', b'c')
+        self.assertRaises(TypeError, pat.sub, b'b', b'c')
+        self.assertRaises(TypeError, bpat.sub, b'b', 'c')
+        self.assertRaises(TypeError, bpat.sub, 'b', b'c')
+        self.assertRaises(TypeError, bpat.sub, 'b', 'c')
+
+    def test_unicode_flag(self):
+        # The unicode flag is implied for str patterns and disallowed for
+        # bytes patterns, which default to ascii.
+        pat = re.compile('\xc0', re.IGNORECASE)
+        self.assertNotEqual(pat.match('\xe0'), None)
+        pat = re.compile('\w')
+        self.assertNotEqual(pat.match('\xe0'), None)
+        pat = re.compile(b'\xc0', re.IGNORECASE)
+        self.assertEqual(pat.match(b'\xe0'), None)
+        pat = re.compile(b'\w')
+        self.assertEqual(pat.match(b'\xe0'), None)
+        self.assertRaises(ValueError, re.compile, b'\w', re.UNICODE)
 
 
 def run_re_tests():
@@ -725,23 +734,25 @@ def run_re_tests():
                 else:
                     print('=== Failed incorrectly', t)
 
-                # Try the match on a unicode string, and check that it
-                # still succeeds.
+                # Try the match with both pattern and string converted to
+                # bytes, and check that it still succeeds.
                 try:
-                    result = obj.search(str(s, "latin-1"))
-                    if result is None:
-                        print('=== Fails on unicode match', t)
-                except NameError:
-                    continue # 1.5.2
-                except TypeError:
-                    continue # unicode test case
-
-                # Try the match on a unicode pattern, and check that it
-                # still succeeds.
-                obj=re.compile(str(pattern, "latin-1"))
-                result = obj.search(s)
-                if result is None:
-                    print('=== Fails on unicode pattern match', t)
+                    bpat = bytes(pattern, "ascii")
+                    bs = bytes(s, "ascii")
+                except UnicodeEncodeError:
+                    # skip non-ascii tests
+                    pass
+                else:
+                    try:
+                        bpat = re.compile(bpat)
+                    except Exception:
+                        print('=== Fails on bytes pattern compile', t)
+                        if verbose:
+                            traceback.print_exc(file=sys.stdout)
+                    else:
+                        bytes_result = bpat.search(bs)
+                        if bytes_result is None:
+                            print('=== Fails on bytes pattern match', t)
 
                 # Try the match with the search area limited to the extent
                 # of the match and see if it still succeeds.  \B will
@@ -764,10 +775,11 @@ def run_re_tests():
 
                 # Try the match with LOCALE enabled, and check that it
                 # still succeeds.
-                obj = re.compile(pattern, re.LOCALE)
-                result = obj.search(s)
-                if result is None:
-                    print('=== Fails on locale-sensitive match', t)
+                if '(?u)' not in pattern:
+                    obj = re.compile(pattern, re.LOCALE)
+                    result = obj.search(s)
+                    if result is None:
+                        print('=== Fails on locale-sensitive match', t)
 
                 # Try the match with UNICODE locale enabled, and check
                 # that it still succeeds.
diff -r e390e60fcb20 Modules/_sre.c
--- a/Modules/_sre.c	Sat Jun 28 20:23:49 2008 +0200
+++ b/Modules/_sre.c	Sun Jun 29 03:19:04 2008 +0200
@@ -1691,7 +1691,7 @@ getstring(PyObject* string, Py_ssize_t* 
     /* get pointer to string buffer */
     view.len = -1;
     buffer = Py_TYPE(string)->tp_as_buffer;
-    if (!buffer || !buffer->bf_getbuffer || 
+    if (!buffer || !buffer->bf_getbuffer ||
         (*buffer->bf_getbuffer)(string, &view, PyBUF_SIMPLE) < 0) {
             PyErr_SetString(PyExc_TypeError, "expected string or buffer");
             return NULL;
@@ -1717,7 +1717,7 @@ getstring(PyObject* string, Py_ssize_t* 
     if (PyBytes_Check(string) || bytes == size)
         charsize = 1;
 #if defined(HAVE_UNICODE)
-    else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE))) 
+    else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
         charsize = sizeof(Py_UNICODE);
 #endif
     else {
@@ -1729,7 +1729,7 @@ getstring(PyObject* string, Py_ssize_t* 
     *p_charsize = charsize;
 
     if (ptr == NULL) {
-            PyErr_SetString(PyExc_ValueError, 
+            PyErr_SetString(PyExc_ValueError,
                             "Buffer is NULL");
     }
     return ptr;
@@ -1753,6 +1753,17 @@ state_init(SRE_STATE* state, PatternObje
     ptr = getstring(string, &length, &charsize);
     if (!ptr)
         return NULL;
+
+	if (charsize == 1 && pattern->flags & SRE_FLAG_UNICODE) {
+		PyErr_SetString(PyExc_TypeError,
+			"can't use a string pattern on a bytes-like object");
+		return NULL;
+	}
+	if (charsize > 1 && !(pattern->flags & SRE_FLAG_UNICODE)) {
+		PyErr_SetString(PyExc_TypeError,
+			"can't use a bytes pattern on a string-like object");
+		return NULL;
+	}
 
     /* adjust boundaries */
     if (start < 0)