When using unicode characters inside of JSON strings, values retrieved via the JSON_EXTRACT SQLite function fail to be decoded by the sqlite3 driver if they include four-byte unicode characters.
Version information for my build, which is Fedora 30:
Python 3.7.4 (default, Jul 9 2019, 16:32:37)
[GCC 9.1.1 20190503 (Red Hat 9.1.1-1)] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import sqlite3
>>> sqlite3.version
'2.6.0'
>>> sqlite3.sqlite_version
'3.26.0'
Demo as follows:
import json
import sqlite3
# two unicode strings, the second one has four byte character in it
good_data = "réve illé"
bad_data = "réve🐍 illé"
# create simple json structures
good_data_json = json.dumps({"foo": good_data})
bad_data_json = json.dumps({"foo": bad_data})
# all strings are valid utf-8
# data round trips correctly through json
assert json.loads(good_data_json.encode("utf-8").decode("utf-8")) == {
"foo": good_data
}
assert json.loads(bad_data_json.encode("utf-8").decode("utf-8")) == {
"foo": bad_data
}
conn = sqlite3.connect(":memory:")
cursor = conn.cursor()
cursor.execute("CREATE TABLE some_data (id INT, data JSON)")
cursor.executemany(
"INSERT INTO some_data(id, data) VALUES(?, ?)",
[(1, good_data_json), (2, bad_data_json)],
)
# we can retrieve the JSON objects as a whole from the DB, no issue
cursor.execute("SELECT some_data.data FROM some_data ORDER BY id")
assert cursor.fetchall() == [(good_data_json, ), (bad_data_json, )]
# when we use JSON_EXTRACT, then full utf-8 support is lost
# extract good value from JSON object
cursor.execute("""
SELECT JSON_EXTRACT(some_data.data, '$."foo"')
FROM some_data WHERE id=1
""")
assert cursor.fetchone()[0] == good_data
# extract bad value from JSON object; utf-8 failure
# sqlite3.OperationalError: Could not decode to UTF-8 column
# 'JSON_EXTRACT(some_data.data, '$."foo"')' with text 'r��ve������ ill��'
cursor.execute("""
SELECT JSON_EXTRACT(some_data.data, '$."foo"')
FROM some_data WHERE id=2
""")
assert cursor.fetchone()[0] == bad_data
output:
Traceback (most recent call last):
File "test4.py", line 50, in <module>
""")
sqlite3.OperationalError: Could not decode to UTF-8 column 'JSON_EXTRACT(some_data.data, '$."foo"')' with text 'r��ve������ ill��'
surprising to say the least as the SQLite driver has always been completely solid with all unicode, but there you go.
|