From 68069719cf86a9e0d76d86485a24d2e1f27f8ed5 Mon Sep 17 00:00:00 2001 From: Oliver Seemann Date: Thu, 31 Aug 2023 21:35:28 +0200 Subject: [PATCH] Avoid UnicodeDecodeError for non-utf8 QueryEvents Query strings in QueryEvents that appear in the binlog stream must not necessarily be utf-8 encoded, but the current implementation handles only utf-8. This commit adds the `errors="backslashreplace"` kwarg to decode(), to avoid a runtime error and insert \xNN escape sequences for byte sequences that are not valid utf-8. It includes a test that generates a QueryEvent with latin-1 encoding, which fails without the fix. --- pymysqlreplication/event.py | 5 +++-- pymysqlreplication/tests/base.py | 4 ++-- pymysqlreplication/tests/test_basic.py | 18 ++++++++++++++++++ 3 files changed, 23 insertions(+), 4 deletions(-) diff --git a/pymysqlreplication/event.py b/pymysqlreplication/event.py index 12db2915..580d099b 100644 --- a/pymysqlreplication/event.py +++ b/pymysqlreplication/event.py @@ -324,8 +324,9 @@ def __init__(self, from_packet, event_size, table_map, ctl_connection, **kwargs) self.schema = self.packet.read(self.schema_length) self.packet.advance(1) - self.query = self.packet.read(event_size - 13 - self.status_vars_length - - self.schema_length - 1).decode("utf-8") + query = self.packet.read(event_size - 13 - self.status_vars_length + - self.schema_length - 1) + self.query = query.decode("utf-8", errors='backslashreplace') #string[EOF] query def _dump(self): diff --git a/pymysqlreplication/tests/base.py b/pymysqlreplication/tests/base.py index fd18cb3d..301ee3e9 100644 --- a/pymysqlreplication/tests/base.py +++ b/pymysqlreplication/tests/base.py @@ -18,7 +18,7 @@ class PyMySQLReplicationTestCase(base): def ignoredEvents(self): return [] - def setUp(self): + def setUp(self, charset="utf8"): # default self.database = { "host": os.environ.get("MYSQL_5_7") or "localhost", @@ -26,7 +26,7 @@ def setUp(self): "passwd": "", "port": 3306, "use_unicode": True, - "charset": "utf8", + "charset": charset, "db": "pymysqlreplication_test" } diff --git a/pymysqlreplication/tests/test_basic.py b/pymysqlreplication/tests/test_basic.py index cb27dada..f31cbcfa 100644 --- a/pymysqlreplication/tests/test_basic.py +++ b/pymysqlreplication/tests/test_basic.py @@ -1190,6 +1190,24 @@ def test_rows_query_log_event(self): event = self.stream.fetchone() self.assertIsInstance(event, RowsQueryLogEvent) +class TestLatin1(base.PyMySQLReplicationTestCase): + + def setUp(self): + super().setUp(charset='latin1') + + def test_query_event_latin1(self): + """ + Ensure query events with a non-utf8 encoded query are parsed without errors. + """ + self.stream = BinLogStreamReader(self.database, server_id=1024, only_events=[QueryEvent]) + self.execute("CREATE TABLE test_latin1_ÖÆÛ (a INT)") + self.execute("COMMIT") + assert "ÖÆÛ".encode('latin-1') == b'\xd6\xc6\xdb' + + event = self.stream.fetchone() + assert event.query.startswith("CREATE TABLE test") + assert event.query == r"CREATE TABLE test_latin1_\xd6\xc6\xdb (a INT)" + if __name__ == "__main__": import unittest