From c2e91f86315a7b59abcee09aacf0fd2f8c2b04fa Mon Sep 17 00:00:00 2001 From: michal Date: Sat, 18 Sep 2021 20:07:01 +0200 Subject: [PATCH 1/5] Fixes GH43540 --- pandas/io/common.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index ba1cc82bfea56..628828466e757 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -877,7 +877,8 @@ def read(self, size: int = -1) -> str | bytes: if self.decode: # memory mapping is applied before compression. Encoding should # be applied to the de-compressed data. - return content.decode(self.encoding, errors=self.errors) + final: bool = len(content) == 0 + return self.decoder.decode(content, final=final) return content def __next__(self) -> str: From a900ef9e3a57b4d027775a26a43f63dfff870dad Mon Sep 17 00:00:00 2001 From: michal Date: Sat, 18 Sep 2021 20:07:01 +0200 Subject: [PATCH 2/5] Revert "Fixes GH43540" This reverts commit c2e91f86315a7b59abcee09aacf0fd2f8c2b04fa. --- pandas/io/common.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 628828466e757..ba1cc82bfea56 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -877,8 +877,7 @@ def read(self, size: int = -1) -> str | bytes: if self.decode: # memory mapping is applied before compression. Encoding should # be applied to the de-compressed data. - final: bool = len(content) == 0 - return self.decoder.decode(content, final=final) + return content.decode(self.encoding, errors=self.errors) return content def __next__(self) -> str: From 7eaa1263b9d3435c111ed8708cbebc03d43aeec3 Mon Sep 17 00:00:00 2001 From: michal Date: Tue, 21 Sep 2021 19:19:20 +0200 Subject: [PATCH 3/5] Fix GH43540 --- doc/source/whatsnew/v1.3.4.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.3.4.rst b/doc/source/whatsnew/v1.3.4.rst index 87b08fae52c15..e6475c75532e5 100644 --- a/doc/source/whatsnew/v1.3.4.rst +++ b/doc/source/whatsnew/v1.3.4.rst @@ -19,6 +19,7 @@ Fixed regressions - Fixed performance regression in :meth:`MultiIndex.equals` (:issue:`43549`) - Fixed regression in :meth:`Series.cat.reorder_categories` failing to update the categories on the ``Series`` (:issue:`43232`) - Fixed regression in :meth:`Series.cat.categories` setter failing to update the categories on the ``Series`` (:issue:`43334`) +- Fixed regression in :meth:`pandas.read_csv` raising UnicodeDecodeError exception when memory_map=True (:issue:`43540`) - .. --------------------------------------------------------------------------- From 61f13b8bd84ba2e5b72d66377f529a8474b4d657 Mon Sep 17 00:00:00 2001 From: michal Date: Tue, 21 Sep 2021 19:48:33 +0200 Subject: [PATCH 4/5] Fix GH43540 --- doc/source/whatsnew/v1.3.4.rst | 2 +- pandas/io/common.py | 3 ++- pandas/tests/io/parser/test_encoding.py | 26 +++++++++++++++++++++++++ 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.3.4.rst b/doc/source/whatsnew/v1.3.4.rst index e6475c75532e5..6212f2c6f3399 100644 --- a/doc/source/whatsnew/v1.3.4.rst +++ b/doc/source/whatsnew/v1.3.4.rst @@ -19,7 +19,7 @@ Fixed regressions - Fixed performance regression in :meth:`MultiIndex.equals` (:issue:`43549`) - Fixed regression in :meth:`Series.cat.reorder_categories` failing to update the categories on the ``Series`` (:issue:`43232`) - Fixed regression in :meth:`Series.cat.categories` setter failing to update the categories on the ``Series`` (:issue:`43334`) -- Fixed regression in :meth:`pandas.read_csv` raising UnicodeDecodeError exception when memory_map=True (:issue:`43540`) +- Fixed regression in :meth:`pandas.read_csv` raising ``UnicodeDecodeError`` exception when ``memory_map=True`` (:issue:`43540`) - .. --------------------------------------------------------------------------- diff --git a/pandas/io/common.py b/pandas/io/common.py index ba1cc82bfea56..6dfddd571b88f 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -877,7 +877,8 @@ def read(self, size: int = -1) -> str | bytes: if self.decode: # memory mapping is applied before compression. Encoding should # be applied to the de-compressed data. - return content.decode(self.encoding, errors=self.errors) + final = size == -1 or len(content) < size + return self.decoder.decode(content, final=final) return content def __next__(self) -> str: diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index 680c437f7087e..b269679bddeca 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -252,6 +252,32 @@ def test_encoding_memory_map(all_parsers, encoding): tm.assert_frame_equal(df, expected) +@skip_pyarrow +def test_chunk_splits_multibyte_char(all_parsers): + """ + Chunk splits a multibyte character with memory_map=True + + GH 43540 + """ + parser = all_parsers + # DEFAULT_CHUNKSIZE = 262144, defined in parsers.pyx + df = DataFrame(data=["a" * 127] * 2048) + + # Put two-bytes utf-8 encoded character "ą" at the end of chunk + # utf-8 encoding of "ą" is b'\xc4\x85' + df.iloc[2047] = "a" * 127 + "ą" + with tm.ensure_clean("bug-gh43540.csv") as fname: + df.to_csv( + fname, + index=False, + header=False, + encoding="utf-8", + engine="c", + ) + dfr = parser.read_csv(fname, header=None, memory_map=True) + tm.assert_frame_equal(dfr, df) + + def test_not_readable(all_parsers): # GH43439 parser = all_parsers From 3a262778b3c6759d23e61c47385e3b36c1d66cb8 Mon Sep 17 00:00:00 2001 From: michal Date: Tue, 21 Sep 2021 20:17:07 +0200 Subject: [PATCH 5/5] Fix GH43540 --- pandas/tests/io/parser/test_encoding.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index b269679bddeca..6ca3fdf9a6258 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -267,14 +267,8 @@ def test_chunk_splits_multibyte_char(all_parsers): # utf-8 encoding of "ą" is b'\xc4\x85' df.iloc[2047] = "a" * 127 + "ą" with tm.ensure_clean("bug-gh43540.csv") as fname: - df.to_csv( - fname, - index=False, - header=False, - encoding="utf-8", - engine="c", - ) - dfr = parser.read_csv(fname, header=None, memory_map=True) + df.to_csv(fname, index=False, header=False, encoding="utf-8") + dfr = parser.read_csv(fname, header=None, memory_map=True, engine="c") tm.assert_frame_equal(dfr, df)