BUG: REGR: read_csv with memory_map=True raises UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc4 in position 262143: unexpected end of data (#43647)

michal-gh · web-flow · commit 5a369d61cd9a · 2021-09-21T17:37:41.000-04:00
diff --git a/doc/source/whatsnew/v1.3.4.rst b/doc/source/whatsnew/v1.3.4.rst
@@ -19,6 +19,7 @@ Fixed regressions
 - Fixed performance regression in :meth:`MultiIndex.equals` (:issue:`43549`)
 - Fixed regression in :meth:`Series.cat.reorder_categories` failing to update the categories on the ``Series`` (:issue:`43232`)
 - Fixed regression in :meth:`Series.cat.categories` setter failing to update the categories on the ``Series`` (:issue:`43334`)
+- Fixed regression in :meth:`pandas.read_csv` raising ``UnicodeDecodeError`` exception when ``memory_map=True`` (:issue:`43540`)
 -
 
 .. ---------------------------------------------------------------------------
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -877,7 +877,8 @@ def read(self, size: int = -1) -> str | bytes:
         if self.decode:
             # memory mapping is applied before compression. Encoding should
             # be applied to the de-compressed data.
-            return content.decode(self.encoding, errors=self.errors)
+            final = size == -1 or len(content) < size
+            return self.decoder.decode(content, final=final)
         return content
 
     def __next__(self) -> str:
diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py
@@ -252,6 +252,26 @@ def test_encoding_memory_map(all_parsers, encoding):
     tm.assert_frame_equal(df, expected)
 
 
+@skip_pyarrow
+def test_chunk_splits_multibyte_char(all_parsers):
+    """
+    Chunk splits a multibyte character with memory_map=True
+
+    GH 43540
+    """
+    parser = all_parsers
+    # DEFAULT_CHUNKSIZE = 262144, defined in parsers.pyx
+    df = DataFrame(data=["a" * 127] * 2048)
+
+    # Put two-bytes utf-8 encoded character "ą" at the end of chunk
+    # utf-8 encoding of "ą" is b'\xc4\x85'
+    df.iloc[2047] = "a" * 127 + "ą"
+    with tm.ensure_clean("bug-gh43540.csv") as fname:
+        df.to_csv(fname, index=False, header=False, encoding="utf-8")
+        dfr = parser.read_csv(fname, header=None, memory_map=True, engine="c")
+    tm.assert_frame_equal(dfr, df)
+
+
 def test_not_readable(all_parsers):
     # GH43439
     parser = all_parsers

Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,7 @@ Fixed regressions`
`19`	`19`	- Fixed performance regression in :meth:`MultiIndex.equals` (:issue:`43549`)
`20`	`20`	- Fixed regression in :meth:`Series.cat.reorder_categories` failing to update the categories on the ``Series`` (:issue:`43232`)
`21`	`21`	- Fixed regression in :meth:`Series.cat.categories` setter failing to update the categories on the ``Series`` (:issue:`43334`)
	`22`	+- Fixed regression in :meth:`pandas.read_csv` raising ``UnicodeDecodeError`` exception when ``memory_map=True`` (:issue:`43540`)
`22`	`23`	`-`
`23`	`24`
`24`	`25`	`.. ---------------------------------------------------------------------------`