Skip to content

Commit 3162b24

Browse files
meeseeksmachinemichal-ghlithomas1
authored
Backport PR #43647 on branch 1.3.x (BUG: REGR: read_csv with memory_map=True raises UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc4 in position 262143: unexpected end of data ) (#43691)
Co-authored-by: michal-gh <[email protected]> Co-authored-by: Thomas Li <[email protected]>
1 parent 4782ec1 commit 3162b24

File tree

3 files changed

+22
-1
lines changed

3 files changed

+22
-1
lines changed

doc/source/whatsnew/v1.3.4.rst

+1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ Fixed regressions
1919
- Fixed performance regression in :meth:`MultiIndex.equals` (:issue:`43549`)
2020
- Fixed regression in :meth:`Series.cat.reorder_categories` failing to update the categories on the ``Series`` (:issue:`43232`)
2121
- Fixed regression in :meth:`Series.cat.categories` setter failing to update the categories on the ``Series`` (:issue:`43334`)
22+
- Fixed regression in :meth:`pandas.read_csv` raising ``UnicodeDecodeError`` exception when ``memory_map=True`` (:issue:`43540`)
2223
-
2324

2425
.. ---------------------------------------------------------------------------

pandas/io/common.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -859,7 +859,8 @@ def read(self, size: int = -1) -> str | bytes:
859859
if self.decode:
860860
# memory mapping is applied before compression. Encoding should
861861
# be applied to the de-compressed data.
862-
return content.decode(self.encoding, errors=self.errors)
862+
final = size == -1 or len(content) < size
863+
return self.decoder.decode(content, final=final)
863864
return content
864865

865866
def __next__(self) -> str:

pandas/tests/io/parser/test_encoding.py

+19
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,25 @@ def test_encoding_memory_map(all_parsers, encoding):
239239
tm.assert_frame_equal(df, expected)
240240

241241

242+
def test_chunk_splits_multibyte_char(all_parsers):
243+
"""
244+
Chunk splits a multibyte character with memory_map=True
245+
246+
GH 43540
247+
"""
248+
parser = all_parsers
249+
# DEFAULT_CHUNKSIZE = 262144, defined in parsers.pyx
250+
df = DataFrame(data=["a" * 127] * 2048)
251+
252+
# Put two-bytes utf-8 encoded character "ą" at the end of chunk
253+
# utf-8 encoding of "ą" is b'\xc4\x85'
254+
df.iloc[2047] = "a" * 127 + "ą"
255+
with tm.ensure_clean("bug-gh43540.csv") as fname:
256+
df.to_csv(fname, index=False, header=False, encoding="utf-8")
257+
dfr = parser.read_csv(fname, header=None, memory_map=True, engine="c")
258+
tm.assert_frame_equal(dfr, df)
259+
260+
242261
def test_not_readable(all_parsers):
243262
# GH43439
244263
parser = all_parsers

0 commit comments

Comments
 (0)