Skip to content

Commit 5a369d6

Browse files
authored
BUG: REGR: read_csv with memory_map=True raises UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc4 in position 262143: unexpected end of data (#43647)
1 parent ef97dd4 commit 5a369d6

File tree

3 files changed

+23
-1
lines changed

3 files changed

+23
-1
lines changed

doc/source/whatsnew/v1.3.4.rst

+1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ Fixed regressions
1919
- Fixed performance regression in :meth:`MultiIndex.equals` (:issue:`43549`)
2020
- Fixed regression in :meth:`Series.cat.reorder_categories` failing to update the categories on the ``Series`` (:issue:`43232`)
2121
- Fixed regression in :meth:`Series.cat.categories` setter failing to update the categories on the ``Series`` (:issue:`43334`)
22+
- Fixed regression in :meth:`pandas.read_csv` raising ``UnicodeDecodeError`` exception when ``memory_map=True`` (:issue:`43540`)
2223
-
2324

2425
.. ---------------------------------------------------------------------------

pandas/io/common.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -877,7 +877,8 @@ def read(self, size: int = -1) -> str | bytes:
877877
if self.decode:
878878
# memory mapping is applied before compression. Encoding should
879879
# be applied to the de-compressed data.
880-
return content.decode(self.encoding, errors=self.errors)
880+
final = size == -1 or len(content) < size
881+
return self.decoder.decode(content, final=final)
881882
return content
882883

883884
def __next__(self) -> str:

pandas/tests/io/parser/test_encoding.py

+20
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,26 @@ def test_encoding_memory_map(all_parsers, encoding):
252252
tm.assert_frame_equal(df, expected)
253253

254254

255+
@skip_pyarrow
256+
def test_chunk_splits_multibyte_char(all_parsers):
257+
"""
258+
Chunk splits a multibyte character with memory_map=True
259+
260+
GH 43540
261+
"""
262+
parser = all_parsers
263+
# DEFAULT_CHUNKSIZE = 262144, defined in parsers.pyx
264+
df = DataFrame(data=["a" * 127] * 2048)
265+
266+
# Put two-bytes utf-8 encoded character "ą" at the end of chunk
267+
# utf-8 encoding of "ą" is b'\xc4\x85'
268+
df.iloc[2047] = "a" * 127 + "ą"
269+
with tm.ensure_clean("bug-gh43540.csv") as fname:
270+
df.to_csv(fname, index=False, header=False, encoding="utf-8")
271+
dfr = parser.read_csv(fname, header=None, memory_map=True, engine="c")
272+
tm.assert_frame_equal(dfr, df)
273+
274+
255275
def test_not_readable(all_parsers):
256276
# GH43439
257277
parser = all_parsers

0 commit comments

Comments
 (0)