Skip to content

Commit 8c201e2

Browse files
authored
PERF: read_csv with memory_map=True when file encoding is UTF-8 (#43787) (#43787)
1 parent 6599834 commit 8c201e2

File tree

4 files changed

+61
-1
lines changed

4 files changed

+61
-1
lines changed

asv_bench/benchmarks/io/csv.py

+29
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from pandas import (
1111
Categorical,
1212
DataFrame,
13+
concat,
1314
date_range,
1415
read_csv,
1516
to_datetime,
@@ -459,6 +460,34 @@ def time_read_special_date(self, value, engine):
459460
)
460461

461462

463+
class ReadCSVMemMapUTF8:
464+
465+
fname = "__test__.csv"
466+
number = 5
467+
468+
def setup(self):
469+
lines = []
470+
line_length = 128
471+
start_char = " "
472+
end_char = "\U00010080"
473+
# This for loop creates a list of 128-char strings
474+
# consisting of consecutive Unicode chars
475+
for lnum in range(ord(start_char), ord(end_char), line_length):
476+
line = "".join([chr(c) for c in range(lnum, lnum + 0x80)]) + "\n"
477+
try:
478+
line.encode("utf-8")
479+
except UnicodeEncodeError:
480+
# Some 16-bit words are not valid Unicode chars and must be skipped
481+
continue
482+
lines.append(line)
483+
df = DataFrame(lines)
484+
df = concat([df for n in range(100)], ignore_index=True)
485+
df.to_csv(self.fname, index=False, header=False, encoding="utf-8")
486+
487+
def time_read_memmapped_utf8(self):
488+
read_csv(self.fname, header=None, memory_map=True, encoding="utf-8", engine="c")
489+
490+
462491
class ParseDateComparison(StringIORewind):
463492
params = ([False, True],)
464493
param_names = ["cache_dates"]

doc/source/whatsnew/v1.4.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,7 @@ Performance improvements
364364
- Indexing into a :class:`SparseArray` with a ``slice`` with ``step=1`` no longer requires converting to a dense array (:issue:`43777`)
365365
- Performance improvement in :meth:`SparseArray.take` with ``allow_fill=False`` (:issue:`43654`)
366366
- Performance improvement in :meth:`.Rolling.mean` and :meth:`.Expanding.mean` with ``engine="numba"`` (:issue:`43612`)
367+
- Improved performance of :meth:`pandas.read_csv` with ``memory_map=True`` when file encoding is UTF-8 (:issue:`43787`)
367368
-
368369

369370
.. ---------------------------------------------------------------------------

pandas/io/common.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -874,7 +874,7 @@ def __iter__(self) -> _MMapWrapper:
874874
def read(self, size: int = -1) -> str | bytes:
875875
# CSV c-engine uses read instead of iterating
876876
content: bytes = self.mmap.read(size)
877-
if self.decode:
877+
if self.decode and self.encoding != "utf-8":
878878
# memory mapping is applied before compression. Encoding should
879879
# be applied to the de-compressed data.
880880
final = size == -1 or len(content) < size

pandas/tests/io/parser/test_encoding.py

+30
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,36 @@ def test_chunk_splits_multibyte_char(all_parsers):
272272
tm.assert_frame_equal(dfr, df)
273273

274274

275+
@skip_pyarrow
276+
def test_readcsv_memmap_utf8(all_parsers):
277+
"""
278+
GH 43787
279+
280+
Test correct handling of UTF-8 chars when memory_map=True and encoding is UTF-8
281+
"""
282+
lines = []
283+
line_length = 128
284+
start_char = " "
285+
end_char = "\U00010080"
286+
# This for loop creates a list of 128-char strings
287+
# consisting of consecutive Unicode chars
288+
for lnum in range(ord(start_char), ord(end_char), line_length):
289+
line = "".join([chr(c) for c in range(lnum, lnum + 0x80)]) + "\n"
290+
try:
291+
line.encode("utf-8")
292+
except UnicodeEncodeError:
293+
continue
294+
lines.append(line)
295+
parser = all_parsers
296+
df = DataFrame(lines)
297+
with tm.ensure_clean("utf8test.csv") as fname:
298+
df.to_csv(fname, index=False, header=False, encoding="utf-8")
299+
dfr = parser.read_csv(
300+
fname, header=None, memory_map=True, engine="c", encoding="utf-8"
301+
)
302+
tm.assert_frame_equal(df, dfr)
303+
304+
275305
def test_not_readable(all_parsers):
276306
# GH43439
277307
parser = all_parsers

0 commit comments

Comments
 (0)