diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 35058ba03ade8..153cad403dcc3 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -10,6 +10,7 @@ from pandas import ( Categorical, DataFrame, + concat, date_range, read_csv, to_datetime, @@ -459,6 +460,34 @@ def time_read_special_date(self, value, engine): ) +class ReadCSVMemMapUTF8: + + fname = "__test__.csv" + number = 5 + + def setup(self): + lines = [] + line_length = 128 + start_char = " " + end_char = "\U00010080" + # This for loop creates a list of 128-char strings + # consisting of consecutive Unicode chars + for lnum in range(ord(start_char), ord(end_char), line_length): + line = "".join([chr(c) for c in range(lnum, lnum + 0x80)]) + "\n" + try: + line.encode("utf-8") + except UnicodeEncodeError: + # Some 16-bit words are not valid Unicode chars and must be skipped + continue + lines.append(line) + df = DataFrame(lines) + df = concat([df for n in range(100)], ignore_index=True) + df.to_csv(self.fname, index=False, header=False, encoding="utf-8") + + def time_read_memmapped_utf8(self): + read_csv(self.fname, header=None, memory_map=True, encoding="utf-8", engine="c") + + class ParseDateComparison(StringIORewind): params = ([False, True],) param_names = ["cache_dates"] diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index dcd31abaa8857..83820ac25491d 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -364,6 +364,7 @@ Performance improvements - Indexing into a :class:`SparseArray` with a ``slice`` with ``step=1`` no longer requires converting to a dense array (:issue:`43777`) - Performance improvement in :meth:`SparseArray.take` with ``allow_fill=False`` (:issue:`43654`) - Performance improvement in :meth:`.Rolling.mean` and :meth:`.Expanding.mean` with ``engine="numba"`` (:issue:`43612`) +- Improved performance of :meth:`pandas.read_csv` with ``memory_map=True`` when file encoding is UTF-8 (:issue:`43787`) - .. --------------------------------------------------------------------------- diff --git a/pandas/io/common.py b/pandas/io/common.py index 6dfddd571b88f..be6577e646ac3 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -874,7 +874,7 @@ def __iter__(self) -> _MMapWrapper: def read(self, size: int = -1) -> str | bytes: # CSV c-engine uses read instead of iterating content: bytes = self.mmap.read(size) - if self.decode: + if self.decode and self.encoding != "utf-8": # memory mapping is applied before compression. Encoding should # be applied to the de-compressed data. final = size == -1 or len(content) < size diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index 6ca3fdf9a6258..2573314f155cf 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -272,6 +272,36 @@ def test_chunk_splits_multibyte_char(all_parsers): tm.assert_frame_equal(dfr, df) +@skip_pyarrow +def test_readcsv_memmap_utf8(all_parsers): + """ + GH 43787 + + Test correct handling of UTF-8 chars when memory_map=True and encoding is UTF-8 + """ + lines = [] + line_length = 128 + start_char = " " + end_char = "\U00010080" + # This for loop creates a list of 128-char strings + # consisting of consecutive Unicode chars + for lnum in range(ord(start_char), ord(end_char), line_length): + line = "".join([chr(c) for c in range(lnum, lnum + 0x80)]) + "\n" + try: + line.encode("utf-8") + except UnicodeEncodeError: + continue + lines.append(line) + parser = all_parsers + df = DataFrame(lines) + with tm.ensure_clean("utf8test.csv") as fname: + df.to_csv(fname, index=False, header=False, encoding="utf-8") + dfr = parser.read_csv( + fname, header=None, memory_map=True, engine="c", encoding="utf-8" + ) + tm.assert_frame_equal(df, dfr) + + def test_not_readable(all_parsers): # GH43439 parser = all_parsers