PERF: read_csv with memory_map=True when file encoding is UTF-8 (#43787) (#43787)

michal-gh · web-flow · commit 8c201e28661f · 2021-10-05T21:02:47.000-04:00
diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
@@ -10,6 +10,7 @@
 from pandas import (
     Categorical,
     DataFrame,
+    concat,
     date_range,
     read_csv,
     to_datetime,
@@ -459,6 +460,34 @@ def time_read_special_date(self, value, engine):
         )
 
 
+class ReadCSVMemMapUTF8:
+
+    fname = "__test__.csv"
+    number = 5
+
+    def setup(self):
+        lines = []
+        line_length = 128
+        start_char = " "
+        end_char = "\U00010080"
+        # This for loop creates a list of 128-char strings
+        # consisting of consecutive Unicode chars
+        for lnum in range(ord(start_char), ord(end_char), line_length):
+            line = "".join([chr(c) for c in range(lnum, lnum + 0x80)]) + "\n"
+            try:
+                line.encode("utf-8")
+            except UnicodeEncodeError:
+                # Some 16-bit words are not valid Unicode chars and must be skipped
+                continue
+            lines.append(line)
+        df = DataFrame(lines)
+        df = concat([df for n in range(100)], ignore_index=True)
+        df.to_csv(self.fname, index=False, header=False, encoding="utf-8")
+
+    def time_read_memmapped_utf8(self):
+        read_csv(self.fname, header=None, memory_map=True, encoding="utf-8", engine="c")
+
+
 class ParseDateComparison(StringIORewind):
     params = ([False, True],)
     param_names = ["cache_dates"]
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -364,6 +364,7 @@ Performance improvements
 - Indexing into a :class:`SparseArray` with a ``slice`` with ``step=1`` no longer requires converting to a dense array (:issue:`43777`)
 - Performance improvement in :meth:`SparseArray.take` with ``allow_fill=False`` (:issue:`43654`)
 - Performance improvement in :meth:`.Rolling.mean` and :meth:`.Expanding.mean` with ``engine="numba"`` (:issue:`43612`)
+- Improved performance of :meth:`pandas.read_csv` with ``memory_map=True`` when file encoding is UTF-8 (:issue:`43787`)
 -
 
 .. ---------------------------------------------------------------------------
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -874,7 +874,7 @@ def __iter__(self) -> _MMapWrapper:
     def read(self, size: int = -1) -> str | bytes:
         # CSV c-engine uses read instead of iterating
         content: bytes = self.mmap.read(size)
-        if self.decode:
+        if self.decode and self.encoding != "utf-8":
             # memory mapping is applied before compression. Encoding should
             # be applied to the de-compressed data.
             final = size == -1 or len(content) < size
diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py
@@ -272,6 +272,36 @@ def test_chunk_splits_multibyte_char(all_parsers):
     tm.assert_frame_equal(dfr, df)
 
 
+@skip_pyarrow
+def test_readcsv_memmap_utf8(all_parsers):
+    """
+    GH 43787
+
+    Test correct handling of UTF-8 chars when memory_map=True and encoding is UTF-8
+    """
+    lines = []
+    line_length = 128
+    start_char = " "
+    end_char = "\U00010080"
+    # This for loop creates a list of 128-char strings
+    # consisting of consecutive Unicode chars
+    for lnum in range(ord(start_char), ord(end_char), line_length):
+        line = "".join([chr(c) for c in range(lnum, lnum + 0x80)]) + "\n"
+        try:
+            line.encode("utf-8")
+        except UnicodeEncodeError:
+            continue
+        lines.append(line)
+    parser = all_parsers
+    df = DataFrame(lines)
+    with tm.ensure_clean("utf8test.csv") as fname:
+        df.to_csv(fname, index=False, header=False, encoding="utf-8")
+        dfr = parser.read_csv(
+            fname, header=None, memory_map=True, engine="c", encoding="utf-8"
+        )
+    tm.assert_frame_equal(df, dfr)
+
+
 def test_not_readable(all_parsers):
     # GH43439
     parser = all_parsers

Original file line number	Diff line number	Diff line change
`@@ -364,6 +364,7 @@ Performance improvements`
`364`	`364`	- Indexing into a :class:`SparseArray` with a ``slice`` with ``step=1`` no longer requires converting to a dense array (:issue:`43777`)
`365`	`365`	- Performance improvement in :meth:`SparseArray.take` with ``allow_fill=False`` (:issue:`43654`)
`366`	`366`	- Performance improvement in :meth:`.Rolling.mean` and :meth:`.Expanding.mean` with ``engine="numba"`` (:issue:`43612`)
	`367`	+- Improved performance of :meth:`pandas.read_csv` with ``memory_map=True`` when file encoding is UTF-8 (:issue:`43787`)
`367`	`368`	`-`
`368`	`369`
`369`	`370`	`.. ---------------------------------------------------------------------------`