PERF: read_csv with memory_map=True when file encoding is UTF-8 (pandas-dev#43787)

michal-gh · michal-gh · commit fdc7709467fc · 2021-10-02T21:23:50.000+02:00
diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py
@@ -272,6 +272,32 @@ def test_chunk_splits_multibyte_char(all_parsers):
     tm.assert_frame_equal(dfr, df)
 
 
+@skip_pyarrow
+def test_readcsv_memmap_utf8(all_parsers):
+    lines = []
+    line_length = 128
+    start_char = " "
+    end_char = "\U00010080"
+    # This for loop creates a list of 128-char strings
+    # consisting of consecutive Unicode chars
+    for lnum in range(ord(start_char), ord(end_char), line_length):
+        line = "".join([chr(c) for c in range(lnum, lnum + 0x80)]) + "\n"
+        try:
+            line.encode("utf-8")
+        except UnicodeEncodeError:
+            # Some 16-bit words are not valid Unicode chars and must be skipped
+            continue
+        lines.append(line)
+    parser = all_parsers
+    df = DataFrame(lines)
+    with tm.ensure_clean("utf8test.csv") as fname:
+        df.to_csv(fname, index=False, header=False, encoding="utf-8")
+        dfr = parser.read_csv(
+            fname, header=None, memory_map=True, engine="c", encoding="utf-8"
+        )
+    tm.assert_frame_equal(df, dfr)
+
+
 def test_not_readable(all_parsers):
     # GH43439
     parser = all_parsers