PERF: read_csv with memory_map=True when file encoding is UTF-8 (#43787)

michal-gh · michal-gh · commit 000fcabb4aa4 · 2021-10-02T21:28:22.000+02:00
diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py
@@ -275,17 +275,11 @@ def test_chunk_splits_multibyte_char(all_parsers):
 @skip_pyarrow
 def test_readcsv_memmap_utf8(all_parsers):
     lines = []
-    line_length = 128
-    start_char = " "
-    end_char = "\U00010080"
-    # This for loop creates a list of 128-char strings
-    # consisting of consecutive Unicode chars
-    for lnum in range(ord(start_char), ord(end_char), line_length):
+    for lnum in range(0x20, 0x10080, 0x80):
         line = "".join([chr(c) for c in range(lnum, lnum + 0x80)]) + "\n"
         try:
             line.encode("utf-8")
         except UnicodeEncodeError:
-            # Some 16-bit words are not valid Unicode chars and must be skipped
             continue
         lines.append(line)
     parser = all_parsers