Skip to content

Commit fdc7709

Browse files
committed
PERF: read_csv with memory_map=True when file encoding is UTF-8 (pandas-dev#43787)
1 parent 1df9c2d commit fdc7709

File tree

1 file changed

+26
-0
lines changed

1 file changed

+26
-0
lines changed

pandas/tests/io/parser/test_encoding.py

+26
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,32 @@ def test_chunk_splits_multibyte_char(all_parsers):
272272
tm.assert_frame_equal(dfr, df)
273273

274274

275+
@skip_pyarrow
276+
def test_readcsv_memmap_utf8(all_parsers):
277+
lines = []
278+
line_length = 128
279+
start_char = " "
280+
end_char = "\U00010080"
281+
# This for loop creates a list of 128-char strings
282+
# consisting of consecutive Unicode chars
283+
for lnum in range(ord(start_char), ord(end_char), line_length):
284+
line = "".join([chr(c) for c in range(lnum, lnum + 0x80)]) + "\n"
285+
try:
286+
line.encode("utf-8")
287+
except UnicodeEncodeError:
288+
# Some 16-bit words are not valid Unicode chars and must be skipped
289+
continue
290+
lines.append(line)
291+
parser = all_parsers
292+
df = DataFrame(lines)
293+
with tm.ensure_clean("utf8test.csv") as fname:
294+
df.to_csv(fname, index=False, header=False, encoding="utf-8")
295+
dfr = parser.read_csv(
296+
fname, header=None, memory_map=True, engine="c", encoding="utf-8"
297+
)
298+
tm.assert_frame_equal(df, dfr)
299+
300+
275301
def test_not_readable(all_parsers):
276302
# GH43439
277303
parser = all_parsers

0 commit comments

Comments
 (0)