Skip to content

Commit 9292530

Browse files
authored
CLN: Simplify mmap code (#47175)
1 parent 8e94afb commit 9292530

File tree

3 files changed

+47
-121
lines changed

3 files changed

+47
-121
lines changed

pandas/io/common.py

+22-100
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
)
88
import bz2
99
import codecs
10-
from collections import abc
1110
import dataclasses
1211
import functools
1312
import gzip
@@ -103,7 +102,6 @@ class IOHandles(Generic[AnyStr]):
103102
compression: CompressionDict
104103
created_handles: list[IO[bytes] | IO[str]] = dataclasses.field(default_factory=list)
105104
is_wrapped: bool = False
106-
is_mmap: bool = False
107105

108106
def close(self) -> None:
109107
"""
@@ -687,14 +685,7 @@ def get_handle(
687685

688686
# memory mapping needs to be the first step
689687
# only used for read_csv
690-
handle, memory_map, handles = _maybe_memory_map(
691-
handle,
692-
memory_map,
693-
ioargs.encoding,
694-
ioargs.mode,
695-
errors,
696-
ioargs.compression["method"] not in _supported_compressions,
697-
)
688+
handle, memory_map, handles = _maybe_memory_map(handle, memory_map)
698689

699690
is_path = isinstance(handle, str)
700691
compression_args = dict(ioargs.compression)
@@ -841,12 +832,19 @@ def get_handle(
841832
handle,
842833
encoding=ioargs.encoding,
843834
)
844-
elif is_text and (compression or _is_binary_mode(handle, ioargs.mode)):
835+
elif is_text and (
836+
compression or memory_map or _is_binary_mode(handle, ioargs.mode)
837+
):
838+
if (
839+
not hasattr(handle, "readable")
840+
or not hasattr(handle, "writable")
841+
or not hasattr(handle, "seekable")
842+
):
843+
handle = _IOWrapper(handle)
844+
# error: Argument 1 to "TextIOWrapper" has incompatible type
845+
# "_IOWrapper"; expected "IO[bytes]"
845846
handle = TextIOWrapper(
846-
# error: Argument 1 to "TextIOWrapper" has incompatible type
847-
# "Union[IO[bytes], IO[Any], RawIOBase, BufferedIOBase, TextIOBase, mmap]";
848-
# expected "IO[bytes]"
849-
_IOWrapper(handle), # type: ignore[arg-type]
847+
handle, # type: ignore[arg-type]
850848
encoding=ioargs.encoding,
851849
errors=errors,
852850
newline="",
@@ -877,7 +875,6 @@ def get_handle(
877875
# "List[BaseBuffer]"; expected "List[Union[IO[bytes], IO[str]]]"
878876
created_handles=handles, # type: ignore[arg-type]
879877
is_wrapped=is_wrapped,
880-
is_mmap=memory_map,
881878
compression=ioargs.compression,
882879
)
883880

@@ -1001,75 +998,6 @@ def write_to_buffer(self) -> None:
1001998
self.buffer.writestr(archive_name, self.getvalue())
1002999

10031000

1004-
class _CSVMMapWrapper(abc.Iterator):
1005-
"""
1006-
Wrapper for the Python's mmap class so that it can be properly read in
1007-
by Python's csv.reader class.
1008-
1009-
Parameters
1010-
----------
1011-
f : file object
1012-
File object to be mapped onto memory. Must support the 'fileno'
1013-
method or have an equivalent attribute
1014-
1015-
"""
1016-
1017-
def __init__(
1018-
self,
1019-
f: ReadBuffer[bytes],
1020-
encoding: str = "utf-8",
1021-
errors: str = "strict",
1022-
decode: bool = True,
1023-
) -> None:
1024-
self.encoding = encoding
1025-
self.errors = errors
1026-
self.decoder = codecs.getincrementaldecoder(encoding)(errors=errors)
1027-
self.decode = decode
1028-
1029-
# needed for compression libraries and TextIOWrapper
1030-
self.attributes = {}
1031-
for attribute in ("seekable", "readable"):
1032-
if not hasattr(f, attribute):
1033-
continue
1034-
self.attributes[attribute] = getattr(f, attribute)()
1035-
1036-
self.mmap = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
1037-
1038-
def __getattr__(self, name: str):
1039-
if name in self.attributes:
1040-
return lambda: self.attributes[name]
1041-
return getattr(self.mmap, name)
1042-
1043-
def __iter__(self) -> _CSVMMapWrapper:
1044-
return self
1045-
1046-
def read(self, size: int = -1) -> str | bytes:
1047-
# CSV c-engine uses read instead of iterating
1048-
content: bytes = self.mmap.read(size)
1049-
if self.decode and self.encoding != "utf-8":
1050-
# memory mapping is applied before compression. Encoding should
1051-
# be applied to the de-compressed data.
1052-
final = size == -1 or len(content) < size
1053-
return self.decoder.decode(content, final=final)
1054-
return content
1055-
1056-
def __next__(self) -> str:
1057-
newbytes = self.mmap.readline()
1058-
1059-
# readline returns bytes, not str, but Python's CSV reader
1060-
# expects str, so convert the output to str before continuing
1061-
newline = self.decoder.decode(newbytes)
1062-
1063-
# mmap doesn't raise if reading past the allocated
1064-
# data but instead returns an empty string, so raise
1065-
# if that is returned
1066-
if newline == "":
1067-
raise StopIteration
1068-
1069-
# IncrementalDecoder seems to push newline to the next line
1070-
return newline.lstrip("\n")
1071-
1072-
10731001
class _IOWrapper:
10741002
# TextIOWrapper is overly strict: it request that the buffer has seekable, readable,
10751003
# and writable. If we have a read-only buffer, we shouldn't need writable and vice
@@ -1131,12 +1059,7 @@ def read(self, n: int | None = -1) -> bytes:
11311059

11321060

11331061
def _maybe_memory_map(
1134-
handle: str | BaseBuffer,
1135-
memory_map: bool,
1136-
encoding: str,
1137-
mode: str,
1138-
errors: str,
1139-
decode: bool,
1062+
handle: str | BaseBuffer, memory_map: bool
11401063
) -> tuple[str | BaseBuffer, bool, list[BaseBuffer]]:
11411064
"""Try to memory map file/buffer."""
11421065
handles: list[BaseBuffer] = []
@@ -1149,22 +1072,21 @@ def _maybe_memory_map(
11491072
handle = open(handle, "rb")
11501073
handles.append(handle)
11511074

1152-
# error: Argument 1 to "_MMapWrapper" has incompatible type "Union[IO[Any],
1153-
# RawIOBase, BufferedIOBase, TextIOBase, mmap]"; expected "IO[Any]"
11541075
try:
1155-
# open mmap, adds *-able, and convert to string
1156-
wrapped = cast(
1157-
BaseBuffer,
1158-
_CSVMMapWrapper(handle, encoding, errors, decode), # type: ignore[arg-type]
1076+
# open mmap and adds *-able
1077+
# error: Argument 1 to "_IOWrapper" has incompatible type "mmap";
1078+
# expected "BaseBuffer"
1079+
wrapped = _IOWrapper(
1080+
mmap.mmap(
1081+
handle.fileno(), 0, access=mmap.ACCESS_READ # type: ignore[arg-type]
1082+
)
11591083
)
11601084
finally:
11611085
for handle in reversed(handles):
11621086
# error: "BaseBuffer" has no attribute "close"
11631087
handle.close() # type: ignore[attr-defined]
1164-
handles = []
1165-
handles.append(wrapped)
11661088

1167-
return wrapped, memory_map, handles
1089+
return wrapped, memory_map, [wrapped]
11681090

11691091

11701092
def file_exists(filepath_or_buffer: FilePath | BaseBuffer) -> bool:

pandas/io/parsers/c_parser_wrapper.py

+12
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
from collections import defaultdict
4+
from io import TextIOWrapper
45
from typing import (
56
Hashable,
67
Mapping,
@@ -62,6 +63,17 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None:
6263
# Have to pass int, would break tests using TextReader directly otherwise :(
6364
kwds["on_bad_lines"] = self.on_bad_lines.value
6465

66+
# c-engine can cope with utf-8 bytes. Remove TextIOWrapper when its errors
67+
# policy is the same as the one given to read_csv
68+
if (
69+
isinstance(src, TextIOWrapper)
70+
and src.encoding == "utf-8"
71+
and (src.errors or "strict") == kwds["encoding_errors"]
72+
):
73+
# error: Incompatible types in assignment (expression has type "BinaryIO",
74+
# variable has type "ReadCsvBuffer[str]")
75+
src = src.buffer # type: ignore[assignment]
76+
6577
for key in (
6678
"storage_options",
6779
"encoding",

pandas/tests/io/test_common.py

+13-21
Original file line numberDiff line numberDiff line change
@@ -413,39 +413,31 @@ def test_constructor_bad_file(self, mmap_file):
413413
err = mmap.error
414414

415415
with pytest.raises(err, match=msg):
416-
icom._CSVMMapWrapper(non_file)
416+
icom._maybe_memory_map(non_file, True)
417417

418418
with open(mmap_file) as target:
419419
pass
420420

421421
msg = "I/O operation on closed file"
422422
with pytest.raises(ValueError, match=msg):
423-
icom._CSVMMapWrapper(target)
424-
425-
def test_get_attr(self, mmap_file):
426-
with open(mmap_file) as target:
427-
wrapper = icom._CSVMMapWrapper(target)
428-
429-
attrs = dir(wrapper.mmap)
430-
attrs = [attr for attr in attrs if not attr.startswith("__")]
431-
attrs.append("__next__")
432-
433-
for attr in attrs:
434-
assert hasattr(wrapper, attr)
435-
436-
assert not hasattr(wrapper, "foo")
423+
icom._maybe_memory_map(target, True)
437424

438425
def test_next(self, mmap_file):
439426
with open(mmap_file) as target:
440-
wrapper = icom._CSVMMapWrapper(target)
441427
lines = target.readlines()
442428

443-
for line in lines:
444-
next_line = next(wrapper)
445-
assert next_line.strip() == line.strip()
429+
with icom.get_handle(
430+
target, "r", is_text=True, memory_map=True
431+
) as wrappers:
432+
wrapper = wrappers.handle
433+
assert isinstance(wrapper.buffer.buffer, mmap.mmap)
434+
435+
for line in lines:
436+
next_line = next(wrapper)
437+
assert next_line.strip() == line.strip()
446438

447-
with pytest.raises(StopIteration, match=r"^$"):
448-
next(wrapper)
439+
with pytest.raises(StopIteration, match=r"^$"):
440+
next(wrapper)
449441

450442
def test_unknown_engine(self):
451443
with tm.ensure_clean() as path:

0 commit comments

Comments
 (0)