Skip to content

Commit ba2fdb1

Browse files
authored
CLN: mmap used by only read_csv (#46967)
1 parent 962495d commit ba2fdb1

File tree

2 files changed

+18
-16
lines changed

2 files changed

+18
-16
lines changed

pandas/io/common.py

+14-12
Original file line numberDiff line numberDiff line change
@@ -640,7 +640,7 @@ def get_handle(
640640
.. versionchanged:: 1.4.0 Zstandard support.
641641
642642
memory_map : bool, default False
643-
See parsers._parser_params for more information.
643+
See parsers._parser_params for more information. Only used by read_csv.
644644
is_text : bool, default True
645645
Whether the type of the content passed to the file/buffer is string or
646646
bytes. This is not the same as `"b" not in mode`. If a string content is
@@ -659,6 +659,8 @@ def get_handle(
659659
# Windows does not default to utf-8. Set to utf-8 for a consistent behavior
660660
encoding = encoding or "utf-8"
661661

662+
errors = errors or "strict"
663+
662664
# read_csv does not know whether the buffer is opened in binary/text mode
663665
if _is_binary_mode(path_or_buf, mode) and "b" not in mode:
664666
mode += "b"
@@ -681,6 +683,7 @@ def get_handle(
681683
handles: list[BaseBuffer]
682684

683685
# memory mapping needs to be the first step
686+
# only used for read_csv
684687
handle, memory_map, handles = _maybe_memory_map(
685688
handle,
686689
memory_map,
@@ -1064,7 +1067,7 @@ def closed(self):
10641067
return self.fp is None
10651068

10661069

1067-
class _MMapWrapper(abc.Iterator):
1070+
class _CSVMMapWrapper(abc.Iterator):
10681071
"""
10691072
Wrapper for the Python's mmap class so that it can be properly read in
10701073
by Python's csv.reader class.
@@ -1079,7 +1082,7 @@ class _MMapWrapper(abc.Iterator):
10791082

10801083
def __init__(
10811084
self,
1082-
f: IO,
1085+
f: ReadBuffer[bytes],
10831086
encoding: str = "utf-8",
10841087
errors: str = "strict",
10851088
decode: bool = True,
@@ -1089,19 +1092,21 @@ def __init__(
10891092
self.decoder = codecs.getincrementaldecoder(encoding)(errors=errors)
10901093
self.decode = decode
10911094

1095+
# needed for compression libraries and TextIOWrapper
10921096
self.attributes = {}
10931097
for attribute in ("seekable", "readable"):
10941098
if not hasattr(f, attribute):
10951099
continue
10961100
self.attributes[attribute] = getattr(f, attribute)()
1101+
10971102
self.mmap = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
10981103

10991104
def __getattr__(self, name: str):
11001105
if name in self.attributes:
11011106
return lambda: self.attributes[name]
11021107
return getattr(self.mmap, name)
11031108

1104-
def __iter__(self) -> _MMapWrapper:
1109+
def __iter__(self) -> _CSVMMapWrapper:
11051110
return self
11061111

11071112
def read(self, size: int = -1) -> str | bytes:
@@ -1196,7 +1201,7 @@ def _maybe_memory_map(
11961201
memory_map: bool,
11971202
encoding: str,
11981203
mode: str,
1199-
errors: str | None,
1204+
errors: str,
12001205
decode: bool,
12011206
) -> tuple[str | BaseBuffer, bool, list[BaseBuffer]]:
12021207
"""Try to memory map file/buffer."""
@@ -1207,25 +1212,22 @@ def _maybe_memory_map(
12071212

12081213
# need to open the file first
12091214
if isinstance(handle, str):
1210-
if encoding and "b" not in mode:
1211-
# Encoding
1212-
handle = open(handle, mode, encoding=encoding, errors=errors, newline="")
1213-
else:
1214-
# Binary mode
1215-
handle = open(handle, mode)
1215+
handle = open(handle, "rb")
12161216
handles.append(handle)
12171217

12181218
# error: Argument 1 to "_MMapWrapper" has incompatible type "Union[IO[Any],
12191219
# RawIOBase, BufferedIOBase, TextIOBase, mmap]"; expected "IO[Any]"
12201220
try:
1221+
# open mmap, adds *-able, and convert to string
12211222
wrapped = cast(
12221223
BaseBuffer,
1223-
_MMapWrapper(handle, encoding, errors, decode), # type: ignore[arg-type]
1224+
_CSVMMapWrapper(handle, encoding, errors, decode), # type: ignore[arg-type]
12241225
)
12251226
finally:
12261227
for handle in reversed(handles):
12271228
# error: "BaseBuffer" has no attribute "close"
12281229
handle.close() # type: ignore[attr-defined]
1230+
handles = []
12291231
handles.append(wrapped)
12301232

12311233
return wrapped, memory_map, handles

pandas/tests/io/test_common.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -413,18 +413,18 @@ def test_constructor_bad_file(self, mmap_file):
413413
err = mmap.error
414414

415415
with pytest.raises(err, match=msg):
416-
icom._MMapWrapper(non_file)
416+
icom._CSVMMapWrapper(non_file)
417417

418418
with open(mmap_file) as target:
419419
pass
420420

421421
msg = "I/O operation on closed file"
422422
with pytest.raises(ValueError, match=msg):
423-
icom._MMapWrapper(target)
423+
icom._CSVMMapWrapper(target)
424424

425425
def test_get_attr(self, mmap_file):
426426
with open(mmap_file) as target:
427-
wrapper = icom._MMapWrapper(target)
427+
wrapper = icom._CSVMMapWrapper(target)
428428

429429
attrs = dir(wrapper.mmap)
430430
attrs = [attr for attr in attrs if not attr.startswith("__")]
@@ -437,7 +437,7 @@ def test_get_attr(self, mmap_file):
437437

438438
def test_next(self, mmap_file):
439439
with open(mmap_file) as target:
440-
wrapper = icom._MMapWrapper(target)
440+
wrapper = icom._CSVMMapWrapper(target)
441441
lines = target.readlines()
442442

443443
for line in lines:

0 commit comments

Comments
 (0)