-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
CLN: mmap used by only read_csv #46967
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -640,7 +640,7 @@ def get_handle( | |
.. versionchanged:: 1.4.0 Zstandard support. | ||
|
||
memory_map : bool, default False | ||
See parsers._parser_params for more information. | ||
See parsers._parser_params for more information. Only used by read_csv. | ||
is_text : bool, default True | ||
Whether the type of the content passed to the file/buffer is string or | ||
bytes. This is not the same as `"b" not in mode`. If a string content is | ||
|
@@ -659,6 +659,8 @@ def get_handle( | |
# Windows does not default to utf-8. Set to utf-8 for a consistent behavior | ||
encoding = encoding or "utf-8" | ||
|
||
errors = errors or "strict" | ||
|
||
# read_csv does not know whether the buffer is opened in binary/text mode | ||
if _is_binary_mode(path_or_buf, mode) and "b" not in mode: | ||
mode += "b" | ||
|
@@ -681,6 +683,7 @@ def get_handle( | |
handles: list[BaseBuffer] | ||
|
||
# memory mapping needs to be the first step | ||
# only used for read_csv | ||
handle, memory_map, handles = _maybe_memory_map( | ||
handle, | ||
memory_map, | ||
|
@@ -1064,7 +1067,7 @@ def closed(self): | |
return self.fp is None | ||
|
||
|
||
class _MMapWrapper(abc.Iterator): | ||
class _CSVMMapWrapper(abc.Iterator): | ||
""" | ||
Wrapper for the Python's mmap class so that it can be properly read in | ||
by Python's csv.reader class. | ||
|
@@ -1079,7 +1082,7 @@ class _MMapWrapper(abc.Iterator): | |
|
||
def __init__( | ||
self, | ||
f: IO, | ||
f: ReadBuffer[bytes], | ||
encoding: str = "utf-8", | ||
errors: str = "strict", | ||
decode: bool = True, | ||
|
@@ -1089,19 +1092,21 @@ def __init__( | |
self.decoder = codecs.getincrementaldecoder(encoding)(errors=errors) | ||
self.decode = decode | ||
|
||
# needed for compression libraries and TextIOWrapper | ||
self.attributes = {} | ||
for attribute in ("seekable", "readable"): | ||
if not hasattr(f, attribute): | ||
continue | ||
self.attributes[attribute] = getattr(f, attribute)() | ||
|
||
self.mmap = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) | ||
|
||
def __getattr__(self, name: str): | ||
if name in self.attributes: | ||
return lambda: self.attributes[name] | ||
return getattr(self.mmap, name) | ||
|
||
def __iter__(self) -> _MMapWrapper: | ||
def __iter__(self) -> _CSVMMapWrapper: | ||
return self | ||
|
||
def read(self, size: int = -1) -> str | bytes: | ||
|
@@ -1196,7 +1201,7 @@ def _maybe_memory_map( | |
memory_map: bool, | ||
encoding: str, | ||
mode: str, | ||
errors: str | None, | ||
errors: str, | ||
decode: bool, | ||
) -> tuple[str | BaseBuffer, bool, list[BaseBuffer]]: | ||
"""Try to memory map file/buffer.""" | ||
|
@@ -1207,25 +1212,22 @@ def _maybe_memory_map( | |
|
||
# need to open the file first | ||
if isinstance(handle, str): | ||
if encoding and "b" not in mode: | ||
# Encoding | ||
handle = open(handle, mode, encoding=encoding, errors=errors, newline="") | ||
else: | ||
# Binary mode | ||
handle = open(handle, mode) | ||
handle = open(handle, "rb") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. mmap is used only for reading and there should be no point in opening the file in text mode as mmap should bypass that. |
||
handles.append(handle) | ||
|
||
# error: Argument 1 to "_MMapWrapper" has incompatible type "Union[IO[Any], | ||
# RawIOBase, BufferedIOBase, TextIOBase, mmap]"; expected "IO[Any]" | ||
try: | ||
# open mmap, adds *-able, and convert to string | ||
wrapped = cast( | ||
BaseBuffer, | ||
_MMapWrapper(handle, encoding, errors, decode), # type: ignore[arg-type] | ||
_CSVMMapWrapper(handle, encoding, errors, decode), # type: ignore[arg-type] | ||
) | ||
finally: | ||
for handle in reversed(handles): | ||
# error: "BaseBuffer" has no attribute "close" | ||
handle.close() # type: ignore[attr-defined] | ||
handles = [] | ||
handles.append(wrapped) | ||
|
||
return wrapped, memory_map, handles | ||
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
any tests validate this? (nbd if you can't come up with an example, but ideally)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This encoding+memory_map test should test it:
pandas/pandas/tests/io/parser/test_encoding.py
Line 234 in 962495d