-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
REGR: memory_map with non-UTF8 encoding #40994
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -618,7 +618,12 @@ def get_handle( | |
|
||
# memory mapping needs to be the first step | ||
handle, memory_map, handles = _maybe_memory_map( | ||
handle, memory_map, ioargs.encoding, ioargs.mode, errors | ||
handle, | ||
memory_map, | ||
ioargs.encoding, | ||
ioargs.mode, | ||
errors, | ||
ioargs.compression["method"] not in _compression_to_extension, | ||
) | ||
|
||
is_path = isinstance(handle, str) | ||
|
@@ -820,7 +825,18 @@ class _MMapWrapper(abc.Iterator): | |
|
||
""" | ||
|
||
def __init__(self, f: IO): | ||
def __init__( | ||
self, | ||
f: IO, | ||
encoding: str = "utf-8", | ||
errors: str = "strict", | ||
decode: bool = True, | ||
): | ||
self.encoding = encoding | ||
self.errors = errors | ||
self.decoder = codecs.getincrementaldecoder(encoding)(errors=errors) | ||
self.decode = decode | ||
|
||
self.attributes = {} | ||
for attribute in ("seekable", "readable", "writeable"): | ||
if not hasattr(f, attribute): | ||
|
@@ -836,19 +852,30 @@ def __getattr__(self, name: str): | |
def __iter__(self) -> _MMapWrapper: | ||
return self | ||
|
||
def read(self, size: int = -1) -> str | bytes: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function could be removed if the c-engine handled non-utf-8 better. The PR in its current form will in case of the c-engine: 1) use mmap to read the entire file 2) decode it appropriately (this function) 3) the c-code will encode the now utf-8 string into bytes again. It would be more efficient if the c-engine supported non-utf-8 in more places. I will look into that, but that might take some time. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. prob not worth the effort to handle non-utf-8 better, but if you want to look...ok |
||
# CSV c-engine uses read instead of iterating | ||
content: bytes = self.mmap.read(size) | ||
if self.decode: | ||
# memory mapping is applied before compression. Encoding should | ||
# be applied to the de-compressed data. | ||
return content.decode(self.encoding, errors=self.errors) | ||
return content | ||
|
||
def __next__(self) -> str: | ||
newbytes = self.mmap.readline() | ||
|
||
# readline returns bytes, not str, but Python's CSV reader | ||
# expects str, so convert the output to str before continuing | ||
newline = newbytes.decode("utf-8") | ||
newline = self.decoder.decode(newbytes) | ||
|
||
# mmap doesn't raise if reading past the allocated | ||
# data but instead returns an empty string, so raise | ||
# if that is returned | ||
if newline == "": | ||
raise StopIteration | ||
return newline | ||
|
||
# IncrementalDecoder seems to push newline to the next line | ||
return newline.lstrip("\n") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It may leave Maybe try There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you! It might be worth adding tests to directly test the output of the mmap wrapper independent of I would assume that CRLF is covered by some of the Windows CI but it might be that the c-engine and python's csv are robust enough to ignore that :) |
||
|
||
|
||
def _maybe_memory_map( | ||
|
@@ -857,6 +884,7 @@ def _maybe_memory_map( | |
encoding: str, | ||
mode: str, | ||
errors: str | None, | ||
decode: bool, | ||
) -> tuple[FileOrBuffer, bool, list[Buffer]]: | ||
"""Try to memory map file/buffer.""" | ||
handles: list[Buffer] = [] | ||
|
@@ -877,7 +905,10 @@ def _maybe_memory_map( | |
try: | ||
# error: Argument 1 to "_MMapWrapper" has incompatible type "Union[IO[Any], | ||
# RawIOBase, BufferedIOBase, TextIOBase, mmap]"; expected "IO[Any]" | ||
wrapped = cast(mmap.mmap, _MMapWrapper(handle)) # type: ignore[arg-type] | ||
wrapped = cast( | ||
mmap.mmap, | ||
_MMapWrapper(handle, encoding, errors, decode), # type: ignore[arg-type] | ||
) | ||
handle.close() | ||
handles.remove(handle) | ||
handles.append(wrapped) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
wouldn't object to passing by kwargs for easier reading