diff --git a/doc/source/whatsnew/v1.2.5.rst b/doc/source/whatsnew/v1.2.5.rst index 16f9284802407..60e146b2212eb 100644 --- a/doc/source/whatsnew/v1.2.5.rst +++ b/doc/source/whatsnew/v1.2.5.rst @@ -15,7 +15,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Regression in :func:`concat` between two :class:`DataFrames` where one has an :class:`Index` that is all-None and the other is :class:`DatetimeIndex` incorrectly raising (:issue:`40841`) -- +- Regression in :func:`read_csv` when using ``memory_map=True`` with an non-UTF8 encoding (:issue:`40986`) - .. --------------------------------------------------------------------------- diff --git a/pandas/io/common.py b/pandas/io/common.py index 00966d39dd99d..06b00a9cbb4eb 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -618,7 +618,12 @@ def get_handle( # memory mapping needs to be the first step handle, memory_map, handles = _maybe_memory_map( - handle, memory_map, ioargs.encoding, ioargs.mode, errors + handle, + memory_map, + ioargs.encoding, + ioargs.mode, + errors, + ioargs.compression["method"] not in _compression_to_extension, ) is_path = isinstance(handle, str) @@ -820,7 +825,18 @@ class _MMapWrapper(abc.Iterator): """ - def __init__(self, f: IO): + def __init__( + self, + f: IO, + encoding: str = "utf-8", + errors: str = "strict", + decode: bool = True, + ): + self.encoding = encoding + self.errors = errors + self.decoder = codecs.getincrementaldecoder(encoding)(errors=errors) + self.decode = decode + self.attributes = {} for attribute in ("seekable", "readable", "writeable"): if not hasattr(f, attribute): @@ -836,19 +852,30 @@ def __getattr__(self, name: str): def __iter__(self) -> _MMapWrapper: return self + def read(self, size: int = -1) -> str | bytes: + # CSV c-engine uses read instead of iterating + content: bytes = self.mmap.read(size) + if self.decode: + # memory mapping is applied before compression. Encoding should + # be applied to the de-compressed data. + return content.decode(self.encoding, errors=self.errors) + return content + def __next__(self) -> str: newbytes = self.mmap.readline() # readline returns bytes, not str, but Python's CSV reader # expects str, so convert the output to str before continuing - newline = newbytes.decode("utf-8") + newline = self.decoder.decode(newbytes) # mmap doesn't raise if reading past the allocated # data but instead returns an empty string, so raise # if that is returned if newline == "": raise StopIteration - return newline + + # IncrementalDecoder seems to push newline to the next line + return newline.lstrip("\n") def _maybe_memory_map( @@ -857,6 +884,7 @@ def _maybe_memory_map( encoding: str, mode: str, errors: str | None, + decode: bool, ) -> tuple[FileOrBuffer, bool, list[Buffer]]: """Try to memory map file/buffer.""" handles: list[Buffer] = [] @@ -877,7 +905,10 @@ def _maybe_memory_map( try: # error: Argument 1 to "_MMapWrapper" has incompatible type "Union[IO[Any], # RawIOBase, BufferedIOBase, TextIOBase, mmap]"; expected "IO[Any]" - wrapped = cast(mmap.mmap, _MMapWrapper(handle)) # type: ignore[arg-type] + wrapped = cast( + mmap.mmap, + _MMapWrapper(handle, encoding, errors, decode), # type: ignore[arg-type] + ) handle.close() handles.remove(handle) handles.append(wrapped) diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index abf6128699a21..fb110706c3fb4 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -30,25 +30,6 @@ def __init__(self, src: FilePathOrBuffer, **kwds): assert self.handles is not None for key in ("storage_options", "encoding", "memory_map", "compression"): kwds.pop(key, None) - if self.handles.is_mmap and hasattr(self.handles.handle, "mmap"): - # error: Item "IO[Any]" of "Union[IO[Any], RawIOBase, BufferedIOBase, - # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap" - - # error: Item "RawIOBase" of "Union[IO[Any], RawIOBase, BufferedIOBase, - # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap" - - # error: Item "BufferedIOBase" of "Union[IO[Any], RawIOBase, BufferedIOBase, - # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap" - - # error: Item "TextIOBase" of "Union[IO[Any], RawIOBase, BufferedIOBase, - # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap" - - # error: Item "TextIOWrapper" of "Union[IO[Any], RawIOBase, BufferedIOBase, - # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap" - - # error: Item "mmap" of "Union[IO[Any], RawIOBase, BufferedIOBase, - # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap" - self.handles.handle = self.handles.handle.mmap # type: ignore[union-attr] try: self._reader = parsers.TextReader(self.handles.handle, **kwds) diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index 89ece3b1a7300..006438df2a5e0 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -220,3 +220,20 @@ def test_parse_encoded_special_characters(encoding): expected = DataFrame(data=[[":foo", 0], ["bar", 1], ["baz", 2]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("encoding", ["utf-8", None, "utf-16", "cp1255", "latin-1"]) +def test_encoding_memory_map(all_parsers, encoding): + # GH40986 + parser = all_parsers + expected = DataFrame( + { + "name": ["Raphael", "Donatello", "Miguel Angel", "Leonardo"], + "mask": ["red", "purple", "orange", "blue"], + "weapon": ["sai", "bo staff", "nunchunk", "katana"], + } + ) + with tm.ensure_clean() as file: + expected.to_csv(file, index=False, encoding=encoding) + df = parser.read_csv(file, encoding=encoding, memory_map=True) + tm.assert_frame_equal(df, expected)