diff --git a/doc/source/whatsnew/v1.2.5.rst b/doc/source/whatsnew/v1.2.5.rst index 16f9284802407..60e146b2212eb 100644 --- a/doc/source/whatsnew/v1.2.5.rst +++ b/doc/source/whatsnew/v1.2.5.rst @@ -15,7 +15,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Regression in :func:`concat` between two :class:`DataFrames` where one has an :class:`Index` that is all-None and the other is :class:`DatetimeIndex` incorrectly raising (:issue:`40841`) -- +- Regression in :func:`read_csv` when using ``memory_map=True`` with an non-UTF8 encoding (:issue:`40986`) - .. --------------------------------------------------------------------------- diff --git a/pandas/io/common.py b/pandas/io/common.py index be353fefdd1ef..e6b6471294ac7 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -568,7 +568,12 @@ def get_handle( # memory mapping needs to be the first step handle, memory_map, handles = _maybe_memory_map( - handle, memory_map, ioargs.encoding, ioargs.mode, errors + handle, + memory_map, + ioargs.encoding, + ioargs.mode, + errors, + ioargs.compression["method"] not in _compression_to_extension, ) is_path = isinstance(handle, str) @@ -759,7 +764,18 @@ class _MMapWrapper(abc.Iterator): """ - def __init__(self, f: IO): + def __init__( + self, + f: IO, + encoding: str = "utf-8", + errors: str = "strict", + decode: bool = True, + ): + self.encoding = encoding + self.errors = errors + self.decoder = codecs.getincrementaldecoder(encoding)(errors=errors) + self.decode = decode + self.attributes = {} for attribute in ("seekable", "readable", "writeable"): if not hasattr(f, attribute): @@ -775,19 +791,31 @@ def __getattr__(self, name: str): def __iter__(self) -> "_MMapWrapper": return self + def read(self, size: int = -1) -> Union[str, bytes]: + # CSV c-engine uses read instead of iterating + content: bytes = self.mmap.read(size) + if self.decode: + errors = self.errors if self.errors is not None else "strict" + # memory mapping is applied before compression. Encoding should + # be applied to the de-compressed data. + return content.decode(self.encoding, errors=errors) + return content + def __next__(self) -> str: newbytes = self.mmap.readline() # readline returns bytes, not str, but Python's CSV reader # expects str, so convert the output to str before continuing - newline = newbytes.decode("utf-8") + newline = self.decoder.decode(newbytes) # mmap doesn't raise if reading past the allocated # data but instead returns an empty string, so raise # if that is returned if newline == "": raise StopIteration - return newline + + # IncrementalDecoder seems to push newline to the next line + return newline.lstrip("\n") def _maybe_memory_map( @@ -796,6 +824,7 @@ def _maybe_memory_map( encoding: str, mode: str, errors: Optional[str], + decode: bool, ) -> Tuple[FileOrBuffer, bool, List[Buffer]]: """Try to memory map file/buffer.""" handles: List[Buffer] = [] @@ -814,7 +843,10 @@ def _maybe_memory_map( handles.append(handle) try: - wrapped = cast(mmap.mmap, _MMapWrapper(handle)) # type: ignore[arg-type] + wrapped = cast( + mmap.mmap, + _MMapWrapper(handle, encoding, errors, decode), # type: ignore[arg-type] + ) handle.close() handles.remove(handle) handles.append(wrapped) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 8ad86fd0a0dce..bbff9dfe1ddd0 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1868,31 +1868,6 @@ def __init__(self, src: FilePathOrBuffer, **kwds): assert self.handles is not None for key in ("storage_options", "encoding", "memory_map", "compression"): kwds.pop(key, None) - if self.handles.is_mmap and hasattr(self.handles.handle, "mmap"): - # pandas\io\parsers.py:1861: error: Item "IO[Any]" of - # "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, - # TextIOWrapper, mmap]" has no attribute "mmap" [union-attr] - - # pandas\io\parsers.py:1861: error: Item "RawIOBase" of - # "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, - # TextIOWrapper, mmap]" has no attribute "mmap" [union-attr] - - # pandas\io\parsers.py:1861: error: Item "BufferedIOBase" of - # "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, - # TextIOWrapper, mmap]" has no attribute "mmap" [union-attr] - - # pandas\io\parsers.py:1861: error: Item "TextIOBase" of - # "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, - # TextIOWrapper, mmap]" has no attribute "mmap" [union-attr] - - # pandas\io\parsers.py:1861: error: Item "TextIOWrapper" of - # "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, - # TextIOWrapper, mmap]" has no attribute "mmap" [union-attr] - - # pandas\io\parsers.py:1861: error: Item "mmap" of "Union[IO[Any], - # RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap]" has - # no attribute "mmap" [union-attr] - self.handles.handle = self.handles.handle.mmap # type: ignore[union-attr] try: self._reader = parsers.TextReader(self.handles.handle, **kwds) diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index e74265da3e966..41e1964086dce 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -217,3 +217,20 @@ def test_parse_encoded_special_characters(encoding): expected = DataFrame(data=[[":foo", 0], ["bar", 1], ["baz", 2]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("encoding", ["utf-8", None, "utf-16", "cp1255", "latin-1"]) +def test_encoding_memory_map(all_parsers, encoding): + # GH40986 + parser = all_parsers + expected = DataFrame( + { + "name": ["Raphael", "Donatello", "Miguel Angel", "Leonardo"], + "mask": ["red", "purple", "orange", "blue"], + "weapon": ["sai", "bo staff", "nunchunk", "katana"], + } + ) + with tm.ensure_clean() as file: + expected.to_csv(file, index=False, encoding=encoding) + df = parser.read_csv(file, encoding=encoding, memory_map=True) + tm.assert_frame_equal(df, expected)