REGR: memory_map with non-UTF8 encoding (#40994)

twoertwein · web-flow · commit 0a0540c30bb4 · 2021-04-26T08:20:56.000-04:00
diff --git a/doc/source/whatsnew/v1.2.5.rst b/doc/source/whatsnew/v1.2.5.rst
@@ -15,7 +15,7 @@ including other versions of pandas.
 Fixed regressions
 ~~~~~~~~~~~~~~~~~
 - Regression in :func:`concat` between two :class:`DataFrames` where one has an :class:`Index` that is all-None and the other is :class:`DatetimeIndex` incorrectly raising (:issue:`40841`)
--
+- Regression in :func:`read_csv` when using ``memory_map=True`` with an non-UTF8 encoding (:issue:`40986`)
 -
 
 .. ---------------------------------------------------------------------------
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -618,7 +618,12 @@ def get_handle(
 
     # memory mapping needs to be the first step
     handle, memory_map, handles = _maybe_memory_map(
-        handle, memory_map, ioargs.encoding, ioargs.mode, errors
+        handle,
+        memory_map,
+        ioargs.encoding,
+        ioargs.mode,
+        errors,
+        ioargs.compression["method"] not in _compression_to_extension,
     )
 
     is_path = isinstance(handle, str)
@@ -820,7 +825,18 @@ class _MMapWrapper(abc.Iterator):
 
     """
 
-    def __init__(self, f: IO):
+    def __init__(
+        self,
+        f: IO,
+        encoding: str = "utf-8",
+        errors: str = "strict",
+        decode: bool = True,
+    ):
+        self.encoding = encoding
+        self.errors = errors
+        self.decoder = codecs.getincrementaldecoder(encoding)(errors=errors)
+        self.decode = decode
+
         self.attributes = {}
         for attribute in ("seekable", "readable", "writeable"):
             if not hasattr(f, attribute):
@@ -836,19 +852,30 @@ def __getattr__(self, name: str):
     def __iter__(self) -> _MMapWrapper:
         return self
 
+    def read(self, size: int = -1) -> str | bytes:
+        # CSV c-engine uses read instead of iterating
+        content: bytes = self.mmap.read(size)
+        if self.decode:
+            # memory mapping is applied before compression. Encoding should
+            # be applied to the de-compressed data.
+            return content.decode(self.encoding, errors=self.errors)
+        return content
+
     def __next__(self) -> str:
         newbytes = self.mmap.readline()
 
         # readline returns bytes, not str, but Python's CSV reader
         # expects str, so convert the output to str before continuing
-        newline = newbytes.decode("utf-8")
+        newline = self.decoder.decode(newbytes)
 
         # mmap doesn't raise if reading past the allocated
         # data but instead returns an empty string, so raise
         # if that is returned
         if newline == "":
             raise StopIteration
-        return newline
+
+        # IncrementalDecoder seems to push newline to the next line
+        return newline.lstrip("\n")
 
 
 def _maybe_memory_map(
@@ -857,6 +884,7 @@ def _maybe_memory_map(
     encoding: str,
     mode: str,
     errors: str | None,
+    decode: bool,
 ) -> tuple[FileOrBuffer, bool, list[Buffer]]:
     """Try to memory map file/buffer."""
     handles: list[Buffer] = []
@@ -877,7 +905,10 @@ def _maybe_memory_map(
     try:
         # error: Argument 1 to "_MMapWrapper" has incompatible type "Union[IO[Any],
         # RawIOBase, BufferedIOBase, TextIOBase, mmap]"; expected "IO[Any]"
-        wrapped = cast(mmap.mmap, _MMapWrapper(handle))  # type: ignore[arg-type]
+        wrapped = cast(
+            mmap.mmap,
+            _MMapWrapper(handle, encoding, errors, decode),  # type: ignore[arg-type]
+        )
         handle.close()
         handles.remove(handle)
         handles.append(wrapped)
diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py
@@ -30,25 +30,6 @@ def __init__(self, src: FilePathOrBuffer, **kwds):
         assert self.handles is not None
         for key in ("storage_options", "encoding", "memory_map", "compression"):
             kwds.pop(key, None)
-        if self.handles.is_mmap and hasattr(self.handles.handle, "mmap"):
-            # error: Item "IO[Any]" of "Union[IO[Any], RawIOBase, BufferedIOBase,
-            # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap"
-
-            # error: Item "RawIOBase" of "Union[IO[Any], RawIOBase, BufferedIOBase,
-            # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap"
-
-            # error: Item "BufferedIOBase" of "Union[IO[Any], RawIOBase, BufferedIOBase,
-            # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap"
-
-            # error: Item "TextIOBase" of "Union[IO[Any], RawIOBase, BufferedIOBase,
-            # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap"
-
-            # error: Item "TextIOWrapper" of "Union[IO[Any], RawIOBase, BufferedIOBase,
-            # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap"
-
-            # error: Item "mmap" of "Union[IO[Any], RawIOBase, BufferedIOBase,
-            # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap"
-            self.handles.handle = self.handles.handle.mmap  # type: ignore[union-attr]
 
         try:
             self._reader = parsers.TextReader(self.handles.handle, **kwds)
diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py
@@ -220,3 +220,20 @@ def test_parse_encoded_special_characters(encoding):
 
     expected = DataFrame(data=[["：foo", 0], ["bar", 1], ["baz", 2]], columns=["a", "b"])
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("encoding", ["utf-8", None, "utf-16", "cp1255", "latin-1"])
+def test_encoding_memory_map(all_parsers, encoding):
+    # GH40986
+    parser = all_parsers
+    expected = DataFrame(
+        {
+            "name": ["Raphael", "Donatello", "Miguel Angel", "Leonardo"],
+            "mask": ["red", "purple", "orange", "blue"],
+            "weapon": ["sai", "bo staff", "nunchunk", "katana"],
+        }
+    )
+    with tm.ensure_clean() as file:
+        expected.to_csv(file, index=False, encoding=encoding)
+        df = parser.read_csv(file, encoding=encoding, memory_map=True)
+    tm.assert_frame_equal(df, expected)

Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@ including other versions of pandas.`
`15`	`15`	`Fixed regressions`
`16`	`16`	`~~~~~~~~~~~~~~~~~`
`17`	`17`	- Regression in :func:`concat` between two :class:`DataFrames` where one has an :class:`Index` that is all-None and the other is :class:`DatetimeIndex` incorrectly raising (:issue:`40841`)
`18`		`--`
	`18`	+- Regression in :func:`read_csv` when using ``memory_map=True`` with an non-UTF8 encoding (:issue:`40986`)
`19`	`19`	`-`
`20`	`20`
`21`	`21`	`.. ---------------------------------------------------------------------------`