Backport PR pandas-dev#40994: REGR: memory_map with non-UTF8 encoding (pandas-dev#41257)

twoertwein · phofl · web-flow · commit e64410f87111 · 2021-05-25T14:22:36.000+01:00
Co-authored-by: phofl &lt;patrick_hoefler@gmx.net&gt;
diff --git a/doc/source/whatsnew/v1.2.5.rst b/doc/source/whatsnew/v1.2.5.rst
@@ -15,7 +15,7 @@ including other versions of pandas.
 Fixed regressions
 ~~~~~~~~~~~~~~~~~
 - Regression in :func:`concat` between two :class:`DataFrames` where one has an :class:`Index` that is all-None and the other is :class:`DatetimeIndex` incorrectly raising (:issue:`40841`)
--
+- Regression in :func:`read_csv` when using ``memory_map=True`` with an non-UTF8 encoding (:issue:`40986`)
 -
 
 .. ---------------------------------------------------------------------------
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -568,7 +568,12 @@ def get_handle(
 
     # memory mapping needs to be the first step
     handle, memory_map, handles = _maybe_memory_map(
-        handle, memory_map, ioargs.encoding, ioargs.mode, errors
+        handle,
+        memory_map,
+        ioargs.encoding,
+        ioargs.mode,
+        errors,
+        ioargs.compression["method"] not in _compression_to_extension,
     )
 
     is_path = isinstance(handle, str)
@@ -759,7 +764,18 @@ class _MMapWrapper(abc.Iterator):
 
     """
 
-    def __init__(self, f: IO):
+    def __init__(
+        self,
+        f: IO,
+        encoding: str = "utf-8",
+        errors: str = "strict",
+        decode: bool = True,
+    ):
+        self.encoding = encoding
+        self.errors = errors
+        self.decoder = codecs.getincrementaldecoder(encoding)(errors=errors)
+        self.decode = decode
+
         self.attributes = {}
         for attribute in ("seekable", "readable", "writeable"):
             if not hasattr(f, attribute):
@@ -775,19 +791,31 @@ def __getattr__(self, name: str):
     def __iter__(self) -> "_MMapWrapper":
         return self
 
+    def read(self, size: int = -1) -> Union[str, bytes]:
+        # CSV c-engine uses read instead of iterating
+        content: bytes = self.mmap.read(size)
+        if self.decode:
+            errors = self.errors if self.errors is not None else "strict"
+            # memory mapping is applied before compression. Encoding should
+            # be applied to the de-compressed data.
+            return content.decode(self.encoding, errors=errors)
+        return content
+
     def __next__(self) -> str:
         newbytes = self.mmap.readline()
 
         # readline returns bytes, not str, but Python's CSV reader
         # expects str, so convert the output to str before continuing
-        newline = newbytes.decode("utf-8")
+        newline = self.decoder.decode(newbytes)
 
         # mmap doesn't raise if reading past the allocated
         # data but instead returns an empty string, so raise
         # if that is returned
         if newline == "":
             raise StopIteration
-        return newline
+
+        # IncrementalDecoder seems to push newline to the next line
+        return newline.lstrip("\n")
 
 
 def _maybe_memory_map(
@@ -796,6 +824,7 @@ def _maybe_memory_map(
     encoding: str,
     mode: str,
     errors: Optional[str],
+    decode: bool,
 ) -> Tuple[FileOrBuffer, bool, List[Buffer]]:
     """Try to memory map file/buffer."""
     handles: List[Buffer] = []
@@ -814,7 +843,10 @@ def _maybe_memory_map(
         handles.append(handle)
 
     try:
-        wrapped = cast(mmap.mmap, _MMapWrapper(handle))  # type: ignore[arg-type]
+        wrapped = cast(
+            mmap.mmap,
+            _MMapWrapper(handle, encoding, errors, decode),  # type: ignore[arg-type]
+        )
         handle.close()
         handles.remove(handle)
         handles.append(wrapped)
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -1868,31 +1868,6 @@ def __init__(self, src: FilePathOrBuffer, **kwds):
         assert self.handles is not None
         for key in ("storage_options", "encoding", "memory_map", "compression"):
             kwds.pop(key, None)
-        if self.handles.is_mmap and hasattr(self.handles.handle, "mmap"):
-            # pandas\io\parsers.py:1861: error: Item "IO[Any]" of
-            # "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase,
-            # TextIOWrapper, mmap]" has no attribute "mmap"  [union-attr]
-
-            # pandas\io\parsers.py:1861: error: Item "RawIOBase" of
-            # "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase,
-            # TextIOWrapper, mmap]" has no attribute "mmap"  [union-attr]
-
-            # pandas\io\parsers.py:1861: error: Item "BufferedIOBase" of
-            # "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase,
-            # TextIOWrapper, mmap]" has no attribute "mmap"  [union-attr]
-
-            # pandas\io\parsers.py:1861: error: Item "TextIOBase" of
-            # "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase,
-            # TextIOWrapper, mmap]" has no attribute "mmap"  [union-attr]
-
-            # pandas\io\parsers.py:1861: error: Item "TextIOWrapper" of
-            # "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase,
-            # TextIOWrapper, mmap]" has no attribute "mmap"  [union-attr]
-
-            # pandas\io\parsers.py:1861: error: Item "mmap" of "Union[IO[Any],
-            # RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap]" has
-            # no attribute "mmap"  [union-attr]
-            self.handles.handle = self.handles.handle.mmap  # type: ignore[union-attr]
 
         try:
             self._reader = parsers.TextReader(self.handles.handle, **kwds)
diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py
@@ -217,3 +217,20 @@ def test_parse_encoded_special_characters(encoding):
 
     expected = DataFrame(data=[["：foo", 0], ["bar", 1], ["baz", 2]], columns=["a", "b"])
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("encoding", ["utf-8", None, "utf-16", "cp1255", "latin-1"])
+def test_encoding_memory_map(all_parsers, encoding):
+    # GH40986
+    parser = all_parsers
+    expected = DataFrame(
+        {
+            "name": ["Raphael", "Donatello", "Miguel Angel", "Leonardo"],
+            "mask": ["red", "purple", "orange", "blue"],
+            "weapon": ["sai", "bo staff", "nunchunk", "katana"],
+        }
+    )
+    with tm.ensure_clean() as file:
+        expected.to_csv(file, index=False, encoding=encoding)
+        df = parser.read_csv(file, encoding=encoding, memory_map=True)
+    tm.assert_frame_equal(df, expected)

Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@ including other versions of pandas.`
`15`	`15`	`Fixed regressions`
`16`	`16`	`~~~~~~~~~~~~~~~~~`
`17`	`17`	- Regression in :func:`concat` between two :class:`DataFrames` where one has an :class:`Index` that is all-None and the other is :class:`DatetimeIndex` incorrectly raising (:issue:`40841`)
`18`		`--`
	`18`	+- Regression in :func:`read_csv` when using ``memory_map=True`` with an non-UTF8 encoding (:issue:`40986`)
`19`	`19`	`-`
`20`	`20`
`21`	`21`	`.. ---------------------------------------------------------------------------`