Skip to content

Commit e64410f

Browse files
twoertweinphofl
andauthored
Backport PR pandas-dev#40994: REGR: memory_map with non-UTF8 encoding (pandas-dev#41257)
Co-authored-by: phofl <[email protected]>
1 parent 9c1676f commit e64410f

File tree

4 files changed

+55
-31
lines changed

4 files changed

+55
-31
lines changed

doc/source/whatsnew/v1.2.5.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ including other versions of pandas.
1515
Fixed regressions
1616
~~~~~~~~~~~~~~~~~
1717
- Regression in :func:`concat` between two :class:`DataFrames` where one has an :class:`Index` that is all-None and the other is :class:`DatetimeIndex` incorrectly raising (:issue:`40841`)
18-
-
18+
- Regression in :func:`read_csv` when using ``memory_map=True`` with an non-UTF8 encoding (:issue:`40986`)
1919
-
2020

2121
.. ---------------------------------------------------------------------------

pandas/io/common.py

+37-5
Original file line numberDiff line numberDiff line change
@@ -568,7 +568,12 @@ def get_handle(
568568

569569
# memory mapping needs to be the first step
570570
handle, memory_map, handles = _maybe_memory_map(
571-
handle, memory_map, ioargs.encoding, ioargs.mode, errors
571+
handle,
572+
memory_map,
573+
ioargs.encoding,
574+
ioargs.mode,
575+
errors,
576+
ioargs.compression["method"] not in _compression_to_extension,
572577
)
573578

574579
is_path = isinstance(handle, str)
@@ -759,7 +764,18 @@ class _MMapWrapper(abc.Iterator):
759764
760765
"""
761766

762-
def __init__(self, f: IO):
767+
def __init__(
768+
self,
769+
f: IO,
770+
encoding: str = "utf-8",
771+
errors: str = "strict",
772+
decode: bool = True,
773+
):
774+
self.encoding = encoding
775+
self.errors = errors
776+
self.decoder = codecs.getincrementaldecoder(encoding)(errors=errors)
777+
self.decode = decode
778+
763779
self.attributes = {}
764780
for attribute in ("seekable", "readable", "writeable"):
765781
if not hasattr(f, attribute):
@@ -775,19 +791,31 @@ def __getattr__(self, name: str):
775791
def __iter__(self) -> "_MMapWrapper":
776792
return self
777793

794+
def read(self, size: int = -1) -> Union[str, bytes]:
795+
# CSV c-engine uses read instead of iterating
796+
content: bytes = self.mmap.read(size)
797+
if self.decode:
798+
errors = self.errors if self.errors is not None else "strict"
799+
# memory mapping is applied before compression. Encoding should
800+
# be applied to the de-compressed data.
801+
return content.decode(self.encoding, errors=errors)
802+
return content
803+
778804
def __next__(self) -> str:
779805
newbytes = self.mmap.readline()
780806

781807
# readline returns bytes, not str, but Python's CSV reader
782808
# expects str, so convert the output to str before continuing
783-
newline = newbytes.decode("utf-8")
809+
newline = self.decoder.decode(newbytes)
784810

785811
# mmap doesn't raise if reading past the allocated
786812
# data but instead returns an empty string, so raise
787813
# if that is returned
788814
if newline == "":
789815
raise StopIteration
790-
return newline
816+
817+
# IncrementalDecoder seems to push newline to the next line
818+
return newline.lstrip("\n")
791819

792820

793821
def _maybe_memory_map(
@@ -796,6 +824,7 @@ def _maybe_memory_map(
796824
encoding: str,
797825
mode: str,
798826
errors: Optional[str],
827+
decode: bool,
799828
) -> Tuple[FileOrBuffer, bool, List[Buffer]]:
800829
"""Try to memory map file/buffer."""
801830
handles: List[Buffer] = []
@@ -814,7 +843,10 @@ def _maybe_memory_map(
814843
handles.append(handle)
815844

816845
try:
817-
wrapped = cast(mmap.mmap, _MMapWrapper(handle)) # type: ignore[arg-type]
846+
wrapped = cast(
847+
mmap.mmap,
848+
_MMapWrapper(handle, encoding, errors, decode), # type: ignore[arg-type]
849+
)
818850
handle.close()
819851
handles.remove(handle)
820852
handles.append(wrapped)

pandas/io/parsers.py

-25
Original file line numberDiff line numberDiff line change
@@ -1868,31 +1868,6 @@ def __init__(self, src: FilePathOrBuffer, **kwds):
18681868
assert self.handles is not None
18691869
for key in ("storage_options", "encoding", "memory_map", "compression"):
18701870
kwds.pop(key, None)
1871-
if self.handles.is_mmap and hasattr(self.handles.handle, "mmap"):
1872-
# pandas\io\parsers.py:1861: error: Item "IO[Any]" of
1873-
# "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase,
1874-
# TextIOWrapper, mmap]" has no attribute "mmap" [union-attr]
1875-
1876-
# pandas\io\parsers.py:1861: error: Item "RawIOBase" of
1877-
# "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase,
1878-
# TextIOWrapper, mmap]" has no attribute "mmap" [union-attr]
1879-
1880-
# pandas\io\parsers.py:1861: error: Item "BufferedIOBase" of
1881-
# "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase,
1882-
# TextIOWrapper, mmap]" has no attribute "mmap" [union-attr]
1883-
1884-
# pandas\io\parsers.py:1861: error: Item "TextIOBase" of
1885-
# "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase,
1886-
# TextIOWrapper, mmap]" has no attribute "mmap" [union-attr]
1887-
1888-
# pandas\io\parsers.py:1861: error: Item "TextIOWrapper" of
1889-
# "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase,
1890-
# TextIOWrapper, mmap]" has no attribute "mmap" [union-attr]
1891-
1892-
# pandas\io\parsers.py:1861: error: Item "mmap" of "Union[IO[Any],
1893-
# RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap]" has
1894-
# no attribute "mmap" [union-attr]
1895-
self.handles.handle = self.handles.handle.mmap # type: ignore[union-attr]
18961871

18971872
try:
18981873
self._reader = parsers.TextReader(self.handles.handle, **kwds)

pandas/tests/io/parser/test_encoding.py

+17
Original file line numberDiff line numberDiff line change
@@ -217,3 +217,20 @@ def test_parse_encoded_special_characters(encoding):
217217

218218
expected = DataFrame(data=[[":foo", 0], ["bar", 1], ["baz", 2]], columns=["a", "b"])
219219
tm.assert_frame_equal(result, expected)
220+
221+
222+
@pytest.mark.parametrize("encoding", ["utf-8", None, "utf-16", "cp1255", "latin-1"])
223+
def test_encoding_memory_map(all_parsers, encoding):
224+
# GH40986
225+
parser = all_parsers
226+
expected = DataFrame(
227+
{
228+
"name": ["Raphael", "Donatello", "Miguel Angel", "Leonardo"],
229+
"mask": ["red", "purple", "orange", "blue"],
230+
"weapon": ["sai", "bo staff", "nunchunk", "katana"],
231+
}
232+
)
233+
with tm.ensure_clean() as file:
234+
expected.to_csv(file, index=False, encoding=encoding)
235+
df = parser.read_csv(file, encoding=encoding, memory_map=True)
236+
tm.assert_frame_equal(df, expected)

0 commit comments

Comments
 (0)