Skip to content

Commit 0a0540c

Browse files
authored
REGR: memory_map with non-UTF8 encoding (#40994)
1 parent 8e21df2 commit 0a0540c

File tree

4 files changed

+54
-25
lines changed

4 files changed

+54
-25
lines changed

doc/source/whatsnew/v1.2.5.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ including other versions of pandas.
1515
Fixed regressions
1616
~~~~~~~~~~~~~~~~~
1717
- Regression in :func:`concat` between two :class:`DataFrames` where one has an :class:`Index` that is all-None and the other is :class:`DatetimeIndex` incorrectly raising (:issue:`40841`)
18-
-
18+
- Regression in :func:`read_csv` when using ``memory_map=True`` with an non-UTF8 encoding (:issue:`40986`)
1919
-
2020

2121
.. ---------------------------------------------------------------------------

pandas/io/common.py

+36-5
Original file line numberDiff line numberDiff line change
@@ -618,7 +618,12 @@ def get_handle(
618618

619619
# memory mapping needs to be the first step
620620
handle, memory_map, handles = _maybe_memory_map(
621-
handle, memory_map, ioargs.encoding, ioargs.mode, errors
621+
handle,
622+
memory_map,
623+
ioargs.encoding,
624+
ioargs.mode,
625+
errors,
626+
ioargs.compression["method"] not in _compression_to_extension,
622627
)
623628

624629
is_path = isinstance(handle, str)
@@ -820,7 +825,18 @@ class _MMapWrapper(abc.Iterator):
820825
821826
"""
822827

823-
def __init__(self, f: IO):
828+
def __init__(
829+
self,
830+
f: IO,
831+
encoding: str = "utf-8",
832+
errors: str = "strict",
833+
decode: bool = True,
834+
):
835+
self.encoding = encoding
836+
self.errors = errors
837+
self.decoder = codecs.getincrementaldecoder(encoding)(errors=errors)
838+
self.decode = decode
839+
824840
self.attributes = {}
825841
for attribute in ("seekable", "readable", "writeable"):
826842
if not hasattr(f, attribute):
@@ -836,19 +852,30 @@ def __getattr__(self, name: str):
836852
def __iter__(self) -> _MMapWrapper:
837853
return self
838854

855+
def read(self, size: int = -1) -> str | bytes:
856+
# CSV c-engine uses read instead of iterating
857+
content: bytes = self.mmap.read(size)
858+
if self.decode:
859+
# memory mapping is applied before compression. Encoding should
860+
# be applied to the de-compressed data.
861+
return content.decode(self.encoding, errors=self.errors)
862+
return content
863+
839864
def __next__(self) -> str:
840865
newbytes = self.mmap.readline()
841866

842867
# readline returns bytes, not str, but Python's CSV reader
843868
# expects str, so convert the output to str before continuing
844-
newline = newbytes.decode("utf-8")
869+
newline = self.decoder.decode(newbytes)
845870

846871
# mmap doesn't raise if reading past the allocated
847872
# data but instead returns an empty string, so raise
848873
# if that is returned
849874
if newline == "":
850875
raise StopIteration
851-
return newline
876+
877+
# IncrementalDecoder seems to push newline to the next line
878+
return newline.lstrip("\n")
852879

853880

854881
def _maybe_memory_map(
@@ -857,6 +884,7 @@ def _maybe_memory_map(
857884
encoding: str,
858885
mode: str,
859886
errors: str | None,
887+
decode: bool,
860888
) -> tuple[FileOrBuffer, bool, list[Buffer]]:
861889
"""Try to memory map file/buffer."""
862890
handles: list[Buffer] = []
@@ -877,7 +905,10 @@ def _maybe_memory_map(
877905
try:
878906
# error: Argument 1 to "_MMapWrapper" has incompatible type "Union[IO[Any],
879907
# RawIOBase, BufferedIOBase, TextIOBase, mmap]"; expected "IO[Any]"
880-
wrapped = cast(mmap.mmap, _MMapWrapper(handle)) # type: ignore[arg-type]
908+
wrapped = cast(
909+
mmap.mmap,
910+
_MMapWrapper(handle, encoding, errors, decode), # type: ignore[arg-type]
911+
)
881912
handle.close()
882913
handles.remove(handle)
883914
handles.append(wrapped)

pandas/io/parsers/c_parser_wrapper.py

-19
Original file line numberDiff line numberDiff line change
@@ -30,25 +30,6 @@ def __init__(self, src: FilePathOrBuffer, **kwds):
3030
assert self.handles is not None
3131
for key in ("storage_options", "encoding", "memory_map", "compression"):
3232
kwds.pop(key, None)
33-
if self.handles.is_mmap and hasattr(self.handles.handle, "mmap"):
34-
# error: Item "IO[Any]" of "Union[IO[Any], RawIOBase, BufferedIOBase,
35-
# TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap"
36-
37-
# error: Item "RawIOBase" of "Union[IO[Any], RawIOBase, BufferedIOBase,
38-
# TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap"
39-
40-
# error: Item "BufferedIOBase" of "Union[IO[Any], RawIOBase, BufferedIOBase,
41-
# TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap"
42-
43-
# error: Item "TextIOBase" of "Union[IO[Any], RawIOBase, BufferedIOBase,
44-
# TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap"
45-
46-
# error: Item "TextIOWrapper" of "Union[IO[Any], RawIOBase, BufferedIOBase,
47-
# TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap"
48-
49-
# error: Item "mmap" of "Union[IO[Any], RawIOBase, BufferedIOBase,
50-
# TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap"
51-
self.handles.handle = self.handles.handle.mmap # type: ignore[union-attr]
5233

5334
try:
5435
self._reader = parsers.TextReader(self.handles.handle, **kwds)

pandas/tests/io/parser/test_encoding.py

+17
Original file line numberDiff line numberDiff line change
@@ -220,3 +220,20 @@ def test_parse_encoded_special_characters(encoding):
220220

221221
expected = DataFrame(data=[[":foo", 0], ["bar", 1], ["baz", 2]], columns=["a", "b"])
222222
tm.assert_frame_equal(result, expected)
223+
224+
225+
@pytest.mark.parametrize("encoding", ["utf-8", None, "utf-16", "cp1255", "latin-1"])
226+
def test_encoding_memory_map(all_parsers, encoding):
227+
# GH40986
228+
parser = all_parsers
229+
expected = DataFrame(
230+
{
231+
"name": ["Raphael", "Donatello", "Miguel Angel", "Leonardo"],
232+
"mask": ["red", "purple", "orange", "blue"],
233+
"weapon": ["sai", "bo staff", "nunchunk", "katana"],
234+
}
235+
)
236+
with tm.ensure_clean() as file:
237+
expected.to_csv(file, index=False, encoding=encoding)
238+
df = parser.read_csv(file, encoding=encoding, memory_map=True)
239+
tm.assert_frame_equal(df, expected)

0 commit comments

Comments
 (0)