diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 2249790b7ff1b..a61715c106c41 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -746,6 +746,9 @@ I/O - Bug in :func:`read_csv` raising ``AttributeError`` when attempting to read a .csv file and infer index column dtype from an nullable integer type (:issue:`44079`) - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` with ``compression`` set to ``'zip'`` no longer create a zip file containing a file ending with ".zip". Instead, they try to infer the inner file name more smartly. (:issue:`39465`) - Bug in :func:`read_csv` when passing simultaneously a parser in ``date_parser`` and ``parse_dates=False``, the parsing was still called (:issue:`44366`) +- Bug in :func:`read_csv` when passing a ``tempfile.SpooledTemporaryFile`` opened in binary mode (:issue:`44748`) +- Bug in :func:`read_csv` silently ignoring errors when failling to create a memory-mapped file (:issue:`44766`) +- Period ^^^^^^ diff --git a/pandas/io/common.py b/pandas/io/common.py index 844304396a23f..3aeb4f5b3da8c 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -3,10 +3,8 @@ import bz2 import codecs -from collections import abc import dataclasses import gzip -import io from io import ( BufferedIOBase, BytesIO, @@ -18,7 +16,6 @@ import mmap import os from pathlib import Path -import tempfile from typing import ( IO, Any, @@ -104,7 +101,7 @@ def close(self) -> None: avoid closing the potentially user-created buffer. """ if self.is_wrapped: - assert isinstance(self.handle, (TextIOWrapper, BytesIOWrapper)) + assert isinstance(self.handle, TextIOWrapper) self.handle.flush() self.handle.detach() self.created_handles.remove(self.handle) @@ -673,14 +670,7 @@ def get_handle( handles: list[BaseBuffer] # memory mapping needs to be the first step - handle, memory_map, handles = _maybe_memory_map( - handle, - memory_map, - ioargs.encoding, - ioargs.mode, - errors, - ioargs.compression["method"] not in _compression_to_extension, - ) + handle, memory_map, handles = _maybe_memory_map(handle, memory_map) is_path = isinstance(handle, str) compression_args = dict(ioargs.compression) @@ -779,20 +769,19 @@ def get_handle( # Convert BytesIO or file objects passed with an encoding is_wrapped = False if not is_text and ioargs.mode == "rb" and isinstance(handle, TextIOBase): - handle = BytesIOWrapper( + # not added to handles as it does not open/buffer resources + handle = _BytesIOWrapper( handle, encoding=ioargs.encoding, ) - handles.append(handle) - # the (text) handle is always provided by the caller - # since get_handle would have opened it in binary mode - is_wrapped = True - elif is_text and (compression or _is_binary_mode(handle, ioargs.mode)): + elif is_text and ( + compression or _is_binary_mode(handle, ioargs.mode) or memory_map + ): handle = TextIOWrapper( # error: Argument 1 to "TextIOWrapper" has incompatible type # "Union[IO[bytes], IO[Any], RawIOBase, BufferedIOBase, TextIOBase, mmap]"; # expected "IO[bytes]" - handle, # type: ignore[arg-type] + _IOWrapper(handle), # type: ignore[arg-type] encoding=ioargs.encoding, errors=errors, newline="", @@ -909,78 +898,26 @@ def closed(self): return self.fp is None -class _MMapWrapper(abc.Iterator): - """ - Wrapper for the Python's mmap class so that it can be properly read in - by Python's csv.reader class. - - Parameters - ---------- - f : file object - File object to be mapped onto memory. Must support the 'fileno' - method or have an equivalent attribute - - """ +class _IOWrapper: + # Lies that certain attributes exist and are True. - def __init__( - self, - f: IO, - encoding: str = "utf-8", - errors: str = "strict", - decode: bool = True, - ): - self.encoding = encoding - self.errors = errors - self.decoder = codecs.getincrementaldecoder(encoding)(errors=errors) - self.decode = decode - - self.attributes = {} - for attribute in ("seekable", "readable", "writeable"): - if not hasattr(f, attribute): - continue - self.attributes[attribute] = getattr(f, attribute)() - self.mmap = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) + def __init__(self, buffer: BaseBuffer): + self.buffer = buffer + self.attributes = tuple( + attr + for attr in ("seekable", "readable", "writable") + if not hasattr(self.buffer, attr) + ) def __getattr__(self, name: str): if name in self.attributes: - return lambda: self.attributes[name] - return getattr(self.mmap, name) + return lambda: True + return getattr(self.buffer, name) - def __iter__(self) -> _MMapWrapper: - return self - - def read(self, size: int = -1) -> str | bytes: - # CSV c-engine uses read instead of iterating - content: bytes = self.mmap.read(size) - if self.decode and self.encoding != "utf-8": - # memory mapping is applied before compression. Encoding should - # be applied to the de-compressed data. - final = size == -1 or len(content) < size - return self.decoder.decode(content, final=final) - return content - - def __next__(self) -> str: - newbytes = self.mmap.readline() - - # readline returns bytes, not str, but Python's CSV reader - # expects str, so convert the output to str before continuing - newline = self.decoder.decode(newbytes) - - # mmap doesn't raise if reading past the allocated - # data but instead returns an empty string, so raise - # if that is returned - if newline == "": - raise StopIteration - - # IncrementalDecoder seems to push newline to the next line - return newline.lstrip("\n") - - -# Wrapper that wraps a StringIO buffer and reads bytes from it -# Created for compat with pyarrow read_csv -class BytesIOWrapper(io.BytesIO): - buffer: StringIO | TextIOBase | None +class _BytesIOWrapper: + # Wrapper that wraps a StringIO buffer and reads bytes from it + # Created for compat with pyarrow read_csv def __init__(self, buffer: StringIO | TextIOBase, encoding: str = "utf-8"): self.buffer = buffer self.encoding = encoding @@ -1006,23 +943,9 @@ def read(self, n: int | None = -1) -> bytes: self.overflow = combined_bytestring[n:] return to_return - def detach(self): - # Slightly modified from Python's TextIOWrapper detach method - if self.buffer is None: - raise ValueError("buffer is already detached") - self.flush() - buffer = self.buffer - self.buffer = None - return buffer - def _maybe_memory_map( - handle: str | BaseBuffer, - memory_map: bool, - encoding: str, - mode: str, - errors: str | None, - decode: bool, + handle: str | BaseBuffer, memory_map: bool ) -> tuple[str | BaseBuffer, bool, list[BaseBuffer]]: """Try to memory map file/buffer.""" handles: list[BaseBuffer] = [] @@ -1032,32 +955,17 @@ def _maybe_memory_map( # need to open the file first if isinstance(handle, str): - if encoding and "b" not in mode: - # Encoding - handle = open(handle, mode, encoding=encoding, errors=errors, newline="") - else: - # Binary mode - handle = open(handle, mode) + # encoding will be handled later in TextIOWrapper + handle = open(handle, "rb") handles.append(handle) - try: - # error: Argument 1 to "_MMapWrapper" has incompatible type "Union[IO[Any], - # RawIOBase, BufferedIOBase, TextIOBase, mmap]"; expected "IO[Any]" - wrapped = cast( - BaseBuffer, - _MMapWrapper(handle, encoding, errors, decode), # type: ignore[arg-type] - ) - # error: "BaseBuffer" has no attribute "close" - handle.close() # type: ignore[attr-defined] - handles.remove(handle) - handles.append(wrapped) - handle = wrapped - except Exception: - # we catch any errors that may have occurred - # because that is consistent with the lower-level - # functionality of the C engine (pd.read_csv), so - # leave the file handler as is then - memory_map = False + mmap_handle = mmap.mmap(handle.fileno(), 0, access=mmap.ACCESS_READ) + + # error: Argument 1 to "_IOWrapper" has incompatible type "mmap"; + # expected "BaseBuffer" + wrapped = _IOWrapper(mmap_handle) # type: ignore[arg-type] + handles.append(wrapped) + handle = wrapped return handle, memory_map, handles @@ -1088,8 +996,6 @@ def _is_binary_mode(handle: FilePath | BaseBuffer, mode: str) -> bool: codecs.StreamWriter, codecs.StreamReader, codecs.StreamReaderWriter, - # cannot be wrapped in TextIOWrapper GH43439 - tempfile.SpooledTemporaryFile, ) if issubclass(type(handle), text_classes): return False diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 5d03529654b0d..c5e8ff4f484e1 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -231,6 +231,7 @@ def _open_handles( self, src: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], kwds: dict[str, Any], + is_text: bool = True, ) -> None: """ Let the readers open IOHandles after they are done with their potential raises. @@ -243,6 +244,7 @@ def _open_handles( memory_map=kwds.get("memory_map", False), storage_options=kwds.get("storage_options", None), errors=kwds.get("encoding_errors", "strict"), + is_text=is_text, ) def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> Iterable: diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 05c963f2d2552..77eff2fb7cd21 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -51,7 +51,9 @@ def __init__( kwds["usecols"] = self.usecols # open handles - self._open_handles(src, kwds) + # The c-engine can cope with utf-8 bytes: no need to use TextIOWrapper when the + # encoding is utf-8: get_handle(..., is_text=Flase) + self._open_handles(src, kwds, is_text=kwds.get("encoding") != "utf-8") assert self.handles is not None # Have to pass int, would break tests using TextReader directly otherwise :( diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py index 11ef9d7d69122..0855a469ae58d 100644 --- a/pandas/tests/io/parser/common/test_file_buffer_url.py +++ b/pandas/tests/io/parser/common/test_file_buffer_url.py @@ -347,25 +347,6 @@ def test_read_csv_file_handle(all_parsers, io_class, encoding): assert not handle.closed -def test_memory_map_file_handle_silent_fallback(all_parsers, compression): - """ - Do not fail for buffers with memory_map=True (cannot memory map BytesIO). - - GH 37621 - """ - parser = all_parsers - expected = DataFrame({"a": [1], "b": [2]}) - - handle = BytesIO() - expected.to_csv(handle, index=False, compression=compression, mode="wb") - handle.seek(0) - - tm.assert_frame_equal( - parser.read_csv(handle, memory_map=True, compression=compression), - expected, - ) - - def test_memory_map_compression(all_parsers, compression): """ Support memory map for compressed files. diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index 993f52b00334f..30e3452e8222a 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -236,9 +236,14 @@ def test_parse_encoded_special_characters(encoding): def test_encoding_memory_map(all_parsers, encoding): # GH40986 parser = all_parsers + + # add one entry with a special character + encoding_ = encoding or "utf-8" + leonardo = "Léonardo".encode(encoding_, errors="ignore").decode(encoding_) + expected = DataFrame( { - "name": ["Raphael", "Donatello", "Miguel Angel", "Leonardo"], + "name": ["Raphael", "Donatello", "Miguel Angel", leonardo], "mask": ["red", "purple", "orange", "blue"], "weapon": ["sai", "bo staff", "nunchunk", "katana"], } @@ -299,17 +304,21 @@ def test_readcsv_memmap_utf8(all_parsers): tm.assert_frame_equal(df, dfr) -def test_not_readable(all_parsers, request): +@pytest.mark.usefixtures("pyarrow_xfail") +@pytest.mark.parametrize("mode", ["w+b", "w+t"]) +def test_not_readable(all_parsers, mode, request): # GH43439 parser = all_parsers - if parser.engine in ("python", "pyarrow"): + if parser.engine == "pyarrow": mark = pytest.mark.xfail( - reason="SpooledTemporaryFile does only work with the c-engine" + reason="SpooledTemporaryFile does only work with the pyarrow-engine" ) request.node.add_marker(mark) - - with tempfile.SpooledTemporaryFile() as handle: - handle.write(b"abcd") + content = b"abcd" + if "t" in mode: + content = "abcd" + with tempfile.SpooledTemporaryFile(mode=mode) as handle: + handle.write(content) handle.seek(0) df = parser.read_csv(handle) expected = DataFrame([], columns=["abcd"]) diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index cdfc9f71f169c..584cf9e5331dc 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -699,15 +699,15 @@ def test_encoding_mmap(memory_map): GH 23254. """ encoding = "iso8859_1" - data = BytesIO(" 1 A Ä 2\n".encode(encoding)) - df = read_fwf( - data, - header=None, - widths=[2, 2, 2, 2], - encoding=encoding, - memory_map=memory_map, - ) - data.seek(0) + with tm.ensure_clean() as path: + Path(path).write_bytes(" 1 A Ä 2\n".encode(encoding)) + df = read_fwf( + path, + header=None, + widths=[2, 2, 2, 2], + encoding=encoding, + memory_map=memory_map, + ) df_reference = DataFrame([[1, "A", "Ä", 2]]) tm.assert_frame_equal(df, df_reference) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index a782f8dbbc76d..44b0f4cb88435 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -425,39 +425,45 @@ def test_constructor_bad_file(self, mmap_file): err = mmap.error with pytest.raises(err, match=msg): - icom._MMapWrapper(non_file) + icom._maybe_memory_map(non_file, True) target = open(mmap_file) target.close() msg = "I/O operation on closed file" with pytest.raises(ValueError, match=msg): - icom._MMapWrapper(target) + icom._maybe_memory_map(target, True) def test_get_attr(self, mmap_file): with open(mmap_file) as target: - wrapper = icom._MMapWrapper(target) + handles = icom.get_handle(target, "r", memory_map=True) - attrs = dir(wrapper.mmap) - attrs = [attr for attr in attrs if not attr.startswith("__")] - attrs.append("__next__") + with handles: + next_wrapper = handles.handle + mmap_wrapper = handles.created_handles[-1] + mmap_handle = mmap_wrapper.buffer - for attr in attrs: - assert hasattr(wrapper, attr) + # check _IOWrapper + attrs = dir(mmap_handle) + attrs = [attr for attr in attrs if not attr.startswith("__")] - assert not hasattr(wrapper, "foo") + for attr in attrs: + assert hasattr(mmap_wrapper, attr) + + assert hasattr(next_wrapper, "__next__") def test_next(self, mmap_file): with open(mmap_file) as target: - wrapper = icom._MMapWrapper(target) + handles = icom.get_handle(target, "r", memory_map=True) lines = target.readlines() - for line in lines: - next_line = next(wrapper) - assert next_line.strip() == line.strip() + with handles: + for line in lines: + next_line = next(handles.handle) + assert next_line.strip() == line.strip() - with pytest.raises(StopIteration, match=r"^$"): - next(wrapper) + with pytest.raises(StopIteration, match=r"^$"): + next(handles.handle) def test_unknown_engine(self): with tm.ensure_clean() as path: @@ -466,36 +472,38 @@ def test_unknown_engine(self): with pytest.raises(ValueError, match="Unknown engine"): pd.read_csv(path, engine="pyt") - def test_binary_mode(self): - """ - 'encoding' shouldn't be passed to 'open' in binary mode. - GH 35058 - """ - with tm.ensure_clean() as path: - df = tm.makeDataFrame() - df.to_csv(path, mode="w+b") - tm.assert_frame_equal(df, pd.read_csv(path, index_col=0)) +def test_binary_mode(): + """ + 'encoding' shouldn't be passed to 'open' in binary mode. + + GH 35058 + """ + with tm.ensure_clean() as path: + df = tm.makeDataFrame() + df.to_csv(path, mode="w+b") + tm.assert_frame_equal(df, pd.read_csv(path, index_col=0)) - @pytest.mark.parametrize("encoding", ["utf-16", "utf-32"]) - @pytest.mark.parametrize("compression_", ["bz2", "xz"]) - def test_warning_missing_utf_bom(self, encoding, compression_): - """ - bz2 and xz do not write the byte order mark (BOM) for utf-16/32. - https://stackoverflow.com/questions/55171439 +@pytest.mark.parametrize("encoding", ["utf-16", "utf-32"]) +@pytest.mark.parametrize("compression_", ["bz2", "xz"]) +def test_warning_missing_utf_bom(encoding, compression_): + """ + bz2 and xz do not write the byte order mark (BOM) for utf-16/32. - GH 35681 - """ - df = tm.makeDataFrame() - with tm.ensure_clean() as path: - with tm.assert_produces_warning(UnicodeWarning): - df.to_csv(path, compression=compression_, encoding=encoding) + https://stackoverflow.com/questions/55171439 + + GH 35681 + """ + df = tm.makeDataFrame() + with tm.ensure_clean() as path: + with tm.assert_produces_warning(UnicodeWarning): + df.to_csv(path, compression=compression_, encoding=encoding) - # reading should fail (otherwise we wouldn't need the warning) - msg = r"UTF-\d+ stream does not start with BOM" - with pytest.raises(UnicodeError, match=msg): - pd.read_csv(path, compression=compression_, encoding=encoding) + # reading should fail (otherwise we wouldn't need the warning) + msg = r"UTF-\d+ stream does not start with BOM" + with pytest.raises(UnicodeError, match=msg): + pd.read_csv(path, compression=compression_, encoding=encoding) def test_is_fsspec_url():