diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 78031fd0456f7..7cf8c07683514 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -752,6 +752,7 @@ I/O - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` with ``compression`` set to ``'zip'`` no longer create a zip file containing a file ending with ".zip". Instead, they try to infer the inner file name more smartly. (:issue:`39465`) - Bug in :func:`read_csv` when passing simultaneously a parser in ``date_parser`` and ``parse_dates=False``, the parsing was still called (:issue:`44366`) - Bug in :func:`read_csv` silently ignoring errors when failling to create a memory-mapped file (:issue:`44766`) +- Bug in :func:`read_csv` when passing a ``tempfile.SpooledTemporaryFile`` opened in binary mode (:issue:`44748`) - Period diff --git a/pandas/io/common.py b/pandas/io/common.py index 22284e1225062..e12a7348b0075 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -6,7 +6,6 @@ from collections import abc import dataclasses import gzip -import io from io import ( BufferedIOBase, BytesIO, @@ -18,7 +17,6 @@ import mmap import os from pathlib import Path -import tempfile from typing import ( IO, Any, @@ -104,7 +102,7 @@ def close(self) -> None: avoid closing the potentially user-created buffer. """ if self.is_wrapped: - assert isinstance(self.handle, (TextIOWrapper, BytesIOWrapper)) + assert isinstance(self.handle, TextIOWrapper) self.handle.flush() self.handle.detach() self.created_handles.remove(self.handle) @@ -779,20 +777,17 @@ def get_handle( # Convert BytesIO or file objects passed with an encoding is_wrapped = False if not is_text and ioargs.mode == "rb" and isinstance(handle, TextIOBase): - handle = BytesIOWrapper( + # not added to handles as it does not open/buffer resources + handle = _BytesIOWrapper( handle, encoding=ioargs.encoding, ) - handles.append(handle) - # the (text) handle is always provided by the caller - # since get_handle would have opened it in binary mode - is_wrapped = True elif is_text and (compression or _is_binary_mode(handle, ioargs.mode)): handle = TextIOWrapper( # error: Argument 1 to "TextIOWrapper" has incompatible type # "Union[IO[bytes], IO[Any], RawIOBase, BufferedIOBase, TextIOBase, mmap]"; # expected "IO[bytes]" - handle, # type: ignore[arg-type] + _IOWrapper(handle), # type: ignore[arg-type] encoding=ioargs.encoding, errors=errors, newline="", @@ -935,7 +930,7 @@ def __init__( self.decode = decode self.attributes = {} - for attribute in ("seekable", "readable", "writeable"): + for attribute in ("seekable", "readable"): if not hasattr(f, attribute): continue self.attributes[attribute] = getattr(f, attribute)() @@ -976,11 +971,40 @@ def __next__(self) -> str: return newline.lstrip("\n") -# Wrapper that wraps a StringIO buffer and reads bytes from it -# Created for compat with pyarrow read_csv -class BytesIOWrapper(io.BytesIO): - buffer: StringIO | TextIOBase | None +class _IOWrapper: + # TextIOWrapper is overly strict: it request that the buffer has seekable, readable, + # and writable. If we have a read-only buffer, we shouldn't need writable and vice + # versa. Some buffers, are seek/read/writ-able but they do not have the "-able" + # methods, e.g., tempfile.SpooledTemporaryFile. + # If a buffer does not have the above "-able" methods, we simple assume they are + # seek/read/writ-able. + def __init__(self, buffer: BaseBuffer): + self.buffer = buffer + + def __getattr__(self, name: str): + return getattr(self.buffer, name) + + def readable(self) -> bool: + if hasattr(self.buffer, "readable"): + # error: "BaseBuffer" has no attribute "readable" + return self.buffer.readable() # type: ignore[attr-defined] + return True + + def seekable(self) -> bool: + if hasattr(self.buffer, "seekable"): + return self.buffer.seekable() + return True + + def writable(self) -> bool: + if hasattr(self.buffer, "writable"): + # error: "BaseBuffer" has no attribute "writable" + return self.buffer.writable() # type: ignore[attr-defined] + return True + +class _BytesIOWrapper: + # Wrapper that wraps a StringIO buffer and reads bytes from it + # Created for compat with pyarrow read_csv def __init__(self, buffer: StringIO | TextIOBase, encoding: str = "utf-8"): self.buffer = buffer self.encoding = encoding @@ -1006,15 +1030,6 @@ def read(self, n: int | None = -1) -> bytes: self.overflow = combined_bytestring[n:] return to_return - def detach(self): - # Slightly modified from Python's TextIOWrapper detach method - if self.buffer is None: - raise ValueError("buffer is already detached") - self.flush() - buffer = self.buffer - self.buffer = None - return buffer - def _maybe_memory_map( handle: str | BaseBuffer, @@ -1042,10 +1057,15 @@ def _maybe_memory_map( # error: Argument 1 to "_MMapWrapper" has incompatible type "Union[IO[Any], # RawIOBase, BufferedIOBase, TextIOBase, mmap]"; expected "IO[Any]" - wrapped = cast( - BaseBuffer, - _MMapWrapper(handle, encoding, errors, decode), # type: ignore[arg-type] - ) + try: + wrapped = cast( + BaseBuffer, + _MMapWrapper(handle, encoding, errors, decode), # type: ignore[arg-type] + ) + finally: + for handle in reversed(handles): + # error: "BaseBuffer" has no attribute "close" + handle.close() # type: ignore[attr-defined] handles.append(wrapped) return wrapped, memory_map, handles @@ -1077,8 +1097,6 @@ def _is_binary_mode(handle: FilePath | BaseBuffer, mode: str) -> bool: codecs.StreamWriter, codecs.StreamReader, codecs.StreamReaderWriter, - # cannot be wrapped in TextIOWrapper GH43439 - tempfile.SpooledTemporaryFile, ) if issubclass(type(handle), text_classes): return False diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index 993f52b00334f..2b27332c7e85b 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -299,17 +299,16 @@ def test_readcsv_memmap_utf8(all_parsers): tm.assert_frame_equal(df, dfr) -def test_not_readable(all_parsers, request): +@pytest.mark.usefixtures("pyarrow_xfail") +@pytest.mark.parametrize("mode", ["w+b", "w+t"]) +def test_not_readable(all_parsers, mode): # GH43439 parser = all_parsers - if parser.engine in ("python", "pyarrow"): - mark = pytest.mark.xfail( - reason="SpooledTemporaryFile does only work with the c-engine" - ) - request.node.add_marker(mark) - - with tempfile.SpooledTemporaryFile() as handle: - handle.write(b"abcd") + content = b"abcd" + if "t" in mode: + content = "abcd" + with tempfile.SpooledTemporaryFile(mode=mode) as handle: + handle.write(content) handle.seek(0) df = parser.read_csv(handle) expected = DataFrame([], columns=["abcd"]) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 94a1d614880e4..a00268d82a57d 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -607,4 +607,5 @@ def test_errno_attribute(): def test_fail_mmap(): with pytest.raises(UnsupportedOperation, match="fileno"): - icom.get_handle(BytesIO(), "rb", memory_map=True) + with BytesIO() as buffer: + icom.get_handle(buffer, "rb", memory_map=True)