From 2f80e70136c47cf6331524dae4a31d02a2e4ede4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Sat, 4 Dec 2021 17:09:29 -0500 Subject: [PATCH 1/4] ENH: support SpooledTemporaryFile --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/io/common.py | 53 +++++++++++++------------ pandas/tests/io/parser/test_encoding.py | 17 ++++---- 3 files changed, 37 insertions(+), 34 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 78031fd0456f7..7cf8c07683514 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -752,6 +752,7 @@ I/O - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` with ``compression`` set to ``'zip'`` no longer create a zip file containing a file ending with ".zip". Instead, they try to infer the inner file name more smartly. (:issue:`39465`) - Bug in :func:`read_csv` when passing simultaneously a parser in ``date_parser`` and ``parse_dates=False``, the parsing was still called (:issue:`44366`) - Bug in :func:`read_csv` silently ignoring errors when failling to create a memory-mapped file (:issue:`44766`) +- Bug in :func:`read_csv` when passing a ``tempfile.SpooledTemporaryFile`` opened in binary mode (:issue:`44748`) - Period diff --git a/pandas/io/common.py b/pandas/io/common.py index 22284e1225062..47c3d5c50386d 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -6,7 +6,6 @@ from collections import abc import dataclasses import gzip -import io from io import ( BufferedIOBase, BytesIO, @@ -18,7 +17,6 @@ import mmap import os from pathlib import Path -import tempfile from typing import ( IO, Any, @@ -104,7 +102,7 @@ def close(self) -> None: avoid closing the potentially user-created buffer. """ if self.is_wrapped: - assert isinstance(self.handle, (TextIOWrapper, BytesIOWrapper)) + assert isinstance(self.handle, TextIOWrapper) self.handle.flush() self.handle.detach() self.created_handles.remove(self.handle) @@ -779,20 +777,17 @@ def get_handle( # Convert BytesIO or file objects passed with an encoding is_wrapped = False if not is_text and ioargs.mode == "rb" and isinstance(handle, TextIOBase): - handle = BytesIOWrapper( + # not added to handles as it does not open/buffer resources + handle = _BytesIOWrapper( handle, encoding=ioargs.encoding, ) - handles.append(handle) - # the (text) handle is always provided by the caller - # since get_handle would have opened it in binary mode - is_wrapped = True elif is_text and (compression or _is_binary_mode(handle, ioargs.mode)): handle = TextIOWrapper( # error: Argument 1 to "TextIOWrapper" has incompatible type # "Union[IO[bytes], IO[Any], RawIOBase, BufferedIOBase, TextIOBase, mmap]"; # expected "IO[bytes]" - handle, # type: ignore[arg-type] + _IOWrapper(handle), # type: ignore[arg-type] encoding=ioargs.encoding, errors=errors, newline="", @@ -935,7 +930,7 @@ def __init__( self.decode = decode self.attributes = {} - for attribute in ("seekable", "readable", "writeable"): + for attribute in ("seekable", "readable"): if not hasattr(f, attribute): continue self.attributes[attribute] = getattr(f, attribute)() @@ -976,11 +971,30 @@ def __next__(self) -> str: return newline.lstrip("\n") -# Wrapper that wraps a StringIO buffer and reads bytes from it -# Created for compat with pyarrow read_csv -class BytesIOWrapper(io.BytesIO): - buffer: StringIO | TextIOBase | None +class _IOWrapper: + # TextIOWrapper is overly strict: it request that the buffer has seekable, readable, + # and writable. If we have a read-only buffer, we shouldn't need writable and vice + # versa. Some buffers, are seek/read/writ-able but they do not have the "-able" + # methods, e.g., tempfile.SpooledTemporaryFile. + # If a buffer does not have the above "-able" methods, we simple assume they are + # seek/read/writ-able. + def __init__(self, buffer: BaseBuffer): + self.buffer = buffer + self.attributes = tuple( + attr + for attr in ("seekable", "readable", "writable") + if not hasattr(self.buffer, attr) + ) + def __getattr__(self, name: str): + if name in self.attributes: + return lambda: True + return getattr(self.buffer, name) + + +class _BytesIOWrapper: + # Wrapper that wraps a StringIO buffer and reads bytes from it + # Created for compat with pyarrow read_csv def __init__(self, buffer: StringIO | TextIOBase, encoding: str = "utf-8"): self.buffer = buffer self.encoding = encoding @@ -1006,15 +1020,6 @@ def read(self, n: int | None = -1) -> bytes: self.overflow = combined_bytestring[n:] return to_return - def detach(self): - # Slightly modified from Python's TextIOWrapper detach method - if self.buffer is None: - raise ValueError("buffer is already detached") - self.flush() - buffer = self.buffer - self.buffer = None - return buffer - def _maybe_memory_map( handle: str | BaseBuffer, @@ -1077,8 +1082,6 @@ def _is_binary_mode(handle: FilePath | BaseBuffer, mode: str) -> bool: codecs.StreamWriter, codecs.StreamReader, codecs.StreamReaderWriter, - # cannot be wrapped in TextIOWrapper GH43439 - tempfile.SpooledTemporaryFile, ) if issubclass(type(handle), text_classes): return False diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index 993f52b00334f..2b27332c7e85b 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -299,17 +299,16 @@ def test_readcsv_memmap_utf8(all_parsers): tm.assert_frame_equal(df, dfr) -def test_not_readable(all_parsers, request): +@pytest.mark.usefixtures("pyarrow_xfail") +@pytest.mark.parametrize("mode", ["w+b", "w+t"]) +def test_not_readable(all_parsers, mode): # GH43439 parser = all_parsers - if parser.engine in ("python", "pyarrow"): - mark = pytest.mark.xfail( - reason="SpooledTemporaryFile does only work with the c-engine" - ) - request.node.add_marker(mark) - - with tempfile.SpooledTemporaryFile() as handle: - handle.write(b"abcd") + content = b"abcd" + if "t" in mode: + content = "abcd" + with tempfile.SpooledTemporaryFile(mode=mode) as handle: + handle.write(content) handle.seek(0) df = parser.read_csv(handle) expected = DataFrame([], columns=["abcd"]) From 5fb607032c10d49f95beb81deec7571f3e19eab2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Tue, 7 Dec 2021 18:54:10 -0500 Subject: [PATCH 2/4] explicit methods --- pandas/io/common.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 47c3d5c50386d..b39265ea02cc4 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -980,17 +980,25 @@ class _IOWrapper: # seek/read/writ-able. def __init__(self, buffer: BaseBuffer): self.buffer = buffer - self.attributes = tuple( - attr - for attr in ("seekable", "readable", "writable") - if not hasattr(self.buffer, attr) - ) def __getattr__(self, name: str): - if name in self.attributes: - return lambda: True return getattr(self.buffer, name) + def readable(self) -> bool: + if hasattr(self.buffer, "readable"): + return self.buffer.readable() + return True + + def seekable(self) -> bool: + if hasattr(self.buffer, "seekable"): + return self.buffer.seekable() + return True + + def writable(self) -> bool: + if hasattr(self.buffer, "writable"): + return self.buffer.writable() + return True + class _BytesIOWrapper: # Wrapper that wraps a StringIO buffer and reads bytes from it From 2ddcd12cfa7e8ca55121b70dbdd1ce4e0f2f44e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Tue, 7 Dec 2021 21:17:55 -0500 Subject: [PATCH 3/4] mypy --- pandas/io/common.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index b39265ea02cc4..9de913e5e3091 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -986,7 +986,8 @@ def __getattr__(self, name: str): def readable(self) -> bool: if hasattr(self.buffer, "readable"): - return self.buffer.readable() + # error: "BaseBuffer" has no attribute "readable" + return self.buffer.readable() # type: ignore[attr-defined] return True def seekable(self) -> bool: @@ -996,7 +997,8 @@ def seekable(self) -> bool: def writable(self) -> bool: if hasattr(self.buffer, "writable"): - return self.buffer.writable() + # error: "BaseBuffer" has no attribute "writable" + return self.buffer.writable() # type: ignore[attr-defined] return True From d66ce1d58f74fdc803a66a6a60fb8d2efb5b2baf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Fri, 10 Dec 2021 18:15:26 -0500 Subject: [PATCH 4/4] maybe prevent resource warnings --- pandas/io/common.py | 13 +++++++++---- pandas/tests/io/test_common.py | 3 ++- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 9de913e5e3091..e12a7348b0075 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -1057,10 +1057,15 @@ def _maybe_memory_map( # error: Argument 1 to "_MMapWrapper" has incompatible type "Union[IO[Any], # RawIOBase, BufferedIOBase, TextIOBase, mmap]"; expected "IO[Any]" - wrapped = cast( - BaseBuffer, - _MMapWrapper(handle, encoding, errors, decode), # type: ignore[arg-type] - ) + try: + wrapped = cast( + BaseBuffer, + _MMapWrapper(handle, encoding, errors, decode), # type: ignore[arg-type] + ) + finally: + for handle in reversed(handles): + # error: "BaseBuffer" has no attribute "close" + handle.close() # type: ignore[attr-defined] handles.append(wrapped) return wrapped, memory_map, handles diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 94a1d614880e4..a00268d82a57d 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -607,4 +607,5 @@ def test_errno_attribute(): def test_fail_mmap(): with pytest.raises(UnsupportedOperation, match="fileno"): - icom.get_handle(BytesIO(), "rb", memory_map=True) + with BytesIO() as buffer: + icom.get_handle(buffer, "rb", memory_map=True)