Skip to content

ENH: support SpooledTemporaryFile #44761

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Dec 11, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -752,6 +752,7 @@ I/O
- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` with ``compression`` set to ``'zip'`` no longer create a zip file containing a file ending with ".zip". Instead, they try to infer the inner file name more smartly. (:issue:`39465`)
- Bug in :func:`read_csv` when passing simultaneously a parser in ``date_parser`` and ``parse_dates=False``, the parsing was still called (:issue:`44366`)
- Bug in :func:`read_csv` silently ignoring errors when failling to create a memory-mapped file (:issue:`44766`)
- Bug in :func:`read_csv` when passing a ``tempfile.SpooledTemporaryFile`` opened in binary mode (:issue:`44748`)
-

Period
Expand Down
76 changes: 47 additions & 29 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from collections import abc
import dataclasses
import gzip
import io
from io import (
BufferedIOBase,
BytesIO,
Expand All @@ -18,7 +17,6 @@
import mmap
import os
from pathlib import Path
import tempfile
from typing import (
IO,
Any,
Expand Down Expand Up @@ -104,7 +102,7 @@ def close(self) -> None:
avoid closing the potentially user-created buffer.
"""
if self.is_wrapped:
assert isinstance(self.handle, (TextIOWrapper, BytesIOWrapper))
assert isinstance(self.handle, TextIOWrapper)
self.handle.flush()
self.handle.detach()
self.created_handles.remove(self.handle)
Expand Down Expand Up @@ -779,20 +777,17 @@ def get_handle(
# Convert BytesIO or file objects passed with an encoding
is_wrapped = False
if not is_text and ioargs.mode == "rb" and isinstance(handle, TextIOBase):
handle = BytesIOWrapper(
# not added to handles as it does not open/buffer resources
handle = _BytesIOWrapper(
handle,
encoding=ioargs.encoding,
)
handles.append(handle)
# the (text) handle is always provided by the caller
# since get_handle would have opened it in binary mode
is_wrapped = True
elif is_text and (compression or _is_binary_mode(handle, ioargs.mode)):
handle = TextIOWrapper(
# error: Argument 1 to "TextIOWrapper" has incompatible type
# "Union[IO[bytes], IO[Any], RawIOBase, BufferedIOBase, TextIOBase, mmap]";
# expected "IO[bytes]"
handle, # type: ignore[arg-type]
_IOWrapper(handle), # type: ignore[arg-type]
encoding=ioargs.encoding,
errors=errors,
newline="",
Expand Down Expand Up @@ -935,7 +930,7 @@ def __init__(
self.decode = decode

self.attributes = {}
for attribute in ("seekable", "readable", "writeable"):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should have been "writable" but it seems it was never used as we do not support writing to memory-mapped files.

for attribute in ("seekable", "readable"):
if not hasattr(f, attribute):
continue
self.attributes[attribute] = getattr(f, attribute)()
Expand Down Expand Up @@ -976,11 +971,40 @@ def __next__(self) -> str:
return newline.lstrip("\n")


# Wrapper that wraps a StringIO buffer and reads bytes from it
# Created for compat with pyarrow read_csv
class BytesIOWrapper(io.BytesIO):
buffer: StringIO | TextIOBase | None
class _IOWrapper:
# TextIOWrapper is overly strict: it request that the buffer has seekable, readable,
# and writable. If we have a read-only buffer, we shouldn't need writable and vice
# versa. Some buffers, are seek/read/writ-able but they do not have the "-able"
# methods, e.g., tempfile.SpooledTemporaryFile.
# If a buffer does not have the above "-able" methods, we simple assume they are
# seek/read/writ-able.
def __init__(self, buffer: BaseBuffer):
self.buffer = buffer

def __getattr__(self, name: str):
return getattr(self.buffer, name)

def readable(self) -> bool:
if hasattr(self.buffer, "readable"):
# error: "BaseBuffer" has no attribute "readable"
return self.buffer.readable() # type: ignore[attr-defined]
return True

def seekable(self) -> bool:
if hasattr(self.buffer, "seekable"):
return self.buffer.seekable()
return True

def writable(self) -> bool:
if hasattr(self.buffer, "writable"):
# error: "BaseBuffer" has no attribute "writable"
return self.buffer.writable() # type: ignore[attr-defined]
return True


class _BytesIOWrapper:
# Wrapper that wraps a StringIO buffer and reads bytes from it
# Created for compat with pyarrow read_csv
def __init__(self, buffer: StringIO | TextIOBase, encoding: str = "utf-8"):
self.buffer = buffer
self.encoding = encoding
Expand All @@ -1006,15 +1030,6 @@ def read(self, n: int | None = -1) -> bytes:
self.overflow = combined_bytestring[n:]
return to_return

def detach(self):
# Slightly modified from Python's TextIOWrapper detach method
if self.buffer is None:
raise ValueError("buffer is already detached")
self.flush()
buffer = self.buffer
self.buffer = None
return buffer


def _maybe_memory_map(
handle: str | BaseBuffer,
Expand Down Expand Up @@ -1042,10 +1057,15 @@ def _maybe_memory_map(

# error: Argument 1 to "_MMapWrapper" has incompatible type "Union[IO[Any],
# RawIOBase, BufferedIOBase, TextIOBase, mmap]"; expected "IO[Any]"
wrapped = cast(
BaseBuffer,
_MMapWrapper(handle, encoding, errors, decode), # type: ignore[arg-type]
)
try:
wrapped = cast(
BaseBuffer,
_MMapWrapper(handle, encoding, errors, decode), # type: ignore[arg-type]
)
finally:
for handle in reversed(handles):
# error: "BaseBuffer" has no attribute "close"
handle.close() # type: ignore[attr-defined]
handles.append(wrapped)

return wrapped, memory_map, handles
Expand Down Expand Up @@ -1077,8 +1097,6 @@ def _is_binary_mode(handle: FilePath | BaseBuffer, mode: str) -> bool:
codecs.StreamWriter,
codecs.StreamReader,
codecs.StreamReaderWriter,
# cannot be wrapped in TextIOWrapper GH43439
tempfile.SpooledTemporaryFile,
)
if issubclass(type(handle), text_classes):
return False
Expand Down
17 changes: 8 additions & 9 deletions pandas/tests/io/parser/test_encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,17 +299,16 @@ def test_readcsv_memmap_utf8(all_parsers):
tm.assert_frame_equal(df, dfr)


def test_not_readable(all_parsers, request):
@pytest.mark.usefixtures("pyarrow_xfail")
@pytest.mark.parametrize("mode", ["w+b", "w+t"])
def test_not_readable(all_parsers, mode):
# GH43439
parser = all_parsers
if parser.engine in ("python", "pyarrow"):
mark = pytest.mark.xfail(
reason="SpooledTemporaryFile does only work with the c-engine"
)
request.node.add_marker(mark)

with tempfile.SpooledTemporaryFile() as handle:
handle.write(b"abcd")
content = b"abcd"
if "t" in mode:
content = "abcd"
with tempfile.SpooledTemporaryFile(mode=mode) as handle:
handle.write(content)
handle.seek(0)
df = parser.read_csv(handle)
expected = DataFrame([], columns=["abcd"])
Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/io/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -607,4 +607,5 @@ def test_errno_attribute():

def test_fail_mmap():
with pytest.raises(UnsupportedOperation, match="fileno"):
icom.get_handle(BytesIO(), "rb", memory_map=True)
with BytesIO() as buffer:
icom.get_handle(buffer, "rb", memory_map=True)