Skip to content

Commit 480d5bd

Browse files
lithomas1feefladder
authored andcommitted
ENH: Add BytesIOWrapper (pandas-dev#42669)
1 parent 21ec89a commit 480d5bd

File tree

2 files changed

+95
-3
lines changed

2 files changed

+95
-3
lines changed

pandas/io/common.py

+53-3
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,13 @@
66
from collections import abc
77
import dataclasses
88
import gzip
9+
import io
910
from io import (
1011
BufferedIOBase,
1112
BytesIO,
1213
RawIOBase,
1314
StringIO,
15+
TextIOBase,
1416
TextIOWrapper,
1517
)
1618
import mmap
@@ -50,7 +52,6 @@
5052

5153
lzma = import_lzma()
5254

53-
5455
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
5556
_VALID_URLS.discard("")
5657

@@ -102,7 +103,7 @@ def close(self) -> None:
102103
avoid closing the potentially user-created buffer.
103104
"""
104105
if self.is_wrapped:
105-
assert isinstance(self.handle, TextIOWrapper)
106+
assert isinstance(self.handle, (TextIOWrapper, BytesIOWrapper))
106107
self.handle.flush()
107108
self.handle.detach()
108109
self.created_handles.remove(self.handle)
@@ -712,7 +713,16 @@ def get_handle(
712713

713714
# Convert BytesIO or file objects passed with an encoding
714715
is_wrapped = False
715-
if is_text and (compression or _is_binary_mode(handle, ioargs.mode)):
716+
if not is_text and ioargs.mode == "rb" and isinstance(handle, TextIOBase):
717+
handle = BytesIOWrapper(
718+
handle,
719+
encoding=ioargs.encoding,
720+
)
721+
handles.append(handle)
722+
# the (text) handle is always provided by the caller
723+
# since get_handle would have opened it in binary mode
724+
is_wrapped = True
725+
elif is_text and (compression or _is_binary_mode(handle, ioargs.mode)):
716726
handle = TextIOWrapper(
717727
# error: Argument 1 to "TextIOWrapper" has incompatible type
718728
# "Union[IO[bytes], IO[Any], RawIOBase, BufferedIOBase, TextIOBase, mmap]";
@@ -878,6 +888,46 @@ def __next__(self) -> str:
878888
return newline.lstrip("\n")
879889

880890

891+
# Wrapper that wraps a StringIO buffer and reads bytes from it
892+
# Created for compat with pyarrow read_csv
893+
class BytesIOWrapper(io.BytesIO):
894+
buffer: StringIO | TextIOBase | None
895+
896+
def __init__(self, buffer: StringIO | TextIOBase, encoding: str = "utf-8"):
897+
self.buffer = buffer
898+
self.encoding = encoding
899+
# Because a character can be represented by more than 1 byte,
900+
# it is possible that reading will produce more bytes than n
901+
# We store the extra bytes in this overflow variable, and append the
902+
# overflow to the front of the bytestring the next time reading is performed
903+
self.overflow = b""
904+
905+
def __getattr__(self, attr: str):
906+
return getattr(self.buffer, attr)
907+
908+
def read(self, n: int | None = -1) -> bytes:
909+
assert self.buffer is not None
910+
bytestring = self.buffer.read(n).encode(self.encoding)
911+
# When n=-1/n greater than remaining bytes: Read entire file/rest of file
912+
combined_bytestring = self.overflow + bytestring
913+
if n is None or n < 0 or n >= len(combined_bytestring):
914+
self.overflow = b""
915+
return combined_bytestring
916+
else:
917+
to_return = combined_bytestring[:n]
918+
self.overflow = combined_bytestring[n:]
919+
return to_return
920+
921+
def detach(self):
922+
# Slightly modified from Python's TextIOWrapper detach method
923+
if self.buffer is None:
924+
raise ValueError("buffer is already detached")
925+
self.flush()
926+
buffer = self.buffer
927+
self.buffer = None
928+
return buffer
929+
930+
881931
def _maybe_memory_map(
882932
handle: FileOrBuffer,
883933
memory_map: bool,

pandas/tests/io/test_common.py

+42
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,48 @@ def test_get_handle_with_buffer(self):
135135
assert not input_buffer.closed
136136
input_buffer.close()
137137

138+
# Test that BytesIOWrapper(get_handle) returns correct amount of bytes every time
139+
def test_bytesiowrapper_returns_correct_bytes(self):
140+
# Test latin1, ucs-2, and ucs-4 chars
141+
data = """a,b,c
142+
1,2,3
143+
©,®,®
144+
Look,a snake,🐍"""
145+
with icom.get_handle(StringIO(data), "rb", is_text=False) as handles:
146+
result = b""
147+
chunksize = 5
148+
while True:
149+
chunk = handles.handle.read(chunksize)
150+
# Make sure each chunk is correct amount of bytes
151+
assert len(chunk) <= chunksize
152+
if len(chunk) < chunksize:
153+
# Can be less amount of bytes, but only at EOF
154+
# which happens when read returns empty
155+
assert len(handles.handle.read()) == 0
156+
result += chunk
157+
break
158+
result += chunk
159+
assert result == data.encode("utf-8")
160+
161+
# Test that pyarrow can handle a file opened with get_handle
162+
@td.skip_if_no("pyarrow", min_version="0.15.0")
163+
def test_get_handle_pyarrow_compat(self):
164+
from pyarrow import csv
165+
166+
# Test latin1, ucs-2, and ucs-4 chars
167+
data = """a,b,c
168+
1,2,3
169+
©,®,®
170+
Look,a snake,🐍"""
171+
expected = pd.DataFrame(
172+
{"a": ["1", "©", "Look"], "b": ["2", "®", "a snake"], "c": ["3", "®", "🐍"]}
173+
)
174+
s = StringIO(data)
175+
with icom.get_handle(s, "rb", is_text=False) as handles:
176+
df = csv.read_csv(handles.handle).to_pandas()
177+
tm.assert_frame_equal(df, expected)
178+
assert not s.closed
179+
138180
def test_iterator(self):
139181
with pd.read_csv(StringIO(self.data1), chunksize=1) as reader:
140182
result = pd.concat(reader, ignore_index=True)

0 commit comments

Comments
 (0)