From 2a16522fbb05ee28e36cc27d2ad4566190277ad1 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Thu, 22 Jul 2021 11:13:23 -0700 Subject: [PATCH 1/5] ENH: Add BytesIOWrapper --- pandas/io/common.py | 68 ++++++++++++++++++++++++++++------ pandas/tests/io/test_common.py | 41 ++++++++++++++++++++ 2 files changed, 98 insertions(+), 11 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 06b00a9cbb4eb..392c869196940 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -6,11 +6,13 @@ from collections import abc import dataclasses import gzip +import io from io import ( BufferedIOBase, BytesIO, RawIOBase, StringIO, + TextIOBase, TextIOWrapper, ) import mmap @@ -50,7 +52,6 @@ lzma = import_lzma() - _VALID_URLS = set(uses_relative + uses_netloc + uses_params) _VALID_URLS.discard("") @@ -102,7 +103,7 @@ def close(self) -> None: avoid closing the potentially user-created buffer. """ if self.is_wrapped: - assert isinstance(self.handle, TextIOWrapper) + assert isinstance(self.handle, (TextIOWrapper, BytesIOWrapper)) self.handle.flush() self.handle.detach() self.created_handles.remove(self.handle) @@ -713,15 +714,23 @@ def get_handle( # Convert BytesIO or file objects passed with an encoding is_wrapped = False if is_text and (compression or _is_binary_mode(handle, ioargs.mode)): - handle = TextIOWrapper( - # error: Argument 1 to "TextIOWrapper" has incompatible type - # "Union[IO[bytes], IO[Any], RawIOBase, BufferedIOBase, TextIOBase, mmap]"; - # expected "IO[bytes]" - handle, # type: ignore[arg-type] - encoding=ioargs.encoding, - errors=errors, - newline="", - ) + # Use our custom BytesIOWrapper + # when reading text to bytes, otherwise use TextIOWrapper + if ioargs.mode == "rb" and isinstance(handle, TextIOBase): + handle = BytesIOWrapper( + handle, + encoding=ioargs.encoding, + ) + else: + handle = TextIOWrapper( + # error: Argument 1 to "TextIOWrapper" has incompatible type + # "Union[IO[bytes], IO[Any], RawIOBase, BufferedIOBase, TextIOBase, mmap]"; # noqa: E501 + # expected "IO[bytes]" + handle, # type: ignore[arg-type] + encoding=ioargs.encoding, + errors=errors, + newline="", + ) handles.append(handle) # only marked as wrapped when the caller provided a handle is_wrapped = not ( @@ -878,6 +887,43 @@ def __next__(self) -> str: return newline.lstrip("\n") +# Wrapper that wraps a StringIO buffer and reads bytes from it +# Created for compat with pyarrow read_csv +class BytesIOWrapper(io.BytesIO): + def __init__(self, buffer: StringIO | TextIOBase, encoding: str = "utf-8"): + self.buffer = buffer + self.encoding = encoding + # Because a character can be represented by more than 1 byte, + # it is possible that reading will produce more bytes than n + # We store the extra bytes in this overflow variable, and append the + # overflow to the front of the bytestring the next time reading is performed + self.overflow = b"" + + def __getattr__(self, attr: str): + return getattr(self.buffer, attr) + + def read(self, n: int | None = -1) -> bytes: + bytestring = self.buffer.read(n).encode(self.encoding) + # When n=-1/n greater than remaining bytes: Read entire file/rest of file + combined_bytestring = self.overflow + bytestring + if n == -1 or n >= len(combined_bytestring): + self.overflow = b"" + return combined_bytestring + else: + to_return = combined_bytestring[:n] + self.overflow = combined_bytestring[n:] + return to_return + + def detach(self): + # Slightly modified from Python's TextIOWrapper detach method + if self.buffer is None: + raise ValueError("buffer is already detached") + self.flush() + buffer = self.buffer + self.buffer = None + return buffer + + def _maybe_memory_map( handle: FileOrBuffer, memory_map: bool, diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index d52ea01ac35de..4749c43fb6368 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -135,6 +135,47 @@ def test_get_handle_with_buffer(self): assert not input_buffer.closed input_buffer.close() + # Test that BytesIOWrapper(get_handle) returns correct amount of bytes every time + def test_bytesiowrapper_returns_correct_bytes(self): + # Test latin1, ucs-2, and ucs-4 chars + data = """a,b,c +1,2,3 +©,®,® +Look,a snake,🐍""" + with icom.get_handle(StringIO(data), "rb") as handles: + result = b"" + while True: + chunk = handles.handle.read(5) + # Make sure each chunk is correct amount of bytes + assert len(chunk) <= 5 + if len(chunk) < 5: + # Can be less amount of bytes, but only at EOF + # which happens when read returns empty + assert len(handles.handle.read()) == 0 + result += chunk + break + result += chunk + assert result == data.encode("utf-8") + + # Test that pyarrow can handle a file opened with get_handle + @td.skip_if_no("pyarrow", min_version="0.15.0") + def test_get_handle_pyarrow_compat(self): + from pyarrow import csv + + # Test latin1, ucs-2, and ucs-4 chars + data = """a,b,c +1,2,3 +©,®,® +Look,a snake,🐍""" + expected = pd.DataFrame( + {"a": ["1", "©", "Look"], "b": ["2", "®", "a snake"], "c": ["3", "®", "🐍"]} + ) + s = StringIO(data) + with icom.get_handle(s, "rb") as handles: + df = csv.read_csv(handles.handle).to_pandas() + tm.assert_frame_equal(df, expected) + assert not s.closed + def test_iterator(self): with pd.read_csv(StringIO(self.data1), chunksize=1) as reader: result = pd.concat(reader, ignore_index=True) From 43252cca5d44f4b5aa17b648ee208bdb90f4b729 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Thu, 22 Jul 2021 14:41:37 -0700 Subject: [PATCH 2/5] mypy --- pandas/io/common.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 392c869196940..b03c481b623bd 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -890,6 +890,8 @@ def __next__(self) -> str: # Wrapper that wraps a StringIO buffer and reads bytes from it # Created for compat with pyarrow read_csv class BytesIOWrapper(io.BytesIO): + buffer: StringIO | TextIOBase | None + def __init__(self, buffer: StringIO | TextIOBase, encoding: str = "utf-8"): self.buffer = buffer self.encoding = encoding @@ -903,10 +905,11 @@ def __getattr__(self, attr: str): return getattr(self.buffer, attr) def read(self, n: int | None = -1) -> bytes: + assert self.buffer is not None bytestring = self.buffer.read(n).encode(self.encoding) # When n=-1/n greater than remaining bytes: Read entire file/rest of file combined_bytestring = self.overflow + bytestring - if n == -1 or n >= len(combined_bytestring): + if n is None or n < 0 or n >= len(combined_bytestring): self.overflow = b"" return combined_bytestring else: From 4e998a6df4609e8b75cb71cc76383a4552b6d417 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Fri, 23 Jul 2021 15:44:12 -0700 Subject: [PATCH 3/5] refactor --- pandas/io/common.py | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index b03c481b623bd..add303b91371a 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -713,24 +713,26 @@ def get_handle( # Convert BytesIO or file objects passed with an encoding is_wrapped = False - if is_text and (compression or _is_binary_mode(handle, ioargs.mode)): - # Use our custom BytesIOWrapper - # when reading text to bytes, otherwise use TextIOWrapper - if ioargs.mode == "rb" and isinstance(handle, TextIOBase): - handle = BytesIOWrapper( - handle, - encoding=ioargs.encoding, - ) - else: - handle = TextIOWrapper( - # error: Argument 1 to "TextIOWrapper" has incompatible type - # "Union[IO[bytes], IO[Any], RawIOBase, BufferedIOBase, TextIOBase, mmap]"; # noqa: E501 - # expected "IO[bytes]" - handle, # type: ignore[arg-type] - encoding=ioargs.encoding, - errors=errors, - newline="", - ) + if ioargs.mode == "rb" and isinstance(handle, TextIOBase): + handle = BytesIOWrapper( + handle, + encoding=ioargs.encoding, + ) + handles.append(handle) + # Handle is always provided by caller since + # they are trying to read bytes from string + # buffer + is_wrapped = True + elif is_text and (compression or _is_binary_mode(handle, ioargs.mode)): + handle = TextIOWrapper( + # error: Argument 1 to "TextIOWrapper" has incompatible type + # "Union[IO[bytes], IO[Any], RawIOBase, BufferedIOBase, TextIOBase, mmap]"; + # expected "IO[bytes]" + handle, # type: ignore[arg-type] + encoding=ioargs.encoding, + errors=errors, + newline="", + ) handles.append(handle) # only marked as wrapped when the caller provided a handle is_wrapped = not ( From 392c48bc9f9786d8d94349dabd277a2607aa4e25 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Mon, 26 Jul 2021 08:41:30 -0700 Subject: [PATCH 4/5] fixes and clean test --- pandas/io/common.py | 2 +- pandas/tests/io/test_common.py | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index add303b91371a..3682abefa2761 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -713,7 +713,7 @@ def get_handle( # Convert BytesIO or file objects passed with an encoding is_wrapped = False - if ioargs.mode == "rb" and isinstance(handle, TextIOBase): + if not is_text and ioargs.mode == "rb" and isinstance(handle, TextIOBase): handle = BytesIOWrapper( handle, encoding=ioargs.encoding, diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 4749c43fb6368..b48d676cd0f8a 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -142,13 +142,14 @@ def test_bytesiowrapper_returns_correct_bytes(self): 1,2,3 ©,®,® Look,a snake,🐍""" - with icom.get_handle(StringIO(data), "rb") as handles: + with icom.get_handle(StringIO(data), "rb", is_text=False) as handles: result = b"" + chunksize = 5 while True: - chunk = handles.handle.read(5) + chunk = handles.handle.read(chunksize) # Make sure each chunk is correct amount of bytes - assert len(chunk) <= 5 - if len(chunk) < 5: + assert len(chunk) <= chunksize + if len(chunk) < chunksize: # Can be less amount of bytes, but only at EOF # which happens when read returns empty assert len(handles.handle.read()) == 0 @@ -171,7 +172,7 @@ def test_get_handle_pyarrow_compat(self): {"a": ["1", "©", "Look"], "b": ["2", "®", "a snake"], "c": ["3", "®", "🐍"]} ) s = StringIO(data) - with icom.get_handle(s, "rb") as handles: + with icom.get_handle(s, "rb", is_text=False) as handles: df = csv.read_csv(handles.handle).to_pandas() tm.assert_frame_equal(df, expected) assert not s.closed From 0a58afc5581fe6f76ef8004001ebbccbc69545a4 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 28 Jul 2021 08:26:57 -0700 Subject: [PATCH 5/5] Update common.py --- pandas/io/common.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 3682abefa2761..4e97eaf8b953c 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -719,9 +719,8 @@ def get_handle( encoding=ioargs.encoding, ) handles.append(handle) - # Handle is always provided by caller since - # they are trying to read bytes from string - # buffer + # the (text) handle is always provided by the caller + # since get_handle would have opened it in binary mode is_wrapped = True elif is_text and (compression or _is_binary_mode(handle, ioargs.mode)): handle = TextIOWrapper(