diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 0be4ebc627b30..3ce558ca336c3 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -158,7 +158,7 @@ MultiIndex I/O ^^^ -- +- :meth:`read_csv` now accepts binary mode file buffers when using the Python csv engine (:issue:`23779`) - Plotting diff --git a/pandas/io/common.py b/pandas/io/common.py index ac57cef372399..26b68dda7b464 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -5,7 +5,7 @@ import csv import gzip from http.client import HTTPException # noqa -from io import BytesIO +from io import BufferedIOBase, BytesIO import mmap import os import pathlib @@ -344,9 +344,9 @@ def _get_handle( try: from s3fs import S3File - need_text_wrapping = (BytesIO, S3File) + need_text_wrapping = (BufferedIOBase, S3File) except ImportError: - need_text_wrapping = (BytesIO,) + need_text_wrapping = BufferedIOBase handles = list() f = path_or_buf @@ -422,8 +422,10 @@ def _get_handle( if is_text and (compression or isinstance(f, need_text_wrapping)): from io import TextIOWrapper - f = TextIOWrapper(f, encoding=encoding, newline="") - handles.append(f) + g = TextIOWrapper(f, encoding=encoding, newline="") + if not isinstance(f, BufferedIOBase): + handles.append(g) + f = g if memory_map and hasattr(f, "fileno"): try: diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index b94d5cd497ccf..e5366a8357adb 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -2020,9 +2020,34 @@ def test_file_handles_with_open(all_parsers, csv1): # Don't close user provided file handles. parser = all_parsers - with open(csv1, "r") as f: - parser.read_csv(f) - assert not f.closed + for mode in ["r", "rb"]: + with open(csv1, mode) as f: + parser.read_csv(f) + assert not f.closed + + +@pytest.mark.parametrize( + "fname,encoding", + [ + ("test1.csv", "utf-8"), + ("unicode_series.csv", "latin-1"), + ("sauron.SHIFT_JIS.csv", "shiftjis"), + ], +) +def test_binary_mode_file_buffers(all_parsers, csv_dir_path, fname, encoding): + # gh-23779: Python csv engine shouldn't error on files opened in binary. + parser = all_parsers + + fpath = os.path.join(csv_dir_path, fname) + expected = parser.read_csv(fpath, encoding=encoding) + + with open(fpath, mode="r", encoding=encoding) as fa: + result = parser.read_csv(fa) + tm.assert_frame_equal(expected, result) + + with open(fpath, mode="rb") as fb: + result = parser.read_csv(fb, encoding=encoding) + tm.assert_frame_equal(expected, result) def test_invalid_file_buffer_class(all_parsers):