From 5e3c3e33c8ddd394c1a8411ba5cda1f7e12e68fc Mon Sep 17 00:00:00 2001 From: Avi Kelman Date: Wed, 14 Aug 2019 20:40:23 -0400 Subject: [PATCH 1/9] BUG: Help python csv engine read binary buffers The file buffer given to read_csv could have been opened in binary mode, but the python csv reader errors on binary buffers. closes #23779 --- pandas/io/common.py | 6 +++--- pandas/tests/io/parser/test_python_parser_only.py | 7 +++++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index e01e473047b88..3497bb7862a29 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -5,7 +5,7 @@ import csv import gzip from http.client import HTTPException # noqa -from io import BytesIO +from io import BufferedIOBase, BytesIO import lzma import mmap import os @@ -342,9 +342,9 @@ def _get_handle( try: from s3fs import S3File - need_text_wrapping = (BytesIO, S3File) + need_text_wrapping = (BufferedIOBase, BytesIO, S3File) except ImportError: - need_text_wrapping = (BytesIO,) + need_text_wrapping = (BufferedIOBase, BytesIO) handles = list() f = path_or_buf diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 5b381e43e3e19..6557cbfdc9ca9 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -296,3 +296,10 @@ def test_malformed_skipfooter(python_parser_only): msg = "Expected 3 fields in line 4, saw 5" with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1) + + +def test_binary_buffer(python_parser_only, csv1): + # see gh-23779 + parser = python_parser_only + with open(csv1, "rb") as f: + parser.read_csv(f) From 0ac1bcebc03d77718514be161513f1f1ba6d559e Mon Sep 17 00:00:00 2001 From: Avi Kelman Date: Thu, 15 Aug 2019 00:13:49 -0400 Subject: [PATCH 2/9] whatsnew entry --- doc/source/whatsnew/v0.25.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 21f1fa7ddec1f..a3de14a2a40f9 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -105,7 +105,7 @@ I/O ^^^ - Avoid calling ``S3File.s3`` when reading parquet, as this was removed in s3fs version 0.3.0 (:issue:`27756`) -- +- read_csv now accepts binary mode file buffers when using the Python csv engine (:issue:`23779`) - Plotting From 7e270d259395dcdd350d5ae7600ed14ec1897d4c Mon Sep 17 00:00:00 2001 From: Avi Kelman Date: Thu, 15 Aug 2019 12:04:22 -0400 Subject: [PATCH 3/9] satisfy gh-14418 for binary mode files --- pandas/io/common.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 3497bb7862a29..862ff4e090bc6 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -346,6 +346,8 @@ def _get_handle( except ImportError: need_text_wrapping = (BufferedIOBase, BytesIO) + no_close = (BufferedIOBase) + handles = list() f = path_or_buf @@ -420,8 +422,10 @@ def _get_handle( if is_text and (compression or isinstance(f, need_text_wrapping)): from io import TextIOWrapper - f = TextIOWrapper(f, encoding=encoding, newline="") - handles.append(f) + g = TextIOWrapper(f, encoding=encoding, newline="") + if not isinstance(f, no_close): + handles.append(g) + f = g if memory_map and hasattr(f, "fileno"): try: From b3b9d2f518c190791cbf86e9ffa79c3661e28078 Mon Sep 17 00:00:00 2001 From: Avi Kelman Date: Thu, 15 Aug 2019 12:05:02 -0400 Subject: [PATCH 4/9] update tests per requested changes test both csv engines, assert equality between ascii and binary modes, colocate with other buffer tests --- pandas/tests/io/parser/test_common.py | 28 +++++++++++++++++-- .../io/parser/test_python_parser_only.py | 7 ----- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index b94d5cd497ccf..587cca6192c16 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -2020,9 +2020,31 @@ def test_file_handles_with_open(all_parsers, csv1): # Don't close user provided file handles. parser = all_parsers - with open(csv1, "r") as f: - parser.read_csv(f) - assert not f.closed + for mode in ["r", "rb"]: + with open(csv1, mode) as f: + parser.read_csv(f) + assert not f.closed + + +@pytest.mark.parametrize( + "fname,encoding", [ + ("test1.csv", "utf-8"), + ("unicode_series.csv", "latin-1"), + ("sauron.SHIFT_JIS.csv", "shiftjis") + ] +) +def test_binary_mode_file_buffers(all_parsers, csv_dir_path, fname, encoding): + # gh-23779: Python csv engine shouldn't error on files opened in binary. + parser = all_parsers + + fpath = os.path.join(csv_dir_path, fname) + + with open(fpath, mode="r", encoding=encoding) as fa: + df_a = parser.read_csv(fa) + with open(fpath, mode="rb") as fb: + df_b = parser.read_csv(fb, encoding=encoding) + + tm.assert_frame_equal(df_b, df_a) def test_invalid_file_buffer_class(all_parsers): diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 6557cbfdc9ca9..5b381e43e3e19 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -296,10 +296,3 @@ def test_malformed_skipfooter(python_parser_only): msg = "Expected 3 fields in line 4, saw 5" with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1) - - -def test_binary_buffer(python_parser_only, csv1): - # see gh-23779 - parser = python_parser_only - with open(csv1, "rb") as f: - parser.read_csv(f) From 0a60af245766b10d38baed5d51b4add788ab7998 Mon Sep 17 00:00:00 2001 From: Avi Kelman Date: Thu, 15 Aug 2019 12:07:51 -0400 Subject: [PATCH 5/9] move whatsnew note to 1.0 --- doc/source/whatsnew/v0.25.1.rst | 2 +- doc/source/whatsnew/v1.0.0.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index a3de14a2a40f9..21f1fa7ddec1f 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -105,7 +105,7 @@ I/O ^^^ - Avoid calling ``S3File.s3`` when reading parquet, as this was removed in s3fs version 0.3.0 (:issue:`27756`) -- read_csv now accepts binary mode file buffers when using the Python csv engine (:issue:`23779`) +- - Plotting diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index aeed3668fe774..39e640dcc4935 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -157,7 +157,7 @@ MultiIndex I/O ^^^ -- +- :meth:`read_csv` now accepts binary mode file buffers when using the Python csv engine (:issue:`23779`) - Plotting From 9d73aaa534d858fcdb0fa2860309fa99a80a325a Mon Sep 17 00:00:00 2001 From: Avi Kelman Date: Thu, 15 Aug 2019 12:09:35 -0400 Subject: [PATCH 6/9] black formatting --- pandas/io/common.py | 2 +- pandas/tests/io/parser/test_common.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 862ff4e090bc6..37d7cb988e46b 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -346,7 +346,7 @@ def _get_handle( except ImportError: need_text_wrapping = (BufferedIOBase, BytesIO) - no_close = (BufferedIOBase) + no_close = BufferedIOBase handles = list() f = path_or_buf diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 587cca6192c16..daae92a1e5c8d 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -2027,11 +2027,12 @@ def test_file_handles_with_open(all_parsers, csv1): @pytest.mark.parametrize( - "fname,encoding", [ + "fname,encoding", + [ ("test1.csv", "utf-8"), ("unicode_series.csv", "latin-1"), - ("sauron.SHIFT_JIS.csv", "shiftjis") - ] + ("sauron.SHIFT_JIS.csv", "shiftjis"), + ], ) def test_binary_mode_file_buffers(all_parsers, csv_dir_path, fname, encoding): # gh-23779: Python csv engine shouldn't error on files opened in binary. From 03bceb1336a00b600d7685fd03a3b9e0196f900c Mon Sep 17 00:00:00 2001 From: Avi Kelman Date: Thu, 15 Aug 2019 13:30:38 -0400 Subject: [PATCH 7/9] BytesIO is a subclass of BufferedIOBase --- pandas/io/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 37d7cb988e46b..5261148a334fc 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -342,9 +342,9 @@ def _get_handle( try: from s3fs import S3File - need_text_wrapping = (BufferedIOBase, BytesIO, S3File) + need_text_wrapping = (BufferedIOBase, S3File) except ImportError: - need_text_wrapping = (BufferedIOBase, BytesIO) + need_text_wrapping = (BufferedIOBase) no_close = BufferedIOBase From ab0878d4dc9356a4b1cd1ba1e18ce2d860d1623c Mon Sep 17 00:00:00 2001 From: Avi Kelman Date: Thu, 15 Aug 2019 13:34:04 -0400 Subject: [PATCH 8/9] black formatting --- pandas/io/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 5261148a334fc..9fa61ca0cb6de 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -344,7 +344,7 @@ def _get_handle( need_text_wrapping = (BufferedIOBase, S3File) except ImportError: - need_text_wrapping = (BufferedIOBase) + need_text_wrapping = BufferedIOBase no_close = BufferedIOBase From d6625325c5930d5c4888056a6feb0a64c4037cc9 Mon Sep 17 00:00:00 2001 From: Avi Kelman Date: Fri, 16 Aug 2019 12:03:40 -0400 Subject: [PATCH 9/9] remove no_close and compare with non-open read --- pandas/io/common.py | 4 +--- pandas/tests/io/parser/test_common.py | 10 ++++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 9fa61ca0cb6de..25d62effb8d44 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -346,8 +346,6 @@ def _get_handle( except ImportError: need_text_wrapping = BufferedIOBase - no_close = BufferedIOBase - handles = list() f = path_or_buf @@ -423,7 +421,7 @@ def _get_handle( from io import TextIOWrapper g = TextIOWrapper(f, encoding=encoding, newline="") - if not isinstance(f, no_close): + if not isinstance(f, BufferedIOBase): handles.append(g) f = g diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index daae92a1e5c8d..e5366a8357adb 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -2039,13 +2039,15 @@ def test_binary_mode_file_buffers(all_parsers, csv_dir_path, fname, encoding): parser = all_parsers fpath = os.path.join(csv_dir_path, fname) + expected = parser.read_csv(fpath, encoding=encoding) with open(fpath, mode="r", encoding=encoding) as fa: - df_a = parser.read_csv(fa) - with open(fpath, mode="rb") as fb: - df_b = parser.read_csv(fb, encoding=encoding) + result = parser.read_csv(fa) + tm.assert_frame_equal(expected, result) - tm.assert_frame_equal(df_b, df_a) + with open(fpath, mode="rb") as fb: + result = parser.read_csv(fb, encoding=encoding) + tm.assert_frame_equal(expected, result) def test_invalid_file_buffer_class(all_parsers):