From bfc8e96d31cb2d0315564f888e262df8e671afdd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Tue, 1 Dec 2020 21:42:21 -0500 Subject: [PATCH 1/2] ENH: context-manager for chunksize/iterator-reader --- doc/source/user_guide/io.rst | 41 +++---- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/io/html.py | 5 +- pandas/io/json/_json.py | 12 +- pandas/io/parsers.py | 20 +++- pandas/io/sas/sasreader.py | 20 +++- pandas/tests/io/json/test_compression.py | 6 +- pandas/tests/io/json/test_readlines.py | 37 +++--- pandas/tests/io/parser/test_c_parser_only.py | 12 +- pandas/tests/io/parser/test_common.py | 118 ++++++++++++------- pandas/tests/io/parser/test_dtypes.py | 16 +-- pandas/tests/io/parser/test_network.py | 46 ++++---- pandas/tests/io/parser/test_parse_dates.py | 6 +- pandas/tests/io/parser/test_textreader.py | 6 +- pandas/tests/io/sas/test_sas7bdat.py | 39 +++--- pandas/tests/io/sas/test_xport.py | 41 +++---- pandas/tests/io/test_common.py | 12 +- pandas/tests/io/test_stata.py | 10 +- pandas/tests/reshape/concat/test_invalid.py | 4 +- 19 files changed, 262 insertions(+), 190 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 1bd35131622ab..cbedb3b6c2ece 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1577,19 +1577,20 @@ value will be an iterable object of type ``TextFileReader``: .. ipython:: python - reader = pd.read_csv("tmp.sv", sep="|", chunksize=4) - reader - - for chunk in reader: - print(chunk) + with pd.read_csv("tmp.sv", sep="|", chunksize=4) as reader: + reader + for chunk in reader: + print(chunk) +.. versionchanged:: 1.2 + ``read_csv/json/sas`` return a context-manager when iterating through a file. Specifying ``iterator=True`` will also return the ``TextFileReader`` object: .. ipython:: python - reader = pd.read_csv("tmp.sv", sep="|", iterator=True) - reader.get_chunk(5) + with pd.read_csv("tmp.sv", sep="|", iterator=True) as reader: + reader.get_chunk(5) .. ipython:: python :suppress: @@ -2238,10 +2239,10 @@ For line-delimited json files, pandas can also return an iterator which reads in df.to_json(orient="records", lines=True) # reader is an iterator that returns ``chunksize`` lines each iteration - reader = pd.read_json(StringIO(jsonl), lines=True, chunksize=1) - reader - for chunk in reader: - print(chunk) + with pd.read_json(StringIO(jsonl), lines=True, chunksize=1) as reader: + reader + for chunk in reader: + print(chunk) .. _io.table_schema: @@ -5471,9 +5472,9 @@ object can be used as an iterator. .. ipython:: python - reader = pd.read_stata("stata.dta", chunksize=3) - for df in reader: - print(df.shape) + with pd.read_stata("stata.dta", chunksize=3) as reader: + for df in reader: + print(df.shape) For more fine-grained control, use ``iterator=True`` and specify ``chunksize`` with each call to @@ -5481,9 +5482,9 @@ For more fine-grained control, use ``iterator=True`` and specify .. ipython:: python - reader = pd.read_stata("stata.dta", iterator=True) - chunk1 = reader.read(5) - chunk2 = reader.read(5) + with pd.read_stata("stata.dta", iterator=True) as reader: + chunk1 = reader.read(5) + chunk2 = reader.read(5) Currently the ``index`` is retrieved as a column. @@ -5595,9 +5596,9 @@ Obtain an iterator and read an XPORT file 100,000 lines at a time: pass - rdr = pd.read_sas("sas_xport.xpt", chunk=100000) - for chunk in rdr: - do_something(chunk) + with pd.read_sas("sas_xport.xpt", chunk=100000) as rdr: + for chunk in rdr: + do_something(chunk) The specification_ for the xport file format is available from the SAS web site. diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 8182dfa4bce40..54f73d5b57c7c 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -291,6 +291,7 @@ Other enhancements - Improve numerical stability for :meth:`.Rolling.skew`, :meth:`.Rolling.kurt`, :meth:`Expanding.skew` and :meth:`Expanding.kurt` through implementation of Kahan summation (:issue:`6929`) - Improved error reporting for subsetting columns of a :class:`.DataFrameGroupBy` with ``axis=1`` (:issue:`37725`) - Implement method ``cross`` for :meth:`DataFrame.merge` and :meth:`DataFrame.join` (:issue:`5401`) +- :func:`read_csv/sas/json` return a context manager when called with ``chuncksize``/``iterator`` (:issue:`38225`) .. --------------------------------------------------------------------------- diff --git a/pandas/io/html.py b/pandas/io/html.py index 334a3dab6c13a..4a2d4af62f3e9 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -794,9 +794,8 @@ def _data_to_frame(**kwargs): # fill out elements of body that are "ragged" _expand_elements(body) - tp = TextParser(body, header=header, **kwargs) - df = tp.read() - return df + with TextParser(body, header=header, **kwargs) as tp: + return tp.read() _valid_parsers = { diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index b1d705439e300..3ec5238f1f1c7 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -437,6 +437,9 @@ def read_json( This can only be passed if `lines=True`. If this is None, the file will be read into memory all at once. + .. versionchanged:: 1.2 + ``JsonReader`` is a context manager. + compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use gzip, bz2, zip or xz if path_or_buf is a string ending in @@ -555,7 +558,8 @@ def read_json( if chunksize: return json_reader - return json_reader.read() + with json_reader: + return json_reader.read() class JsonReader(abc.Iterator): @@ -747,6 +751,12 @@ def __next__(self): self.close() raise StopIteration + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.close() + class Parser: _split_keys: Tuple[str, ...] diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 9cdc9871e1e07..8d5340f894f1e 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -276,11 +276,17 @@ iterator : bool, default False Return TextFileReader object for iteration or getting chunks with ``get_chunk()``. + + .. versionchanged:: 1.2 + ``TextFileReader`` is a context manager. chunksize : int, optional Return TextFileReader object for iteration. See the `IO Tools docs `_ for more information on ``iterator`` and ``chunksize``. + + .. versionchanged:: 1.2 + ``TextFileReader`` is a context manager. compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer' and `filepath_or_buffer` is path-like, then detect compression from the @@ -451,12 +457,8 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): if chunksize or iterator: return parser - try: - data = parser.read(nrows) - finally: - parser.close() - - return data + with parser: + return parser.read(nrows) _parser_defaults = { @@ -1074,6 +1076,12 @@ def get_chunk(self, size=None): size = min(size, self.nrows - self._currow) return self.read(nrows=size) + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.close() + def _is_index_col(col): return col is not None and col is not False diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index d9cb9902b930a..243218129fda6 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -6,7 +6,7 @@ from pandas._typing import FilePathOrBuffer, Label -from pandas.io.common import IOHandles, stringify_path +from pandas.io.common import stringify_path if TYPE_CHECKING: from pandas import DataFrame @@ -18,8 +18,6 @@ class ReaderBase(metaclass=ABCMeta): Protocol for XportReader and SAS7BDATReader classes. """ - handles: IOHandles - @abstractmethod def read(self, nrows=None): pass @@ -28,6 +26,12 @@ def read(self, nrows=None): def close(self): pass + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.close() + @overload def read_sas( @@ -87,9 +91,17 @@ def read_sas( Encoding for text data. If None, text data are stored as raw bytes. chunksize : int Read file `chunksize` lines at a time, returns iterator. + + .. versionchanged:: 1.2 + + ``TextFileReader`` is a context manager. iterator : bool, defaults to False If True, returns an iterator for reading the file incrementally. + .. versionchanged:: 1.2 + + ``TextFileReader`` is a context manager. + Returns ------- DataFrame if iterator=False and chunksize=None, else SAS7BDATReader @@ -136,5 +148,5 @@ def read_sas( if iterator or chunksize: return reader - with reader.handles: + with reader: return reader.read() diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index a41af9886c617..5faca6bd89dad 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -65,8 +65,10 @@ def test_chunksize_with_compression(compression): df = pd.read_json('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}') df.to_json(path, orient="records", lines=True, compression=compression) - res = pd.read_json(path, lines=True, chunksize=1, compression=compression) - roundtripped_df = pd.concat(res) + with pd.read_json( + path, lines=True, chunksize=1, compression=compression + ) as res: + roundtripped_df = pd.concat(res) tm.assert_frame_equal(df, roundtripped_df) diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index 2e68d3306c7d1..4bbd81ada995b 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -77,8 +77,8 @@ def test_readjson_chunks(lines_json_df, chunksize): # GH17048: memory usage when lines=True unchunked = read_json(StringIO(lines_json_df), lines=True) - reader = read_json(StringIO(lines_json_df), lines=True, chunksize=chunksize) - chunked = pd.concat(reader) + with read_json(StringIO(lines_json_df), lines=True, chunksize=chunksize) as reader: + chunked = pd.concat(reader) tm.assert_frame_equal(chunked, unchunked) @@ -86,7 +86,8 @@ def test_readjson_chunks(lines_json_df, chunksize): def test_readjson_chunksize_requires_lines(lines_json_df): msg = "chunksize can only be passed if lines=True" with pytest.raises(ValueError, match=msg): - pd.read_json(StringIO(lines_json_df), lines=False, chunksize=2) + with pd.read_json(StringIO(lines_json_df), lines=False, chunksize=2) as _: + pass def test_readjson_chunks_series(): @@ -97,7 +98,8 @@ def test_readjson_chunks_series(): unchunked = pd.read_json(strio, lines=True, typ="Series") strio = StringIO(s.to_json(lines=True, orient="records")) - chunked = pd.concat(pd.read_json(strio, lines=True, typ="Series", chunksize=1)) + with pd.read_json(strio, lines=True, typ="Series", chunksize=1) as reader: + chunked = pd.concat(reader) tm.assert_series_equal(chunked, unchunked) @@ -105,7 +107,8 @@ def test_readjson_chunks_series(): def test_readjson_each_chunk(lines_json_df): # Other tests check that the final result of read_json(chunksize=True) # is correct. This checks the intermediate chunks. - chunks = list(pd.read_json(StringIO(lines_json_df), lines=True, chunksize=2)) + with pd.read_json(StringIO(lines_json_df), lines=True, chunksize=2) as reader: + chunks = list(reader) assert chunks[0].shape == (2, 2) assert chunks[1].shape == (1, 2) @@ -114,7 +117,8 @@ def test_readjson_chunks_from_file(): with tm.ensure_clean("test.json") as path: df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) df.to_json(path, lines=True, orient="records") - chunked = pd.concat(pd.read_json(path, lines=True, chunksize=1)) + with pd.read_json(path, lines=True, chunksize=1) as reader: + chunked = pd.concat(reader) unchunked = pd.read_json(path, lines=True) tm.assert_frame_equal(unchunked, chunked) @@ -141,7 +145,8 @@ def test_readjson_chunks_closes(chunksize): compression=None, nrows=None, ) - reader.read() + with reader: + reader.read() assert ( reader.handles.handle.closed ), f"didn't close stream with chunksize = {chunksize}" @@ -152,7 +157,10 @@ def test_readjson_invalid_chunksize(lines_json_df, chunksize): msg = r"'chunksize' must be an integer >=1" with pytest.raises(ValueError, match=msg): - pd.read_json(StringIO(lines_json_df), lines=True, chunksize=chunksize) + with pd.read_json( + StringIO(lines_json_df), lines=True, chunksize=chunksize + ) as _: + pass @pytest.mark.parametrize("chunksize", [None, 1, 2]) @@ -176,7 +184,8 @@ def test_readjson_chunks_multiple_empty_lines(chunksize): orig = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) test = pd.read_json(j, lines=True, chunksize=chunksize) if chunksize is not None: - test = pd.concat(test) + with test: + test = pd.concat(test) tm.assert_frame_equal(orig, test, obj=f"chunksize: {chunksize}") @@ -212,8 +221,8 @@ def test_readjson_nrows_chunks(nrows, chunksize): {"a": 3, "b": 4} {"a": 5, "b": 6} {"a": 7, "b": 8}""" - reader = read_json(jsonl, lines=True, nrows=nrows, chunksize=chunksize) - chunked = pd.concat(reader) + with read_json(jsonl, lines=True, nrows=nrows, chunksize=chunksize) as reader: + chunked = pd.concat(reader) expected = DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows] tm.assert_frame_equal(chunked, expected) @@ -240,6 +249,6 @@ def test_readjson_lines_chunks_fileurl(datapath): ] os_path = datapath("io", "json", "data", "line_delimited.json") file_url = Path(os_path).as_uri() - url_reader = pd.read_json(file_url, lines=True, chunksize=1) - for index, chuck in enumerate(url_reader): - tm.assert_frame_equal(chuck, df_list_expected[index]) + with pd.read_json(file_url, lines=True, chunksize=1) as url_reader: + for index, chuck in enumerate(url_reader): + tm.assert_frame_equal(chuck, df_list_expected[index]) diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index eee111dd4579c..06ccfa7f62863 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -376,10 +376,10 @@ def test_parse_trim_buffers(c_parser_only): ) # Iterate over the CSV file in chunks of `chunksize` lines - chunks_ = parser.read_csv( + with parser.read_csv( StringIO(csv_data), header=None, dtype=object, chunksize=chunksize - ) - result = concat(chunks_, axis=0, ignore_index=True) + ) as chunks_: + result = concat(chunks_, axis=0, ignore_index=True) # Check for data corruption if there was no segfault tm.assert_frame_equal(result, expected) @@ -387,14 +387,14 @@ def test_parse_trim_buffers(c_parser_only): # This extra test was added to replicate the fault in gh-5291. # Force 'utf-8' encoding, so that `_string_convert` would take # a different execution branch. - chunks_ = parser.read_csv( + with parser.read_csv( StringIO(csv_data), header=None, dtype=object, chunksize=chunksize, encoding="utf_8", - ) - result = concat(chunks_, axis=0, ignore_index=True) + ) as chunks_: + result = concat(chunks_, axis=0, ignore_index=True) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 8f63d06859f62..c8ed0d75b13a2 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -195,12 +195,11 @@ def test_malformed_chunks(all_parsers, nrows): """ parser = all_parsers msg = "Expected 3 fields in line 6, saw 5" - reader = parser.read_csv( + with parser.read_csv( StringIO(data), header=1, comment="#", iterator=True, chunksize=1, skiprows=[2] - ) - - with pytest.raises(ParserError, match=msg): - reader.read(nrows) + ) as reader: + with pytest.raises(ParserError, match=msg): + reader.read(nrows) def test_unnamed_columns(all_parsers): @@ -471,7 +470,6 @@ def test_read_chunksize_with_index(all_parsers, index_col): bar2,12,13,14,15 """ - reader = parser.read_csv(StringIO(data), index_col=0, chunksize=2) expected = DataFrame( [ ["foo", 2, 3, 4, 5], @@ -485,7 +483,8 @@ def test_read_chunksize_with_index(all_parsers, index_col): ) expected = expected.set_index("index") - chunks = list(reader) + with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader: + chunks = list(reader) tm.assert_frame_equal(chunks[0], expected[:2]) tm.assert_frame_equal(chunks[1], expected[2:4]) tm.assert_frame_equal(chunks[2], expected[4:]) @@ -505,7 +504,8 @@ def test_read_chunksize_bad(all_parsers, chunksize): msg = r"'chunksize' must be an integer >=1" with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), chunksize=chunksize) + with parser.read_csv(StringIO(data), chunksize=chunksize) as _: + pass @pytest.mark.parametrize("chunksize", [2, 8]) @@ -522,9 +522,9 @@ def test_read_chunksize_and_nrows(all_parsers, chunksize): parser = all_parsers kwargs = dict(index_col=0, nrows=5) - reader = parser.read_csv(StringIO(data), chunksize=chunksize, **kwargs) expected = parser.read_csv(StringIO(data), **kwargs) - tm.assert_frame_equal(concat(reader), expected) + with parser.read_csv(StringIO(data), chunksize=chunksize, **kwargs) as reader: + tm.assert_frame_equal(concat(reader), expected) def test_read_chunksize_and_nrows_changing_size(all_parsers): @@ -539,14 +539,13 @@ def test_read_chunksize_and_nrows_changing_size(all_parsers): parser = all_parsers kwargs = dict(index_col=0, nrows=5) - reader = parser.read_csv(StringIO(data), chunksize=8, **kwargs) expected = parser.read_csv(StringIO(data), **kwargs) + with parser.read_csv(StringIO(data), chunksize=8, **kwargs) as reader: + tm.assert_frame_equal(reader.get_chunk(size=2), expected.iloc[:2]) + tm.assert_frame_equal(reader.get_chunk(size=4), expected.iloc[2:5]) - tm.assert_frame_equal(reader.get_chunk(size=2), expected.iloc[:2]) - tm.assert_frame_equal(reader.get_chunk(size=4), expected.iloc[2:5]) - - with pytest.raises(StopIteration, match=""): - reader.get_chunk(size=3) + with pytest.raises(StopIteration, match=""): + reader.get_chunk(size=3) def test_get_chunk_passed_chunksize(all_parsers): @@ -557,8 +556,8 @@ def test_get_chunk_passed_chunksize(all_parsers): 7,8,9 1,2,3""" - reader = parser.read_csv(StringIO(data), chunksize=2) - result = reader.get_chunk() + with parser.read_csv(StringIO(data), chunksize=2) as reader: + result = reader.get_chunk() expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"]) tm.assert_frame_equal(result, expected) @@ -576,10 +575,9 @@ def test_read_chunksize_compat(all_parsers, kwargs): bar2,12,13,14,15 """ parser = all_parsers - reader = parser.read_csv(StringIO(data), chunksize=2, **kwargs) - result = parser.read_csv(StringIO(data), **kwargs) - tm.assert_frame_equal(concat(reader), result) + with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader: + tm.assert_frame_equal(concat(reader), result) def test_read_chunksize_jagged_names(all_parsers): @@ -588,9 +586,8 @@ def test_read_chunksize_jagged_names(all_parsers): data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)]) expected = DataFrame([[0] + [np.nan] * 9] * 7 + [[0] * 10]) - reader = parser.read_csv(StringIO(data), names=range(10), chunksize=4) - - result = concat(reader) + with parser.read_csv(StringIO(data), names=range(10), chunksize=4) as reader: + result = concat(reader) tm.assert_frame_equal(result, expected) @@ -602,8 +599,8 @@ def test_read_data_list(all_parsers): data_list = [["A", "B", "C"], ["foo", "1", "2", "3"], ["bar", "4", "5", "6"]] expected = parser.read_csv(StringIO(data), **kwargs) - parser = TextParser(data_list, chunksize=2, **kwargs) - result = parser.read() + with TextParser(data_list, chunksize=2, **kwargs) as parser: + result = parser.read() tm.assert_frame_equal(result, expected) @@ -622,12 +619,12 @@ def test_iterator(all_parsers): kwargs = dict(index_col=0) expected = parser.read_csv(StringIO(data), **kwargs) - reader = parser.read_csv(StringIO(data), iterator=True, **kwargs) + with parser.read_csv(StringIO(data), iterator=True, **kwargs) as reader: - first_chunk = reader.read(3) - tm.assert_frame_equal(first_chunk, expected[:3]) + first_chunk = reader.read(3) + tm.assert_frame_equal(first_chunk, expected[:3]) - last_chunk = reader.read(5) + last_chunk = reader.read(5) tm.assert_frame_equal(last_chunk, expected[3:]) @@ -639,8 +636,8 @@ def test_iterator2(all_parsers): baz,7,8,9 """ - reader = parser.read_csv(StringIO(data), iterator=True) - result = list(reader) + with parser.read_csv(StringIO(data), iterator=True) as reader: + result = list(reader) expected = DataFrame( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], @@ -663,10 +660,10 @@ def test_reader_list(all_parsers): kwargs = dict(index_col=0) lines = list(csv.reader(StringIO(data))) - reader = TextParser(lines, chunksize=2, **kwargs) + with TextParser(lines, chunksize=2, **kwargs) as reader: + chunks = list(reader) expected = parser.read_csv(StringIO(data), **kwargs) - chunks = list(reader) tm.assert_frame_equal(chunks[0], expected[:2]) tm.assert_frame_equal(chunks[1], expected[2:4]) @@ -686,10 +683,10 @@ def test_reader_list_skiprows(all_parsers): kwargs = dict(index_col=0) lines = list(csv.reader(StringIO(data))) - reader = TextParser(lines, chunksize=2, skiprows=[1], **kwargs) + with TextParser(lines, chunksize=2, skiprows=[1], **kwargs) as reader: + chunks = list(reader) expected = parser.read_csv(StringIO(data), **kwargs) - chunks = list(reader) tm.assert_frame_equal(chunks[0], expected[1:3]) @@ -703,8 +700,8 @@ def test_iterator_stop_on_chunksize(all_parsers): baz,7,8,9 """ - reader = parser.read_csv(StringIO(data), chunksize=1) - result = list(reader) + with parser.read_csv(StringIO(data), chunksize=1) as reader: + result = list(reader) assert len(result) == 3 expected = DataFrame( @@ -724,7 +721,8 @@ def test_iterator_skipfooter_errors(all_parsers, kwargs): data = "a\n1\n2" with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), skipfooter=1, **kwargs) + with parser.read_csv(StringIO(data), skipfooter=1, **kwargs) as _: + pass def test_nrows_skipfooter_errors(all_parsers): @@ -1362,7 +1360,8 @@ def test_empty_with_nrows_chunksize(all_parsers, iterator): data = StringIO("foo,bar\n") if iterator: - result = next(iter(parser.read_csv(data, chunksize=nrows))) + with parser.read_csv(data, chunksize=nrows) as reader: + result = next(iter(reader)) else: result = parser.read_csv(data, nrows=nrows) @@ -2056,10 +2055,9 @@ def test_read_csv_memory_growth_chunksize(all_parsers): for i in range(1000): f.write(str(i) + "\n") - result = parser.read_csv(path, chunksize=20) - - for _ in result: - pass + with parser.read_csv(path, chunksize=20) as result: + for _ in result: + pass def test_read_csv_raises_on_header_prefix(all_parsers): @@ -2310,3 +2308,35 @@ def test_memory_map_compression(all_parsers, compression): parser.read_csv(path, memory_map=True, compression=compression), expected, ) + + +def test_context_manager(all_parsers, datapath): + # make sure that opened files are closed + parser = all_parsers + + path = datapath("io", "data", "csv", "iris.csv") + + reader = parser.read_csv(path, chunksize=1) + assert not reader._engine.handles.handle.closed + try: + with reader: + next(reader) + assert False + except AssertionError: + assert reader._engine.handles.handle.closed + + +def test_context_manageri_user_provided(all_parsers, datapath): + # make sure that user-provided handles are not closed + parser = all_parsers + + with open(datapath("io", "data", "csv", "iris.csv"), mode="r") as path: + + reader = parser.read_csv(path, chunksize=1) + assert not reader._engine.handles.handle.closed + try: + with reader: + next(reader) + assert False + except AssertionError: + assert not reader._engine.handles.handle.closed diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py index 930ce1a695e1f..1e68e54b413b0 100644 --- a/pandas/tests/io/parser/test_dtypes.py +++ b/pandas/tests/io/parser/test_dtypes.py @@ -213,10 +213,11 @@ def test_categorical_dtype_chunksize_infer_categories(all_parsers): DataFrame({"a": [1, 1], "b": Categorical(["a", "b"])}), DataFrame({"a": [1, 2], "b": Categorical(["b", "c"])}, index=[2, 3]), ] - actuals = parser.read_csv(StringIO(data), dtype={"b": "category"}, chunksize=2) - - for actual, expected in zip(actuals, expecteds): - tm.assert_frame_equal(actual, expected) + with parser.read_csv( + StringIO(data), dtype={"b": "category"}, chunksize=2 + ) as actuals: + for actual, expected in zip(actuals, expecteds): + tm.assert_frame_equal(actual, expected) def test_categorical_dtype_chunksize_explicit_categories(all_parsers): @@ -235,10 +236,9 @@ def test_categorical_dtype_chunksize_explicit_categories(all_parsers): ), ] dtype = CategoricalDtype(cats) - actuals = parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2) - - for actual, expected in zip(actuals, expecteds): - tm.assert_frame_equal(actual, expected) + with parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2) as actuals: + for actual, expected in zip(actuals, expecteds): + tm.assert_frame_equal(actual, expected) @pytest.mark.parametrize("ordered", [False, True]) diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index b8b03cbd14a1d..97f82b9a01a9a 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -122,41 +122,45 @@ def test_parse_public_s3_bucket_chunked(self, tips_df, s3so): # Read with a chunksize chunksize = 5 for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: - df_reader = read_csv( + with read_csv( "s3://pandas-test/tips.csv" + ext, chunksize=chunksize, compression=comp, storage_options=s3so, - ) - assert df_reader.chunksize == chunksize - for i_chunk in [0, 1, 2]: - # Read a couple of chunks and make sure we see them - # properly. - df = df_reader.get_chunk() - assert isinstance(df, DataFrame) - assert not df.empty - true_df = tips_df.iloc[chunksize * i_chunk : chunksize * (i_chunk + 1)] - tm.assert_frame_equal(true_df, df) + ) as df_reader: + assert df_reader.chunksize == chunksize + for i_chunk in [0, 1, 2]: + # Read a couple of chunks and make sure we see them + # properly. + df = df_reader.get_chunk() + assert isinstance(df, DataFrame) + assert not df.empty + true_df = tips_df.iloc[ + chunksize * i_chunk : chunksize * (i_chunk + 1) + ] + tm.assert_frame_equal(true_df, df) def test_parse_public_s3_bucket_chunked_python(self, tips_df, s3so): # Read with a chunksize using the Python parser chunksize = 5 for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: - df_reader = read_csv( + with read_csv( "s3://pandas-test/tips.csv" + ext, chunksize=chunksize, compression=comp, engine="python", storage_options=s3so, - ) - assert df_reader.chunksize == chunksize - for i_chunk in [0, 1, 2]: - # Read a couple of chunks and make sure we see them properly. - df = df_reader.get_chunk() - assert isinstance(df, DataFrame) - assert not df.empty - true_df = tips_df.iloc[chunksize * i_chunk : chunksize * (i_chunk + 1)] - tm.assert_frame_equal(true_df, df) + ) as df_reader: + assert df_reader.chunksize == chunksize + for i_chunk in [0, 1, 2]: + # Read a couple of chunks and make sure we see them properly. + df = df_reader.get_chunk() + assert isinstance(df, DataFrame) + assert not df.empty + true_df = tips_df.iloc[ + chunksize * i_chunk : chunksize * (i_chunk + 1) + ] + tm.assert_frame_equal(true_df, df) def test_parse_public_s3_bucket_python(self, tips_df, s3so): for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 7a5203ca86520..a20ca508ebbfe 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1020,13 +1020,13 @@ def test_multiple_date_cols_chunked(all_parsers): ) expected = expected.set_index("nominal") - reader = parser.read_csv( + with parser.read_csv( StringIO(data), parse_dates={"nominal": [1, 2]}, index_col="nominal", chunksize=2, - ) - chunks = list(reader) + ) as reader: + chunks = list(reader) tm.assert_frame_equal(chunks[0], expected[:2]) tm.assert_frame_equal(chunks[1], expected[2:4]) diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index 413b78a52ad38..1af69785c7584 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -336,8 +336,10 @@ def test_empty_field_eof(self): def test_empty_csv_input(self): # GH14867 - df = read_csv(StringIO(), chunksize=20, header=None, names=["a", "b", "c"]) - assert isinstance(df, TextFileReader) + with read_csv( + StringIO(), chunksize=20, header=None, names=["a", "b", "c"] + ) as df: + assert isinstance(df, TextFileReader) def assert_array_dicts_equal(left, right): diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 9de6ca75fd4d9..cca62c5af59a1 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -52,24 +52,22 @@ def test_from_buffer(self): with open(fname, "rb") as f: byts = f.read() buf = io.BytesIO(byts) - rdr = pd.read_sas( + with pd.read_sas( buf, format="sas7bdat", iterator=True, encoding="utf-8" - ) - df = rdr.read() + ) as rdr: + df = rdr.read() tm.assert_frame_equal(df, df0, check_exact=False) - rdr.close() def test_from_iterator(self): for j in 0, 1: df0 = self.data[j] for k in self.test_ix[j]: fname = os.path.join(self.dirpath, f"test{k}.sas7bdat") - rdr = pd.read_sas(fname, iterator=True, encoding="utf-8") - df = rdr.read(2) - tm.assert_frame_equal(df, df0.iloc[0:2, :]) - df = rdr.read(3) - tm.assert_frame_equal(df, df0.iloc[2:5, :]) - rdr.close() + with pd.read_sas(fname, iterator=True, encoding="utf-8") as rdr: + df = rdr.read(2) + tm.assert_frame_equal(df, df0.iloc[0:2, :]) + df = rdr.read(3) + tm.assert_frame_equal(df, df0.iloc[2:5, :]) def test_path_pathlib(self): for j in 0, 1: @@ -96,25 +94,24 @@ def test_iterator_loop(self): for k in self.test_ix[j]: for chunksize in 3, 5, 10, 11: fname = os.path.join(self.dirpath, f"test{k}.sas7bdat") - rdr = pd.read_sas(fname, chunksize=10, encoding="utf-8") - y = 0 - for x in rdr: - y += x.shape[0] + with pd.read_sas(fname, chunksize=10, encoding="utf-8") as rdr: + y = 0 + for x in rdr: + y += x.shape[0] assert y == rdr.row_count - rdr.close() def test_iterator_read_too_much(self): # github #14734 k = self.test_ix[0][0] fname = os.path.join(self.dirpath, f"test{k}.sas7bdat") - rdr = pd.read_sas(fname, format="sas7bdat", iterator=True, encoding="utf-8") - d1 = rdr.read(rdr.row_count + 20) - rdr.close() + with pd.read_sas( + fname, format="sas7bdat", iterator=True, encoding="utf-8" + ) as rdr: + d1 = rdr.read(rdr.row_count + 20) - rdr = pd.read_sas(fname, iterator=True, encoding="utf-8") - d2 = rdr.read(rdr.row_count + 20) + with pd.read_sas(fname, iterator=True, encoding="utf-8") as rdr: + d2 = rdr.read(rdr.row_count + 20) tm.assert_frame_equal(d1, d2) - rdr.close() def test_encoding_options(datapath): diff --git a/pandas/tests/io/sas/test_xport.py b/pandas/tests/io/sas/test_xport.py index 939edb3d8e0b4..a8713f5bf36c9 100644 --- a/pandas/tests/io/sas/test_xport.py +++ b/pandas/tests/io/sas/test_xport.py @@ -47,29 +47,25 @@ def test1_basic(self): num_rows = data.shape[0] # Test reading beyond end of file - reader = read_sas(self.file01, format="xport", iterator=True) - data = reader.read(num_rows + 100) + with read_sas(self.file01, format="xport", iterator=True) as reader: + data = reader.read(num_rows + 100) assert data.shape[0] == num_rows - reader.close() # Test incremental read with `read` method. - reader = read_sas(self.file01, format="xport", iterator=True) - data = reader.read(10) - reader.close() + with read_sas(self.file01, format="xport", iterator=True) as reader: + data = reader.read(10) tm.assert_frame_equal(data, data_csv.iloc[0:10, :]) # Test incremental read with `get_chunk` method. - reader = read_sas(self.file01, format="xport", chunksize=10) - data = reader.get_chunk() - reader.close() + with read_sas(self.file01, format="xport", chunksize=10) as reader: + data = reader.get_chunk() tm.assert_frame_equal(data, data_csv.iloc[0:10, :]) # Test read in loop m = 0 - reader = read_sas(self.file01, format="xport", chunksize=100) - for x in reader: - m += x.shape[0] - reader.close() + with read_sas(self.file01, format="xport", chunksize=100) as reader: + for x in reader: + m += x.shape[0] assert m == num_rows # Read full file with `read_sas` method @@ -89,15 +85,17 @@ def test1_index(self): tm.assert_frame_equal(data, data_csv, check_index_type=False) # Test incremental read with `read` method. - reader = read_sas(self.file01, index="SEQN", format="xport", iterator=True) - data = reader.read(10) - reader.close() + with read_sas( + self.file01, index="SEQN", format="xport", iterator=True + ) as reader: + data = reader.read(10) tm.assert_frame_equal(data, data_csv.iloc[0:10, :], check_index_type=False) # Test incremental read with `get_chunk` method. - reader = read_sas(self.file01, index="SEQN", format="xport", chunksize=10) - data = reader.get_chunk() - reader.close() + with read_sas( + self.file01, index="SEQN", format="xport", chunksize=10 + ) as reader: + data = reader.get_chunk() tm.assert_frame_equal(data, data_csv.iloc[0:10, :], check_index_type=False) def test1_incremental(self): @@ -107,9 +105,8 @@ def test1_incremental(self): data_csv = data_csv.set_index("SEQN") numeric_as_float(data_csv) - reader = read_sas(self.file01, index="SEQN", chunksize=1000) - - all_data = list(reader) + with read_sas(self.file01, index="SEQN", chunksize=1000) as reader: + all_data = list(reader) data = pd.concat(all_data, axis=0) tm.assert_frame_equal(data, data_csv, check_index_type=False) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index c7a7101b5fe17..c3b21daa0ac04 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -121,16 +121,16 @@ def test_get_handle_with_buffer(self): input_buffer.close() def test_iterator(self): - reader = pd.read_csv(StringIO(self.data1), chunksize=1) - result = pd.concat(reader, ignore_index=True) + with pd.read_csv(StringIO(self.data1), chunksize=1) as reader: + result = pd.concat(reader, ignore_index=True) expected = pd.read_csv(StringIO(self.data1)) tm.assert_frame_equal(result, expected) # GH12153 - it = pd.read_csv(StringIO(self.data1), chunksize=1) - first = next(it) - tm.assert_frame_equal(first, expected.iloc[[0]]) - tm.assert_frame_equal(pd.concat(it), expected.iloc[1:]) + with pd.read_csv(StringIO(self.data1), chunksize=1) as it: + first = next(it) + tm.assert_frame_equal(first, expected.iloc[[0]]) + tm.assert_frame_equal(pd.concat(it), expected.iloc[1:]) @pytest.mark.parametrize( "reader, module, error_class, fn_ext", diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index b065aa187f5fb..24944281419c3 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1974,12 +1974,12 @@ def test_iterator_value_labels(): df = DataFrame({f"col{k}": pd.Categorical(values, ordered=True) for k in range(2)}) with tm.ensure_clean() as path: df.to_stata(path, write_index=False) - reader = pd.read_stata(path, chunksize=100) expected = pd.Index(["a_label", "b_label", "c_label"], dtype="object") - for j, chunk in enumerate(reader): - for i in range(2): - tm.assert_index_equal(chunk.dtypes[i].categories, expected) - tm.assert_frame_equal(chunk, df.iloc[j * 100 : (j + 1) * 100]) + with pd.read_stata(path, chunksize=100) as reader: + for j, chunk in enumerate(reader): + for i in range(2): + tm.assert_index_equal(chunk.dtypes[i].categories, expected) + tm.assert_frame_equal(chunk, df.iloc[j * 100 : (j + 1) * 100]) def test_precision_loss(): diff --git a/pandas/tests/reshape/concat/test_invalid.py b/pandas/tests/reshape/concat/test_invalid.py index ec8167906a60c..cc9f09c16fb43 100644 --- a/pandas/tests/reshape/concat/test_invalid.py +++ b/pandas/tests/reshape/concat/test_invalid.py @@ -45,7 +45,7 @@ def test_concat_invalid_first_argument(self): bar2,12,13,14,15 """ - reader = read_csv(StringIO(data), chunksize=1) - result = concat(reader, ignore_index=True) + with read_csv(StringIO(data), chunksize=1) as reader: + result = concat(reader, ignore_index=True) expected = read_csv(StringIO(data)) tm.assert_frame_equal(result, expected) From 0c61786398812952bc47e5e9359aa4fcb7779a7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Fri, 4 Dec 2020 12:03:31 -0500 Subject: [PATCH 2/2] new lines --- doc/source/user_guide/io.rst | 1 + doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/io/json/_json.py | 3 ++- pandas/io/parsers.py | 6 ++++-- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index cbedb3b6c2ece..2b324a74fffaf 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1583,6 +1583,7 @@ value will be an iterable object of type ``TextFileReader``: print(chunk) .. versionchanged:: 1.2 + ``read_csv/json/sas`` return a context-manager when iterating through a file. Specifying ``iterator=True`` will also return the ``TextFileReader`` object: diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 54f73d5b57c7c..13ddeb2a89133 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -291,7 +291,7 @@ Other enhancements - Improve numerical stability for :meth:`.Rolling.skew`, :meth:`.Rolling.kurt`, :meth:`Expanding.skew` and :meth:`Expanding.kurt` through implementation of Kahan summation (:issue:`6929`) - Improved error reporting for subsetting columns of a :class:`.DataFrameGroupBy` with ``axis=1`` (:issue:`37725`) - Implement method ``cross`` for :meth:`DataFrame.merge` and :meth:`DataFrame.join` (:issue:`5401`) -- :func:`read_csv/sas/json` return a context manager when called with ``chuncksize``/``iterator`` (:issue:`38225`) +- When :func:`read_csv/sas/json` are called with ``chuncksize``/``iterator`` they can be used in a ``with`` statement as they return context-managers (:issue:`38225`) .. --------------------------------------------------------------------------- diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 3ec5238f1f1c7..da085d0d0eb2f 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -438,7 +438,8 @@ def read_json( If this is None, the file will be read into memory all at once. .. versionchanged:: 1.2 - ``JsonReader`` is a context manager. + + ``JsonReader`` is a context manager. compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 8d5340f894f1e..5b623c360c3ef 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -278,7 +278,8 @@ ``get_chunk()``. .. versionchanged:: 1.2 - ``TextFileReader`` is a context manager. + + ``TextFileReader`` is a context manager. chunksize : int, optional Return TextFileReader object for iteration. See the `IO Tools docs @@ -286,7 +287,8 @@ for more information on ``iterator`` and ``chunksize``. .. versionchanged:: 1.2 - ``TextFileReader`` is a context manager. + + ``TextFileReader`` is a context manager. compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer' and `filepath_or_buffer` is path-like, then detect compression from the