From 0b477a6ac8516c5546ee2e8863d414bfa6c62b7c Mon Sep 17 00:00:00 2001 From: Robert Bradshaw Date: Fri, 4 Dec 2020 10:36:04 -0800 Subject: [PATCH 1/5] Fix #36791 and #34548 correctly respecting read_json chunksize. --- pandas/io/json/_json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index b1d705439e300..459fe8bf2238c 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -625,7 +625,7 @@ def _preprocess_data(self, data): If self.chunksize, we prepare the data for the `__next__` method. Otherwise, we read it into memory for the `read` method. """ - if hasattr(data, "read") and (not self.chunksize or not self.nrows): + if hasattr(data, "read") and not (self.chunksize or self.nrows): data = data.read() self.close() if not hasattr(data, "read") and (self.chunksize or self.nrows): From 76d2ce66f4b59dd6c1b8ebd358fddba94a331262 Mon Sep 17 00:00:00 2001 From: Robert Bradshaw Date: Tue, 22 Dec 2020 11:17:36 -0800 Subject: [PATCH 2/5] Test of read_json chunksize. --- pandas/tests/io/json/test_readlines.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index 2e68d3306c7d1..89384f3fbb658 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -243,3 +243,27 @@ def test_readjson_lines_chunks_fileurl(datapath): url_reader = pd.read_json(file_url, lines=True, chunksize=1) for index, chuck in enumerate(url_reader): tm.assert_frame_equal(chuck, df_list_expected[index]) + + +def test_chunksize_is_incremental(): + jsonl = """{"a": 1, "b": 2} + {"a": 3, "b": 4} + {"a": 5, "b": 6} + {"a": 7, "b": 8}\n""" * 1000 + + class MyReader(): + def __init__(self, contents): + self.read_count = 0 + self.stringio = StringIO(contents) + + def read(self, *args): + self.read_count += 1 + return self.stringio.read(*args) + + def __iter__(self): + self.read_count += 1 + return iter(self.stringio) + + reader = MyReader(jsonl) + assert len(list(pd.read_json(reader, lines=True, chunksize=100))) > 1 + assert reader.read_count > 10 From b3ea53546f8358de05d575806c7c3a0b40879f3a Mon Sep 17 00:00:00 2001 From: Robert Bradshaw Date: Tue, 22 Dec 2020 11:20:48 -0800 Subject: [PATCH 3/5] whatsnew entry --- doc/source/whatsnew/v1.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 8182dfa4bce40..2d29532be000a 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -549,6 +549,7 @@ Performance improvements - Performance improvement in :meth:`DataFrame.groupby` for ``float`` ``dtype`` (:issue:`28303`), changes of the underlying hash-function can lead to changes in float based indexes sort ordering for ties (e.g. :meth:`Index.value_counts`) - Performance improvement in :meth:`pd.isin` for inputs with more than 1e6 elements (:issue:`36611`) - Performance improvement for :meth:`DataFrame.__setitem__` with list-like indexers (:issue:`37954`) +- :meth:`read_json` now avoids reading entire file into memory when chunksize is specified (:issue:`34548`) .. --------------------------------------------------------------------------- From fc21579ddf93cd618e6d4abd70c2a0cf74acaaa0 Mon Sep 17 00:00:00 2001 From: Robert Bradshaw Date: Tue, 22 Dec 2020 11:31:48 -0800 Subject: [PATCH 4/5] Make precommit happy. --- pandas/tests/io/json/test_readlines.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index 9c934ad782236..ce53c51cf41d3 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -255,12 +255,15 @@ def test_readjson_lines_chunks_fileurl(datapath): def test_chunksize_is_incremental(): - jsonl = """{"a": 1, "b": 2} + jsonl = ( + """{"a": 1, "b": 2} {"a": 3, "b": 4} {"a": 5, "b": 6} - {"a": 7, "b": 8}\n""" * 1000 + {"a": 7, "b": 8}\n""" + * 1000 + ) - class MyReader(): + class MyReader: def __init__(self, contents): self.read_count = 0 self.stringio = StringIO(contents) From 4b856ee86040ffe172a5b743d06a7d57e1914a42 Mon Sep 17 00:00:00 2001 From: Robert Bradshaw Date: Tue, 22 Dec 2020 11:56:12 -0800 Subject: [PATCH 5/5] Add reference to bug fixed in the test. --- pandas/tests/io/json/test_readlines.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index ce53c51cf41d3..099d99507e136 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -255,6 +255,7 @@ def test_readjson_lines_chunks_fileurl(datapath): def test_chunksize_is_incremental(): + # See https://github.com/pandas-dev/pandas/issues/34548 jsonl = ( """{"a": 1, "b": 2} {"a": 3, "b": 4}