From 0b477a6ac8516c5546ee2e8863d414bfa6c62b7c Mon Sep 17 00:00:00 2001
From: Robert Bradshaw <robertwb@gmail.com>
Date: Fri, 4 Dec 2020 10:36:04 -0800
Subject: [PATCH 1/5] Fix #36791 and #34548 correctly respecting read_json
 chunksize.

---
 pandas/io/json/_json.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
index b1d705439e300..459fe8bf2238c 100644
--- a/pandas/io/json/_json.py
+++ b/pandas/io/json/_json.py
@@ -625,7 +625,7 @@ def _preprocess_data(self, data):
         If self.chunksize, we prepare the data for the `__next__` method.
         Otherwise, we read it into memory for the `read` method.
         """
-        if hasattr(data, "read") and (not self.chunksize or not self.nrows):
+        if hasattr(data, "read") and not (self.chunksize or self.nrows):
             data = data.read()
             self.close()
         if not hasattr(data, "read") and (self.chunksize or self.nrows):

From 76d2ce66f4b59dd6c1b8ebd358fddba94a331262 Mon Sep 17 00:00:00 2001
From: Robert Bradshaw <robertwb@gmail.com>
Date: Tue, 22 Dec 2020 11:17:36 -0800
Subject: [PATCH 2/5] Test of read_json chunksize.

---
 pandas/tests/io/json/test_readlines.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py
index 2e68d3306c7d1..89384f3fbb658 100644
--- a/pandas/tests/io/json/test_readlines.py
+++ b/pandas/tests/io/json/test_readlines.py
@@ -243,3 +243,27 @@ def test_readjson_lines_chunks_fileurl(datapath):
     url_reader = pd.read_json(file_url, lines=True, chunksize=1)
     for index, chuck in enumerate(url_reader):
         tm.assert_frame_equal(chuck, df_list_expected[index])
+
+
+def test_chunksize_is_incremental():
+    jsonl = """{"a": 1, "b": 2}
+        {"a": 3, "b": 4}
+        {"a": 5, "b": 6}
+        {"a": 7, "b": 8}\n""" * 1000
+
+    class MyReader():
+        def __init__(self, contents):
+            self.read_count = 0
+            self.stringio = StringIO(contents)
+
+        def read(self, *args):
+            self.read_count += 1
+            return self.stringio.read(*args)
+
+        def __iter__(self):
+            self.read_count += 1
+            return iter(self.stringio)
+
+    reader = MyReader(jsonl)
+    assert len(list(pd.read_json(reader, lines=True, chunksize=100))) > 1
+    assert reader.read_count > 10

From b3ea53546f8358de05d575806c7c3a0b40879f3a Mon Sep 17 00:00:00 2001
From: Robert Bradshaw <robertwb@gmail.com>
Date: Tue, 22 Dec 2020 11:20:48 -0800
Subject: [PATCH 3/5] whatsnew entry

---
 doc/source/whatsnew/v1.2.0.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
index 8182dfa4bce40..2d29532be000a 100644
--- a/doc/source/whatsnew/v1.2.0.rst
+++ b/doc/source/whatsnew/v1.2.0.rst
@@ -549,6 +549,7 @@ Performance improvements
 - Performance improvement in :meth:`DataFrame.groupby` for ``float`` ``dtype`` (:issue:`28303`), changes of the underlying hash-function can lead to changes in float based indexes sort ordering for ties (e.g. :meth:`Index.value_counts`)
 - Performance improvement in :meth:`pd.isin` for inputs with more than 1e6 elements (:issue:`36611`)
 - Performance improvement for :meth:`DataFrame.__setitem__` with list-like indexers (:issue:`37954`)
+- :meth:`read_json` now avoids reading entire file into memory when chunksize is specified (:issue:`34548`)
 
 .. ---------------------------------------------------------------------------
 

From fc21579ddf93cd618e6d4abd70c2a0cf74acaaa0 Mon Sep 17 00:00:00 2001
From: Robert Bradshaw <robertwb@gmail.com>
Date: Tue, 22 Dec 2020 11:31:48 -0800
Subject: [PATCH 4/5] Make precommit happy.

---
 pandas/tests/io/json/test_readlines.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py
index 9c934ad782236..ce53c51cf41d3 100644
--- a/pandas/tests/io/json/test_readlines.py
+++ b/pandas/tests/io/json/test_readlines.py
@@ -255,12 +255,15 @@ def test_readjson_lines_chunks_fileurl(datapath):
 
 
 def test_chunksize_is_incremental():
-    jsonl = """{"a": 1, "b": 2}
+    jsonl = (
+        """{"a": 1, "b": 2}
         {"a": 3, "b": 4}
         {"a": 5, "b": 6}
-        {"a": 7, "b": 8}\n""" * 1000
+        {"a": 7, "b": 8}\n"""
+        * 1000
+    )
 
-    class MyReader():
+    class MyReader:
         def __init__(self, contents):
             self.read_count = 0
             self.stringio = StringIO(contents)

From 4b856ee86040ffe172a5b743d06a7d57e1914a42 Mon Sep 17 00:00:00 2001
From: Robert Bradshaw <robertwb@gmail.com>
Date: Tue, 22 Dec 2020 11:56:12 -0800
Subject: [PATCH 5/5] Add reference to bug fixed in the test.

---
 pandas/tests/io/json/test_readlines.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py
index ce53c51cf41d3..099d99507e136 100644
--- a/pandas/tests/io/json/test_readlines.py
+++ b/pandas/tests/io/json/test_readlines.py
@@ -255,6 +255,7 @@ def test_readjson_lines_chunks_fileurl(datapath):
 
 
 def test_chunksize_is_incremental():
+    # See https://github.com/pandas-dev/pandas/issues/34548
     jsonl = (
         """{"a": 1, "b": 2}
         {"a": 3, "b": 4}