BUG: read_json does not respect chunksize (#38293)

robertwb · pull[bot] · commit 7229f56ff513 · 2021-09-14T03:14:49.000Z
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -567,6 +567,7 @@ Performance improvements
 - Performance improvement in :meth:`DataFrame.groupby` for ``float`` ``dtype`` (:issue:`28303`), changes of the underlying hash-function can lead to changes in float based indexes sort ordering for ties (e.g. :meth:`Index.value_counts`)
 - Performance improvement in :meth:`pd.isin` for inputs with more than 1e6 elements (:issue:`36611`)
 - Performance improvement for :meth:`DataFrame.__setitem__` with list-like indexers (:issue:`37954`)
+- :meth:`read_json` now avoids reading entire file into memory when chunksize is specified (:issue:`34548`)
 
 .. ---------------------------------------------------------------------------
 
diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
@@ -630,7 +630,7 @@ def _preprocess_data(self, data):
         If self.chunksize, we prepare the data for the `__next__` method.
         Otherwise, we read it into memory for the `read` method.
         """
-        if hasattr(data, "read") and (not self.chunksize or not self.nrows):
+        if hasattr(data, "read") and not (self.chunksize or self.nrows):
             data = data.read()
             self.close()
         if not hasattr(data, "read") and (self.chunksize or self.nrows):
diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py
@@ -252,3 +252,31 @@ def test_readjson_lines_chunks_fileurl(datapath):
     with pd.read_json(file_url, lines=True, chunksize=1) as url_reader:
         for index, chuck in enumerate(url_reader):
             tm.assert_frame_equal(chuck, df_list_expected[index])
+
+
+def test_chunksize_is_incremental():
+    # See https://github.com/pandas-dev/pandas/issues/34548
+    jsonl = (
+        """{"a": 1, "b": 2}
+        {"a": 3, "b": 4}
+        {"a": 5, "b": 6}
+        {"a": 7, "b": 8}\n"""
+        * 1000
+    )
+
+    class MyReader:
+        def __init__(self, contents):
+            self.read_count = 0
+            self.stringio = StringIO(contents)
+
+        def read(self, *args):
+            self.read_count += 1
+            return self.stringio.read(*args)
+
+        def __iter__(self):
+            self.read_count += 1
+            return iter(self.stringio)
+
+    reader = MyReader(jsonl)
+    assert len(list(pd.read_json(reader, lines=True, chunksize=100))) > 1
+    assert reader.read_count > 10