Skip to content

Commit 7229f56

Browse files
robertwbpull[bot]
authored andcommitted
BUG: read_json does not respect chunksize (#38293)
1 parent fbbf8d2 commit 7229f56

File tree

3 files changed

+30
-1
lines changed

3 files changed

+30
-1
lines changed

doc/source/whatsnew/v1.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -567,6 +567,7 @@ Performance improvements
567567
- Performance improvement in :meth:`DataFrame.groupby` for ``float`` ``dtype`` (:issue:`28303`), changes of the underlying hash-function can lead to changes in float based indexes sort ordering for ties (e.g. :meth:`Index.value_counts`)
568568
- Performance improvement in :meth:`pd.isin` for inputs with more than 1e6 elements (:issue:`36611`)
569569
- Performance improvement for :meth:`DataFrame.__setitem__` with list-like indexers (:issue:`37954`)
570+
- :meth:`read_json` now avoids reading entire file into memory when chunksize is specified (:issue:`34548`)
570571

571572
.. ---------------------------------------------------------------------------
572573

pandas/io/json/_json.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -630,7 +630,7 @@ def _preprocess_data(self, data):
630630
If self.chunksize, we prepare the data for the `__next__` method.
631631
Otherwise, we read it into memory for the `read` method.
632632
"""
633-
if hasattr(data, "read") and (not self.chunksize or not self.nrows):
633+
if hasattr(data, "read") and not (self.chunksize or self.nrows):
634634
data = data.read()
635635
self.close()
636636
if not hasattr(data, "read") and (self.chunksize or self.nrows):

pandas/tests/io/json/test_readlines.py

+28
Original file line numberDiff line numberDiff line change
@@ -252,3 +252,31 @@ def test_readjson_lines_chunks_fileurl(datapath):
252252
with pd.read_json(file_url, lines=True, chunksize=1) as url_reader:
253253
for index, chuck in enumerate(url_reader):
254254
tm.assert_frame_equal(chuck, df_list_expected[index])
255+
256+
257+
def test_chunksize_is_incremental():
258+
# See https://github.com/pandas-dev/pandas/issues/34548
259+
jsonl = (
260+
"""{"a": 1, "b": 2}
261+
{"a": 3, "b": 4}
262+
{"a": 5, "b": 6}
263+
{"a": 7, "b": 8}\n"""
264+
* 1000
265+
)
266+
267+
class MyReader:
268+
def __init__(self, contents):
269+
self.read_count = 0
270+
self.stringio = StringIO(contents)
271+
272+
def read(self, *args):
273+
self.read_count += 1
274+
return self.stringio.read(*args)
275+
276+
def __iter__(self):
277+
self.read_count += 1
278+
return iter(self.stringio)
279+
280+
reader = MyReader(jsonl)
281+
assert len(list(pd.read_json(reader, lines=True, chunksize=100))) > 1
282+
assert reader.read_count > 10

0 commit comments

Comments
 (0)