Skip to content

Commit 15d1d1e

Browse files
committed
ENH Add nrow parameter for line delimited json for read_json #33916
1 parent dd84044 commit 15d1d1e

File tree

2 files changed

+49
-1
lines changed

2 files changed

+49
-1
lines changed

pandas/io/json/_json.py

+23-1
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,7 @@ def read_json(
363363
lines=False,
364364
chunksize=None,
365365
compression="infer",
366+
nrows=None,
366367
):
367368
"""
368369
Convert a JSON string to pandas object.
@@ -493,6 +494,12 @@ def read_json(
493494
for more information on ``chunksize``.
494495
This can only be passed if `lines=True`.
495496
If this is None, the file will be read into memory all at once.
497+
498+
chunksize : int, optional
499+
The number of lines from the line-delimited jsonfile that has to be read.
500+
This can only be passed if `lines=True`.
501+
If this is None, all the rows will be returned.
502+
496503
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
497504
For on-the-fly decompression of on-disk data. If 'infer', then use
498505
gzip, bz2, zip or xz if path_or_buf is a string ending in
@@ -600,6 +607,7 @@ def read_json(
600607
lines=lines,
601608
chunksize=chunksize,
602609
compression=compression,
610+
nrows=nrows,
603611
)
604612

605613
if chunksize:
@@ -637,6 +645,7 @@ def __init__(
637645
lines,
638646
chunksize,
639647
compression,
648+
nrows
640649
):
641650

642651
self.path_or_buf = filepath_or_buffer
@@ -655,11 +664,16 @@ def __init__(
655664
self.chunksize = chunksize
656665
self.nrows_seen = 0
657666
self.should_close = False
667+
self.nrows = nrows
658668

659669
if self.chunksize is not None:
660670
self.chunksize = _validate_integer("chunksize", self.chunksize, 1)
661671
if not self.lines:
662672
raise ValueError("chunksize can only be passed if lines=True")
673+
if self.nrows is not None:
674+
self.chunksize = _validate_integer("nrows", self.nrows, 0)
675+
if not self.lines:
676+
raise ValueError("nrows can only be passed if lines=True")
663677

664678
data = self._get_data_from_filepath(filepath_or_buffer)
665679
self.data = self._preprocess_data(data)
@@ -726,7 +740,10 @@ def read(self):
726740
obj = concat(self)
727741
elif self.lines:
728742
data = ensure_str(self.data)
729-
obj = self._get_object_parser(self._combine_lines(data.split("\n")))
743+
data = data.split("\n")
744+
if self.nrows:
745+
data = data[:self.nrows]
746+
obj = self._get_object_parser(self._combine_lines(data))
730747
else:
731748
obj = self._get_object_parser(self.data)
732749
self.close()
@@ -773,6 +790,11 @@ def close(self):
773790
pass
774791

775792
def __next__(self):
793+
if self.nrows:
794+
if self.nrows_seen >= self.nrows:
795+
self.close()
796+
raise StopIteration
797+
776798
lines = list(islice(self.data, self.chunksize))
777799
if lines:
778800
lines_json = self._combine_lines(lines)

pandas/tests/io/json/test_readlines.py

+26
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ def test_readjson_chunks_closes(chunksize):
130130
lines=True,
131131
chunksize=chunksize,
132132
compression=None,
133+
nrows=None
133134
)
134135
reader.read()
135136
assert (
@@ -179,3 +180,28 @@ def test_readjson_unicode(monkeypatch):
179180
result = read_json(path)
180181
expected = pd.DataFrame({"£©µÀÆÖÞßéöÿ": ["АБВГДабвгд가"]})
181182
tm.assert_frame_equal(result, expected)
183+
184+
185+
@pytest.mark.parametrize("nrows", [1, 2])
186+
def test_readjson_nrows(nrows):
187+
# Test reading line-format JSON to Series with nrows param
188+
jsonl = '''{"a": 1, "b": 2}
189+
{"a": 3, "b": 4}
190+
{"a": 5, "b": 6}
191+
{"a": 7, "b": 8}'''
192+
result = pd.read_json(jsonl, lines=True, nrows=nrows)
193+
expected = pd.DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows]
194+
tm.assert_frame_equal(result, expected)
195+
196+
197+
@pytest.mark.parametrize("nrows,chunksize", [(2, 2), (4, 2)])
198+
def test_readjson_nrows_chunks(nrows, chunksize):
199+
# Test reading line-format JSON to Series with nrows and chunksize param
200+
jsonl = '''{"a": 1, "b": 2}
201+
{"a": 3, "b": 4}
202+
{"a": 5, "b": 6}
203+
{"a": 7, "b": 8}'''
204+
reader = read_json(jsonl, lines=True, nrows=nrows, chunksize=chunksize)
205+
chunked = pd.concat(reader)
206+
expected = pd.DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows]
207+
tm.assert_frame_equal(chunked, expected)

0 commit comments

Comments
 (0)