Skip to content

Commit 89c5a59

Browse files
authored
Add nrows to read json. (#33962)
1 parent 157cb20 commit 89c5a59

File tree

4 files changed

+91
-17
lines changed

4 files changed

+91
-17
lines changed

asv_bench/benchmarks/io/json.py

+6
Original file line numberDiff line numberDiff line change
@@ -53,12 +53,18 @@ def time_read_json_lines(self, index):
5353
def time_read_json_lines_concat(self, index):
5454
concat(read_json(self.fname, orient="records", lines=True, chunksize=25000))
5555

56+
def time_read_json_lines_nrows(self, index):
57+
read_json(self.fname, orient="records", lines=True, nrows=25000)
58+
5659
def peakmem_read_json_lines(self, index):
5760
read_json(self.fname, orient="records", lines=True)
5861

5962
def peakmem_read_json_lines_concat(self, index):
6063
concat(read_json(self.fname, orient="records", lines=True, chunksize=25000))
6164

65+
def peakmem_read_json_lines_nrows(self, index):
66+
read_json(self.fname, orient="records", lines=True, nrows=15000)
67+
6268

6369
class ToJSON(BaseIO):
6470

doc/source/whatsnew/v1.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,7 @@ Other enhancements
289289
- Make :class:`pandas.core.window.Rolling` and :class:`pandas.core.window.Expanding` iterable(:issue:`11704`)
290290
- Make ``option_context`` a :class:`contextlib.ContextDecorator`, which allows it to be used as a decorator over an entire function (:issue:`34253`).
291291
- :meth:`groupby.transform` now allows ``func`` to be ``pad``, ``backfill`` and ``cumcount`` (:issue:`31269`).
292+
- :meth:`~pandas.io.json.read_json` now accepts `nrows` parameter. (:issue:`33916`).
292293
- :meth `~pandas.io.gbq.read_gbq` now allows to disable progress bar (:issue:`33360`).
293294

294295
.. ---------------------------------------------------------------------------

pandas/io/json/_json.py

+44-17
Original file line numberDiff line numberDiff line change
@@ -355,14 +355,15 @@ def read_json(
355355
dtype=None,
356356
convert_axes=None,
357357
convert_dates=True,
358-
keep_default_dates=True,
359-
numpy=False,
360-
precise_float=False,
358+
keep_default_dates: bool = True,
359+
numpy: bool = False,
360+
precise_float: bool = False,
361361
date_unit=None,
362362
encoding=None,
363-
lines=False,
364-
chunksize=None,
363+
lines: bool = False,
364+
chunksize: Optional[int] = None,
365365
compression="infer",
366+
nrows: Optional[int] = None,
366367
):
367368
"""
368369
Convert a JSON string to pandas object.
@@ -493,13 +494,21 @@ def read_json(
493494
for more information on ``chunksize``.
494495
This can only be passed if `lines=True`.
495496
If this is None, the file will be read into memory all at once.
497+
496498
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
497499
For on-the-fly decompression of on-disk data. If 'infer', then use
498500
gzip, bz2, zip or xz if path_or_buf is a string ending in
499501
'.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression
500502
otherwise. If using 'zip', the ZIP file must contain only one data
501503
file to be read in. Set to None for no decompression.
502504
505+
nrows : int, optional
506+
The number of lines from the line-delimited jsonfile that has to be read.
507+
This can only be passed if `lines=True`.
508+
If this is None, all the rows will be returned.
509+
510+
.. versionadded:: 1.1
511+
503512
Returns
504513
-------
505514
Series or DataFrame
@@ -600,6 +609,7 @@ def read_json(
600609
lines=lines,
601610
chunksize=chunksize,
602611
compression=compression,
612+
nrows=nrows,
603613
)
604614

605615
if chunksize:
@@ -629,14 +639,15 @@ def __init__(
629639
dtype,
630640
convert_axes,
631641
convert_dates,
632-
keep_default_dates,
633-
numpy,
634-
precise_float,
642+
keep_default_dates: bool,
643+
numpy: bool,
644+
precise_float: bool,
635645
date_unit,
636646
encoding,
637-
lines,
638-
chunksize,
647+
lines: bool,
648+
chunksize: Optional[int],
639649
compression,
650+
nrows: Optional[int],
640651
):
641652

642653
self.orient = orient
@@ -654,11 +665,16 @@ def __init__(
654665
self.chunksize = chunksize
655666
self.nrows_seen = 0
656667
self.should_close = False
668+
self.nrows = nrows
657669

658670
if self.chunksize is not None:
659671
self.chunksize = _validate_integer("chunksize", self.chunksize, 1)
660672
if not self.lines:
661673
raise ValueError("chunksize can only be passed if lines=True")
674+
if self.nrows is not None:
675+
self.nrows = _validate_integer("nrows", self.nrows, 0)
676+
if not self.lines:
677+
raise ValueError("nrows can only be passed if lines=True")
662678

663679
data = self._get_data_from_filepath(filepath_or_buffer)
664680
self.data = self._preprocess_data(data)
@@ -671,9 +687,9 @@ def _preprocess_data(self, data):
671687
If self.chunksize, we prepare the data for the `__next__` method.
672688
Otherwise, we read it into memory for the `read` method.
673689
"""
674-
if hasattr(data, "read") and not self.chunksize:
690+
if hasattr(data, "read") and (not self.chunksize or not self.nrows):
675691
data = data.read()
676-
if not hasattr(data, "read") and self.chunksize:
692+
if not hasattr(data, "read") and (self.chunksize or self.nrows):
677693
data = StringIO(data)
678694

679695
return data
@@ -721,11 +737,17 @@ def read(self):
721737
"""
722738
Read the whole JSON input into a pandas object.
723739
"""
724-
if self.lines and self.chunksize:
725-
obj = concat(self)
726-
elif self.lines:
727-
data = ensure_str(self.data)
728-
obj = self._get_object_parser(self._combine_lines(data.split("\n")))
740+
if self.lines:
741+
if self.chunksize:
742+
obj = concat(self)
743+
elif self.nrows:
744+
lines = list(islice(self.data, self.nrows))
745+
lines_json = self._combine_lines(lines)
746+
obj = self._get_object_parser(lines_json)
747+
else:
748+
data = ensure_str(self.data)
749+
data = data.split("\n")
750+
obj = self._get_object_parser(self._combine_lines(data))
729751
else:
730752
obj = self._get_object_parser(self.data)
731753
self.close()
@@ -772,6 +794,11 @@ def close(self):
772794
pass
773795

774796
def __next__(self):
797+
if self.nrows:
798+
if self.nrows_seen >= self.nrows:
799+
self.close()
800+
raise StopIteration
801+
775802
lines = list(islice(self.data, self.chunksize))
776803
if lines:
777804
lines_json = self._combine_lines(lines)

pandas/tests/io/json/test_readlines.py

+40
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ def test_readjson_chunks_closes(chunksize):
130130
lines=True,
131131
chunksize=chunksize,
132132
compression=None,
133+
nrows=None,
133134
)
134135
reader.read()
135136
assert (
@@ -179,3 +180,42 @@ def test_readjson_unicode(monkeypatch):
179180
result = read_json(path)
180181
expected = pd.DataFrame({"£©µÀÆÖÞßéöÿ": ["АБВГДабвгд가"]})
181182
tm.assert_frame_equal(result, expected)
183+
184+
185+
@pytest.mark.parametrize("nrows", [1, 2])
186+
def test_readjson_nrows(nrows):
187+
# GH 33916
188+
# Test reading line-format JSON to Series with nrows param
189+
jsonl = """{"a": 1, "b": 2}
190+
{"a": 3, "b": 4}
191+
{"a": 5, "b": 6}
192+
{"a": 7, "b": 8}"""
193+
result = pd.read_json(jsonl, lines=True, nrows=nrows)
194+
expected = pd.DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows]
195+
tm.assert_frame_equal(result, expected)
196+
197+
198+
@pytest.mark.parametrize("nrows,chunksize", [(2, 2), (4, 2)])
199+
def test_readjson_nrows_chunks(nrows, chunksize):
200+
# GH 33916
201+
# Test reading line-format JSON to Series with nrows and chunksize param
202+
jsonl = """{"a": 1, "b": 2}
203+
{"a": 3, "b": 4}
204+
{"a": 5, "b": 6}
205+
{"a": 7, "b": 8}"""
206+
reader = read_json(jsonl, lines=True, nrows=nrows, chunksize=chunksize)
207+
chunked = pd.concat(reader)
208+
expected = pd.DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows]
209+
tm.assert_frame_equal(chunked, expected)
210+
211+
212+
def test_readjson_nrows_requires_lines():
213+
# GH 33916
214+
# Test ValuError raised if nrows is set without setting lines in read_json
215+
jsonl = """{"a": 1, "b": 2}
216+
{"a": 3, "b": 4}
217+
{"a": 5, "b": 6}
218+
{"a": 7, "b": 8}"""
219+
msg = "nrows can only be passed if lines=True"
220+
with pytest.raises(ValueError, match=msg):
221+
pd.read_json(jsonl, lines=False, nrows=2)

0 commit comments

Comments
 (0)