diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index f478bf2aee0ba..a490e250943f5 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -53,12 +53,18 @@ def time_read_json_lines(self, index): def time_read_json_lines_concat(self, index): concat(read_json(self.fname, orient="records", lines=True, chunksize=25000)) + def time_read_json_lines_nrows(self, index): + read_json(self.fname, orient="records", lines=True, nrows=25000) + def peakmem_read_json_lines(self, index): read_json(self.fname, orient="records", lines=True) def peakmem_read_json_lines_concat(self, index): concat(read_json(self.fname, orient="records", lines=True, chunksize=25000)) + def peakmem_read_json_lines_nrows(self, index): + read_json(self.fname, orient="records", lines=True, nrows=15000) + class ToJSON(BaseIO): diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 17623b943bf87..2243790a663df 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -289,6 +289,7 @@ Other enhancements - Make :class:`pandas.core.window.Rolling` and :class:`pandas.core.window.Expanding` iterable(:issue:`11704`) - Make ``option_context`` a :class:`contextlib.ContextDecorator`, which allows it to be used as a decorator over an entire function (:issue:`34253`). - :meth:`groupby.transform` now allows ``func`` to be ``pad``, ``backfill`` and ``cumcount`` (:issue:`31269`). +- :meth:`~pandas.io.json.read_json` now accepts `nrows` parameter. (:issue:`33916`). - :meth `~pandas.io.gbq.read_gbq` now allows to disable progress bar (:issue:`33360`). .. --------------------------------------------------------------------------- diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 72aa8fdd16e6d..b973553a767ba 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -355,14 +355,15 @@ def read_json( dtype=None, convert_axes=None, convert_dates=True, - keep_default_dates=True, - numpy=False, - precise_float=False, + keep_default_dates: bool = True, + numpy: bool = False, + precise_float: bool = False, date_unit=None, encoding=None, - lines=False, - chunksize=None, + lines: bool = False, + chunksize: Optional[int] = None, compression="infer", + nrows: Optional[int] = None, ): """ Convert a JSON string to pandas object. @@ -493,6 +494,7 @@ def read_json( for more information on ``chunksize``. This can only be passed if `lines=True`. If this is None, the file will be read into memory all at once. + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use gzip, bz2, zip or xz if path_or_buf is a string ending in @@ -500,6 +502,13 @@ def read_json( otherwise. If using 'zip', the ZIP file must contain only one data file to be read in. Set to None for no decompression. + nrows : int, optional + The number of lines from the line-delimited jsonfile that has to be read. + This can only be passed if `lines=True`. + If this is None, all the rows will be returned. + + .. versionadded:: 1.1 + Returns ------- Series or DataFrame @@ -600,6 +609,7 @@ def read_json( lines=lines, chunksize=chunksize, compression=compression, + nrows=nrows, ) if chunksize: @@ -629,14 +639,15 @@ def __init__( dtype, convert_axes, convert_dates, - keep_default_dates, - numpy, - precise_float, + keep_default_dates: bool, + numpy: bool, + precise_float: bool, date_unit, encoding, - lines, - chunksize, + lines: bool, + chunksize: Optional[int], compression, + nrows: Optional[int], ): self.orient = orient @@ -654,11 +665,16 @@ def __init__( self.chunksize = chunksize self.nrows_seen = 0 self.should_close = False + self.nrows = nrows if self.chunksize is not None: self.chunksize = _validate_integer("chunksize", self.chunksize, 1) if not self.lines: raise ValueError("chunksize can only be passed if lines=True") + if self.nrows is not None: + self.nrows = _validate_integer("nrows", self.nrows, 0) + if not self.lines: + raise ValueError("nrows can only be passed if lines=True") data = self._get_data_from_filepath(filepath_or_buffer) self.data = self._preprocess_data(data) @@ -671,9 +687,9 @@ def _preprocess_data(self, data): If self.chunksize, we prepare the data for the `__next__` method. Otherwise, we read it into memory for the `read` method. """ - if hasattr(data, "read") and not self.chunksize: + if hasattr(data, "read") and (not self.chunksize or not self.nrows): data = data.read() - if not hasattr(data, "read") and self.chunksize: + if not hasattr(data, "read") and (self.chunksize or self.nrows): data = StringIO(data) return data @@ -721,11 +737,17 @@ def read(self): """ Read the whole JSON input into a pandas object. """ - if self.lines and self.chunksize: - obj = concat(self) - elif self.lines: - data = ensure_str(self.data) - obj = self._get_object_parser(self._combine_lines(data.split("\n"))) + if self.lines: + if self.chunksize: + obj = concat(self) + elif self.nrows: + lines = list(islice(self.data, self.nrows)) + lines_json = self._combine_lines(lines) + obj = self._get_object_parser(lines_json) + else: + data = ensure_str(self.data) + data = data.split("\n") + obj = self._get_object_parser(self._combine_lines(data)) else: obj = self._get_object_parser(self.data) self.close() @@ -772,6 +794,11 @@ def close(self): pass def __next__(self): + if self.nrows: + if self.nrows_seen >= self.nrows: + self.close() + raise StopIteration + lines = list(islice(self.data, self.chunksize)) if lines: lines_json = self._combine_lines(lines) diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index e531457627342..53462eaaada8d 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -130,6 +130,7 @@ def test_readjson_chunks_closes(chunksize): lines=True, chunksize=chunksize, compression=None, + nrows=None, ) reader.read() assert ( @@ -179,3 +180,42 @@ def test_readjson_unicode(monkeypatch): result = read_json(path) expected = pd.DataFrame({"£©µÀÆÖÞßéöÿ": ["АБВГДабвгд가"]}) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("nrows", [1, 2]) +def test_readjson_nrows(nrows): + # GH 33916 + # Test reading line-format JSON to Series with nrows param + jsonl = """{"a": 1, "b": 2} + {"a": 3, "b": 4} + {"a": 5, "b": 6} + {"a": 7, "b": 8}""" + result = pd.read_json(jsonl, lines=True, nrows=nrows) + expected = pd.DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows] + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("nrows,chunksize", [(2, 2), (4, 2)]) +def test_readjson_nrows_chunks(nrows, chunksize): + # GH 33916 + # Test reading line-format JSON to Series with nrows and chunksize param + jsonl = """{"a": 1, "b": 2} + {"a": 3, "b": 4} + {"a": 5, "b": 6} + {"a": 7, "b": 8}""" + reader = read_json(jsonl, lines=True, nrows=nrows, chunksize=chunksize) + chunked = pd.concat(reader) + expected = pd.DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows] + tm.assert_frame_equal(chunked, expected) + + +def test_readjson_nrows_requires_lines(): + # GH 33916 + # Test ValuError raised if nrows is set without setting lines in read_json + jsonl = """{"a": 1, "b": 2} + {"a": 3, "b": 4} + {"a": 5, "b": 6} + {"a": 7, "b": 8}""" + msg = "nrows can only be passed if lines=True" + with pytest.raises(ValueError, match=msg): + pd.read_json(jsonl, lines=False, nrows=2)