Add nrows to read json. (#33962)

hasnain2808 · web-flow · commit 89c5a5941c48 · 2020-06-04T16:44:20.000-04:00
diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py
@@ -53,12 +53,18 @@ def time_read_json_lines(self, index):
     def time_read_json_lines_concat(self, index):
         concat(read_json(self.fname, orient="records", lines=True, chunksize=25000))
 
+    def time_read_json_lines_nrows(self, index):
+        read_json(self.fname, orient="records", lines=True, nrows=25000)
+
     def peakmem_read_json_lines(self, index):
         read_json(self.fname, orient="records", lines=True)
 
     def peakmem_read_json_lines_concat(self, index):
         concat(read_json(self.fname, orient="records", lines=True, chunksize=25000))
 
+    def peakmem_read_json_lines_nrows(self, index):
+        read_json(self.fname, orient="records", lines=True, nrows=15000)
+
 
 class ToJSON(BaseIO):
 
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -289,6 +289,7 @@ Other enhancements
 - Make :class:`pandas.core.window.Rolling` and :class:`pandas.core.window.Expanding` iterable（:issue:`11704`)
 - Make ``option_context`` a :class:`contextlib.ContextDecorator`, which allows it to be used as a decorator over an entire function (:issue:`34253`).
 - :meth:`groupby.transform` now allows ``func`` to be ``pad``, ``backfill`` and ``cumcount`` (:issue:`31269`).
+- :meth:`~pandas.io.json.read_json` now accepts `nrows` parameter. (:issue:`33916`).
 - :meth `~pandas.io.gbq.read_gbq` now allows to disable progress bar (:issue:`33360`).
 
 .. ---------------------------------------------------------------------------
diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
@@ -355,14 +355,15 @@ def read_json(
     dtype=None,
     convert_axes=None,
     convert_dates=True,
-    keep_default_dates=True,
-    numpy=False,
-    precise_float=False,
+    keep_default_dates: bool = True,
+    numpy: bool = False,
+    precise_float: bool = False,
     date_unit=None,
     encoding=None,
-    lines=False,
-    chunksize=None,
+    lines: bool = False,
+    chunksize: Optional[int] = None,
     compression="infer",
+    nrows: Optional[int] = None,
 ):
     """
     Convert a JSON string to pandas object.
@@ -493,13 +494,21 @@ def read_json(
         for more information on ``chunksize``.
         This can only be passed if `lines=True`.
         If this is None, the file will be read into memory all at once.
+
     compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
         For on-the-fly decompression of on-disk data. If 'infer', then use
         gzip, bz2, zip or xz if path_or_buf is a string ending in
         '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression
         otherwise. If using 'zip', the ZIP file must contain only one data
         file to be read in. Set to None for no decompression.
 
+    nrows : int, optional
+        The number of lines from the line-delimited jsonfile that has to be read.
+        This can only be passed if `lines=True`.
+        If this is None, all the rows will be returned.
+
+        .. versionadded:: 1.1
+
     Returns
     -------
     Series or DataFrame
@@ -600,6 +609,7 @@ def read_json(
         lines=lines,
         chunksize=chunksize,
         compression=compression,
+        nrows=nrows,
     )
 
     if chunksize:
@@ -629,14 +639,15 @@ def __init__(
         dtype,
         convert_axes,
         convert_dates,
-        keep_default_dates,
-        numpy,
-        precise_float,
+        keep_default_dates: bool,
+        numpy: bool,
+        precise_float: bool,
         date_unit,
         encoding,
-        lines,
-        chunksize,
+        lines: bool,
+        chunksize: Optional[int],
         compression,
+        nrows: Optional[int],
     ):
 
         self.orient = orient
@@ -654,11 +665,16 @@ def __init__(
         self.chunksize = chunksize
         self.nrows_seen = 0
         self.should_close = False
+        self.nrows = nrows
 
         if self.chunksize is not None:
             self.chunksize = _validate_integer("chunksize", self.chunksize, 1)
             if not self.lines:
                 raise ValueError("chunksize can only be passed if lines=True")
+        if self.nrows is not None:
+            self.nrows = _validate_integer("nrows", self.nrows, 0)
+            if not self.lines:
+                raise ValueError("nrows can only be passed if lines=True")
 
         data = self._get_data_from_filepath(filepath_or_buffer)
         self.data = self._preprocess_data(data)
@@ -671,9 +687,9 @@ def _preprocess_data(self, data):
         If self.chunksize, we prepare the data for the `__next__` method.
         Otherwise, we read it into memory for the `read` method.
         """
-        if hasattr(data, "read") and not self.chunksize:
+        if hasattr(data, "read") and (not self.chunksize or not self.nrows):
             data = data.read()
-        if not hasattr(data, "read") and self.chunksize:
+        if not hasattr(data, "read") and (self.chunksize or self.nrows):
             data = StringIO(data)
 
         return data
@@ -721,11 +737,17 @@ def read(self):
         """
         Read the whole JSON input into a pandas object.
         """
-        if self.lines and self.chunksize:
-            obj = concat(self)
-        elif self.lines:
-            data = ensure_str(self.data)
-            obj = self._get_object_parser(self._combine_lines(data.split("\n")))
+        if self.lines:
+            if self.chunksize:
+                obj = concat(self)
+            elif self.nrows:
+                lines = list(islice(self.data, self.nrows))
+                lines_json = self._combine_lines(lines)
+                obj = self._get_object_parser(lines_json)
+            else:
+                data = ensure_str(self.data)
+                data = data.split("\n")
+                obj = self._get_object_parser(self._combine_lines(data))
         else:
             obj = self._get_object_parser(self.data)
         self.close()
@@ -772,6 +794,11 @@ def close(self):
                 pass
 
     def __next__(self):
+        if self.nrows:
+            if self.nrows_seen >= self.nrows:
+                self.close()
+                raise StopIteration
+
         lines = list(islice(self.data, self.chunksize))
         if lines:
             lines_json = self._combine_lines(lines)
diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py
@@ -130,6 +130,7 @@ def test_readjson_chunks_closes(chunksize):
             lines=True,
             chunksize=chunksize,
             compression=None,
+            nrows=None,
         )
         reader.read()
         assert (
@@ -179,3 +180,42 @@ def test_readjson_unicode(monkeypatch):
         result = read_json(path)
         expected = pd.DataFrame({"£©µÀÆÖÞßéöÿ": ["АБВГДабвгд가"]})
         tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("nrows", [1, 2])
+def test_readjson_nrows(nrows):
+    # GH 33916
+    # Test reading line-format JSON to Series with nrows param
+    jsonl = """{"a": 1, "b": 2}
+        {"a": 3, "b": 4}
+        {"a": 5, "b": 6}
+        {"a": 7, "b": 8}"""
+    result = pd.read_json(jsonl, lines=True, nrows=nrows)
+    expected = pd.DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows]
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("nrows,chunksize", [(2, 2), (4, 2)])
+def test_readjson_nrows_chunks(nrows, chunksize):
+    # GH 33916
+    # Test reading line-format JSON to Series with nrows and chunksize param
+    jsonl = """{"a": 1, "b": 2}
+        {"a": 3, "b": 4}
+        {"a": 5, "b": 6}
+        {"a": 7, "b": 8}"""
+    reader = read_json(jsonl, lines=True, nrows=nrows, chunksize=chunksize)
+    chunked = pd.concat(reader)
+    expected = pd.DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows]
+    tm.assert_frame_equal(chunked, expected)
+
+
+def test_readjson_nrows_requires_lines():
+    # GH 33916
+    # Test ValuError raised if nrows is set without setting lines in read_json
+    jsonl = """{"a": 1, "b": 2}
+        {"a": 3, "b": 4}
+        {"a": 5, "b": 6}
+        {"a": 7, "b": 8}"""
+    msg = "nrows can only be passed if lines=True"
+    with pytest.raises(ValueError, match=msg):
+        pd.read_json(jsonl, lines=False, nrows=2)