ENH Add nrow parameter for line delimited json for read_json #33916

hasnain2808 · hasnain2808 · commit 15d1d1ef1f41 · 2020-05-04T10:34:01.000+05:30
diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
@@ -363,6 +363,7 @@ def read_json(
     lines=False,
     chunksize=None,
     compression="infer",
+    nrows=None,
 ):
     """
     Convert a JSON string to pandas object.
@@ -493,6 +494,12 @@ def read_json(
         for more information on ``chunksize``.
         This can only be passed if `lines=True`.
         If this is None, the file will be read into memory all at once.
+
+    chunksize : int, optional
+        The number of lines from the line-delimited jsonfile that has to be read.
+        This can only be passed if `lines=True`.
+        If this is None, all the rows will be returned.
+
     compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
         For on-the-fly decompression of on-disk data. If 'infer', then use
         gzip, bz2, zip or xz if path_or_buf is a string ending in
@@ -600,6 +607,7 @@ def read_json(
         lines=lines,
         chunksize=chunksize,
         compression=compression,
+        nrows=nrows,
     )
 
     if chunksize:
@@ -637,6 +645,7 @@ def __init__(
         lines,
         chunksize,
         compression,
+        nrows
     ):
 
         self.path_or_buf = filepath_or_buffer
@@ -655,11 +664,16 @@ def __init__(
         self.chunksize = chunksize
         self.nrows_seen = 0
         self.should_close = False
+        self.nrows = nrows
 
         if self.chunksize is not None:
             self.chunksize = _validate_integer("chunksize", self.chunksize, 1)
             if not self.lines:
                 raise ValueError("chunksize can only be passed if lines=True")
+        if self.nrows is not None:
+            self.chunksize = _validate_integer("nrows", self.nrows, 0)
+            if not self.lines:
+                raise ValueError("nrows can only be passed if lines=True")
 
         data = self._get_data_from_filepath(filepath_or_buffer)
         self.data = self._preprocess_data(data)
@@ -726,7 +740,10 @@ def read(self):
             obj = concat(self)
         elif self.lines:
             data = ensure_str(self.data)
-            obj = self._get_object_parser(self._combine_lines(data.split("\n")))
+            data = data.split("\n")
+            if self.nrows:
+                data = data[:self.nrows]
+            obj = self._get_object_parser(self._combine_lines(data))
         else:
             obj = self._get_object_parser(self.data)
         self.close()
@@ -773,6 +790,11 @@ def close(self):
                 pass
 
     def __next__(self):
+        if self.nrows:
+            if self.nrows_seen >= self.nrows:
+                self.close()
+                raise StopIteration
+
         lines = list(islice(self.data, self.chunksize))
         if lines:
             lines_json = self._combine_lines(lines)
diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py
@@ -130,6 +130,7 @@ def test_readjson_chunks_closes(chunksize):
             lines=True,
             chunksize=chunksize,
             compression=None,
+            nrows=None
         )
         reader.read()
         assert (
@@ -179,3 +180,28 @@ def test_readjson_unicode(monkeypatch):
         result = read_json(path)
         expected = pd.DataFrame({"£©µÀÆÖÞßéöÿ": ["АБВГДабвгд가"]})
         tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("nrows", [1, 2])
+def test_readjson_nrows(nrows):
+    # Test reading line-format JSON to Series with nrows param
+    jsonl = '''{"a": 1, "b": 2}
+        {"a": 3, "b": 4}
+        {"a": 5, "b": 6}
+        {"a": 7, "b": 8}'''
+    result = pd.read_json(jsonl, lines=True, nrows=nrows)
+    expected = pd.DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows]
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("nrows,chunksize", [(2, 2), (4, 2)])
+def test_readjson_nrows_chunks(nrows, chunksize):
+    # Test reading line-format JSON to Series with nrows and chunksize param
+    jsonl = '''{"a": 1, "b": 2}
+        {"a": 3, "b": 4}
+        {"a": 5, "b": 6}
+        {"a": 7, "b": 8}'''
+    reader = read_json(jsonl, lines=True, nrows=nrows, chunksize=chunksize)
+    chunked = pd.concat(reader)
+    expected = pd.DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows]
+    tm.assert_frame_equal(chunked, expected)