pandas-dev · jreback · Jun 4, 2020 · May 4, 2020 · May 4, 2020 · May 7, 2020
diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
@@ -3,6 +3,7 @@
 from io import StringIO
 from itertools import islice
 import os
+import re
 from typing import Any, Callable, Optional, Type
 
 import numpy as np
@@ -355,14 +356,15 @@ def read_json(
     dtype=None,
     convert_axes=None,
     convert_dates=True,
-    keep_default_dates=True,
-    numpy=False,
-    precise_float=False,
+    keep_default_dates: bool = True,
+    numpy: bool = False,
+    precise_float: bool = False,
     date_unit=None,
     encoding=None,
-    lines=False,
-    chunksize=None,
+    lines: bool = False,
+    chunksize: Optional[int] = None,
     compression="infer",
+    nrows: Optional[int] = None,
 ):
     """
     Convert a JSON string to pandas object.
@@ -493,13 +495,20 @@ def read_json(
         for more information on ``chunksize``.
         This can only be passed if `lines=True`.
         If this is None, the file will be read into memory all at once.
+
     compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
         For on-the-fly decompression of on-disk data. If 'infer', then use
         gzip, bz2, zip or xz if path_or_buf is a string ending in
         '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression
         otherwise. If using 'zip', the ZIP file must contain only one data
         file to be read in. Set to None for no decompression.
 
+    nrows : int, optional
+        The number of lines from the line-delimited jsonfile that has to be read.
+        This can only be passed if `lines=True`.
+        If this is None, all the rows will be returned.
+        .. versionadded:: 1.1
+
     Returns
     -------
     Series or DataFrame
@@ -600,6 +609,7 @@ def read_json(
         lines=lines,
         chunksize=chunksize,
         compression=compression,
+        nrows=nrows,
     )
 
     if chunksize:
@@ -629,14 +639,15 @@ def __init__(
         dtype,
         convert_axes,
         convert_dates,
-        keep_default_dates,
-        numpy,
-        precise_float,
+        keep_default_dates: bool,
+        numpy: bool,
+        precise_float: bool,
         date_unit,
         encoding,
-        lines,
-        chunksize,
+        lines: bool,
+        chunksize: Optional[int],
         compression,
+        nrows: Optional[int],
     ):
 
         self.path_or_buf = filepath_or_buffer
@@ -655,11 +666,16 @@ def __init__(
         self.chunksize = chunksize
         self.nrows_seen = 0
         self.should_close = False
+        self.nrows = nrows
 
         if self.chunksize is not None:
             self.chunksize = _validate_integer("chunksize", self.chunksize, 1)
             if not self.lines:
                 raise ValueError("chunksize can only be passed if lines=True")
+        if self.nrows is not None:
+            self.nrows = _validate_integer("nrows", self.nrows, 0)
+            if not self.lines:
+                raise ValueError("nrows can only be passed if lines=True")
 
         data = self._get_data_from_filepath(filepath_or_buffer)
         self.data = self._preprocess_data(data)
@@ -726,7 +742,25 @@ def read(self):
             obj = concat(self)
         elif self.lines:
             data = ensure_str(self.data)
-            obj = self._get_object_parser(self._combine_lines(data.split("\n")))
+            print(data)
+            if self.nrows:
+                compiled_pattern = re.compile("\n")
+                data_iterator = compiled_pattern.finditer(data)
+                data_surrogate = []
+                start = 0
+                nrows_seen = 0
+                for vals in data_iterator:
+                    if nrows_seen >= self.nrows:
+                        break
+                    begin, end = vals.span()
+                    data_surrogate.append(data[start:begin].strip())
+                    start = end
+                    nrows_seen += 1
+                data = data_surrogate
+                print(data)
+            else:
+                data = data.split("\n")
+            obj = self._get_object_parser(self._combine_lines(data))
         else:
             obj = self._get_object_parser(self.data)
         self.close()
@@ -773,6 +807,11 @@ def close(self):
                 pass
 
     def __next__(self):
+        if self.nrows:
+            if self.nrows_seen >= self.nrows:
+                self.close()
+                raise StopIteration
+
         lines = list(islice(self.data, self.chunksize))
         if lines:
             lines_json = self._combine_lines(lines)

diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py
@@ -130,6 +130,7 @@ def test_readjson_chunks_closes(chunksize):
             lines=True,
             chunksize=chunksize,
             compression=None,
+            nrows=None,
         )
         reader.read()
         assert (
@@ -179,3 +180,28 @@ def test_readjson_unicode(monkeypatch):
         result = read_json(path)
         expected = pd.DataFrame({"£©µÀÆÖÞßéöÿ": ["АБВГДабвгд가"]})
         tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("nrows", [1, 2])
+def test_readjson_nrows(nrows):
+    # Test reading line-format JSON to Series with nrows param
+    jsonl = """{"a": 1, "b": 2}
+        {"a": 3, "b": 4}
+        {"a": 5, "b": 6}
+        {"a": 7, "b": 8}"""
+    result = pd.read_json(jsonl, lines=True, nrows=nrows)
+    expected = pd.DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows]
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("nrows,chunksize", [(2, 2), (4, 2)])
+def test_readjson_nrows_chunks(nrows, chunksize):
+    # Test reading line-format JSON to Series with nrows and chunksize param
+    jsonl = """{"a": 1, "b": 2}
+        {"a": 3, "b": 4}
+        {"a": 5, "b": 6}
+        {"a": 7, "b": 8}"""
+    reader = read_json(jsonl, lines=True, nrows=nrows, chunksize=chunksize)
+    chunked = pd.concat(reader)
+    expected = pd.DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows]
+    tm.assert_frame_equal(chunked, expected)