Skip to content

Add nrows to read json. #33962

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 31 commits into from
Jun 4, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
15d1d1e
ENH Add nrow parameter for line delimited json for read_json #33916
hasnain2808 May 4, 2020
fc4993f
ENH solve linting via black8 for Add nrow parameter for line delimite…
hasnain2808 May 4, 2020
028d398
optimized list indexing and type hints added
hasnain2808 May 7, 2020
8765192
solved errors related to typing of args and linting issues
hasnain2808 May 7, 2020
ca9c3e0
use an iterator to slice strings
hasnain2808 May 9, 2020
b355f9c
Update pandas/io/json/_json.py fixed typo
hasnain2808 May 19, 2020
74e9c2b
fixed errors with nrows iterators
hasnain2808 May 19, 2020
237010e
remove print statements
hasnain2808 May 20, 2020
b0b0d69
refactor nrows for json files to use a simpler regular expression
hasnain2808 May 22, 2020
0a6717a
remove debug lines
hasnain2808 May 22, 2020
db50e92
add test check if ValueError is raised if nrows is set and lines in n…
hasnain2808 May 24, 2020
3b139b3
refactor to use generators better
hasnain2808 May 25, 2020
00a474d
Merge branch 'master' of https://github.com/pandas-dev/pandas into ad…
hasnain2808 May 25, 2020
730d6d8
Add related issue number and comments for tests
hasnain2808 May 25, 2020
b6a9499
use StringIO iterator for nrows as used in chunks
hasnain2808 May 28, 2020
8c830b4
add asv benchmarks for nrows in read_json
hasnain2808 May 29, 2020
5c55339
add benchmarks to read a single chunk
hasnain2808 May 30, 2020
d19309a
chunksize 1,100,10000 benchmarks
hasnain2808 May 30, 2020
dec797e
remove wrong benchmarks
hasnain2808 May 30, 2020
91e0b94
remove wrong benchmarks
hasnain2808 May 30, 2020
c010399
add whatsnew and remove unwanted benchmarks
hasnain2808 Jun 3, 2020
2355fc5
remove conflict
hasnain2808 Jun 3, 2020
2648c3d
Merge branch 'master' of https://github.com/pandas-dev/pandas into ad…
hasnain2808 Jun 3, 2020
7fcf3db
add whatsnew for nrows
hasnain2808 Jun 3, 2020
9e667a1
solve doc error
hasnain2808 Jun 3, 2020
cb3de4d
remove merge conflict lines
hasnain2808 Jun 4, 2020
d14ff45
Merge branch 'master' of https://github.com/pandas-dev/pandas into ad…
hasnain2808 Jun 4, 2020
2ce74db
added the conflicting line back
hasnain2808 Jun 4, 2020
133aef9
Merge branch 'master' of https://github.com/pandas-dev/pandas into ad…
hasnain2808 Jun 4, 2020
b3ee647
Merge remote-tracking branch 'upstream/master' into add-nrows-to-read…
hasnain2808 Jun 4, 2020
b9a3ebd
Merge branch 'master' of https://github.com/pandas-dev/pandas into ad…
hasnain2808 Jun 4, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions asv_bench/benchmarks/io/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,18 @@ def time_read_json_lines(self, index):
def time_read_json_lines_concat(self, index):
concat(read_json(self.fname, orient="records", lines=True, chunksize=25000))

def time_read_json_lines_nrows(self, index):
read_json(self.fname, orient="records", lines=True, nrows=25000)

def peakmem_read_json_lines(self, index):
read_json(self.fname, orient="records", lines=True)

def peakmem_read_json_lines_concat(self, index):
concat(read_json(self.fname, orient="records", lines=True, chunksize=25000))

def peakmem_read_json_lines_nrows(self, index):
read_json(self.fname, orient="records", lines=True, nrows=15000)


class ToJSON(BaseIO):

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,7 @@ Other enhancements
- Make :class:`pandas.core.window.Rolling` and :class:`pandas.core.window.Expanding` iterable(:issue:`11704`)
- Make ``option_context`` a :class:`contextlib.ContextDecorator`, which allows it to be used as a decorator over an entire function (:issue:`34253`).
- :meth:`groupby.transform` now allows ``func`` to be ``pad``, ``backfill`` and ``cumcount`` (:issue:`31269`).
- :meth:`~pandas.io.json.read_json` now accepts `nrows` parameter. (:issue:`33916`).
- :meth `~pandas.io.gbq.read_gbq` now allows to disable progress bar (:issue:`33360`).

.. ---------------------------------------------------------------------------
Expand Down
61 changes: 44 additions & 17 deletions pandas/io/json/_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,14 +355,15 @@ def read_json(
dtype=None,
convert_axes=None,
convert_dates=True,
keep_default_dates=True,
numpy=False,
precise_float=False,
keep_default_dates: bool = True,
numpy: bool = False,
precise_float: bool = False,
date_unit=None,
encoding=None,
lines=False,
chunksize=None,
lines: bool = False,
chunksize: Optional[int] = None,
compression="infer",
nrows: Optional[int] = None,
):
"""
Convert a JSON string to pandas object.
Expand Down Expand Up @@ -493,13 +494,21 @@ def read_json(
for more information on ``chunksize``.
This can only be passed if `lines=True`.
If this is None, the file will be read into memory all at once.

compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
For on-the-fly decompression of on-disk data. If 'infer', then use
gzip, bz2, zip or xz if path_or_buf is a string ending in
'.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression
otherwise. If using 'zip', the ZIP file must contain only one data
file to be read in. Set to None for no decompression.

nrows : int, optional
The number of lines from the line-delimited jsonfile that has to be read.
This can only be passed if `lines=True`.
If this is None, all the rows will be returned.

.. versionadded:: 1.1

Returns
-------
Series or DataFrame
Expand Down Expand Up @@ -600,6 +609,7 @@ def read_json(
lines=lines,
chunksize=chunksize,
compression=compression,
nrows=nrows,
)

if chunksize:
Expand Down Expand Up @@ -629,14 +639,15 @@ def __init__(
dtype,
convert_axes,
convert_dates,
keep_default_dates,
numpy,
precise_float,
keep_default_dates: bool,
numpy: bool,
precise_float: bool,
date_unit,
encoding,
lines,
chunksize,
lines: bool,
chunksize: Optional[int],
compression,
nrows: Optional[int],
):

self.orient = orient
Expand All @@ -654,11 +665,16 @@ def __init__(
self.chunksize = chunksize
self.nrows_seen = 0
self.should_close = False
self.nrows = nrows

if self.chunksize is not None:
self.chunksize = _validate_integer("chunksize", self.chunksize, 1)
if not self.lines:
raise ValueError("chunksize can only be passed if lines=True")
if self.nrows is not None:
self.nrows = _validate_integer("nrows", self.nrows, 0)
if not self.lines:
raise ValueError("nrows can only be passed if lines=True")

data = self._get_data_from_filepath(filepath_or_buffer)
self.data = self._preprocess_data(data)
Expand All @@ -671,9 +687,9 @@ def _preprocess_data(self, data):
If self.chunksize, we prepare the data for the `__next__` method.
Otherwise, we read it into memory for the `read` method.
"""
if hasattr(data, "read") and not self.chunksize:
if hasattr(data, "read") and (not self.chunksize or not self.nrows):
data = data.read()
if not hasattr(data, "read") and self.chunksize:
if not hasattr(data, "read") and (self.chunksize or self.nrows):
data = StringIO(data)

return data
Expand Down Expand Up @@ -721,11 +737,17 @@ def read(self):
"""
Read the whole JSON input into a pandas object.
"""
if self.lines and self.chunksize:
obj = concat(self)
elif self.lines:
data = ensure_str(self.data)
obj = self._get_object_parser(self._combine_lines(data.split("\n")))
if self.lines:
if self.chunksize:
obj = concat(self)
elif self.nrows:
lines = list(islice(self.data, self.nrows))
lines_json = self._combine_lines(lines)
obj = self._get_object_parser(lines_json)
else:
data = ensure_str(self.data)
data = data.split("\n")
obj = self._get_object_parser(self._combine_lines(data))
else:
obj = self._get_object_parser(self.data)
self.close()
Expand Down Expand Up @@ -772,6 +794,11 @@ def close(self):
pass

def __next__(self):
if self.nrows:
if self.nrows_seen >= self.nrows:
self.close()
raise StopIteration

lines = list(islice(self.data, self.chunksize))
if lines:
lines_json = self._combine_lines(lines)
Expand Down
40 changes: 40 additions & 0 deletions pandas/tests/io/json/test_readlines.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ def test_readjson_chunks_closes(chunksize):
lines=True,
chunksize=chunksize,
compression=None,
nrows=None,
)
reader.read()
assert (
Expand Down Expand Up @@ -179,3 +180,42 @@ def test_readjson_unicode(monkeypatch):
result = read_json(path)
expected = pd.DataFrame({"£©µÀÆÖÞßéöÿ": ["АБВГДабвгд가"]})
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("nrows", [1, 2])
def test_readjson_nrows(nrows):
# GH 33916
# Test reading line-format JSON to Series with nrows param
jsonl = """{"a": 1, "b": 2}
{"a": 3, "b": 4}
{"a": 5, "b": 6}
{"a": 7, "b": 8}"""
result = pd.read_json(jsonl, lines=True, nrows=nrows)
expected = pd.DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows]
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("nrows,chunksize", [(2, 2), (4, 2)])
def test_readjson_nrows_chunks(nrows, chunksize):
# GH 33916
# Test reading line-format JSON to Series with nrows and chunksize param
jsonl = """{"a": 1, "b": 2}
{"a": 3, "b": 4}
{"a": 5, "b": 6}
{"a": 7, "b": 8}"""
reader = read_json(jsonl, lines=True, nrows=nrows, chunksize=chunksize)
chunked = pd.concat(reader)
expected = pd.DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows]
tm.assert_frame_equal(chunked, expected)


def test_readjson_nrows_requires_lines():
# GH 33916
# Test ValuError raised if nrows is set without setting lines in read_json
jsonl = """{"a": 1, "b": 2}
{"a": 3, "b": 4}
{"a": 5, "b": 6}
{"a": 7, "b": 8}"""
msg = "nrows can only be passed if lines=True"
with pytest.raises(ValueError, match=msg):
pd.read_json(jsonl, lines=False, nrows=2)