Skip to content

Add nrows to read json. #33962

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 31 commits into from
Jun 4, 2020
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
15d1d1e
ENH Add nrow parameter for line delimited json for read_json #33916
hasnain2808 May 4, 2020
fc4993f
ENH solve linting via black8 for Add nrow parameter for line delimite…
hasnain2808 May 4, 2020
028d398
optimized list indexing and type hints added
hasnain2808 May 7, 2020
8765192
solved errors related to typing of args and linting issues
hasnain2808 May 7, 2020
ca9c3e0
use an iterator to slice strings
hasnain2808 May 9, 2020
b355f9c
Update pandas/io/json/_json.py fixed typo
hasnain2808 May 19, 2020
74e9c2b
fixed errors with nrows iterators
hasnain2808 May 19, 2020
237010e
remove print statements
hasnain2808 May 20, 2020
b0b0d69
refactor nrows for json files to use a simpler regular expression
hasnain2808 May 22, 2020
0a6717a
remove debug lines
hasnain2808 May 22, 2020
db50e92
add test check if ValueError is raised if nrows is set and lines in n…
hasnain2808 May 24, 2020
3b139b3
refactor to use generators better
hasnain2808 May 25, 2020
00a474d
Merge branch 'master' of https://github.com/pandas-dev/pandas into ad…
hasnain2808 May 25, 2020
730d6d8
Add related issue number and comments for tests
hasnain2808 May 25, 2020
b6a9499
use StringIO iterator for nrows as used in chunks
hasnain2808 May 28, 2020
8c830b4
add asv benchmarks for nrows in read_json
hasnain2808 May 29, 2020
5c55339
add benchmarks to read a single chunk
hasnain2808 May 30, 2020
d19309a
chunksize 1,100,10000 benchmarks
hasnain2808 May 30, 2020
dec797e
remove wrong benchmarks
hasnain2808 May 30, 2020
91e0b94
remove wrong benchmarks
hasnain2808 May 30, 2020
c010399
add whatsnew and remove unwanted benchmarks
hasnain2808 Jun 3, 2020
2355fc5
remove conflict
hasnain2808 Jun 3, 2020
2648c3d
Merge branch 'master' of https://github.com/pandas-dev/pandas into ad…
hasnain2808 Jun 3, 2020
7fcf3db
add whatsnew for nrows
hasnain2808 Jun 3, 2020
9e667a1
solve doc error
hasnain2808 Jun 3, 2020
cb3de4d
remove merge conflict lines
hasnain2808 Jun 4, 2020
d14ff45
Merge branch 'master' of https://github.com/pandas-dev/pandas into ad…
hasnain2808 Jun 4, 2020
2ce74db
added the conflicting line back
hasnain2808 Jun 4, 2020
133aef9
Merge branch 'master' of https://github.com/pandas-dev/pandas into ad…
hasnain2808 Jun 4, 2020
b3ee647
Merge remote-tracking branch 'upstream/master' into add-nrows-to-read…
hasnain2808 Jun 4, 2020
b9a3ebd
Merge branch 'master' of https://github.com/pandas-dev/pandas into ad…
hasnain2808 Jun 4, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 50 additions & 11 deletions pandas/io/json/_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from io import StringIO
from itertools import islice
import os
import re
from typing import Any, Callable, Optional, Type

import numpy as np
Expand Down Expand Up @@ -355,14 +356,15 @@ def read_json(
dtype=None,
convert_axes=None,
convert_dates=True,
keep_default_dates=True,
numpy=False,
precise_float=False,
keep_default_dates: bool = True,
numpy: bool = False,
precise_float: bool = False,
date_unit=None,
encoding=None,
lines=False,
chunksize=None,
lines: bool = False,
chunksize: Optional[int] = None,
compression="infer",
nrows: Optional[int] = None,
):
"""
Convert a JSON string to pandas object.
Expand Down Expand Up @@ -493,13 +495,20 @@ def read_json(
for more information on ``chunksize``.
This can only be passed if `lines=True`.
If this is None, the file will be read into memory all at once.

compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
For on-the-fly decompression of on-disk data. If 'infer', then use
gzip, bz2, zip or xz if path_or_buf is a string ending in
'.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression
otherwise. If using 'zip', the ZIP file must contain only one data
file to be read in. Set to None for no decompression.

nrows : int, optional
The number of lines from the line-delimited jsonfile that has to be read.
This can only be passed if `lines=True`.
If this is None, all the rows will be returned.
.. versionadded:: 1.1

Returns
-------
Series or DataFrame
Expand Down Expand Up @@ -600,6 +609,7 @@ def read_json(
lines=lines,
chunksize=chunksize,
compression=compression,
nrows=nrows,
)

if chunksize:
Expand Down Expand Up @@ -629,14 +639,15 @@ def __init__(
dtype,
convert_axes,
convert_dates,
keep_default_dates,
numpy,
precise_float,
keep_default_dates: bool,
numpy: bool,
precise_float: bool,
date_unit,
encoding,
lines,
chunksize,
lines: bool,
chunksize: Optional[int],
compression,
nrows: Optional[int],
):

self.path_or_buf = filepath_or_buffer
Expand All @@ -655,11 +666,16 @@ def __init__(
self.chunksize = chunksize
self.nrows_seen = 0
self.should_close = False
self.nrows = nrows

if self.chunksize is not None:
self.chunksize = _validate_integer("chunksize", self.chunksize, 1)
if not self.lines:
raise ValueError("chunksize can only be passed if lines=True")
if self.nrows is not None:
self.nrows = _validate_integer("nrows", self.nrows, 0)
if not self.lines:
raise ValueError("nrows can only be passed if lines=True")

data = self._get_data_from_filepath(filepath_or_buffer)
self.data = self._preprocess_data(data)
Expand Down Expand Up @@ -726,7 +742,25 @@ def read(self):
obj = concat(self)
elif self.lines:
data = ensure_str(self.data)
obj = self._get_object_parser(self._combine_lines(data.split("\n")))
print(data)
if self.nrows:
compiled_pattern = re.compile("\n")
data_iterator = compiled_pattern.finditer(data)
data_surrogate = []
start = 0
nrows_seen = 0
for vals in data_iterator:
if nrows_seen >= self.nrows:
break
begin, end = vals.span()
data_surrogate.append(data[start:begin].strip())
start = end
nrows_seen += 1
data = data_surrogate
print(data)
else:
data = data.split("\n")
obj = self._get_object_parser(self._combine_lines(data))
else:
obj = self._get_object_parser(self.data)
self.close()
Expand Down Expand Up @@ -773,6 +807,11 @@ def close(self):
pass

def __next__(self):
if self.nrows:
if self.nrows_seen >= self.nrows:
self.close()
raise StopIteration

lines = list(islice(self.data, self.chunksize))
if lines:
lines_json = self._combine_lines(lines)
Expand Down
26 changes: 26 additions & 0 deletions pandas/tests/io/json/test_readlines.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ def test_readjson_chunks_closes(chunksize):
lines=True,
chunksize=chunksize,
compression=None,
nrows=None,
)
reader.read()
assert (
Expand Down Expand Up @@ -179,3 +180,28 @@ def test_readjson_unicode(monkeypatch):
result = read_json(path)
expected = pd.DataFrame({"£©µÀÆÖÞßéöÿ": ["АБВГДабвгд가"]})
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("nrows", [1, 2])
def test_readjson_nrows(nrows):
# Test reading line-format JSON to Series with nrows param
jsonl = """{"a": 1, "b": 2}
{"a": 3, "b": 4}
{"a": 5, "b": 6}
{"a": 7, "b": 8}"""
result = pd.read_json(jsonl, lines=True, nrows=nrows)
expected = pd.DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows]
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("nrows,chunksize", [(2, 2), (4, 2)])
def test_readjson_nrows_chunks(nrows, chunksize):
# Test reading line-format JSON to Series with nrows and chunksize param
jsonl = """{"a": 1, "b": 2}
{"a": 3, "b": 4}
{"a": 5, "b": 6}
{"a": 7, "b": 8}"""
reader = read_json(jsonl, lines=True, nrows=nrows, chunksize=chunksize)
chunked = pd.concat(reader)
expected = pd.DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows]
tm.assert_frame_equal(chunked, expected)