Skip to content

Commit 6e51a5c

Browse files
louispotokalanbato
authored andcommitted
Add chunksize param to read_json when lines=True (pandas-dev#17168)
closes pandas-dev#17048
1 parent 50ab764 commit 6e51a5c

File tree

6 files changed

+383
-88
lines changed

6 files changed

+383
-88
lines changed

asv_bench/benchmarks/io_bench.py

+30
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import os
12
from .pandas_vb_common import *
23
from pandas import concat, Timestamp, compat
34
try:
@@ -192,3 +193,32 @@ def time_read_nrows(self, compression, engine):
192193
ext = ".bz2"
193194
pd.read_csv(self.big_fname + ext, nrows=10,
194195
compression=compression, engine=engine)
196+
197+
198+
class read_json_lines(object):
199+
goal_time = 0.2
200+
fname = "__test__.json"
201+
202+
def setup(self):
203+
self.N = 100000
204+
self.C = 5
205+
self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]))
206+
self.df.to_json(self.fname,orient="records",lines=True)
207+
208+
def teardown(self):
209+
try:
210+
os.remove(self.fname)
211+
except:
212+
pass
213+
214+
def time_read_json_lines(self):
215+
pd.read_json(self.fname, lines=True)
216+
217+
def time_read_json_lines_chunk(self):
218+
pd.concat(pd.read_json(self.fname, lines=True, chunksize=self.N//4))
219+
220+
def peakmem_read_json_lines(self):
221+
pd.read_json(self.fname, lines=True)
222+
223+
def peakmem_read_json_lines_chunk(self):
224+
pd.concat(pd.read_json(self.fname, lines=True, chunksize=self.N//4))

doc/source/io.rst

+10
Original file line numberDiff line numberDiff line change
@@ -1845,6 +1845,7 @@ is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series``
18451845
seconds, milliseconds, microseconds or nanoseconds respectively.
18461846
- ``lines`` : reads file as one json object per line.
18471847
- ``encoding`` : The encoding to use to decode py3 bytes.
1848+
- ``chunksize`` : when used in combination with ``lines=True``, return a JsonReader which reads in ``chunksize`` lines per iteration.
18481849

18491850
The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is not parseable.
18501851

@@ -2049,6 +2050,10 @@ Line delimited json
20492050
pandas is able to read and write line-delimited json files that are common in data processing pipelines
20502051
using Hadoop or Spark.
20512052

2053+
.. versionadded:: 0.21.0
2054+
2055+
For line-delimited json files, pandas can also return an iterator which reads in ``chunksize`` lines at a time. This can be useful for large files or to read from a stream.
2056+
20522057
.. ipython:: python
20532058
20542059
jsonl = '''
@@ -2059,6 +2064,11 @@ using Hadoop or Spark.
20592064
df
20602065
df.to_json(orient='records', lines=True)
20612066
2067+
# reader is an iterator that returns `chunksize` lines each iteration
2068+
reader = pd.read_json(StringIO(jsonl), lines=True, chunksize=1)
2069+
reader
2070+
for chunk in reader:
2071+
print(chunk)
20622072
20632073
.. _io.table_schema:
20642074

doc/source/whatsnew/v0.21.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ Other Enhancements
162162
- :func:`MultiIndex.is_monotonic_decreasing` has been implemented. Previously returned ``False`` in all cases. (:issue:`16554`)
163163
- :func:`Categorical.rename_categories` now accepts a dict-like argument as `new_categories` and only updates the categories found in that dict. (:issue:`17336`)
164164
- :func:`read_excel` raises ``ImportError`` with a better message if ``xlrd`` is not installed. (:issue:`17613`)
165+
- :func:`read_json` now accepts a ``chunksize`` parameter that can be used when ``lines=True``. If ``chunksize`` is passed, read_json now returns an iterator which reads in ``chunksize`` lines with each iteration. (:issue:`17048`)
165166
- :meth:`DataFrame.assign` will preserve the original order of ``**kwargs`` for Python 3.6+ users instead of sorting the column names
166167

167168

pandas/io/json/json.py

+174-41
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# pylint: disable-msg=E1101,W0613,W0603
2+
from itertools import islice
23
import os
34
import numpy as np
45

@@ -8,8 +9,10 @@
89
from pandas import compat, isna
910
from pandas import Series, DataFrame, to_datetime, MultiIndex
1011
from pandas.io.common import (get_filepath_or_buffer, _get_handle,
11-
_stringify_path)
12+
_stringify_path, BaseIterator)
13+
from pandas.io.parsers import _validate_integer
1214
from pandas.core.common import AbstractMethodError
15+
from pandas.core.reshape.concat import concat
1316
from pandas.io.formats.printing import pprint_thing
1417
from .normalize import _convert_to_line_delimits
1518
from .table_schema import build_table_schema
@@ -175,7 +178,7 @@ def write(self):
175178
def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
176179
convert_axes=True, convert_dates=True, keep_default_dates=True,
177180
numpy=False, precise_float=False, date_unit=None, encoding=None,
178-
lines=False):
181+
lines=False, chunksize=None):
179182
"""
180183
Convert a JSON string to pandas object
181184
@@ -264,6 +267,16 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
264267
265268
.. versionadded:: 0.19.0
266269
270+
chunksize: integer, default None
271+
Return JsonReader object for iteration.
272+
See the `line-delimted json docs
273+
<http://pandas.pydata.org/pandas-docs/stable/io.html#io-jsonl>`_
274+
for more information on ``chunksize``.
275+
This can only be passed if `lines=True`.
276+
If this is None, the file will be read into memory all at once.
277+
278+
.. versionadded:: 0.21.0
279+
267280
Returns
268281
-------
269282
result : Series or DataFrame, depending on the value of `typ`.
@@ -323,47 +336,167 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
323336

324337
filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf,
325338
encoding=encoding)
326-
if isinstance(filepath_or_buffer, compat.string_types):
327-
try:
328-
exists = os.path.exists(filepath_or_buffer)
329-
330-
# if the filepath is too long will raise here
331-
# 5874
332-
except (TypeError, ValueError):
333-
exists = False
334-
335-
if exists:
336-
fh, handles = _get_handle(filepath_or_buffer, 'r',
337-
encoding=encoding)
338-
json = fh.read()
339-
fh.close()
339+
340+
json_reader = JsonReader(
341+
filepath_or_buffer, orient=orient, typ=typ, dtype=dtype,
342+
convert_axes=convert_axes, convert_dates=convert_dates,
343+
keep_default_dates=keep_default_dates, numpy=numpy,
344+
precise_float=precise_float, date_unit=date_unit, encoding=encoding,
345+
lines=lines, chunksize=chunksize
346+
)
347+
348+
if chunksize:
349+
return json_reader
350+
351+
return json_reader.read()
352+
353+
354+
class JsonReader(BaseIterator):
355+
"""
356+
JsonReader provides an interface for reading in a JSON file.
357+
358+
If initialized with ``lines=True`` and ``chunksize``, can be iterated over
359+
``chunksize`` lines at a time. Otherwise, calling ``read`` reads in the
360+
whole document.
361+
"""
362+
def __init__(self, filepath_or_buffer, orient, typ, dtype, convert_axes,
363+
convert_dates, keep_default_dates, numpy, precise_float,
364+
date_unit, encoding, lines, chunksize):
365+
366+
self.path_or_buf = filepath_or_buffer
367+
self.orient = orient
368+
self.typ = typ
369+
self.dtype = dtype
370+
self.convert_axes = convert_axes
371+
self.convert_dates = convert_dates
372+
self.keep_default_dates = keep_default_dates
373+
self.numpy = numpy
374+
self.precise_float = precise_float
375+
self.date_unit = date_unit
376+
self.encoding = encoding
377+
self.lines = lines
378+
self.chunksize = chunksize
379+
self.nrows_seen = 0
380+
self.should_close = False
381+
382+
if self.chunksize is not None:
383+
self.chunksize = _validate_integer("chunksize", self.chunksize, 1)
384+
if not self.lines:
385+
raise ValueError("chunksize can only be passed if lines=True")
386+
387+
data = self._get_data_from_filepath(filepath_or_buffer)
388+
self.data = self._preprocess_data(data)
389+
390+
def _preprocess_data(self, data):
391+
"""
392+
At this point, the data either has a `read` attribute (e.g. a file
393+
object or a StringIO) or is a string that is a JSON document.
394+
395+
If self.chunksize, we prepare the data for the `__next__` method.
396+
Otherwise, we read it into memory for the `read` method.
397+
"""
398+
if hasattr(data, 'read') and not self.chunksize:
399+
data = data.read()
400+
if not hasattr(data, 'read') and self.chunksize:
401+
data = StringIO(data)
402+
403+
return data
404+
405+
def _get_data_from_filepath(self, filepath_or_buffer):
406+
"""
407+
read_json accepts three input types:
408+
1. filepath (string-like)
409+
2. file-like object (e.g. open file object, StringIO)
410+
3. JSON string
411+
412+
This method turns (1) into (2) to simplify the rest of the processing.
413+
It returns input types (2) and (3) unchanged.
414+
"""
415+
416+
data = filepath_or_buffer
417+
418+
if isinstance(data, compat.string_types):
419+
try:
420+
exists = os.path.exists(filepath_or_buffer)
421+
422+
# gh-5874: if the filepath is too long will raise here
423+
except (TypeError, ValueError):
424+
pass
425+
426+
else:
427+
if exists:
428+
data, _ = _get_handle(filepath_or_buffer, 'r',
429+
encoding=self.encoding)
430+
self.should_close = True
431+
self.open_stream = data
432+
433+
return data
434+
435+
def _combine_lines(self, lines):
436+
"""Combines a list of JSON objects into one JSON object"""
437+
lines = filter(None, map(lambda x: x.strip(), lines))
438+
return '[' + ','.join(lines) + ']'
439+
440+
def read(self):
441+
"""Read the whole JSON input into a pandas object"""
442+
if self.lines and self.chunksize:
443+
obj = concat(self)
444+
elif self.lines:
445+
obj = self._get_object_parser(
446+
self._combine_lines(self.data.split('\n'))
447+
)
340448
else:
341-
json = filepath_or_buffer
342-
elif hasattr(filepath_or_buffer, 'read'):
343-
json = filepath_or_buffer.read()
344-
else:
345-
json = filepath_or_buffer
449+
obj = self._get_object_parser(self.data)
450+
self.close()
451+
return obj
452+
453+
def _get_object_parser(self, json):
454+
"""parses a json document into a pandas object"""
455+
typ = self.typ
456+
dtype = self.dtype
457+
kwargs = {
458+
"orient": self.orient, "dtype": self.dtype,
459+
"convert_axes": self.convert_axes,
460+
"convert_dates": self.convert_dates,
461+
"keep_default_dates": self.keep_default_dates, "numpy": self.numpy,
462+
"precise_float": self.precise_float, "date_unit": self.date_unit
463+
}
464+
obj = None
465+
if typ == 'frame':
466+
obj = FrameParser(json, **kwargs).parse()
467+
468+
if typ == 'series' or obj is None:
469+
if not isinstance(dtype, bool):
470+
dtype = dict(data=dtype)
471+
obj = SeriesParser(json, **kwargs).parse()
472+
473+
return obj
474+
475+
def close(self):
476+
"""
477+
If we opened a stream earlier, in _get_data_from_filepath, we should
478+
close it. If an open stream or file was passed, we leave it open.
479+
"""
480+
if self.should_close:
481+
try:
482+
self.open_stream.close()
483+
except (IOError, AttributeError):
484+
pass
346485

347-
if lines:
348-
# If given a json lines file, we break the string into lines, add
349-
# commas and put it in a json list to make a valid json object.
350-
lines = list(StringIO(json.strip()))
351-
json = '[' + ','.join(lines) + ']'
352-
353-
obj = None
354-
if typ == 'frame':
355-
obj = FrameParser(json, orient, dtype, convert_axes, convert_dates,
356-
keep_default_dates, numpy, precise_float,
357-
date_unit).parse()
358-
359-
if typ == 'series' or obj is None:
360-
if not isinstance(dtype, bool):
361-
dtype = dict(data=dtype)
362-
obj = SeriesParser(json, orient, dtype, convert_axes, convert_dates,
363-
keep_default_dates, numpy, precise_float,
364-
date_unit).parse()
365-
366-
return obj
486+
def __next__(self):
487+
lines = list(islice(self.data, self.chunksize))
488+
if lines:
489+
lines_json = self._combine_lines(lines)
490+
obj = self._get_object_parser(lines_json)
491+
492+
# Make sure that the returned objects have the right index.
493+
obj.index = range(self.nrows_seen, self.nrows_seen + len(obj))
494+
self.nrows_seen += len(obj)
495+
496+
return obj
497+
498+
self.close()
499+
raise StopIteration
367500

368501

369502
class Parser(object):

pandas/tests/io/json/test_pandas.py

-47
Original file line numberDiff line numberDiff line change
@@ -985,53 +985,6 @@ def test_tz_range_is_utc(self):
985985
df = DataFrame({'DT': dti})
986986
assert dumps(df, iso_dates=True) == dfexp
987987

988-
def test_read_jsonl(self):
989-
# GH9180
990-
result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
991-
expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
992-
assert_frame_equal(result, expected)
993-
994-
def test_read_jsonl_unicode_chars(self):
995-
# GH15132: non-ascii unicode characters
996-
# \u201d == RIGHT DOUBLE QUOTATION MARK
997-
998-
# simulate file handle
999-
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
1000-
json = StringIO(json)
1001-
result = read_json(json, lines=True)
1002-
expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
1003-
columns=['a', 'b'])
1004-
assert_frame_equal(result, expected)
1005-
1006-
# simulate string
1007-
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
1008-
result = read_json(json, lines=True)
1009-
expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
1010-
columns=['a', 'b'])
1011-
assert_frame_equal(result, expected)
1012-
1013-
def test_to_jsonl(self):
1014-
# GH9180
1015-
df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
1016-
result = df.to_json(orient="records", lines=True)
1017-
expected = '{"a":1,"b":2}\n{"a":1,"b":2}'
1018-
assert result == expected
1019-
1020-
df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=['a', 'b'])
1021-
result = df.to_json(orient="records", lines=True)
1022-
expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}'
1023-
assert result == expected
1024-
assert_frame_equal(pd.read_json(result, lines=True), df)
1025-
1026-
# GH15096: escaped characters in columns and data
1027-
df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]],
1028-
columns=["a\\", 'b'])
1029-
result = df.to_json(orient="records", lines=True)
1030-
expected = ('{"a\\\\":"foo\\\\","b":"bar"}\n'
1031-
'{"a\\\\":"foo\\"","b":"bar"}')
1032-
assert result == expected
1033-
assert_frame_equal(pd.read_json(result, lines=True), df)
1034-
1035988
def test_latin_encoding(self):
1036989
if compat.PY2:
1037990
tm.assert_raises_regex(

0 commit comments

Comments
 (0)