Skip to content

Commit 68063d8

Browse files
committed
Make read_json with lines=True more memory-efficient
Instead of reading the whole file to memory and then manipulating it, read and parse it 10k lines at a time. This only covers some kinds of input to read_json. This also is much slower than the previous implementation.
1 parent c55dbf0 commit 68063d8

File tree

1 file changed

+42
-4
lines changed

1 file changed

+42
-4
lines changed

pandas/io/json/json.py

Lines changed: 42 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
# pylint: disable-msg=E1101,W0613,W0603
2+
from itertools import islice
3+
from pandas import concat
24
import os
35
import numpy as np
46

@@ -174,7 +176,7 @@ def write(self):
174176
def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
175177
convert_axes=True, convert_dates=True, keep_default_dates=True,
176178
numpy=False, precise_float=False, date_unit=None, encoding=None,
177-
lines=False):
179+
lines=False, chunksize=None):
178180
"""
179181
Convert a JSON string to pandas object
180182
@@ -263,6 +265,14 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
263265
264266
.. versionadded:: 0.19.0
265267
268+
chunksize: integer, default None
269+
If `lines=True`, how many lines to read into memory at a time.
270+
If this is None, the file will be read into memory all at once.
271+
Passing a chunksize helps with memory usage, but is slower.
272+
Also note this is different from the `chunksize` parameter in
273+
`read_csv`, which returns a FileTextReader.
274+
If the JSON input is a string, this argument has no effect.
275+
266276
Returns
267277
-------
268278
result : Series or DataFrame, depending on the value of `typ`.
@@ -334,12 +344,18 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
334344
if exists:
335345
fh, handles = _get_handle(filepath_or_buffer, 'r',
336346
encoding=encoding)
337-
json = fh.read()
338-
fh.close()
347+
if lines and chunksize:
348+
return _read_json_as_lines(fh, chunksize)
349+
else:
350+
json = fh.read()
351+
fh.close()
339352
else:
340353
json = filepath_or_buffer
341354
elif hasattr(filepath_or_buffer, 'read'):
342-
json = filepath_or_buffer.read()
355+
if lines and chunksize:
356+
return _read_json_as_lines(fh, chunksize)
357+
else:
358+
json = filepath_or_buffer.read()
343359
else:
344360
json = filepath_or_buffer
345361

@@ -349,6 +365,28 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
349365
lines = list(StringIO(json.strip()))
350366
json = '[' + ','.join(lines) + ']'
351367

368+
return _get_obj(typ, json, orient, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit)
369+
370+
def _read_json_as_lines(fh, chunksize):
371+
while True:
372+
lines = list(islice(fh, chunksize))
373+
374+
if lines:
375+
lines_json = '[' + ','.join(lines) + ']'
376+
obj = _get_obj(typ, lines_json, orient, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit)
377+
if not return_val:
378+
obj = return_val
379+
else:
380+
return_val = concat([return_val, obj])
381+
382+
else:
383+
break
384+
fh.close()
385+
return return_val
386+
387+
def _get_obj(typ, json, orient, dtype, convert_axes, convert_dates,
388+
keep_default_dates, numpy, precise_float,
389+
date_unit):
352390
obj = None
353391
if typ == 'frame':
354392
obj = FrameParser(json, orient, dtype, convert_axes, convert_dates,

0 commit comments

Comments
 (0)