Skip to content

Commit 6efd743

Browse files
aterreljreback
authored andcommitted
ENH: Adding lines to read_json
closes #9180 closes #13356 closes #13351 Author: Andy R. Terrel <[email protected]>
1 parent ee6c0cd commit 6efd743

File tree

5 files changed

+142
-11
lines changed

5 files changed

+142
-11
lines changed

doc/source/io.rst

+23
Original file line numberDiff line numberDiff line change
@@ -1466,6 +1466,7 @@ with optional parameters:
14661466
- ``force_ascii`` : force encoded string to be ASCII, default True.
14671467
- ``date_unit`` : The time unit to encode to, governs timestamp and ISO8601 precision. One of 's', 'ms', 'us' or 'ns' for seconds, milliseconds, microseconds and nanoseconds respectively. Default 'ms'.
14681468
- ``default_handler`` : The handler to call if an object cannot otherwise be converted to a suitable format for JSON. Takes a single argument, which is the object to convert, and returns a serializable object.
1469+
- ``lines`` : If ``records`` orient, then will write each record per line as json.
14691470

14701471
Note ``NaN``'s, ``NaT``'s and ``None`` will be converted to ``null`` and ``datetime`` objects will be converted based on the ``date_format`` and ``date_unit`` parameters.
14711472

@@ -1656,6 +1657,8 @@ is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series``
16561657
None. By default the timestamp precision will be detected, if this is not desired
16571658
then pass one of 's', 'ms', 'us' or 'ns' to force timestamp precision to
16581659
seconds, milliseconds, microseconds or nanoseconds respectively.
1660+
- ``lines`` : reads file as one json object per line.
1661+
- ``encoding`` : The encoding to use to decode py3 bytes.
16591662

16601663
The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is not parseable.
16611664

@@ -1845,6 +1848,26 @@ into a flat table.
18451848
18461849
json_normalize(data, 'counties', ['state', 'shortname', ['info', 'governor']])
18471850
1851+
.. _io.jsonl:
1852+
1853+
Line delimited json
1854+
'''''''''''''''''''
1855+
1856+
.. versionadded:: 0.19.0
1857+
1858+
pandas is able to read and write line-delimited json files that are common in data processing pipelines
1859+
using Hadoop or Spark.
1860+
1861+
.. ipython:: python
1862+
1863+
jsonl = '''
1864+
{"a":1,"b":2}
1865+
{"a":3,"b":4}
1866+
'''
1867+
df = pd.read_json(jsonl, lines=True)
1868+
df
1869+
df.to_json(orient='records', lines=True)
1870+
18481871
HTML
18491872
----
18501873

doc/source/whatsnew/v0.19.0.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,7 @@ Other enhancements
325325

326326
.. _whatsnew_0190.api:
327327

328+
328329
API changes
329330
~~~~~~~~~~~
330331

@@ -344,7 +345,7 @@ API changes
344345
- ``PeriodIndex.fillna`` with ``Period`` has different freq now coerces to ``object`` dtype (:issue:`13664`)
345346
- More informative exceptions are passed through the csv parser. The exception type would now be the original exception type instead of ``CParserError``. (:issue:`13652`)
346347
- ``astype()`` will now accept a dict of column name to data types mapping as the ``dtype`` argument. (:issue:`12086`)
347-
348+
- The ``pd.read_json`` and ``DataFrame.to_json`` has gained support for reading and writing json lines with ``lines`` option see :ref:`Line delimited json <io.jsonl>` (:issue:`9180`)
348349

349350
.. _whatsnew_0190.api.tolist:
350351

pandas/core/generic.py

+10-2
Original file line numberDiff line numberDiff line change
@@ -1017,7 +1017,7 @@ def __setstate__(self, state):
10171017

10181018
def to_json(self, path_or_buf=None, orient=None, date_format='epoch',
10191019
double_precision=10, force_ascii=True, date_unit='ms',
1020-
default_handler=None):
1020+
default_handler=None, lines=False):
10211021
"""
10221022
Convert the object to a JSON string.
10231023
@@ -1065,6 +1065,13 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch',
10651065
Handler to call if object cannot otherwise be converted to a
10661066
suitable format for JSON. Should receive a single argument which is
10671067
the object to convert and return a serialisable object.
1068+
lines : boolean, defalut False
1069+
If 'orient' is 'records' write out line delimited json format. Will
1070+
throw ValueError if incorrect 'orient' since others are not list
1071+
like.
1072+
1073+
.. versionadded:: 0.19.0
1074+
10681075
10691076
Returns
10701077
-------
@@ -1077,7 +1084,8 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch',
10771084
date_format=date_format,
10781085
double_precision=double_precision,
10791086
force_ascii=force_ascii, date_unit=date_unit,
1080-
default_handler=default_handler)
1087+
default_handler=default_handler,
1088+
lines=lines)
10811089

10821090
def to_hdf(self, path_or_buf, key, **kwargs):
10831091
"""Activate the HDFStore.

pandas/io/json.py

+55-8
Original file line numberDiff line numberDiff line change
@@ -7,22 +7,25 @@
77

88
import pandas.json as _json
99
from pandas.tslib import iNaT
10-
from pandas.compat import long, u
10+
from pandas.compat import StringIO, long, u
1111
from pandas import compat, isnull
1212
from pandas import Series, DataFrame, to_datetime
13-
from pandas.io.common import get_filepath_or_buffer
13+
from pandas.io.common import get_filepath_or_buffer, _get_handle
1414
from pandas.core.common import AbstractMethodError
1515
from pandas.formats.printing import pprint_thing
1616

1717
loads = _json.loads
1818
dumps = _json.dumps
1919

20-
# interface to/from
21-
2220

21+
# interface to/from
2322
def to_json(path_or_buf, obj, orient=None, date_format='epoch',
2423
double_precision=10, force_ascii=True, date_unit='ms',
25-
default_handler=None):
24+
default_handler=None, lines=False):
25+
26+
if lines and orient != 'records':
27+
raise ValueError(
28+
"'lines' keyword only valid when 'orient' is records")
2629

2730
if isinstance(obj, Series):
2831
s = SeriesWriter(
@@ -37,6 +40,9 @@ def to_json(path_or_buf, obj, orient=None, date_format='epoch',
3740
else:
3841
raise NotImplementedError("'obj' should be a Series or a DataFrame")
3942

43+
if lines:
44+
s = _convert_to_line_delimits(s)
45+
4046
if isinstance(path_or_buf, compat.string_types):
4147
with open(path_or_buf, 'w') as fh:
4248
fh.write(s)
@@ -105,7 +111,8 @@ def _format_axes(self):
105111

106112
def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
107113
convert_axes=True, convert_dates=True, keep_default_dates=True,
108-
numpy=False, precise_float=False, date_unit=None):
114+
numpy=False, precise_float=False, date_unit=None, encoding=None,
115+
lines=False):
109116
"""
110117
Convert a JSON string to pandas object
111118
@@ -178,13 +185,23 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
178185
is to try and detect the correct precision, but if this is not desired
179186
then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds,
180187
milliseconds, microseconds or nanoseconds respectively.
188+
lines : boolean, default False
189+
Read the file as a json object per line.
190+
191+
.. versionadded:: 0.19.0
192+
193+
encoding : str, default is 'utf-8'
194+
The encoding to use to decode py3 bytes.
195+
196+
.. versionadded:: 0.19.0
181197
182198
Returns
183199
-------
184200
result : Series or DataFrame
185201
"""
186202

187-
filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf)
203+
filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf,
204+
encoding=encoding)
188205
if isinstance(filepath_or_buffer, compat.string_types):
189206
try:
190207
exists = os.path.exists(filepath_or_buffer)
@@ -195,7 +212,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
195212
exists = False
196213

197214
if exists:
198-
with open(filepath_or_buffer, 'r') as fh:
215+
with _get_handle(filepath_or_buffer, 'r', encoding=encoding) as fh:
199216
json = fh.read()
200217
else:
201218
json = filepath_or_buffer
@@ -204,6 +221,12 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
204221
else:
205222
json = filepath_or_buffer
206223

224+
if lines:
225+
# If given a json lines file, we break the string into lines, add
226+
# commas and put it in a json list to make a valid json object.
227+
lines = list(StringIO(json.strip()))
228+
json = u'[' + u','.join(lines) + u']'
229+
207230
obj = None
208231
if typ == 'frame':
209232
obj = FrameParser(json, orient, dtype, convert_axes, convert_dates,
@@ -574,6 +597,30 @@ def is_ok(col):
574597
# JSON normalization routines
575598

576599

600+
def _convert_to_line_delimits(s):
601+
"""Helper function that converts json lists to line delimited json."""
602+
603+
# Determine we have a JSON list to turn to lines otherwise just return the
604+
# json object, only lists can
605+
if not s[0] == '[' and s[-1] == ']':
606+
return s
607+
s = s[1:-1]
608+
num_open_brackets_seen = 0
609+
commas_to_replace = []
610+
for idx, char in enumerate(s): # iter through to find all
611+
if char == ',': # commas that should be \n
612+
if num_open_brackets_seen == 0:
613+
commas_to_replace.append(idx)
614+
elif char == '{':
615+
num_open_brackets_seen += 1
616+
elif char == '}':
617+
num_open_brackets_seen -= 1
618+
s_arr = np.array(list(s)) # Turn to an array to set
619+
s_arr[commas_to_replace] = '\n' # all commas at once.
620+
s = ''.join(s_arr)
621+
return s
622+
623+
577624
def nested_to_record(ds, prefix="", level=0):
578625
"""a simplified json_normalize
579626

pandas/io/tests/json/test_pandas.py

+52
Original file line numberDiff line numberDiff line change
@@ -948,6 +948,58 @@ def test_tz_range_is_utc(self):
948948
df = DataFrame({'DT': dti})
949949
self.assertEqual(dfexp, pd.json.dumps(df, iso_dates=True))
950950

951+
def test_read_jsonl(self):
952+
# GH9180
953+
result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
954+
expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
955+
assert_frame_equal(result, expected)
956+
957+
def test_to_jsonl(self):
958+
# GH9180
959+
df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
960+
result = df.to_json(orient="records", lines=True)
961+
expected = '{"a":1,"b":2}\n{"a":1,"b":2}'
962+
self.assertEqual(result, expected)
963+
964+
def test_latin_encoding(self):
965+
if compat.PY2:
966+
self.assertRaisesRegexp(
967+
TypeError, '\[unicode\] is not implemented as a table column')
968+
return
969+
970+
values = [[b'E\xc9, 17', b'', b'a', b'b', b'c'],
971+
[b'E\xc9, 17', b'a', b'b', b'c'],
972+
[b'EE, 17', b'', b'a', b'b', b'c'],
973+
[b'E\xc9, 17', b'\xf8\xfc', b'a', b'b', b'c'],
974+
[b'', b'a', b'b', b'c'],
975+
[b'\xf8\xfc', b'a', b'b', b'c'],
976+
[b'A\xf8\xfc', b'', b'a', b'b', b'c'],
977+
[np.nan, b'', b'b', b'c'],
978+
[b'A\xf8\xfc', np.nan, b'', b'b', b'c']]
979+
980+
def _try_decode(x, encoding='latin-1'):
981+
try:
982+
return x.decode(encoding)
983+
except AttributeError:
984+
return x
985+
986+
# not sure how to remove latin-1 from code in python 2 and 3
987+
values = [[_try_decode(x) for x in y] for y in values]
988+
989+
examples = []
990+
for dtype in ['category', object]:
991+
for val in values:
992+
examples.append(Series(val, dtype=dtype))
993+
994+
def roundtrip(s, encoding='latin-1'):
995+
with ensure_clean('test.json') as path:
996+
s.to_json(path, encoding=encoding)
997+
retr = read_json(path, encoding=encoding)
998+
assert_series_equal(s, retr, check_categorical=False)
999+
1000+
for s in examples:
1001+
roundtrip(s)
1002+
9511003

9521004
if __name__ == '__main__':
9531005
import nose

0 commit comments

Comments
 (0)