-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
ENH: Adding json line parsing to pd.read_json #9180 #13351
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
47c60a6
a8dd0ef
f71d011
fc865c4
e8f10ea
3c796a9
6861a71
c76dafe
b20798a
f547b0d
ae19f04
ac7b687
f7c3bbf
37252c6
e635318
32a2f8d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1016,7 +1016,7 @@ def __setstate__(self, state): | |
|
||
def to_json(self, path_or_buf=None, orient=None, date_format='epoch', | ||
double_precision=10, force_ascii=True, date_unit='ms', | ||
default_handler=None): | ||
default_handler=None, lines=False): | ||
""" | ||
Convert the object to a JSON string. | ||
|
||
|
@@ -1064,6 +1064,13 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch', | |
Handler to call if object cannot otherwise be converted to a | ||
suitable format for JSON. Should receive a single argument which is | ||
the object to convert and return a serialisable object. | ||
lines : boolean, defalut False | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. defalut -> default |
||
If 'orient' is 'records' write out line delimited json format. Will | ||
throw ValueError if incorrect 'orient' since others are not list | ||
like. | ||
|
||
.. versionadded:: 0.19.0 | ||
|
||
|
||
Returns | ||
------- | ||
|
@@ -1076,7 +1083,8 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch', | |
date_format=date_format, | ||
double_precision=double_precision, | ||
force_ascii=force_ascii, date_unit=date_unit, | ||
default_handler=default_handler) | ||
default_handler=default_handler, | ||
lines=lines) | ||
|
||
def to_hdf(self, path_or_buf, key, **kwargs): | ||
"""Activate the HDFStore. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,22 +7,49 @@ | |
|
||
import pandas.json as _json | ||
from pandas.tslib import iNaT | ||
from pandas.compat import long, u | ||
from pandas.compat import StringIO, long, u | ||
from pandas import compat, isnull | ||
from pandas import Series, DataFrame, to_datetime | ||
from pandas.io.common import get_filepath_or_buffer | ||
from pandas.io.common import get_filepath_or_buffer, _get_handle | ||
from pandas.core.common import AbstractMethodError | ||
from pandas.formats.printing import pprint_thing | ||
|
||
loads = _json.loads | ||
dumps = _json.dumps | ||
|
||
|
||
# interface to/from | ||
def _convert_to_line_delimits(s): | ||
"""Helper function that converts json lists to line delimited json.""" | ||
|
||
# Determine we have a JSON list to turn to lines otherwise just return the | ||
# json object, only lists can | ||
if not s[0] == '[' and s[-1] == ']': | ||
return s | ||
s = s[1:-1] | ||
num_open_brackets_seen = 0 | ||
commas_to_replace = [] | ||
for idx, char in enumerate(s): # iter through to find all | ||
if char == ',': # commas that should be \n | ||
if num_open_brackets_seen == 0: | ||
commas_to_replace.append(idx) | ||
elif char == '{': | ||
num_open_brackets_seen += 1 | ||
elif char == '}': | ||
num_open_brackets_seen -= 1 | ||
s_arr = np.array(list(s)) # Turn to an array to set | ||
s_arr[commas_to_replace] = '\n' # all commas at once. | ||
s = ''.join(s_arr) | ||
return s | ||
|
||
|
||
def to_json(path_or_buf, obj, orient=None, date_format='epoch', | ||
double_precision=10, force_ascii=True, date_unit='ms', | ||
default_handler=None): | ||
default_handler=None, lines=False): | ||
|
||
if lines and orient != 'records': | ||
raise ValueError( | ||
"'lines' keyword only valid when 'orient' is records") | ||
|
||
if isinstance(obj, Series): | ||
s = SeriesWriter( | ||
|
@@ -37,6 +64,9 @@ def to_json(path_or_buf, obj, orient=None, date_format='epoch', | |
else: | ||
raise NotImplementedError("'obj' should be a Series or a DataFrame") | ||
|
||
if lines: | ||
s = _convert_to_line_delimits(s) | ||
|
||
if isinstance(path_or_buf, compat.string_types): | ||
with open(path_or_buf, 'w') as fh: | ||
fh.write(s) | ||
|
@@ -105,7 +135,8 @@ def _format_axes(self): | |
|
||
def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, | ||
convert_axes=True, convert_dates=True, keep_default_dates=True, | ||
numpy=False, precise_float=False, date_unit=None): | ||
numpy=False, precise_float=False, date_unit=None, encoding=None, | ||
lines=False): | ||
""" | ||
Convert a JSON string to pandas object | ||
|
||
|
@@ -178,13 +209,23 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, | |
is to try and detect the correct precision, but if this is not desired | ||
then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds, | ||
milliseconds, microseconds or nanoseconds respectively. | ||
lines : boolean, default False | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. doesn't this also need records format? |
||
Read the file as a json object per line. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add a versionadded 0.18.2 |
||
|
||
.. versionadded:: 0.19.0 | ||
|
||
encoding : str, default is 'utf-8' | ||
The encoding to use to decode py3 bytes. | ||
|
||
.. versionadded:: 0.19.0 | ||
|
||
Returns | ||
------- | ||
result : Series or DataFrame | ||
""" | ||
|
||
filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf) | ||
filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf, | ||
encoding=encoding) | ||
if isinstance(filepath_or_buffer, compat.string_types): | ||
try: | ||
exists = os.path.exists(filepath_or_buffer) | ||
|
@@ -195,7 +236,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, | |
exists = False | ||
|
||
if exists: | ||
with open(filepath_or_buffer, 'r') as fh: | ||
with _get_handle(filepath_or_buffer, 'r', encoding=encoding) as fh: | ||
json = fh.read() | ||
else: | ||
json = filepath_or_buffer | ||
|
@@ -204,6 +245,12 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, | |
else: | ||
json = filepath_or_buffer | ||
|
||
if lines: | ||
# If given a json lines file, we break the string into lines, add | ||
# commas and put it in a json list to make a valid json object. | ||
lines = list(StringIO(json.strip())) | ||
json = u'[' + u','.join(lines) + u']' | ||
|
||
obj = None | ||
if typ == 'frame': | ||
obj = FrameParser(json, orient, dtype, convert_axes, convert_dates, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -948,6 +948,58 @@ def test_tz_range_is_utc(self): | |
df = DataFrame({'DT': dti}) | ||
self.assertEqual(dfexp, pd.json.dumps(df, iso_dates=True)) | ||
|
||
def test_read_jsonl(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add some tests that assert ValueError if invalid combination of lines=True and orient? |
||
# GH9180 | ||
result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True) | ||
expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) | ||
assert_frame_equal(result, expected) | ||
|
||
def test_to_jsonl(self): | ||
# GH9180 | ||
df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) | ||
result = df.to_json(orient="records", lines=True) | ||
expected = '{"a":1,"b":2}\n{"a":1,"b":2}' | ||
self.assertEqual(result, expected) | ||
|
||
def test_latin_encoding(self): | ||
if compat.PY2: | ||
self.assertRaisesRegexp( | ||
TypeError, '\[unicode\] is not implemented as a table column') | ||
return | ||
|
||
values = [[b'E\xc9, 17', b'', b'a', b'b', b'c'], | ||
[b'E\xc9, 17', b'a', b'b', b'c'], | ||
[b'EE, 17', b'', b'a', b'b', b'c'], | ||
[b'E\xc9, 17', b'\xf8\xfc', b'a', b'b', b'c'], | ||
[b'', b'a', b'b', b'c'], | ||
[b'\xf8\xfc', b'a', b'b', b'c'], | ||
[b'A\xf8\xfc', b'', b'a', b'b', b'c'], | ||
[np.nan, b'', b'b', b'c'], | ||
[b'A\xf8\xfc', np.nan, b'', b'b', b'c']] | ||
|
||
def _try_decode(x, encoding='latin-1'): | ||
try: | ||
return x.decode(encoding) | ||
except AttributeError: | ||
return x | ||
|
||
# not sure how to remove latin-1 from code in python 2 and 3 | ||
values = [[_try_decode(x) for x in y] for y in values] | ||
|
||
examples = [] | ||
for dtype in ['category', object]: | ||
for val in values: | ||
examples.append(Series(val, dtype=dtype)) | ||
|
||
def roundtrip(s, encoding='latin-1'): | ||
with ensure_clean('test.json') as path: | ||
s.to_json(path, encoding=encoding) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am confused because it is already used here (encoding keyword), while I don't see it in the docstring/signature of to_json There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. that is a good point! |
||
retr = read_json(path, encoding=encoding) | ||
assert_series_equal(s, retr, check_categorical=False) | ||
|
||
for s in examples: | ||
roundtrip(s) | ||
|
||
|
||
if __name__ == '__main__': | ||
import nose | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I suppose writing should have encoding as well........?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nah, encodings just confuse people =P