diff --git a/doc/source/io.rst b/doc/source/io.rst index da0444a8b8df9..58a3d03a9b73a 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1466,6 +1466,7 @@ with optional parameters: - ``force_ascii`` : force encoded string to be ASCII, default True. - ``date_unit`` : The time unit to encode to, governs timestamp and ISO8601 precision. One of 's', 'ms', 'us' or 'ns' for seconds, milliseconds, microseconds and nanoseconds respectively. Default 'ms'. - ``default_handler`` : The handler to call if an object cannot otherwise be converted to a suitable format for JSON. Takes a single argument, which is the object to convert, and returns a serializable object. +- ``lines`` : If ``records`` orient, then will write each record per line as json. Note ``NaN``'s, ``NaT``'s and ``None`` will be converted to ``null`` and ``datetime`` objects will be converted based on the ``date_format`` and ``date_unit`` parameters. @@ -1656,6 +1657,8 @@ is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series`` None. By default the timestamp precision will be detected, if this is not desired then pass one of 's', 'ms', 'us' or 'ns' to force timestamp precision to seconds, milliseconds, microseconds or nanoseconds respectively. +- ``lines`` : reads file as one json object per line. +- ``encoding`` : The encoding to use to decode py3 bytes. The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is not parseable. @@ -1845,6 +1848,26 @@ into a flat table. json_normalize(data, 'counties', ['state', 'shortname', ['info', 'governor']]) +.. _io.jsonl: + +Line delimited json +''''''''''''''''''' + +.. versionadded:: 0.19.0 + +pandas is able to read and write line-delimited json files that are common in data processing pipelines +using Hadoop or Spark. + +.. ipython:: python + + jsonl = ''' + {"a":1,"b":2} + {"a":3,"b":4} + ''' + df = pd.read_json(jsonl, lines=True) + df + df.to_json(orient='records', lines=True) + HTML ---- diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index f65f7d57d5d08..f549d7361ea5f 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -254,6 +254,7 @@ Other enhancements .. _whatsnew_0190.api: + API changes ~~~~~~~~~~~ @@ -271,7 +272,7 @@ API changes - ``__setitem__`` will no longer apply a callable rhs as a function instead of storing it. Call ``where`` directly to get the previous behavior. (:issue:`13299`) - Passing ``Period`` with multiple frequencies to normal ``Index`` now returns ``Index`` with ``object`` dtype (:issue:`13664`) - ``PeriodIndex.fillna`` with ``Period`` has different freq now coerces to ``object`` dtype (:issue:`13664`) - +- The ``pd.read_json`` and ``DataFrame.to_json`` has gained support for reading and writing json lines with ``lines`` option see :ref:`Line delimited json ` (:issue:`9180`) .. _whatsnew_0190.api.tolist: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6c1676fbdd7f4..cf5e99bd52993 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1016,7 +1016,7 @@ def __setstate__(self, state): def to_json(self, path_or_buf=None, orient=None, date_format='epoch', double_precision=10, force_ascii=True, date_unit='ms', - default_handler=None): + default_handler=None, lines=False): """ Convert the object to a JSON string. @@ -1064,6 +1064,13 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch', Handler to call if object cannot otherwise be converted to a suitable format for JSON. Should receive a single argument which is the object to convert and return a serialisable object. + lines : boolean, defalut False + If 'orient' is 'records' write out line delimited json format. Will + throw ValueError if incorrect 'orient' since others are not list + like. + + .. versionadded:: 0.19.0 + Returns ------- @@ -1076,7 +1083,8 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch', date_format=date_format, double_precision=double_precision, force_ascii=force_ascii, date_unit=date_unit, - default_handler=default_handler) + default_handler=default_handler, + lines=lines) def to_hdf(self, path_or_buf, key, **kwargs): """Activate the HDFStore. diff --git a/pandas/io/json.py b/pandas/io/json.py index fd97e51208f7e..5d937856ae06d 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -7,22 +7,49 @@ import pandas.json as _json from pandas.tslib import iNaT -from pandas.compat import long, u +from pandas.compat import StringIO, long, u from pandas import compat, isnull from pandas import Series, DataFrame, to_datetime -from pandas.io.common import get_filepath_or_buffer +from pandas.io.common import get_filepath_or_buffer, _get_handle from pandas.core.common import AbstractMethodError from pandas.formats.printing import pprint_thing loads = _json.loads dumps = _json.dumps + # interface to/from +def _convert_to_line_delimits(s): + """Helper function that converts json lists to line delimited json.""" + + # Determine we have a JSON list to turn to lines otherwise just return the + # json object, only lists can + if not s[0] == '[' and s[-1] == ']': + return s + s = s[1:-1] + num_open_brackets_seen = 0 + commas_to_replace = [] + for idx, char in enumerate(s): # iter through to find all + if char == ',': # commas that should be \n + if num_open_brackets_seen == 0: + commas_to_replace.append(idx) + elif char == '{': + num_open_brackets_seen += 1 + elif char == '}': + num_open_brackets_seen -= 1 + s_arr = np.array(list(s)) # Turn to an array to set + s_arr[commas_to_replace] = '\n' # all commas at once. + s = ''.join(s_arr) + return s def to_json(path_or_buf, obj, orient=None, date_format='epoch', double_precision=10, force_ascii=True, date_unit='ms', - default_handler=None): + default_handler=None, lines=False): + + if lines and orient != 'records': + raise ValueError( + "'lines' keyword only valid when 'orient' is records") if isinstance(obj, Series): s = SeriesWriter( @@ -37,6 +64,9 @@ def to_json(path_or_buf, obj, orient=None, date_format='epoch', else: raise NotImplementedError("'obj' should be a Series or a DataFrame") + if lines: + s = _convert_to_line_delimits(s) + if isinstance(path_or_buf, compat.string_types): with open(path_or_buf, 'w') as fh: fh.write(s) @@ -105,7 +135,8 @@ def _format_axes(self): def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=True, - numpy=False, precise_float=False, date_unit=None): + numpy=False, precise_float=False, date_unit=None, encoding=None, + lines=False): """ Convert a JSON string to pandas object @@ -178,13 +209,23 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, is to try and detect the correct precision, but if this is not desired then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds, milliseconds, microseconds or nanoseconds respectively. + lines : boolean, default False + Read the file as a json object per line. + + .. versionadded:: 0.19.0 + + encoding : str, default is 'utf-8' + The encoding to use to decode py3 bytes. + + .. versionadded:: 0.19.0 Returns ------- result : Series or DataFrame """ - filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf) + filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf, + encoding=encoding) if isinstance(filepath_or_buffer, compat.string_types): try: exists = os.path.exists(filepath_or_buffer) @@ -195,7 +236,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, exists = False if exists: - with open(filepath_or_buffer, 'r') as fh: + with _get_handle(filepath_or_buffer, 'r', encoding=encoding) as fh: json = fh.read() else: json = filepath_or_buffer @@ -204,6 +245,12 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, else: json = filepath_or_buffer + if lines: + # If given a json lines file, we break the string into lines, add + # commas and put it in a json list to make a valid json object. + lines = list(StringIO(json.strip())) + json = u'[' + u','.join(lines) + u']' + obj = None if typ == 'frame': obj = FrameParser(json, orient, dtype, convert_axes, convert_dates, diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py index 9f8aedc2e399e..6516ced7b5fb7 100644 --- a/pandas/io/tests/json/test_pandas.py +++ b/pandas/io/tests/json/test_pandas.py @@ -948,6 +948,58 @@ def test_tz_range_is_utc(self): df = DataFrame({'DT': dti}) self.assertEqual(dfexp, pd.json.dumps(df, iso_dates=True)) + def test_read_jsonl(self): + # GH9180 + result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True) + expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + assert_frame_equal(result, expected) + + def test_to_jsonl(self): + # GH9180 + df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + result = df.to_json(orient="records", lines=True) + expected = '{"a":1,"b":2}\n{"a":1,"b":2}' + self.assertEqual(result, expected) + + def test_latin_encoding(self): + if compat.PY2: + self.assertRaisesRegexp( + TypeError, '\[unicode\] is not implemented as a table column') + return + + values = [[b'E\xc9, 17', b'', b'a', b'b', b'c'], + [b'E\xc9, 17', b'a', b'b', b'c'], + [b'EE, 17', b'', b'a', b'b', b'c'], + [b'E\xc9, 17', b'\xf8\xfc', b'a', b'b', b'c'], + [b'', b'a', b'b', b'c'], + [b'\xf8\xfc', b'a', b'b', b'c'], + [b'A\xf8\xfc', b'', b'a', b'b', b'c'], + [np.nan, b'', b'b', b'c'], + [b'A\xf8\xfc', np.nan, b'', b'b', b'c']] + + def _try_decode(x, encoding='latin-1'): + try: + return x.decode(encoding) + except AttributeError: + return x + + # not sure how to remove latin-1 from code in python 2 and 3 + values = [[_try_decode(x) for x in y] for y in values] + + examples = [] + for dtype in ['category', object]: + for val in values: + examples.append(Series(val, dtype=dtype)) + + def roundtrip(s, encoding='latin-1'): + with ensure_clean('test.json') as path: + s.to_json(path, encoding=encoding) + retr = read_json(path, encoding=encoding) + assert_series_equal(s, retr, check_categorical=False) + + for s in examples: + roundtrip(s) + if __name__ == '__main__': import nose