ENH: Adding lines to read_json

aterrel · jreback · commit 6efd743ccbf2 · 2016-07-24T10:13:29.000-04:00
closes #9180 closes #13356 closes #13351 Author: Andy R. Terrel <andy.terrel@gmail.com>
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -1466,6 +1466,7 @@ with optional parameters:
 - ``force_ascii`` : force encoded string to be ASCII, default True.
 - ``date_unit`` : The time unit to encode to, governs timestamp and ISO8601 precision. One of 's', 'ms', 'us' or 'ns' for seconds, milliseconds, microseconds and nanoseconds respectively. Default 'ms'.
 - ``default_handler`` : The handler to call if an object cannot otherwise be converted to a suitable format for JSON. Takes a single argument, which is the object to convert, and returns a serializable object.
+- ``lines`` : If ``records`` orient, then will write each record per line as json.
 
 Note ``NaN``'s, ``NaT``'s and ``None`` will be converted to ``null`` and ``datetime`` objects will be converted based on the ``date_format`` and ``date_unit`` parameters.
 
@@ -1656,6 +1657,8 @@ is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series``
   None. By default the timestamp precision will be detected, if this is not desired
   then pass one of 's', 'ms', 'us' or 'ns' to force timestamp precision to
   seconds, milliseconds, microseconds or nanoseconds respectively.
+- ``lines`` : reads file as one json object per line.
+- ``encoding`` : The encoding to use to decode py3 bytes.
 
 The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is not parseable.
 
@@ -1845,6 +1848,26 @@ into a flat table.
 
    json_normalize(data, 'counties', ['state', 'shortname', ['info', 'governor']])
 
+.. _io.jsonl:
+
+Line delimited json
+'''''''''''''''''''
+
+.. versionadded:: 0.19.0
+
+pandas is able to read and write line-delimited json files that are common in data processing pipelines
+using Hadoop or Spark.
+
+.. ipython:: python
+
+  jsonl = '''
+      {"a":1,"b":2}
+      {"a":3,"b":4}
+  '''
+  df = pd.read_json(jsonl, lines=True)
+  df
+  df.to_json(orient='records', lines=True)
+
 HTML
 ----
 
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -325,6 +325,7 @@ Other enhancements
 
 .. _whatsnew_0190.api:
 
+
 API changes
 ~~~~~~~~~~~
 
@@ -344,7 +345,7 @@ API changes
 - ``PeriodIndex.fillna`` with ``Period`` has different freq now coerces to ``object`` dtype (:issue:`13664`)
 - More informative exceptions are passed through the csv parser. The exception type would now be the original exception type instead of ``CParserError``. (:issue:`13652`)
 - ``astype()`` will now accept a dict of column name to data types mapping as the ``dtype`` argument. (:issue:`12086`)
-
+- The ``pd.read_json`` and ``DataFrame.to_json`` has gained support for reading and writing json lines with ``lines`` option see :ref:`Line delimited json <io.jsonl>` (:issue:`9180`)
 
 .. _whatsnew_0190.api.tolist:
 
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -1017,7 +1017,7 @@ def __setstate__(self, state):
 
     def to_json(self, path_or_buf=None, orient=None, date_format='epoch',
                 double_precision=10, force_ascii=True, date_unit='ms',
-                default_handler=None):
+                default_handler=None, lines=False):
         """
         Convert the object to a JSON string.
 
@@ -1065,6 +1065,13 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch',
             Handler to call if object cannot otherwise be converted to a
             suitable format for JSON. Should receive a single argument which is
             the object to convert and return a serialisable object.
+        lines : boolean, defalut False
+            If 'orient' is 'records' write out line delimited json format. Will
+            throw ValueError if incorrect 'orient' since others are not list
+            like.
+
+            .. versionadded:: 0.19.0
+
 
         Returns
         -------
@@ -1077,7 +1084,8 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch',
                             date_format=date_format,
                             double_precision=double_precision,
                             force_ascii=force_ascii, date_unit=date_unit,
-                            default_handler=default_handler)
+                            default_handler=default_handler,
+                            lines=lines)
 
     def to_hdf(self, path_or_buf, key, **kwargs):
         """Activate the HDFStore.
diff --git a/pandas/io/json.py b/pandas/io/json.py
@@ -7,22 +7,25 @@
 
 import pandas.json as _json
 from pandas.tslib import iNaT
-from pandas.compat import long, u
+from pandas.compat import StringIO, long, u
 from pandas import compat, isnull
 from pandas import Series, DataFrame, to_datetime
-from pandas.io.common import get_filepath_or_buffer
+from pandas.io.common import get_filepath_or_buffer, _get_handle
 from pandas.core.common import AbstractMethodError
 from pandas.formats.printing import pprint_thing
 
 loads = _json.loads
 dumps = _json.dumps
 
-# interface to/from
-
 
+# interface to/from
 def to_json(path_or_buf, obj, orient=None, date_format='epoch',
             double_precision=10, force_ascii=True, date_unit='ms',
-            default_handler=None):
+            default_handler=None, lines=False):
+
+    if lines and orient != 'records':
+            raise ValueError(
+                "'lines' keyword only valid when 'orient' is records")
 
     if isinstance(obj, Series):
         s = SeriesWriter(
@@ -37,6 +40,9 @@ def to_json(path_or_buf, obj, orient=None, date_format='epoch',
     else:
         raise NotImplementedError("'obj' should be a Series or a DataFrame")
 
+    if lines:
+        s = _convert_to_line_delimits(s)
+
     if isinstance(path_or_buf, compat.string_types):
         with open(path_or_buf, 'w') as fh:
             fh.write(s)
@@ -105,7 +111,8 @@ def _format_axes(self):
 
 def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
               convert_axes=True, convert_dates=True, keep_default_dates=True,
-              numpy=False, precise_float=False, date_unit=None):
+              numpy=False, precise_float=False, date_unit=None, encoding=None,
+              lines=False):
     """
     Convert a JSON string to pandas object
 
@@ -178,13 +185,23 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
         is to try and detect the correct precision, but if this is not desired
         then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds,
         milliseconds, microseconds or nanoseconds respectively.
+    lines : boolean, default False
+        Read the file as a json object per line.
+
+        .. versionadded:: 0.19.0
+
+    encoding : str, default is 'utf-8'
+        The encoding to use to decode py3 bytes.
+
+        .. versionadded:: 0.19.0
 
     Returns
     -------
     result : Series or DataFrame
     """
 
-    filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf)
+    filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf,
+                                                      encoding=encoding)
     if isinstance(filepath_or_buffer, compat.string_types):
         try:
             exists = os.path.exists(filepath_or_buffer)
@@ -195,7 +212,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
             exists = False
 
         if exists:
-            with open(filepath_or_buffer, 'r') as fh:
+            with _get_handle(filepath_or_buffer, 'r', encoding=encoding) as fh:
                 json = fh.read()
         else:
             json = filepath_or_buffer
@@ -204,6 +221,12 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
     else:
         json = filepath_or_buffer
 
+    if lines:
+        # If given a json lines file, we break the string into lines, add
+        # commas and put it in a json list to make a valid json object.
+        lines = list(StringIO(json.strip()))
+        json = u'[' + u','.join(lines) + u']'
+
     obj = None
     if typ == 'frame':
         obj = FrameParser(json, orient, dtype, convert_axes, convert_dates,
@@ -574,6 +597,30 @@ def is_ok(col):
 # JSON normalization routines
 
 
+def _convert_to_line_delimits(s):
+    """Helper function that converts json lists to line delimited json."""
+
+    # Determine we have a JSON list to turn to lines otherwise just return the
+    # json object, only lists can
+    if not s[0] == '[' and s[-1] == ']':
+        return s
+    s = s[1:-1]
+    num_open_brackets_seen = 0
+    commas_to_replace = []
+    for idx, char in enumerate(s):              # iter through to find all
+        if char == ',':                         # commas that should be \n
+            if num_open_brackets_seen == 0:
+                commas_to_replace.append(idx)
+        elif char == '{':
+            num_open_brackets_seen += 1
+        elif char == '}':
+            num_open_brackets_seen -= 1
+    s_arr = np.array(list(s))                  # Turn to an array to set
+    s_arr[commas_to_replace] = '\n'            # all commas at once.
+    s = ''.join(s_arr)
+    return s
+
+
 def nested_to_record(ds, prefix="", level=0):
     """a simplified json_normalize
 
diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py
@@ -948,6 +948,58 @@ def test_tz_range_is_utc(self):
         df = DataFrame({'DT': dti})
         self.assertEqual(dfexp, pd.json.dumps(df, iso_dates=True))
 
+    def test_read_jsonl(self):
+        # GH9180
+        result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
+        expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
+        assert_frame_equal(result, expected)
+
+    def test_to_jsonl(self):
+        # GH9180
+        df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
+        result = df.to_json(orient="records", lines=True)
+        expected = '{"a":1,"b":2}\n{"a":1,"b":2}'
+        self.assertEqual(result, expected)
+
+    def test_latin_encoding(self):
+        if compat.PY2:
+            self.assertRaisesRegexp(
+                TypeError, '\[unicode\] is not implemented as a table column')
+            return
+
+        values = [[b'E\xc9, 17', b'', b'a', b'b', b'c'],
+                  [b'E\xc9, 17', b'a', b'b', b'c'],
+                  [b'EE, 17', b'', b'a', b'b', b'c'],
+                  [b'E\xc9, 17', b'\xf8\xfc', b'a', b'b', b'c'],
+                  [b'', b'a', b'b', b'c'],
+                  [b'\xf8\xfc', b'a', b'b', b'c'],
+                  [b'A\xf8\xfc', b'', b'a', b'b', b'c'],
+                  [np.nan, b'', b'b', b'c'],
+                  [b'A\xf8\xfc', np.nan, b'', b'b', b'c']]
+
+        def _try_decode(x, encoding='latin-1'):
+            try:
+                return x.decode(encoding)
+            except AttributeError:
+                return x
+
+        # not sure how to remove latin-1 from code in python 2 and 3
+        values = [[_try_decode(x) for x in y] for y in values]
+
+        examples = []
+        for dtype in ['category', object]:
+            for val in values:
+                examples.append(Series(val, dtype=dtype))
+
+        def roundtrip(s, encoding='latin-1'):
+            with ensure_clean('test.json') as path:
+                s.to_json(path, encoding=encoding)
+                retr = read_json(path, encoding=encoding)
+                assert_series_equal(s, retr, check_categorical=False)
+
+        for s in examples:
+            roundtrip(s)
+
 
 if __name__ == '__main__':
     import nose