read_json support for orient="table"

WillAyd · WillAyd · commit fc893bd969fa · 2018-01-03T06:12:15.000-08:00
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -1833,7 +1833,7 @@ is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series``
 
   DataFrame
       - default is ``columns``
-      - allowed values are {``split``, ``records``, ``index``, ``columns``, ``values``}
+      - allowed values are {``split``, ``records``, ``index``, ``columns``, ``values``, ``table``}
 
   The format of the JSON string
 
@@ -1846,6 +1846,7 @@ is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series``
      ``index``; dict like {index -> {column -> value}}
      ``columns``; dict like {column -> {index -> value}}
      ``values``; just the values array
+     ``table``; adhering to the JSON `Table Schema`_
 
 - ``dtype`` : if True, infer dtypes, if a dict of column to dtype, then use those, if False, then don't infer dtypes at all, default is True, apply only to the data
 - ``convert_axes`` : boolean, try to convert the axes to the proper dtypes, default is True
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -145,6 +145,7 @@ Other Enhancements
 - ``Resampler`` objects now have a functioning :attr:`~pandas.core.resample.Resampler.pipe` method.
   Previously, calls to ``pipe`` were diverted to  the ``mean`` method (:issue:`17905`).
 - :func:`~pandas.api.types.is_scalar` now returns ``True`` for ``DateOffset`` objects (:issue:`18943`).
+- :func:`read_json` now supports ``table`` as a value to the ``orient`` argument (:issue:`18912`)
 
 .. _whatsnew_0230.api_breaking:
 
diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
@@ -16,7 +16,7 @@
 from pandas.core.reshape.concat import concat
 from pandas.io.formats.printing import pprint_thing
 from .normalize import _convert_to_line_delimits
-from .table_schema import build_table_schema
+from .table_schema import build_table_schema, parse_table_schema
 from pandas.core.dtypes.common import is_period_dtype
 
 loads = json.loads
@@ -261,13 +261,16 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
         * when ``typ == 'frame'``,
 
           - allowed orients are ``{'split','records','index',
-            'columns','values'}``
+            'columns','values', 'table'}``
           - default is ``'columns'``
           - The DataFrame index must be unique for orients ``'index'`` and
             ``'columns'``.
           - The DataFrame columns must be unique for orients ``'index'``,
             ``'columns'``, and ``'records'``.
 
+        .. versionadded:: 0.23.0
+           'table' as an allowed value for the ``orient`` argument
+
     typ : type of object to recover (series or frame), default 'frame'
     dtype : boolean or dict, default True
         If True, infer dtypes, if a dict of column to dtype, then use those,
@@ -839,6 +842,9 @@ def _parse_no_numpy(self):
         elif orient == "index":
             self.obj = DataFrame(
                 loads(json, precise_float=self.precise_float), dtype=None).T
+        elif orient == 'table':
+            self.obj = parse_table_schema(json,
+                                          precise_float=self.precise_float)
         else:
             self.obj = DataFrame(
                 loads(json, precise_float=self.precise_float), dtype=None)
diff --git a/pandas/io/json/table_schema.py b/pandas/io/json/table_schema.py
@@ -3,13 +3,18 @@
 
 http://specs.frictionlessdata.io/json-table-schema/
 """
+import pandas._libs.json as json
+from pandas import DataFrame
+from pandas.api.types import CategoricalDtype
 from pandas.core.common import _all_not_none
 from pandas.core.dtypes.common import (
     is_integer_dtype, is_timedelta64_dtype, is_numeric_dtype,
     is_bool_dtype, is_datetime64_dtype, is_datetime64tz_dtype,
     is_categorical_dtype, is_period_dtype, is_string_dtype
 )
 
+loads = json.loads
+
 
 def as_json_table_type(x):
     """
@@ -103,6 +108,40 @@ def make_field(arr, dtype=None):
     return field
 
 
+def revert_field(field):
+    '''
+    Converts a JSON field descriptor into its corresponding NumPy / pandas type
+
+    Parameters
+    ----------
+    field
+        A JSON field descriptor
+
+    Returns
+    -------
+    dtype
+    '''
+    typ = field['type']
+    if typ == 'integer':
+        return 'int64'
+    elif typ == 'number':
+        return 'float64'
+    elif typ == 'boolean':
+        return 'bool'
+    elif typ == 'duration':
+        return 'timedelta64'
+    elif typ == 'datetime':
+        if field.get('tz'):
+            return 'datetime64[ns, {tz}]'.format(tz=field['tz'])
+        else:
+            return 'datetime64[ns]'
+    elif typ == 'any':
+        if 'constraints' in field and 'ordered' in field:
+            return CategoricalDtype(categories=field['constraints']['enum'],
+                                    ordered=field['ordered'])
+    return 'object'
+
+
 def build_table_schema(data, index=True, primary_key=None, version=True):
     """
     Create a Table schema from ``data``.
@@ -180,3 +219,55 @@ def build_table_schema(data, index=True, primary_key=None, version=True):
     if version:
         schema['pandas_version'] = '0.20.0'
     return schema
+
+
+def parse_table_schema(json, precise_float):
+    """
+    Builds a DataFrame from a given schema
+
+    Parameters
+    ----------
+    json :
+        A JSON table schema
+    precise_float : boolean
+        Flag controlling precision when decoding string to double values, as
+        dictated by ``read_json``
+
+    Returns
+    -------
+    df : DataFrame
+
+    Raises
+    ------
+    NotImplementedError
+        If the JSON table schema contains either timezone or timedelta data
+
+    See also
+    --------
+    build_table_schema : inverse function
+    pandas.read_json
+    """
+    table = loads(json, precise_float=precise_float)
+    col_order = [field['name'] for field in table['schema']['fields']]
+    df = DataFrame(table['data'])[col_order]
+
+    dtypes = {field['name']: revert_field(field)
+              for field in table['schema']['fields']}
+
+    # Cannot directly use as_type with timezone data on object; raise for now
+    if any(str(x).startswith('datetime64[ns, ') for x in dtypes.values()):
+        raise NotImplementedError('table="orient" can not yet read timezone '
+                                  'data')
+
+    # No ISO constructor for Timedelta as of yet, so need to raise
+    if 'timedelta64' in dtypes.values():
+        raise NotImplementedError('table="orient" can not yet read '
+                                  'ISO-formatted Timedelta data')
+
+    df = df.astype(dtypes)
+
+    df = df.set_index(table['schema']['primaryKey'])
+    if all(x.startswith('level_') for x in df.index.names):
+        df.index.names = [None] * len(df.index.names)
+
+    return df
diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py
@@ -14,6 +14,7 @@
     build_table_schema,
     make_field,
     set_default_names)
+import pandas.util.testing as tm
 
 
 class TestBuildSchema(object):
@@ -471,3 +472,126 @@ def test_mi_falsey_name(self):
                                                             ('a', 'b')]))
         result = [x['name'] for x in build_table_schema(df)['fields']]
         assert result == ['level_0', 'level_1', 0, 1, 2, 3]
+
+
+class TestTableOrientReader(object):
+
+    def test_integer(self):
+        df = DataFrame(
+            {'A': [1, 2, 3, 4],
+             },
+            index=pd.Index(range(4), name='idx'))
+        out = df.to_json(orient="table")
+        result = pd.read_json(out, orient="table")
+        tm.assert_frame_equal(df, result)
+
+    def test_object(self):
+        df = DataFrame(
+            {'B': ['a', 'b', 'c', 'c'],
+             },
+            index=pd.Index(range(4), name='idx'))
+        out = df.to_json(orient="table")
+        result = pd.read_json(out, orient="table")
+        tm.assert_frame_equal(df, result)
+
+    def test_date_range(self):
+        df = DataFrame(
+            {'C': pd.date_range('2016-01-01', freq='d', periods=4),
+             },
+            index=pd.Index(range(4), name='idx'))
+
+        out = df.to_json(orient="table")
+        result = pd.read_json(out, orient="table")
+        tm.assert_frame_equal(df, result)
+
+    def test_timedelta_raises(self):
+        df = DataFrame(
+            {'D': pd.timedelta_range('1H', periods=4, freq='T'),
+             },
+            index=pd.Index(range(4), name='idx'))
+
+        out = df.to_json(orient="table")
+        with tm.assert_raises_regex(NotImplementedError, 'can not yet read '
+                                    'ISO-formatted Timedelta data'):
+            pd.read_json(out, orient="table")
+
+    def test_categorical(self):
+        df = DataFrame(
+            {'E': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'])),
+             'F': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'],
+                                           ordered=True)),
+             },
+            index=pd.Index(range(4), name='idx'))
+
+        out = df.to_json(orient="table")
+        result = pd.read_json(out, orient="table")
+        tm.assert_frame_equal(df, result)
+
+    @pytest.mark.parametrize("float_vals", [
+        pytest.param([1., 2., 3., 4.], marks=pytest.mark.xfail),
+        [1.1, 2.2, 3.3, 4.4]])
+    def test_float(self, float_vals):
+        df = DataFrame(
+            {'G': float_vals,
+             },
+            index=pd.Index(range(4), name='idx'))
+
+        out = df.to_json(orient="table")
+        result = pd.read_json(out, orient="table", convert_axes=False)
+        tm.assert_frame_equal(df, result)
+
+    def test_timezone_raises(self):
+        df = DataFrame(
+            {'H': pd.date_range('2016-01-01', freq='d', periods=4,
+                                tz='US/Central'),
+             },
+            index=pd.Index(range(4), name='idx'))
+
+        out = df.to_json(orient="table")
+        with tm.assert_raises_regex(NotImplementedError, 'can not yet read '
+                                    'timezone data'):
+            pd.read_json(out, orient="table")
+
+    def test_bool(self):
+        df = DataFrame(
+            {'I': [True, False, False, True],
+             },
+            index=pd.Index(range(4), name='idx'))
+
+        out = df.to_json(orient="table")
+        result = pd.read_json(out, orient="table")
+        tm.assert_frame_equal(df, result)
+
+    def test_comprehensive(self):
+        df = DataFrame(
+            {'A': [1, 2, 3, 4],
+             'B': ['a', 'b', 'c', 'c'],
+             'C': pd.date_range('2016-01-01', freq='d', periods=4),
+             # 'D': pd.timedelta_range('1H', periods=4, freq='T'),
+             'E': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'])),
+             'F': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'],
+                                           ordered=True)),
+             'G': [1.1, 2.2, 3.3, 4.4],
+             # 'H': pd.date_range('2016-01-01', freq='d', periods=4,
+             #                   tz='US/Central'),
+             'I': [True, False, False, True],
+             },
+            index=pd.Index(range(4), name='idx'))
+
+        out = df.to_json(orient="table")
+        result = pd.read_json(out, orient="table")
+        tm.assert_frame_equal(df, result)
+
+    @pytest.mark.parametrize("index_names", [[None, None], ['foo', 'bar']])
+    def test_multiindex(self, index_names):
+        # GH 18912
+        df = pd.DataFrame(
+            [["Arr", "alpha", [1, 2, 3, 4]],
+             ["Bee", "Beta", [10, 20, 30, 40]]],
+            index=[["A", "B"], ["Null", "Eins"]],
+            columns=["Aussprache", "Griechisch", "Args"]
+        )
+        df.index.names = index_names
+        out = df.to_json(orient="table")
+        result = pd.read_json(out, orient="table")
+        tm.assert_frame_equal(df, result)