read_json support for orient="table"

WillAyd · WillAyd · commit 4319335aadd7 · 2018-01-02T13:02:21.000-08:00
diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
@@ -16,7 +16,7 @@
 from pandas.core.reshape.concat import concat
 from pandas.io.formats.printing import pprint_thing
 from .normalize import _convert_to_line_delimits
-from .table_schema import build_table_schema
+from .table_schema import build_table_schema, parse_table_schema
 from pandas.core.dtypes.common import is_period_dtype
 
 loads = json.loads
@@ -839,6 +839,9 @@ def _parse_no_numpy(self):
         elif orient == "index":
             self.obj = DataFrame(
                 loads(json, precise_float=self.precise_float), dtype=None).T
+        elif orient == 'table':
+            self.obj = parse_table_schema(json,
+                                          precise_float=self.precise_float)
         else:
             self.obj = DataFrame(
                 loads(json, precise_float=self.precise_float), dtype=None)
diff --git a/pandas/io/json/table_schema.py b/pandas/io/json/table_schema.py
@@ -3,13 +3,20 @@
 
 http://specs.frictionlessdata.io/json-table-schema/
 """
+from collections import OrderedDict
+
+import pandas._libs.json as json
+from pandas import DataFrame
+from pandas.api.types import CategoricalDtype
 from pandas.core.common import _all_not_none
 from pandas.core.dtypes.common import (
     is_integer_dtype, is_timedelta64_dtype, is_numeric_dtype,
     is_bool_dtype, is_datetime64_dtype, is_datetime64tz_dtype,
     is_categorical_dtype, is_period_dtype, is_string_dtype
 )
 
+loads = json.loads
+
 
 def as_json_table_type(x):
     """
@@ -103,6 +110,28 @@ def make_field(arr, dtype=None):
     return field
 
 
+def revert_field(field):
+    typ = field['type']
+    if typ == 'integer':
+        return 'int64'
+    elif typ == 'number':
+        return 'float64'
+    elif typ == 'boolean':
+        return 'bool'
+    elif typ == 'duration':
+        return 'timedelta64'
+    elif typ == 'datetime':
+        if field.get('tz'):
+            return 'datetime64[ns, {tz}]'.format(tz=field['tz'])
+        else:
+            return 'datetime64[ns]'
+    elif typ == 'any':
+        if 'constraints' in field and 'ordered' in field:
+            return CategoricalDtype(categories=field['constraints']['enum'],
+                                    ordered=field['ordered'])
+    return 'object'
+
+
 def build_table_schema(data, index=True, primary_key=None, version=True):
     """
     Create a Table schema from ``data``.
@@ -180,3 +209,33 @@ def build_table_schema(data, index=True, primary_key=None, version=True):
     if version:
         schema['pandas_version'] = '0.20.0'
     return schema
+
+
+def parse_table_schema(json, precise_float):
+    """
+    Builds a DataFrame from a given schema
+    """
+    table = loads(json, precise_float=precise_float)
+    data = [OrderedDict(x) for x in table['data']]
+    df = DataFrame(data)
+
+    dtypes = {field['name']: revert_field(field)
+              for field in table['schema']['fields']}
+
+    # Cannot directly use as_type with timezone data on object; raise for now
+    if any(str(x).startswith('datetime64[ns, ') for x in dtypes.values()):
+        raise NotImplementedError('table="orient" can not yet read timezone '
+                                  'data')
+
+    # No ISO constructor for Timedelta as of yet, so need to raise
+    if 'timedelta64' in dtypes.values():
+        raise NotImplementedError('table="orient" can not yet read '
+                                  'ISO-formatted Timedelta data')
+
+    df = df.astype(dtypes)
+
+    df = df.set_index(table['schema']['primaryKey'])
+    if all(x.startswith('level_') for x in df.index.names):
+        df.index.names = [None] * len(df.index.names)
+
+    return df
diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py
@@ -14,6 +14,7 @@
     build_table_schema,
     make_field,
     set_default_names)
+import pandas.util.testing as tm
 
 
 class TestBuildSchema(object):
@@ -471,3 +472,126 @@ def test_mi_falsey_name(self):
                                                             ('a', 'b')]))
         result = [x['name'] for x in build_table_schema(df)['fields']]
         assert result == ['level_0', 'level_1', 0, 1, 2, 3]
+
+
+class TestTableOrientReader(object):
+
+    def test_integer(self):
+        df = DataFrame(
+            {'A': [1, 2, 3, 4],
+             },
+            index=pd.Index(range(4), name='idx'))
+        out = df.to_json(orient="table")
+        result = pd.read_json(out, orient="table")
+        tm.assert_frame_equal(df, result)
+
+    def test_object(self):
+        df = DataFrame(
+            {'B': ['a', 'b', 'c', 'c'],
+             },
+            index=pd.Index(range(4), name='idx'))
+        out = df.to_json(orient="table")
+        result = pd.read_json(out, orient="table")
+        tm.assert_frame_equal(df, result)
+
+    def test_date_range(self):
+        df = DataFrame(
+            {'C': pd.date_range('2016-01-01', freq='d', periods=4),
+             },
+            index=pd.Index(range(4), name='idx'))
+
+        out = df.to_json(orient="table")
+        result = pd.read_json(out, orient="table")
+        tm.assert_frame_equal(df, result)
+
+    def test_timedelta_raises(self):
+        df = DataFrame(
+            {'D': pd.timedelta_range('1H', periods=4, freq='T'),
+             },
+            index=pd.Index(range(4), name='idx'))
+
+        out = df.to_json(orient="table")
+        with tm.assert_raises_regex(NotImplementedError, 'can not yet read '
+                                    'ISO-formatted Timedelta data'):
+            pd.read_json(out, orient="table")
+
+    def test_categorical(self):
+        df = DataFrame(
+            {'E': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'])),
+             'F': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'],
+                                           ordered=True)),
+             },
+            index=pd.Index(range(4), name='idx'))
+
+        out = df.to_json(orient="table")
+        result = pd.read_json(out, orient="table")
+        tm.assert_frame_equal(df, result)
+
+    @pytest.mark.parametrize("float_vals", [
+        pytest.param([1., 2., 3., 4.], marks=pytest.mark.xfail),
+        [1.1, 2.2, 3.3, 4.4]])
+    def test_float(self, float_vals):
+        df = DataFrame(
+            {'G': float_vals,
+             },
+            index=pd.Index(range(4), name='idx'))
+
+        out = df.to_json(orient="table")
+        result = pd.read_json(out, orient="table", convert_axes=False)
+        tm.assert_frame_equal(df, result)
+
+    def test_timezone_raises(self):
+        df = DataFrame(
+            {'H': pd.date_range('2016-01-01', freq='d', periods=4,
+                                tz='US/Central'),
+             },
+            index=pd.Index(range(4), name='idx'))
+
+        out = df.to_json(orient="table")
+        with tm.assert_raises_regex(NotImplementedError, 'can not yet read '
+                                    'timezone data'):
+            pd.read_json(out, orient="table")
+
+    def test_bool(self):
+        df = DataFrame(
+            {'I': [True, False, False, True],
+             },
+            index=pd.Index(range(4), name='idx'))
+
+        out = df.to_json(orient="table")
+        result = pd.read_json(out, orient="table")
+        tm.assert_frame_equal(df, result)
+
+    def test_comprehensive(self):
+        df = DataFrame(
+            {'A': [1, 2, 3, 4],
+             'B': ['a', 'b', 'c', 'c'],
+             'C': pd.date_range('2016-01-01', freq='d', periods=4),
+             # 'D': pd.timedelta_range('1H', periods=4, freq='T'),
+             'E': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'])),
+             'F': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'],
+                                           ordered=True)),
+             'G': [1.1, 2.2, 3.3, 4.4],
+             # 'H': pd.date_range('2016-01-01', freq='d', periods=4,
+             #                   tz='US/Central'),
+             'I': [True, False, False, True],
+             },
+            index=pd.Index(range(4), name='idx'))
+
+        out = df.to_json(orient="table")
+        result = pd.read_json(out, orient="table")
+        tm.assert_frame_equal(df, result)
+
+    @pytest.mark.parametrize("index_names", [[None, None], ['foo', 'bar']])
+    def test_multiindex(self, index_names):
+        # GH 18912
+        df = pd.DataFrame(
+            [["Arr", "alpha", [1, 2, 3, 4]],
+             ["Bee", "Beta", [10, 20, 30, 40]]],
+            index=[["A", "B"], ["Null", "Eins"]],
+            columns=["Aussprache", "Griechisch", "Args"]
+        )
+        df.index.names = index_names
+        out = df.to_json(orient="table")
+        result = pd.read_json(out, orient="table")
+        tm.assert_frame_equal(df, result)