read_json support for orient="table" (#19039)

WillAyd · jreback · commit e3251da38b62 · 2018-01-06T12:33:06.000-05:00
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -1648,7 +1648,7 @@ with optional parameters:
 
   DataFrame
       - default is ``columns``
-      - allowed values are {``split``, ``records``, ``index``, ``columns``, ``values``}
+      - allowed values are {``split``, ``records``, ``index``, ``columns``, ``values``, ``table``}
 
   The format of the JSON string
 
@@ -1732,6 +1732,9 @@ values, index and columns. Name is also included for ``Series``:
   dfjo.to_json(orient="split")
   sjo.to_json(orient="split")
 
+**Table oriented** serializes to the JSON `Table Schema`_, allowing for the
+preservation of metadata including but not limited to dtypes and index names.
+
 .. note::
 
   Any orient option that encodes to a JSON object will not preserve the ordering of
@@ -1833,7 +1836,7 @@ is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series``
 
   DataFrame
       - default is ``columns``
-      - allowed values are {``split``, ``records``, ``index``, ``columns``, ``values``}
+      - allowed values are {``split``, ``records``, ``index``, ``columns``, ``values``, ``table``}
 
   The format of the JSON string
 
@@ -1846,6 +1849,8 @@ is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series``
      ``index``; dict like {index -> {column -> value}}
      ``columns``; dict like {column -> {index -> value}}
      ``values``; just the values array
+     ``table``; adhering to the JSON `Table Schema`_
+
 
 - ``dtype`` : if True, infer dtypes, if a dict of column to dtype, then use those, if False, then don't infer dtypes at all, default is True, apply only to the data
 - ``convert_axes`` : boolean, try to convert the axes to the proper dtypes, default is True
@@ -2202,7 +2207,39 @@ A few notes on the generated table schema:
     then ``level_<i>`` is used.
 
 
-_Table Schema: http://specs.frictionlessdata.io/json-table-schema/
+.. versionadded:: 0.23.0
+
+``read_json`` also accepts ``orient='table'`` as an argument. This allows for
+the preserveration of metadata such as dtypes and index names in a
+round-trippable manner.
+
+  .. ipython:: python
+
+   df = pd.DataFrame({'foo': [1, 2, 3, 4],
+		      'bar': ['a', 'b', 'c', 'd'],
+		      'baz': pd.date_range('2018-01-01', freq='d', periods=4),
+		      'qux': pd.Categorical(['a', 'b', 'c', 'c'])
+		      }, index=pd.Index(range(4), name='idx'))
+   df
+   df.dtypes
+
+   df.to_json('test.json', orient='table')
+   new_df = pd.read_json('test.json', orient='table')
+   new_df
+   new_df.dtypes
+
+Please note that the string `index` is not supported with the round trip
+format, as it is used by default in ``write_json`` to indicate a missing index
+name.
+
+.. ipython:: python
+
+   df.index.name = 'index'
+   df.to_json('test.json', orient='table')
+   new_df = pd.read_json('test.json', orient='table')
+   print(new_df.index.name)
+
+.. _Table Schema: http://specs.frictionlessdata.io/json-table-schema/
 
 HTML
 ----
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -145,6 +145,37 @@ Current Behavior
 
     s.rank(na_option='top')
 
+.. _whatsnew_0230.enhancements.round-trippable_json:
+
+JSON read/write round-trippable with ``orient='table'``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+A ``DataFrame`` can now be written to and subsequently read back via JSON while preserving metadata through usage of the ``orient='table'`` argument (see :issue:`18912` and :issue:`9146`). Previously, none of the available ``orient`` values guaranteed the preservation of dtypes and index names, amongst other metadata.
+
+.. ipython:: python
+
+   df = pd.DataFrame({'foo': [1, 2, 3, 4],
+		      'bar': ['a', 'b', 'c', 'd'],
+		      'baz': pd.date_range('2018-01-01', freq='d', periods=4),
+		      'qux': pd.Categorical(['a', 'b', 'c', 'c'])
+		      }, index=pd.Index(range(4), name='idx'))
+   df
+   df.dtypes
+   df.to_json('test.json', orient='table')
+   new_df = pd.read_json('test.json', orient='table')
+   new_df
+   new_df.dtypes
+
+Please note that the string `index` is not supported with the round trip format, as it is used by default in ``write_json`` to indicate a missing index name.
+
+.. ipython:: python
+
+   df.index.name = 'index'
+   df.to_json('test.json', orient='table')
+   new_df = pd.read_json('test.json', orient='table')
+   new_df
+   print(new_df.index.name)
+
 .. _whatsnew_0230.enhancements.other:
 
 Other Enhancements
diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
@@ -16,7 +16,7 @@
 from pandas.core.reshape.concat import concat
 from pandas.io.formats.printing import pprint_thing
 from .normalize import _convert_to_line_delimits
-from .table_schema import build_table_schema
+from .table_schema import build_table_schema, parse_table_schema
 from pandas.core.dtypes.common import is_period_dtype
 
 loads = json.loads
@@ -261,13 +261,16 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
         * when ``typ == 'frame'``,
 
           - allowed orients are ``{'split','records','index',
-            'columns','values'}``
+            'columns','values', 'table'}``
           - default is ``'columns'``
           - The DataFrame index must be unique for orients ``'index'`` and
             ``'columns'``.
           - The DataFrame columns must be unique for orients ``'index'``,
             ``'columns'``, and ``'records'``.
 
+        .. versionadded:: 0.23.0
+           'table' as an allowed value for the ``orient`` argument
+
     typ : type of object to recover (series or frame), default 'frame'
     dtype : boolean or dict, default True
         If True, infer dtypes, if a dict of column to dtype, then use those,
@@ -336,6 +339,15 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
     -------
     result : Series or DataFrame, depending on the value of `typ`.
 
+    Notes
+    -----
+    Specific to ``orient='table'``, if a ``DataFrame`` with a literal ``Index``
+    name of `index` gets written with ``write_json``, the subsequent read
+    operation will incorrectly set the ``Index`` name to ``None``. This is
+    because `index` is also used by ``write_json`` to denote a missing
+    ``Index`` name, and the subsequent ``read_json`` operation cannot
+    distinguish between the two.
+
     See Also
     --------
     DataFrame.to_json
@@ -839,6 +851,9 @@ def _parse_no_numpy(self):
         elif orient == "index":
             self.obj = DataFrame(
                 loads(json, precise_float=self.precise_float), dtype=None).T
+        elif orient == 'table':
+            self.obj = parse_table_schema(json,
+                                          precise_float=self.precise_float)
         else:
             self.obj = DataFrame(
                 loads(json, precise_float=self.precise_float), dtype=None)
diff --git a/pandas/io/json/table_schema.py b/pandas/io/json/table_schema.py
@@ -3,13 +3,18 @@
 
 http://specs.frictionlessdata.io/json-table-schema/
 """
+import pandas._libs.json as json
+from pandas import DataFrame
+from pandas.api.types import CategoricalDtype
 from pandas.core.common import _all_not_none
 from pandas.core.dtypes.common import (
     is_integer_dtype, is_timedelta64_dtype, is_numeric_dtype,
     is_bool_dtype, is_datetime64_dtype, is_datetime64tz_dtype,
     is_categorical_dtype, is_period_dtype, is_string_dtype
 )
 
+loads = json.loads
+
 
 def as_json_table_type(x):
     """
@@ -75,7 +80,7 @@ def set_default_names(data):
     return data
 
 
-def make_field(arr, dtype=None):
+def convert_pandas_type_to_json_field(arr, dtype=None):
     dtype = dtype or arr.dtype
     if arr.name is None:
         name = 'values'
@@ -103,6 +108,69 @@ def make_field(arr, dtype=None):
     return field
 
 
+def convert_json_field_to_pandas_type(field):
+    """
+    Converts a JSON field descriptor into its corresponding NumPy / pandas type
+
+    Parameters
+    ----------
+    field
+        A JSON field descriptor
+
+    Returns
+    -------
+    dtype
+
+    Raises
+    -----
+    ValueError
+        If the type of the provided field is unknown or currently unsupported
+
+    Examples
+    --------
+    >>> convert_json_field_to_pandas_type({'name': 'an_int',
+                                           'type': 'integer'})
+    'int64'
+    >>> convert_json_field_to_pandas_type({'name': 'a_categorical',
+                                           'type': 'any',
+                                           'contraints': {'enum': [
+                                                          'a', 'b', 'c']},
+                                           'ordered': True})
+    'CategoricalDtype(categories=['a', 'b', 'c'], ordered=True)'
+    >>> convert_json_field_to_pandas_type({'name': 'a_datetime',
+                                           'type': 'datetime'})
+    'datetime64[ns]'
+    >>> convert_json_field_to_pandas_type({'name': 'a_datetime_with_tz',
+                                           'type': 'datetime',
+                                           'tz': 'US/Central'})
+    'datetime64[ns, US/Central]'
+    """
+    typ = field['type']
+    if typ == 'string':
+        return 'object'
+    elif typ == 'integer':
+        return 'int64'
+    elif typ == 'number':
+        return 'float64'
+    elif typ == 'boolean':
+        return 'bool'
+    elif typ == 'duration':
+        return 'timedelta64'
+    elif typ == 'datetime':
+        if field.get('tz'):
+            return 'datetime64[ns, {tz}]'.format(tz=field['tz'])
+        else:
+            return 'datetime64[ns]'
+    elif typ == 'any':
+        if 'constraints' in field and 'ordered' in field:
+            return CategoricalDtype(categories=field['constraints']['enum'],
+                                    ordered=field['ordered'])
+        else:
+            return 'object'
+
+    raise ValueError("Unsupported or invalid field type: {}".format(typ))
+
+
 def build_table_schema(data, index=True, primary_key=None, version=True):
     """
     Create a Table schema from ``data``.
@@ -158,15 +226,15 @@ def build_table_schema(data, index=True, primary_key=None, version=True):
     if index:
         if data.index.nlevels > 1:
             for level in data.index.levels:
-                fields.append(make_field(level))
+                fields.append(convert_pandas_type_to_json_field(level))
         else:
-            fields.append(make_field(data.index))
+            fields.append(convert_pandas_type_to_json_field(data.index))
 
     if data.ndim > 1:
         for column, s in data.iteritems():
-            fields.append(make_field(s))
+            fields.append(convert_pandas_type_to_json_field(s))
     else:
-        fields.append(make_field(data))
+        fields.append(convert_pandas_type_to_json_field(data))
 
     schema['fields'] = fields
     if index and data.index.is_unique and primary_key is None:
@@ -180,3 +248,65 @@ def build_table_schema(data, index=True, primary_key=None, version=True):
     if version:
         schema['pandas_version'] = '0.20.0'
     return schema
+
+
+def parse_table_schema(json, precise_float):
+    """
+    Builds a DataFrame from a given schema
+
+    Parameters
+    ----------
+    json :
+        A JSON table schema
+    precise_float : boolean
+        Flag controlling precision when decoding string to double values, as
+        dictated by ``read_json``
+
+    Returns
+    -------
+    df : DataFrame
+
+    Raises
+    ------
+    NotImplementedError
+        If the JSON table schema contains either timezone or timedelta data
+
+    Notes
+    -----
+        Because ``write_json`` uses the string `index` to denote a name-less
+        ``Index``, this function sets the name of the returned ``DataFrame`` to
+        ``None`` when said string is encountered. Therefore, intentional usage
+        of `index` as the ``Index`` name is not supported.
+
+    See also
+    --------
+    build_table_schema : inverse function
+    pandas.read_json
+    """
+    table = loads(json, precise_float=precise_float)
+    col_order = [field['name'] for field in table['schema']['fields']]
+    df = DataFrame(table['data'])[col_order]
+
+    dtypes = {field['name']: convert_json_field_to_pandas_type(field)
+              for field in table['schema']['fields']}
+
+    # Cannot directly use as_type with timezone data on object; raise for now
+    if any(str(x).startswith('datetime64[ns, ') for x in dtypes.values()):
+        raise NotImplementedError('table="orient" can not yet read timezone '
+                                  'data')
+
+    # No ISO constructor for Timedelta as of yet, so need to raise
+    if 'timedelta64' in dtypes.values():
+        raise NotImplementedError('table="orient" can not yet read '
+                                  'ISO-formatted Timedelta data')
+
+    df = df.astype(dtypes)
+
+    df = df.set_index(table['schema']['primaryKey'])
+    if len(df.index.names) == 1 and df.index.name == 'index':
+        df.index.name = None
+    else:
+        if all(x.startswith('level_') for x in df.index.names):
+            df.index.names = [None] * len(df.index.names)
+
+    return df
diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py