ENH: Added to_json_schema

TomAugspurger · TomAugspurger · commit 68c1bd3167e2 · 2017-02-01T20:26:15.000-06:00
Lays the groundwork for pandas-dev#14386 This handles the schema part of the request there. We'll still need to do the work to publish the data to the frontend, but that can be done as a followup. DOC: More notes in prose docs Move files use isoformat updates Moved to to_json
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -63,6 +63,11 @@ JSON
 
 .. currentmodule:: pandas
 
+.. autosummary::
+   :toctree: generated/
+
+   to_json_schema
+
 HTML
 ~~~~
 
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -2033,6 +2033,46 @@ using Hadoop or Spark.
   df
   df.to_json(orient='records', lines=True)
 
+
+JSON Table Schema
+-----------------
+
+`JSON Table Schema`_ is a spec for describing tabular datasets as a JSON
+object. The JSON includes information on the field names, types, and
+other attributes. The :func:`pd.to_json_schema` function will build a
+JSON Table Schema compatible dict, which can be easily seriealized.
+
+.. ipython:: python
+
+   df = pd.DataFrame(
+       {'A': [1, 2, 3],
+        'B': ['a', 'b', 'c'],
+        'C': pd.date_range('2016-01-01', freq='d', periods=3),
+       }, index=pd.Index(range(3), name='idx'))
+   df
+
+   pd.to_json_schema(df)
+
+The full list of types supported are described in the JSON Table Schema
+spec. This table shows the mapping from pandas types:
+
+==============  ======================
+Pandas type     JSON Table Schema type
+==============  ======================
+int64           integer
+float64         number
+bool            boolean
+datetime64[ns]  date
+timedelta64[ns] timedelta
+=============== ======================
+
+By default, the `primary_key` attribute is set to the index when
+the index (or MultiIndex) has a name (or names) and is unique.
+This behavior can be overridden with the `index` and `primary_key`
+arguments.
+
+_JSON Table Schema: http://specs.frictionlessdata.io/json-table-schema/
+
 HTML
 ----
 
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -114,6 +114,26 @@ Notably, a new numerical index, ``UInt64Index``, has been created (:issue:`14937
 - Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`)
 - Bug in ``pd.value_counts()`` in which unsigned 64-bit integers were being erroneously truncated in the output (:issue:`14934`)
 
+
+JSON Table Schema Output
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+The new top-level method `:func:pd.to_json_schmea` will generate
+a `JSON Table Schema`_ compatible dict describing the DataFrame.
+
+.. ipython:: python
+
+   df = pd.DataFrame(
+       {'A': [1, 2, 3],
+        'B': ['a', 'b', 'c'],
+        'C': pd.date_range('2016-01-01', freq='d', periods=3),
+       }, index=pd.Index(range(3), name='idx'))
+   df
+
+   pd.to_json_schema(df)
+
+.. _JSON Table Schema: http://specs.frictionlessdata.io/json-table-schema/
+
 .. _whatsnew_0200.enhancements.other:
 
 Other enhancements
diff --git a/pandas/io/json.py b/pandas/io/json.py
@@ -2,6 +2,7 @@
 
 import os
 import copy
+import json
 from collections import defaultdict
 import numpy as np
 
@@ -11,9 +12,13 @@
 from pandas import compat, isnull
 from pandas import Series, DataFrame, to_datetime
 from pandas.io.common import get_filepath_or_buffer, _get_handle
+from pandas.core import config
 from pandas.core.common import AbstractMethodError
 from pandas.formats.printing import pprint_thing
-
+from pandas.types.common import (
+    is_integer_dtype, is_timedelta64_dtype, is_string_dtype, is_numeric_dtype,
+    is_bool_dtype, is_datetime64_dtype
+)
 loads = _json.loads
 dumps = _json.dumps
 
@@ -61,6 +66,22 @@ def __init__(self, obj, orient, date_format, double_precision,
         if orient is None:
             orient = self._default_orient
 
+        self.is_jsontable_schema = orient == 'jsontable_schema'
+        if self.is_jsontable_schema:
+            self.schema = to_json_schema(obj)
+
+            # XXX: Do this timedelta properly in to_json
+            sample = obj.head(
+                config.get_option('display.max_rows')).reset_index()
+            timedeltas = sample.select_dtypes(include=['timedelta']).columns
+            sample[timedeltas] = sample[timedeltas].applymap(isoformat)
+            self.obj = sample
+            date_format = 'iso'  # ignoring user input, but epoch not allowed
+            orient = 'records'
+
+        else:
+            self.schema = None
+
         self.orient = orient
         self.date_format = date_format
         self.double_precision = double_precision
@@ -75,14 +96,19 @@ def _format_axes(self):
         raise AbstractMethodError(self)
 
     def write(self):
-        return dumps(
+        serialized = dumps(
             self.obj,
             orient=self.orient,
             double_precision=self.double_precision,
             ensure_ascii=self.ensure_ascii,
             date_unit=self.date_unit,
             iso_dates=self.date_format == 'iso',
-            default_handler=self.default_handler)
+            default_handler=self.default_handler
+        )
+        if self.is_jsontable_schema:
+            serialized = '{{"schema": {}, "data": {}}}'.format(
+                json.dumps(self.schema), serialized)
+        return serialized
 
 
 class SeriesWriter(Writer):
@@ -884,3 +910,135 @@ def _recursive_extract(data, path, seen_meta, level=0):
         result[k] = np.array(v).repeat(lengths)
 
     return result
+
+
+# ---------------------------------------------------------------------
+# JSON-Table Schema routines
+# http://specs.frictionlessdata.io/json-table-schema/
+
+
+def as_jsontable_type(x):
+    """
+    Convert a NumPy / pandas type to its corresponding jsontable type
+
+    ==============  ======================
+    Pandas type     JSON Table Schema type
+    ==============  ======================
+    int64           integer
+    float64         number
+    bool            boolean
+    datetime64[ns]  date
+    timedelta64[ns] duration
+    =============== ======================
+    """
+    if is_integer_dtype(x):
+        return 'integer'
+    elif is_bool_dtype(x):
+        return 'boolean'
+    elif is_numeric_dtype(x):
+        return 'number'
+    elif is_datetime64_dtype(x):
+        return 'date'
+    elif is_timedelta64_dtype(x):
+        return 'duration'
+    elif is_string_dtype(x):
+        return 'string'
+    else:
+        return 'any'
+
+
+def _set_default_names(data):
+    """Sets index names to 'index' for regular, or 'level_x' for Multi"""
+    if all(name is not None for name in data.index.names):
+        return data
+
+    data = data.copy()
+    if data.index.nlevels > 1:
+        names = [name if name is not None else 'level_{}'.format(i)
+                 for i, name in enumerate(data.index.names)]
+        data.index.names = names
+    else:
+        data.index.name = 'index'
+    return data
+
+
+def to_json_schema(data, index=True, primary_key=None):
+    """
+    Create a JSON Table schema from ``data``.
+
+    Parameters
+    ----------
+    data : Series, DataFrame
+    index : bool
+        Whether to include ``data.index`` in the schema.
+    primary_key : bool or None
+        column names to designate as the primary key.
+        The default `None` will set `'primary_key'` to the index
+        level or levels if the index is unique.
+
+    Returns
+    -------
+    schema : dict
+
+    Examples
+    --------
+    >>> df = pd.DataFrame(
+    ...     {'A': [1, 2, 3],
+    ...      'B': ['a', 'b', 'c'],
+    ...      'C': pd.date_range('2016-01-01', freq='d', periods=3),
+    ...     }, index=pd.Index(range(3), name='idx'))
+    >>> pd.to_json_schema(df)
+    {'fields': [{'name': 'idx', 'type': 'integer'},
+                {'name': 'A', 'type': 'integer'},
+                {'name': 'B', 'type': 'string'},
+                {'name': 'C', 'type': 'date'}],
+     'primary_key': 'idx'}
+
+    Notes
+    -----
+    See `as_jsontable_type` for conversion types.
+    Timedeltas as converted to ISO8601 duration format with
+    9 decimal places after the secnods field for nanosecond precision.
+    """
+    if index is True:
+        data = _set_default_names(data)
+
+    schema = {}
+    fields = []
+
+    if index:
+        if data.index.nlevels > 1:
+            for level in data.index.levels:
+                fields.append({'name': level.name,
+                               'type': as_jsontable_type(level.dtype)})
+        else:
+            fields.append({'name': data.index.name,
+                           'type': as_jsontable_type(data.index.dtype)})
+
+    if data.ndim > 1:
+        for column, type_ in data.dtypes.iteritems():
+            fields.append({'name': column,
+                           'type': as_jsontable_type(type_)})
+    else:
+        fields.append({
+            'name': data.name if data.name is not None else 'values',
+            'type': as_jsontable_type(data.dtype)})
+
+    schema['fields'] = fields
+    if index and data.index.is_unique and primary_key is None:
+        # TODO: Always a list, spec allows for a string scalar.
+        if data.index.nlevels == 1:
+            schema['primary_key'] = data.index.name
+        else:
+            schema['primary_key'] = data.index.names
+    elif primary_key is not None:
+        schema['primary_key'] = primary_key
+    return schema
+
+
+def publish_tableschema(data):
+    """Temporary helper for testing w/ frontend"""
+    from IPython.display import display
+    mimetype = 'application/vnd.tableschema.v1+json'
+    payload = data.to_json(orient='jsontable_schema')
+    display({mimetype: payload}, raw=True)
diff --git a/pandas/io/tests/json/test_json_table_schema.py b/pandas/io/tests/json/test_json_table_schema.py