ENH: Added to_json_schema

TomAugspurger · TomAugspurger · commit 85c8d661c532 · 2016-12-17T12:15:30.000-06:00
Lays the groundwork for pandas-dev#14386 This handles the schema part of the request there. We'll still need to do the work to publish the data to the frontend, but that can be done as a followup.
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -63,6 +63,11 @@ JSON
 
 .. currentmodule:: pandas
 
+.. autosummary::
+   :toctree: generated/
+
+   to_json_schema
+
 HTML
 ~~~~
 
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -1998,6 +1998,29 @@ using Hadoop or Spark.
   df
   df.to_json(orient='records', lines=True)
 
+
+JSON Table Schema
+-----------------
+
+`JSON Table Schema`_ is a spec for describing tabular datasets as a JSON
+object. The JSON includes information on the field names, types, and
+other attributes. The :func:`pd.to_json_schema` function will build a
+JSON Table Schema compatible dict, which can be easily seriealized.
+
+.. ipython:: python
+
+   df = pd.DataFrame(
+       {'A': [1, 2, 3],
+        'B': ['a', 'b', 'c'],
+        'C': pd.date_range('2016-01-01', freq='d', periods=3),
+       }, index=pd.Index(range(3), name='idx'))
+   df
+
+   pd.to_json_schema(df)
+
+
+_JSON Table Schema: http://specs.frictionlessdata.io/json-table-schema/
+
 HTML
 ----
 
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -65,6 +65,27 @@ Strings passed to ``DataFrame.groupby()`` as the ``by`` parameter may now refere
    df.groupby(['second', 'A']).sum()
 
 
+.. _whatsnew_0200.enhancements.json_table_schema
+
+JSON Table Schema Output
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+The new top-level method `:func:pd.to_json_schmea` will generate
+a `JSON Table Schema`_ compatible dict describing the DataFrame.
+
+.. ipython:: python
+
+   df = pd.DataFrame(
+       {'A': [1, 2, 3],
+        'B': ['a', 'b', 'c'],
+        'C': pd.date_range('2016-01-01', freq='d', periods=3),
+       }, index=pd.Index(range(3), name='idx'))
+   df
+
+   pd.to_json_schema(df)
+
+.. _JSON Table Schema: http://specs.frictionlessdata.io/json-table-schema/
+
 .. _whatsnew_0200.enhancements.other:
 
 Other enhancements
diff --git a/pandas/api/tests/test_api.py b/pandas/api/tests/test_api.py
@@ -83,7 +83,8 @@ class TestPDApi(Base, tm.TestCase):
              'pivot', 'pivot_table', 'plot_params', 'qcut',
              'scatter_matrix',
              'show_versions', 'timedelta_range', 'unique',
-             'value_counts', 'wide_to_long']
+             'value_counts', 'wide_to_long',
+             'to_json_schema']
 
     # top-level option funcs
     funcs_option = ['reset_option', 'describe_option', 'get_option',
diff --git a/pandas/core/api.py b/pandas/core/api.py
@@ -9,6 +9,7 @@
 from pandas.core.categorical import Categorical
 from pandas.core.groupby import Grouper
 from pandas.formats.format import set_eng_float_format
+from pandas.formats.json import to_json_schema
 from pandas.core.index import (Index, CategoricalIndex, Int64Index,
                                RangeIndex, Float64Index, MultiIndex)
 
diff --git a/pandas/formats/json.py b/pandas/formats/json.py
@@ -0,0 +1,148 @@
+"""
+http://specs.frictionlessdata.io/json-table-schema/
+"""
+import json
+from collections import OrderedDict
+
+from pandas.core import config
+from pandas.types.common import (
+    is_integer_dtype, is_timedelta64_dtype, is_string_dtype, is_numeric_dtype,
+    is_bool_dtype, is_datetime64_dtype
+)
+
+
+def as_jsontable_type(x):
+    """
+    Convert a NumPy / pandas type to its corresponding jsontable type
+
+    ==============  ======================
+    Pandas type     JSON Table Schema type
+    ==============  ======================
+    int64           integer
+    float64         number
+    bool            boolean
+    datetime64[ns]  date
+    timedelta64[ns] timedelta
+    =============== ======================
+    """
+    if is_integer_dtype(x):
+        return 'integer'
+    elif is_bool_dtype(x):
+        return 'boolean'
+    elif is_numeric_dtype(x):
+        return 'number'
+    elif is_datetime64_dtype(x):
+        return 'date'
+    elif is_timedelta64_dtype(x):
+        return 'duration'
+    elif is_string_dtype(x):
+        return 'string'
+    else:
+        return 'any'
+
+
+def _set_default_names(data):
+    """Sets index names to 'index' for regular, or 'level_x' for Multi"""
+    if all(name is not None for name in data.index.names):
+        return data
+
+    data = data.copy()
+    if data.index.nlevels > 1:
+        names = [name if name is not None else 'level_{}'.format(i)
+                 for i, name in enumerate(data.index.names)]
+        data.index.names = names
+    else:
+        data.index.name = 'index'
+    return data
+
+
+def to_json_schema(data, index=True, primary_key=None):
+    """
+    Create a JSON Table schema from ``data``.
+
+    Parameters
+    ----------
+    data : Series, DataFrame
+    index : bool
+        Whether to include ``data.index`` in the schema.
+    primary_key : bool or None
+        column names to designate as the primary key.
+        The default `None` will set `'primary_key'` to the index
+        level or levels if the index is unique.
+
+    Returns
+    -------
+    schema : dict
+
+    Examples
+    --------
+    >>> df = pd.DataFrame(
+    ...     {'A': [1, 2, 3],
+    ...      'B': ['a', 'b', 'c'],
+    ...      'C': pd.date_range('2016-01-01', freq='d', periods=3),
+    ...     }, index=pd.Index(range(3), name='idx'))
+    >>> pd.to_json_schema(df)
+    {'fields': [{'name': 'idx', 'type': 'integer'},
+                {'name': 'A', 'type': 'integer'},
+                {'name': 'B', 'type': 'string'},
+                {'name': 'C', 'type': 'date'}],
+     'primary_key': 'idx'}
+    """
+    if index is True:
+        data = _set_default_names(data)
+
+    schema = {}
+    fields = []
+
+    if index:
+        if data.index.nlevels > 1:
+            for level in data.index.levels:
+                fields.append({'name': level.name,
+                               'type': as_jsontable_type(level.dtype)})
+        else:
+            fields.append({'name': data.index.name,
+                           'type': as_jsontable_type(data.index.dtype)})
+
+    if data.ndim > 1:
+        for column, type_ in data.dtypes.iteritems():
+            fields.append({'name': column,
+                           'type': as_jsontable_type(type_)})
+    else:
+        fields.append({
+            'name': data.name if data.name is not None else 'values',
+            'type': as_jsontable_type(data.dtype)})
+
+    schema['fields'] = fields
+    if index and data.index.is_unique and primary_key is None:
+        # TODO: Always a list, spec allows for a string scalar.
+        if data.index.nlevels == 1:
+            schema['primary_key'] = data.index.name
+        else:
+            schema['primary_key'] = data.index.names
+    elif primary_key is not None:
+        schema['primary_key'] = primary_key
+    return schema
+
+
+def _build_payload(data):
+    # for testing
+    schema = to_json_schema(data)
+    sample = data.head(config.get_option('display.max_rows'))
+    # XXX: timedelta64 types ruin this...
+    payload = {'schema': schema,
+               # XXX: using to_json to do the conversion to serializable
+               # types. Better to fix `to_dict` to return python types,
+               # or make a to_json that doesn't write to strings...
+               'data': json.loads(
+                   sample.reset_index().to_json(orient='records',
+                                                date_format='iso'),
+                   object_pairs_hook=OrderedDict)}
+    return payload
+
+
+def publish_tableschema(data):
+    """Temporary helper for testing w/ frontend"""
+    from IPython.display import display
+    mimetype = 'application/vnd.tableschema.v1+json'
+    payload = _build_payload(data)
+    display({mimetype: payload}, raw=True)
diff --git a/pandas/tests/formats/test_json.py b/pandas/tests/formats/test_json.py
@@ -0,0 +1,118 @@
+"""Tests for JSON Table Schema integration."""
+# import datetime
+from collections import OrderedDict
+
+import numpy as np
+import pandas as pd
+
+from pandas import DataFrame
+import pandas.util.testing as tm
+from pandas.formats.json import (
+    as_jsontable_type, to_json_schema, _build_payload)
+
+
+class TestJSONTableSchema(tm.TestCase):
+
+    def setUp(self):
+        self.df = DataFrame(
+            {'A': [1, 2, 3, 4],
+             'B': ['a', 'b', 'c', 'c'],
+             'C': pd.date_range('2016-01-01', freq='d', periods=4),
+             'D': pd.timedelta_range('1H', periods=4),
+             },
+            index=pd.Index(range(4), name='idx'))
+
+    def test_to_json_schema(self):
+        result = to_json_schema(self.df)
+        expected = {
+            'fields': [{'name': 'idx', 'type': 'integer'},
+                       {'name': 'A', 'type': 'integer'},
+                       {'name': 'B', 'type': 'string'},
+                       {'name': 'C', 'type': 'date'},
+                       {'name': 'D', 'type': 'duration'},
+                       ],
+            'primary_key': 'idx'
+        }
+        self.assertEqual(result, expected)
+
+    def test_series(self):
+        result = to_json_schema(pd.Series([1, 2, 3], name='foo'))
+        expected = {'fields': [{'name': 'index', 'type': 'integer'},
+                               {'name': 'foo', 'type': 'integer'}],
+                    'primary_key': 'index'}
+        self.assertEqual(result, expected)
+
+    def tets_series_unnamed(self):
+        result = to_json_schema(pd.Series([1, 2, 3]))
+        expected = {'fields': [{'name': 'index', 'type': 'integer'},
+                               {'name': 'values', 'type': 'integer'}],
+                    'primary_key': 'index'}
+        self.assertEqual(result, expected)
+
+    def test_multiindex(self):
+        df = self.df.copy()
+        idx = pd.MultiIndex.from_product([('a', 'b'), (1, 2)])
+        df.index = idx
+
+        result = to_json_schema(df)
+        expected = {
+            'fields': [{'name': 'level_0', 'type': 'string'},
+                       {'name': 'level_1', 'type': 'integer'},
+                       {'name': 'A', 'type': 'integer'},
+                       {'name': 'B', 'type': 'string'},
+                       {'name': 'C', 'type': 'date'},
+                       {'name': 'D', 'type': 'duration'},
+                       ],
+            'primary_key': ['level_0', 'level_1']
+        }
+        self.assertEqual(result, expected)
+
+        df.index.names = ['idx0', None]
+        expected['fields'][0]['name'] = 'idx0'
+        expected['primary_key'] = ['idx0', 'level_1']
+        result = to_json_schema(df)
+        self.assertEqual(result, expected)
+
+    def test_as_jsontable_type(self):
+        integers = [np.int, np.int16, np.int32, np.int64]
+        for t in integers:
+            self.assertEqual(as_jsontable_type(t), 'integer')
+
+        floats = [np.float, np.float16, np.float32, np.float64]
+        for t in floats:
+            self.assertEqual(as_jsontable_type(t), 'number')
+
+        bools = [bool, np.bool]
+        for t in bools:
+            self.assertEqual(as_jsontable_type(t), 'boolean')
+
+        # TODO: datedate.date? datetime.time?
+        dates = [np.datetime64, np.dtype("<M8[ns]")]
+        for t in dates:
+            self.assertEqual(as_jsontable_type(t), 'date')
+
+        durations = []
+        for t in durations:
+            self.assertEqual(as_jsontable_type(t), 'duration')
+
+        strings = [object]  # TODO
+        for t in strings:
+            self.assertEqual(as_jsontable_type(t), 'string')
+
+    def test_build_payload(self):
+        result = _build_payload(self.df.drop('D', axis=1))
+        expected = {'data': [
+            OrderedDict([('idx', 0), ('A', 1), ('B', 'a'),
+                         ('C', '2016-01-01T00:00:00.000Z')]),
+            OrderedDict([('idx', 1), ('A', 2), ('B', 'b'),
+                         ('C', '2016-01-02T00:00:00.000Z')]),
+            OrderedDict([('idx', 2), ('A', 3), ('B', 'c'),
+                         ('C', '2016-01-03T00:00:00.000Z')]),
+            OrderedDict([('idx', 3), ('A', 4), ('B', 'c'),
+                         ('C', '2016-01-04T00:00:00.000Z')]),
+        ], 'schema': {'primary_key': 'idx',
+                      'fields': [{'name': 'idx', 'type': 'integer'},
+                                 {'name': 'A', 'type': 'integer'},
+                                 {'name': 'B', 'type': 'string'},
+                                 {'name': 'C', 'type': 'date'}]}}
+        self.assertEqual(result, expected)