ENH: Added to_json_schema (#14904)

TomAugspurger · jorisvandenbossche · commit 07ac39e95565 · 2017-03-04T12:50:04.000+01:00
Lays the groundwork for #14386 This handles the schema part of the request there. We'll still need to do the work to publish the data to the frontend, but that can be done as a followup. Added publish to dataframe repr
diff --git a/ci/requirements-2.7.pip b/ci/requirements-2.7.pip
@@ -4,3 +4,5 @@ pathlib
 backports.lzma
 py
 PyCrypto
+mock
+ipython
diff --git a/ci/requirements-3.5.run b/ci/requirements-3.5.run
@@ -18,3 +18,4 @@ pymysql
 psycopg2
 s3fs
 beautifulsoup4
+ipython
diff --git a/ci/requirements-3.6.run b/ci/requirements-3.6.run
@@ -18,3 +18,4 @@ pymysql
 beautifulsoup4
 s3fs
 xarray
+ipython
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -60,6 +60,7 @@ JSON
    :toctree: generated/
 
    json_normalize
+   build_table_schema
 
 .. currentmodule:: pandas
 
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -2033,6 +2033,126 @@ using Hadoop or Spark.
   df
   df.to_json(orient='records', lines=True)
 
+
+.. _io.table_schema:
+
+Table Schema
+''''''''''''
+
+.. versionadded:: 0.20.0
+
+`Table Schema`_ is a spec for describing tabular datasets as a JSON
+object. The JSON includes information on the field names, types, and
+other attributes. You can use the orient ``table`` to build
+a JSON string with two fields, ``schema`` and ``data``.
+
+.. ipython:: python
+
+   df = pd.DataFrame(
+       {'A': [1, 2, 3],
+        'B': ['a', 'b', 'c'],
+        'C': pd.date_range('2016-01-01', freq='d', periods=3),
+       }, index=pd.Index(range(3), name='idx'))
+   df
+   df.to_json(orient='table', date_format="iso")
+
+The ``schema`` field contains the ``fields`` key, which itself contains
+a list of column name to type pairs, including the ``Index`` or ``MultiIndex``
+(see below for a list of types).
+The ``schema`` field also contains a ``primaryKey`` field if the (Multi)index
+is unique.
+
+The second field, ``data``, contains the serialized data with the ``records``
+orient.
+The index is included, and any datetimes are ISO 8601 formatted, as required
+by the Table Schema spec.
+
+The full list of types supported are described in the Table Schema
+spec. This table shows the mapping from pandas types:
+
+==============  =================
+Pandas type     Table Schema type
+==============  =================
+int64           integer
+float64         number
+bool            boolean
+datetime64[ns]  datetime
+timedelta64[ns] duration
+categorical     any
+object          str
+=============== =================
+
+A few notes on the generated table schema:
+
+- The ``schema`` object contains a ``pandas_version`` field. This contains
+  the version of pandas' dialect of the schema, and will be incremented
+  with each revision.
+- All dates are converted to UTC when serializing. Even timezone naïve values,
+  which are treated as UTC with an offset of 0.
+
+  .. ipython:: python:
+
+     from pandas.io.json import build_table_schema
+     s = pd.Series(pd.date_range('2016', periods=4))
+     build_table_schema(s)
+
+- datetimes with a timezone (before serializing), include an additional field
+  ``tz`` with the time zone name (e.g. ``'US/Central'``).
+
+  .. ipython:: python
+
+     s_tz = pd.Series(pd.date_range('2016', periods=12,
+                                    tz='US/Central'))
+     build_table_schema(s_tz)
+
+- Periods are converted to timestamps before serialization, and so have the
+  same behavior of being converted to UTC. In addition, periods will contain
+  and additional field ``freq`` with the period's frequency, e.g. ``'A-DEC'``
+
+  .. ipython:: python
+
+     s_per = pd.Series(1, index=pd.period_range('2016', freq='A-DEC',
+                                                periods=4))
+     build_table_schema(s_per)
+
+- Categoricals use the ``any`` type and an ``enum`` constraint listing
+  the set of possible values. Additionally, an ``ordered`` field is included
+
+  .. ipython:: python
+
+     s_cat = pd.Series(pd.Categorical(['a', 'b', 'a']))
+     build_table_schema(s_cat)
+
+- A ``primaryKey`` field, containing an array of labels, is included
+  *if the index is unique*:
+
+  .. ipython:: python
+
+     s_dupe = pd.Series([1, 2], index=[1, 1])
+     build_table_schema(s_dupe)
+
+- The ``primaryKey`` behavior is the same with MultiIndexes, but in this
+  case the ``primaryKey`` is an array:
+
+  .. ipython:: python
+
+     s_multi = pd.Series(1, index=pd.MultiIndex.from_product([('a', 'b'),
+                                                              (0, 1)]))
+     build_table_schema(s_multi)
+
+- The default naming roughly follows these rules:
+
+  + For series, the ``object.name`` is used. If that's none, then the
+    name is ``values``
+  + For DataFrames, the stringified version of the column name is used
+  + For ``Index`` (not ``MultiIndex``), ``index.name`` is used, with a
+    fallback to ``index`` if that is None.
+  + For ``MultiIndex``, ``mi.names`` is used. If any level has no name,
+    then ``level_<i>`` is used.
+
+
+_Table Schema: http://specs.frictionlessdata.io/json-table-schema/
+
 HTML
 ----
 
diff --git a/doc/source/options.rst b/doc/source/options.rst
@@ -397,6 +397,9 @@ display.width                       80           Width of the display in charact
                                                  IPython qtconsole, or IDLE do not run in a
                                                  terminal and hence it is not possible
                                                  to correctly detect the width.
+display.html.table_schema           False        Whether to publish a Table Schema
+                                                 representation for frontends that
+                                                 support it.
 html.border                         1            A ``border=value`` attribute is
                                                  inserted in the ``<table>`` tag
                                                  for the DataFrame HTML repr.
@@ -424,6 +427,7 @@ mode.use_inf_as_null                False        True means treat None, NaN, -IN
                                                  are not null (new way).
 =================================== ============ ==================================
 
+
 .. _basics.console_output:
 
 Number Formatting
@@ -512,3 +516,20 @@ Enabling ``display.unicode.ambiguous_as_wide`` lets pandas to figure these chara
 
    pd.set_option('display.unicode.east_asian_width', False)
    pd.set_option('display.unicode.ambiguous_as_wide', False)
+
+.. _options.table_schema:
+
+Table Schema Display
+--------------------
+
+.. versionadded:: 0.20.0
+
+``DataFrame`` and ``Series`` will publish a Table Schema representation
+by default. False by default, this can be enabled globally with the
+``display.html.table_schema`` option:
+
+.. ipython:: python
+
+  pd.set_option('display.html.table_schema', True)
+
+Only ``'display.max_rows'`` are serialized and published.
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -12,6 +12,7 @@ Highlights include:
 - Building pandas for development now requires ``cython >= 0.23`` (:issue:`14831`)
 - The ``.ix`` indexer has been deprecated, see :ref:`here <whatsnew_0200.api_breaking.deprecate_ix>`
 - Switched the test framework to `pytest`_ (:issue:`13097`)
+- A new orient for JSON serialization, ``orient='table'``, that uses the Table Schema spec, see :ref: `here <whatsnew_0200.enhancements.table_schema>`
 
 .. _pytest: http://doc.pytest.org/en/latest/
 
@@ -154,6 +155,40 @@ New Behavior:
 
   df[df.chromosomes != '1'].groupby('chromosomes', sort=False).sum()
 
+.. _whatsnew_0200.enhancements.table_schema
+
+Table Schema Output
+^^^^^^^^^^^^^^^^^^^
+
+The new orient ``'table'`` for :meth:`DataFrame.to_json`
+will generate a `Table Schema`_ compatible string representation of
+the data.
+
+.. ipython:: python
+
+   df = pd.DataFrame(
+       {'A': [1, 2, 3],
+        'B': ['a', 'b', 'c'],
+        'C': pd.date_range('2016-01-01', freq='d', periods=3),
+       }, index=pd.Index(range(3), name='idx'))
+   df
+   df.to_json(orient='table')
+
+
+See :ref:`IO: Table Schema for more<io.table_schema>`.
+
+Additionally, the repr for ``DataFrame`` and ``Series`` can now publish
+this JSON Table schema representation of the Series or DataFrame if you are
+using IPython (or another frontend like `nteract`_ using the Jupyter messaging
+protocol).
+This gives frontends like the Jupyter notebook and `nteract`_
+more flexiblity in how they display pandas objects, since they have
+more information about the data.
+You must enable this by setting the ``display.html.table_schema`` option to True.
+
+.. _Table Schema: http://specs.frictionlessdata.io/json-table-schema/
+.. _nteract: http://nteract.io/
+
 .. _whatsnew_0200.enhancements.other:
 
 Other enhancements
diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
@@ -164,6 +164,13 @@
     (default: False)
 """
 
+pc_table_schema_doc = """
+: boolean
+    Whether to publish a Table Schema representation for frontends
+    that support it.
+    (default: False)
+"""
+
 pc_line_width_deprecation_warning = """\
 line_width has been deprecated, use display.width instead (currently both are
 identical)
@@ -366,6 +373,9 @@ def mpl_style_cb(key):
                        validator=is_text)
     cf.register_option('latex.multirow', False, pc_latex_multirow,
                        validator=is_bool)
+    cf.register_option('html.table_schema', False, pc_table_schema_doc,
+                       validator=is_bool)
+
 
 cf.deprecate_option('display.line_width',
                     msg=pc_line_width_deprecation_warning,
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -4,6 +4,7 @@
 import operator
 import weakref
 import gc
+import json
 
 import numpy as np
 import pandas.lib as lib
@@ -129,6 +130,37 @@ def __init__(self, data, axes=None, copy=False, dtype=None,
         object.__setattr__(self, '_data', data)
         object.__setattr__(self, '_item_cache', {})
 
+    def _ipython_display_(self):
+        try:
+            from IPython.display import display
+        except ImportError:
+            return None
+
+        # Series doesn't define _repr_html_ or _repr_latex_
+        latex = self._repr_latex_() if hasattr(self, '_repr_latex_') else None
+        html = self._repr_html_() if hasattr(self, '_repr_html_') else None
+        table_schema = self._repr_table_schema_()
+        # We need the inital newline since we aren't going through the
+        # usual __repr__. See
+        # https://github.com/pandas-dev/pandas/pull/14904#issuecomment-277829277
+        text = "\n" + repr(self)
+
+        reprs = {"text/plain": text, "text/html": html, "text/latex": latex,
+                 "application/vnd.dataresource+json": table_schema}
+        reprs = {k: v for k, v in reprs.items() if v}
+        display(reprs, raw=True)
+
+    def _repr_table_schema_(self):
+        """
+        Not a real Jupyter special repr method, but we use the same
+        naming convention.
+        """
+        if config.get_option("display.html.table_schema"):
+            data = self.head(config.get_option('display.max_rows'))
+            payload = json.loads(data.to_json(orient='table'),
+                                 object_pairs_hook=collections.OrderedDict)
+            return payload
+
     def _validate_dtype(self, dtype):
         """ validate the passed dtype """
 
@@ -1094,7 +1126,7 @@ def __setstate__(self, state):
     strings before writing.
     """
 
-    def to_json(self, path_or_buf=None, orient=None, date_format='epoch',
+    def to_json(self, path_or_buf=None, orient=None, date_format=None,
                 double_precision=10, force_ascii=True, date_unit='ms',
                 default_handler=None, lines=False):
         """
@@ -1129,10 +1161,17 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch',
               - index : dict like {index -> {column -> value}}
               - columns : dict like {column -> {index -> value}}
               - values : just the values array
+              - table : dict like {'schema': {schema}, 'data': {data}}
+                describing the data, and the data component is
+                like ``orient='records'``.
 
-        date_format : {'epoch', 'iso'}
+                .. versionchanged:: 0.20.0
+
+        date_format : {None, 'epoch', 'iso'}
             Type of date conversion. `epoch` = epoch milliseconds,
-            `iso`` = ISO8601, default is epoch.
+            `iso` = ISO8601. The default depends on the `orient`. For
+            `orient='table'`, the default is `'iso'`. For all other orients,
+            the default is `'epoch'`.
         double_precision : The number of decimal places to use when encoding
             floating point values, default 10.
         force_ascii : force encoded string to be ASCII, default True.
@@ -1151,14 +1190,53 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch',
 
             .. versionadded:: 0.19.0
 
-
         Returns
         -------
         same type as input object with filtered info axis
 
+        See Also
+        --------
+        pd.read_json
+
+        Examples
+        --------
+
+        >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']],
+        ...                   index=['row 1', 'row 2'],
+        ...                   columns=['col 1', 'col 2'])
+        >>> df.to_json(orient='split')
+        '{"columns":["col 1","col 2"],
+          "index":["row 1","row 2"],
+          "data":[["a","b"],["c","d"]]}'
+
+        Encoding/decoding a Dataframe using ``'index'`` formatted JSON:
+
+        >>> df.to_json(orient='index')
+        '{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}'
+
+        Encoding/decoding a Dataframe using ``'records'`` formatted JSON.
+        Note that index labels are not preserved with this encoding.
+
+        >>> df.to_json(orient='records')
+        '[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]'
+
+        Encoding with Table Schema
+
+        >>> df.to_json(orient='table')
+        '{"schema": {"fields": [{"name": "index", "type": "string"},
+                                {"name": "col 1", "type": "string"},
+                                {"name": "col 2", "type": "string"}],
+                     "primaryKey": "index",
+                     "pandas_version": "0.20.0"},
+          "data": [{"index": "row 1", "col 1": "a", "col 2": "b"},
+                   {"index": "row 2", "col 1": "c", "col 2": "d"}]}'
         """
 
         from pandas.io import json
+        if date_format is None and orient == 'table':
+            date_format = 'iso'
+        elif date_format is None:
+            date_format = 'epoch'
         return json.to_json(path_or_buf=path_or_buf, obj=self, orient=orient,
                             date_format=date_format,
                             double_precision=double_precision,
diff --git a/pandas/io/json/__init__.py b/pandas/io/json/__init__.py
@@ -1,4 +1,5 @@
 from .json import to_json, read_json, loads, dumps  # noqa
 from .normalize import json_normalize  # noqa
+from .table_schema import build_table_schema  # noqa
 
-del json, normalize  # noqa
+del json, normalize, table_schema  # noqa
diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
diff --git a/pandas/io/json/table_schema.py b/pandas/io/json/table_schema.py
diff --git a/pandas/tests/formats/test_printing.py b/pandas/tests/formats/test_printing.py
diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py
diff --git a/pandas/util/testing.py b/pandas/util/testing.py

-Original file line number
+Diff line change
 beautifulsoup4
 s3fs
 xarray
 +ipython