Skip to content

Commit 68c1bd3

Browse files
committed
ENH: Added to_json_schema
Lays the groundwork for pandas-dev#14386 This handles the schema part of the request there. We'll still need to do the work to publish the data to the frontend, but that can be done as a followup. DOC: More notes in prose docs Move files use isoformat updates Moved to to_json
1 parent 8452080 commit 68c1bd3

File tree

5 files changed

+352
-3
lines changed

5 files changed

+352
-3
lines changed

doc/source/api.rst

+5
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,11 @@ JSON
6363

6464
.. currentmodule:: pandas
6565

66+
.. autosummary::
67+
:toctree: generated/
68+
69+
to_json_schema
70+
6671
HTML
6772
~~~~
6873

doc/source/io.rst

+40
Original file line numberDiff line numberDiff line change
@@ -2033,6 +2033,46 @@ using Hadoop or Spark.
20332033
df
20342034
df.to_json(orient='records', lines=True)
20352035
2036+
2037+
JSON Table Schema
2038+
-----------------
2039+
2040+
`JSON Table Schema`_ is a spec for describing tabular datasets as a JSON
2041+
object. The JSON includes information on the field names, types, and
2042+
other attributes. The :func:`pd.to_json_schema` function will build a
2043+
JSON Table Schema compatible dict, which can be easily seriealized.
2044+
2045+
.. ipython:: python
2046+
2047+
df = pd.DataFrame(
2048+
{'A': [1, 2, 3],
2049+
'B': ['a', 'b', 'c'],
2050+
'C': pd.date_range('2016-01-01', freq='d', periods=3),
2051+
}, index=pd.Index(range(3), name='idx'))
2052+
df
2053+
2054+
pd.to_json_schema(df)
2055+
2056+
The full list of types supported are described in the JSON Table Schema
2057+
spec. This table shows the mapping from pandas types:
2058+
2059+
============== ======================
2060+
Pandas type JSON Table Schema type
2061+
============== ======================
2062+
int64 integer
2063+
float64 number
2064+
bool boolean
2065+
datetime64[ns] date
2066+
timedelta64[ns] timedelta
2067+
=============== ======================
2068+
2069+
By default, the `primary_key` attribute is set to the index when
2070+
the index (or MultiIndex) has a name (or names) and is unique.
2071+
This behavior can be overridden with the `index` and `primary_key`
2072+
arguments.
2073+
2074+
_JSON Table Schema: http://specs.frictionlessdata.io/json-table-schema/
2075+
20362076
HTML
20372077
----
20382078

doc/source/whatsnew/v0.20.0.txt

+20
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,26 @@ Notably, a new numerical index, ``UInt64Index``, has been created (:issue:`14937
114114
- Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`)
115115
- Bug in ``pd.value_counts()`` in which unsigned 64-bit integers were being erroneously truncated in the output (:issue:`14934`)
116116

117+
118+
JSON Table Schema Output
119+
^^^^^^^^^^^^^^^^^^^^^^^^
120+
121+
The new top-level method `:func:pd.to_json_schmea` will generate
122+
a `JSON Table Schema`_ compatible dict describing the DataFrame.
123+
124+
.. ipython:: python
125+
126+
df = pd.DataFrame(
127+
{'A': [1, 2, 3],
128+
'B': ['a', 'b', 'c'],
129+
'C': pd.date_range('2016-01-01', freq='d', periods=3),
130+
}, index=pd.Index(range(3), name='idx'))
131+
df
132+
133+
pd.to_json_schema(df)
134+
135+
.. _JSON Table Schema: http://specs.frictionlessdata.io/json-table-schema/
136+
117137
.. _whatsnew_0200.enhancements.other:
118138

119139
Other enhancements

pandas/io/json.py

+161-3
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import os
44
import copy
5+
import json
56
from collections import defaultdict
67
import numpy as np
78

@@ -11,9 +12,13 @@
1112
from pandas import compat, isnull
1213
from pandas import Series, DataFrame, to_datetime
1314
from pandas.io.common import get_filepath_or_buffer, _get_handle
15+
from pandas.core import config
1416
from pandas.core.common import AbstractMethodError
1517
from pandas.formats.printing import pprint_thing
16-
18+
from pandas.types.common import (
19+
is_integer_dtype, is_timedelta64_dtype, is_string_dtype, is_numeric_dtype,
20+
is_bool_dtype, is_datetime64_dtype
21+
)
1722
loads = _json.loads
1823
dumps = _json.dumps
1924

@@ -61,6 +66,22 @@ def __init__(self, obj, orient, date_format, double_precision,
6166
if orient is None:
6267
orient = self._default_orient
6368

69+
self.is_jsontable_schema = orient == 'jsontable_schema'
70+
if self.is_jsontable_schema:
71+
self.schema = to_json_schema(obj)
72+
73+
# XXX: Do this timedelta properly in to_json
74+
sample = obj.head(
75+
config.get_option('display.max_rows')).reset_index()
76+
timedeltas = sample.select_dtypes(include=['timedelta']).columns
77+
sample[timedeltas] = sample[timedeltas].applymap(isoformat)
78+
self.obj = sample
79+
date_format = 'iso' # ignoring user input, but epoch not allowed
80+
orient = 'records'
81+
82+
else:
83+
self.schema = None
84+
6485
self.orient = orient
6586
self.date_format = date_format
6687
self.double_precision = double_precision
@@ -75,14 +96,19 @@ def _format_axes(self):
7596
raise AbstractMethodError(self)
7697

7798
def write(self):
78-
return dumps(
99+
serialized = dumps(
79100
self.obj,
80101
orient=self.orient,
81102
double_precision=self.double_precision,
82103
ensure_ascii=self.ensure_ascii,
83104
date_unit=self.date_unit,
84105
iso_dates=self.date_format == 'iso',
85-
default_handler=self.default_handler)
106+
default_handler=self.default_handler
107+
)
108+
if self.is_jsontable_schema:
109+
serialized = '{{"schema": {}, "data": {}}}'.format(
110+
json.dumps(self.schema), serialized)
111+
return serialized
86112

87113

88114
class SeriesWriter(Writer):
@@ -884,3 +910,135 @@ def _recursive_extract(data, path, seen_meta, level=0):
884910
result[k] = np.array(v).repeat(lengths)
885911

886912
return result
913+
914+
915+
# ---------------------------------------------------------------------
916+
# JSON-Table Schema routines
917+
# http://specs.frictionlessdata.io/json-table-schema/
918+
919+
920+
def as_jsontable_type(x):
921+
"""
922+
Convert a NumPy / pandas type to its corresponding jsontable type
923+
924+
============== ======================
925+
Pandas type JSON Table Schema type
926+
============== ======================
927+
int64 integer
928+
float64 number
929+
bool boolean
930+
datetime64[ns] date
931+
timedelta64[ns] duration
932+
=============== ======================
933+
"""
934+
if is_integer_dtype(x):
935+
return 'integer'
936+
elif is_bool_dtype(x):
937+
return 'boolean'
938+
elif is_numeric_dtype(x):
939+
return 'number'
940+
elif is_datetime64_dtype(x):
941+
return 'date'
942+
elif is_timedelta64_dtype(x):
943+
return 'duration'
944+
elif is_string_dtype(x):
945+
return 'string'
946+
else:
947+
return 'any'
948+
949+
950+
def _set_default_names(data):
951+
"""Sets index names to 'index' for regular, or 'level_x' for Multi"""
952+
if all(name is not None for name in data.index.names):
953+
return data
954+
955+
data = data.copy()
956+
if data.index.nlevels > 1:
957+
names = [name if name is not None else 'level_{}'.format(i)
958+
for i, name in enumerate(data.index.names)]
959+
data.index.names = names
960+
else:
961+
data.index.name = 'index'
962+
return data
963+
964+
965+
def to_json_schema(data, index=True, primary_key=None):
966+
"""
967+
Create a JSON Table schema from ``data``.
968+
969+
Parameters
970+
----------
971+
data : Series, DataFrame
972+
index : bool
973+
Whether to include ``data.index`` in the schema.
974+
primary_key : bool or None
975+
column names to designate as the primary key.
976+
The default `None` will set `'primary_key'` to the index
977+
level or levels if the index is unique.
978+
979+
Returns
980+
-------
981+
schema : dict
982+
983+
Examples
984+
--------
985+
>>> df = pd.DataFrame(
986+
... {'A': [1, 2, 3],
987+
... 'B': ['a', 'b', 'c'],
988+
... 'C': pd.date_range('2016-01-01', freq='d', periods=3),
989+
... }, index=pd.Index(range(3), name='idx'))
990+
>>> pd.to_json_schema(df)
991+
{'fields': [{'name': 'idx', 'type': 'integer'},
992+
{'name': 'A', 'type': 'integer'},
993+
{'name': 'B', 'type': 'string'},
994+
{'name': 'C', 'type': 'date'}],
995+
'primary_key': 'idx'}
996+
997+
Notes
998+
-----
999+
See `as_jsontable_type` for conversion types.
1000+
Timedeltas as converted to ISO8601 duration format with
1001+
9 decimal places after the secnods field for nanosecond precision.
1002+
"""
1003+
if index is True:
1004+
data = _set_default_names(data)
1005+
1006+
schema = {}
1007+
fields = []
1008+
1009+
if index:
1010+
if data.index.nlevels > 1:
1011+
for level in data.index.levels:
1012+
fields.append({'name': level.name,
1013+
'type': as_jsontable_type(level.dtype)})
1014+
else:
1015+
fields.append({'name': data.index.name,
1016+
'type': as_jsontable_type(data.index.dtype)})
1017+
1018+
if data.ndim > 1:
1019+
for column, type_ in data.dtypes.iteritems():
1020+
fields.append({'name': column,
1021+
'type': as_jsontable_type(type_)})
1022+
else:
1023+
fields.append({
1024+
'name': data.name if data.name is not None else 'values',
1025+
'type': as_jsontable_type(data.dtype)})
1026+
1027+
schema['fields'] = fields
1028+
if index and data.index.is_unique and primary_key is None:
1029+
# TODO: Always a list, spec allows for a string scalar.
1030+
if data.index.nlevels == 1:
1031+
schema['primary_key'] = data.index.name
1032+
else:
1033+
schema['primary_key'] = data.index.names
1034+
elif primary_key is not None:
1035+
schema['primary_key'] = primary_key
1036+
return schema
1037+
1038+
1039+
def publish_tableschema(data):
1040+
"""Temporary helper for testing w/ frontend"""
1041+
from IPython.display import display
1042+
mimetype = 'application/vnd.tableschema.v1+json'
1043+
payload = data.to_json(orient='jsontable_schema')
1044+
display({mimetype: payload}, raw=True)

0 commit comments

Comments
 (0)