Skip to content

Commit 85c8d66

Browse files
committed
ENH: Added to_json_schema
Lays the groundwork for pandas-dev#14386 This handles the schema part of the request there. We'll still need to do the work to publish the data to the frontend, but that can be done as a followup.
1 parent d1b1720 commit 85c8d66

File tree

7 files changed

+318
-1
lines changed

7 files changed

+318
-1
lines changed

doc/source/api.rst

+5
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,11 @@ JSON
6363

6464
.. currentmodule:: pandas
6565

66+
.. autosummary::
67+
:toctree: generated/
68+
69+
to_json_schema
70+
6671
HTML
6772
~~~~
6873

doc/source/io.rst

+23
Original file line numberDiff line numberDiff line change
@@ -1998,6 +1998,29 @@ using Hadoop or Spark.
19981998
df
19991999
df.to_json(orient='records', lines=True)
20002000
2001+
2002+
JSON Table Schema
2003+
-----------------
2004+
2005+
`JSON Table Schema`_ is a spec for describing tabular datasets as a JSON
2006+
object. The JSON includes information on the field names, types, and
2007+
other attributes. The :func:`pd.to_json_schema` function will build a
2008+
JSON Table Schema compatible dict, which can be easily seriealized.
2009+
2010+
.. ipython:: python
2011+
2012+
df = pd.DataFrame(
2013+
{'A': [1, 2, 3],
2014+
'B': ['a', 'b', 'c'],
2015+
'C': pd.date_range('2016-01-01', freq='d', periods=3),
2016+
}, index=pd.Index(range(3), name='idx'))
2017+
df
2018+
2019+
pd.to_json_schema(df)
2020+
2021+
2022+
_JSON Table Schema: http://specs.frictionlessdata.io/json-table-schema/
2023+
20012024
HTML
20022025
----
20032026

doc/source/whatsnew/v0.20.0.txt

+21
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,27 @@ Strings passed to ``DataFrame.groupby()`` as the ``by`` parameter may now refere
6565
df.groupby(['second', 'A']).sum()
6666

6767

68+
.. _whatsnew_0200.enhancements.json_table_schema
69+
70+
JSON Table Schema Output
71+
^^^^^^^^^^^^^^^^^^^^^^^^
72+
73+
The new top-level method `:func:pd.to_json_schmea` will generate
74+
a `JSON Table Schema`_ compatible dict describing the DataFrame.
75+
76+
.. ipython:: python
77+
78+
df = pd.DataFrame(
79+
{'A': [1, 2, 3],
80+
'B': ['a', 'b', 'c'],
81+
'C': pd.date_range('2016-01-01', freq='d', periods=3),
82+
}, index=pd.Index(range(3), name='idx'))
83+
df
84+
85+
pd.to_json_schema(df)
86+
87+
.. _JSON Table Schema: http://specs.frictionlessdata.io/json-table-schema/
88+
6889
.. _whatsnew_0200.enhancements.other:
6990

7091
Other enhancements

pandas/api/tests/test_api.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,8 @@ class TestPDApi(Base, tm.TestCase):
8383
'pivot', 'pivot_table', 'plot_params', 'qcut',
8484
'scatter_matrix',
8585
'show_versions', 'timedelta_range', 'unique',
86-
'value_counts', 'wide_to_long']
86+
'value_counts', 'wide_to_long',
87+
'to_json_schema']
8788

8889
# top-level option funcs
8990
funcs_option = ['reset_option', 'describe_option', 'get_option',

pandas/core/api.py

+1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from pandas.core.categorical import Categorical
1010
from pandas.core.groupby import Grouper
1111
from pandas.formats.format import set_eng_float_format
12+
from pandas.formats.json import to_json_schema
1213
from pandas.core.index import (Index, CategoricalIndex, Int64Index,
1314
RangeIndex, Float64Index, MultiIndex)
1415

pandas/formats/json.py

+148
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
"""
2+
http://specs.frictionlessdata.io/json-table-schema/
3+
"""
4+
import json
5+
from collections import OrderedDict
6+
7+
from pandas.core import config
8+
from pandas.types.common import (
9+
is_integer_dtype, is_timedelta64_dtype, is_string_dtype, is_numeric_dtype,
10+
is_bool_dtype, is_datetime64_dtype
11+
)
12+
13+
14+
def as_jsontable_type(x):
15+
"""
16+
Convert a NumPy / pandas type to its corresponding jsontable type
17+
18+
============== ======================
19+
Pandas type JSON Table Schema type
20+
============== ======================
21+
int64 integer
22+
float64 number
23+
bool boolean
24+
datetime64[ns] date
25+
timedelta64[ns] timedelta
26+
=============== ======================
27+
"""
28+
if is_integer_dtype(x):
29+
return 'integer'
30+
elif is_bool_dtype(x):
31+
return 'boolean'
32+
elif is_numeric_dtype(x):
33+
return 'number'
34+
elif is_datetime64_dtype(x):
35+
return 'date'
36+
elif is_timedelta64_dtype(x):
37+
return 'duration'
38+
elif is_string_dtype(x):
39+
return 'string'
40+
else:
41+
return 'any'
42+
43+
44+
def _set_default_names(data):
45+
"""Sets index names to 'index' for regular, or 'level_x' for Multi"""
46+
if all(name is not None for name in data.index.names):
47+
return data
48+
49+
data = data.copy()
50+
if data.index.nlevels > 1:
51+
names = [name if name is not None else 'level_{}'.format(i)
52+
for i, name in enumerate(data.index.names)]
53+
data.index.names = names
54+
else:
55+
data.index.name = 'index'
56+
return data
57+
58+
59+
def to_json_schema(data, index=True, primary_key=None):
60+
"""
61+
Create a JSON Table schema from ``data``.
62+
63+
Parameters
64+
----------
65+
data : Series, DataFrame
66+
index : bool
67+
Whether to include ``data.index`` in the schema.
68+
primary_key : bool or None
69+
column names to designate as the primary key.
70+
The default `None` will set `'primary_key'` to the index
71+
level or levels if the index is unique.
72+
73+
Returns
74+
-------
75+
schema : dict
76+
77+
Examples
78+
--------
79+
>>> df = pd.DataFrame(
80+
... {'A': [1, 2, 3],
81+
... 'B': ['a', 'b', 'c'],
82+
... 'C': pd.date_range('2016-01-01', freq='d', periods=3),
83+
... }, index=pd.Index(range(3), name='idx'))
84+
>>> pd.to_json_schema(df)
85+
{'fields': [{'name': 'idx', 'type': 'integer'},
86+
{'name': 'A', 'type': 'integer'},
87+
{'name': 'B', 'type': 'string'},
88+
{'name': 'C', 'type': 'date'}],
89+
'primary_key': 'idx'}
90+
"""
91+
if index is True:
92+
data = _set_default_names(data)
93+
94+
schema = {}
95+
fields = []
96+
97+
if index:
98+
if data.index.nlevels > 1:
99+
for level in data.index.levels:
100+
fields.append({'name': level.name,
101+
'type': as_jsontable_type(level.dtype)})
102+
else:
103+
fields.append({'name': data.index.name,
104+
'type': as_jsontable_type(data.index.dtype)})
105+
106+
if data.ndim > 1:
107+
for column, type_ in data.dtypes.iteritems():
108+
fields.append({'name': column,
109+
'type': as_jsontable_type(type_)})
110+
else:
111+
fields.append({
112+
'name': data.name if data.name is not None else 'values',
113+
'type': as_jsontable_type(data.dtype)})
114+
115+
schema['fields'] = fields
116+
if index and data.index.is_unique and primary_key is None:
117+
# TODO: Always a list, spec allows for a string scalar.
118+
if data.index.nlevels == 1:
119+
schema['primary_key'] = data.index.name
120+
else:
121+
schema['primary_key'] = data.index.names
122+
elif primary_key is not None:
123+
schema['primary_key'] = primary_key
124+
return schema
125+
126+
127+
def _build_payload(data):
128+
# for testing
129+
schema = to_json_schema(data)
130+
sample = data.head(config.get_option('display.max_rows'))
131+
# XXX: timedelta64 types ruin this...
132+
payload = {'schema': schema,
133+
# XXX: using to_json to do the conversion to serializable
134+
# types. Better to fix `to_dict` to return python types,
135+
# or make a to_json that doesn't write to strings...
136+
'data': json.loads(
137+
sample.reset_index().to_json(orient='records',
138+
date_format='iso'),
139+
object_pairs_hook=OrderedDict)}
140+
return payload
141+
142+
143+
def publish_tableschema(data):
144+
"""Temporary helper for testing w/ frontend"""
145+
from IPython.display import display
146+
mimetype = 'application/vnd.tableschema.v1+json'
147+
payload = _build_payload(data)
148+
display({mimetype: payload}, raw=True)

pandas/tests/formats/test_json.py

+118
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
"""Tests for JSON Table Schema integration."""
2+
# import datetime
3+
from collections import OrderedDict
4+
5+
import numpy as np
6+
import pandas as pd
7+
8+
from pandas import DataFrame
9+
import pandas.util.testing as tm
10+
from pandas.formats.json import (
11+
as_jsontable_type, to_json_schema, _build_payload)
12+
13+
14+
class TestJSONTableSchema(tm.TestCase):
15+
16+
def setUp(self):
17+
self.df = DataFrame(
18+
{'A': [1, 2, 3, 4],
19+
'B': ['a', 'b', 'c', 'c'],
20+
'C': pd.date_range('2016-01-01', freq='d', periods=4),
21+
'D': pd.timedelta_range('1H', periods=4),
22+
},
23+
index=pd.Index(range(4), name='idx'))
24+
25+
def test_to_json_schema(self):
26+
result = to_json_schema(self.df)
27+
expected = {
28+
'fields': [{'name': 'idx', 'type': 'integer'},
29+
{'name': 'A', 'type': 'integer'},
30+
{'name': 'B', 'type': 'string'},
31+
{'name': 'C', 'type': 'date'},
32+
{'name': 'D', 'type': 'duration'},
33+
],
34+
'primary_key': 'idx'
35+
}
36+
self.assertEqual(result, expected)
37+
38+
def test_series(self):
39+
result = to_json_schema(pd.Series([1, 2, 3], name='foo'))
40+
expected = {'fields': [{'name': 'index', 'type': 'integer'},
41+
{'name': 'foo', 'type': 'integer'}],
42+
'primary_key': 'index'}
43+
self.assertEqual(result, expected)
44+
45+
def tets_series_unnamed(self):
46+
result = to_json_schema(pd.Series([1, 2, 3]))
47+
expected = {'fields': [{'name': 'index', 'type': 'integer'},
48+
{'name': 'values', 'type': 'integer'}],
49+
'primary_key': 'index'}
50+
self.assertEqual(result, expected)
51+
52+
def test_multiindex(self):
53+
df = self.df.copy()
54+
idx = pd.MultiIndex.from_product([('a', 'b'), (1, 2)])
55+
df.index = idx
56+
57+
result = to_json_schema(df)
58+
expected = {
59+
'fields': [{'name': 'level_0', 'type': 'string'},
60+
{'name': 'level_1', 'type': 'integer'},
61+
{'name': 'A', 'type': 'integer'},
62+
{'name': 'B', 'type': 'string'},
63+
{'name': 'C', 'type': 'date'},
64+
{'name': 'D', 'type': 'duration'},
65+
],
66+
'primary_key': ['level_0', 'level_1']
67+
}
68+
self.assertEqual(result, expected)
69+
70+
df.index.names = ['idx0', None]
71+
expected['fields'][0]['name'] = 'idx0'
72+
expected['primary_key'] = ['idx0', 'level_1']
73+
result = to_json_schema(df)
74+
self.assertEqual(result, expected)
75+
76+
def test_as_jsontable_type(self):
77+
integers = [np.int, np.int16, np.int32, np.int64]
78+
for t in integers:
79+
self.assertEqual(as_jsontable_type(t), 'integer')
80+
81+
floats = [np.float, np.float16, np.float32, np.float64]
82+
for t in floats:
83+
self.assertEqual(as_jsontable_type(t), 'number')
84+
85+
bools = [bool, np.bool]
86+
for t in bools:
87+
self.assertEqual(as_jsontable_type(t), 'boolean')
88+
89+
# TODO: datedate.date? datetime.time?
90+
dates = [np.datetime64, np.dtype("<M8[ns]")]
91+
for t in dates:
92+
self.assertEqual(as_jsontable_type(t), 'date')
93+
94+
durations = []
95+
for t in durations:
96+
self.assertEqual(as_jsontable_type(t), 'duration')
97+
98+
strings = [object] # TODO
99+
for t in strings:
100+
self.assertEqual(as_jsontable_type(t), 'string')
101+
102+
def test_build_payload(self):
103+
result = _build_payload(self.df.drop('D', axis=1))
104+
expected = {'data': [
105+
OrderedDict([('idx', 0), ('A', 1), ('B', 'a'),
106+
('C', '2016-01-01T00:00:00.000Z')]),
107+
OrderedDict([('idx', 1), ('A', 2), ('B', 'b'),
108+
('C', '2016-01-02T00:00:00.000Z')]),
109+
OrderedDict([('idx', 2), ('A', 3), ('B', 'c'),
110+
('C', '2016-01-03T00:00:00.000Z')]),
111+
OrderedDict([('idx', 3), ('A', 4), ('B', 'c'),
112+
('C', '2016-01-04T00:00:00.000Z')]),
113+
], 'schema': {'primary_key': 'idx',
114+
'fields': [{'name': 'idx', 'type': 'integer'},
115+
{'name': 'A', 'type': 'integer'},
116+
{'name': 'B', 'type': 'string'},
117+
{'name': 'C', 'type': 'date'}]}}
118+
self.assertEqual(result, expected)

0 commit comments

Comments
 (0)