Skip to content

Commit 041495a

Browse files
committed
ENH: Added to_json_schema
Lays the groundwork for pandas-dev#14386 This handles the schema part of the request there. We'll still need to do the work to publish the data to the frontend, but that can be done as a followup.
1 parent d1b1720 commit 041495a

File tree

7 files changed

+316
-1
lines changed

7 files changed

+316
-1
lines changed

doc/source/api.rst

+5
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,11 @@ JSON
6363

6464
.. currentmodule:: pandas
6565

66+
.. autosummary::
67+
:toctree: generated/
68+
69+
to_json_schema
70+
6671
HTML
6772
~~~~
6873

doc/source/io.rst

+23
Original file line numberDiff line numberDiff line change
@@ -1998,6 +1998,29 @@ using Hadoop or Spark.
19981998
df
19991999
df.to_json(orient='records', lines=True)
20002000
2001+
2002+
JSON Table Schema
2003+
-----------------
2004+
2005+
`JSON Table Schema`_ is a spec for describing tabular datasets as a JSON
2006+
object. The JSON includes information on the field names, types, and
2007+
other attributes. The :func:`pd.to_json_schema` function will build a
2008+
JSON Table Schema compatible dict, which can be easily seriealized.
2009+
2010+
.. ipython:: python
2011+
2012+
df = pd.DataFrame(
2013+
{'A': [1, 2, 3],
2014+
'B': ['a', 'b', 'c'],
2015+
'C': pd.date_range('2016-01-01', freq='d', periods=3),
2016+
}, index=pd.Index(range(3), name='idx'))
2017+
df
2018+
2019+
pd.to_json_schema(df)
2020+
2021+
2022+
_JSON Table Schema: http://specs.frictionlessdata.io/json-table-schema/
2023+
20012024
HTML
20022025
----
20032026

doc/source/whatsnew/v0.20.0.txt

+21
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,27 @@ Strings passed to ``DataFrame.groupby()`` as the ``by`` parameter may now refere
6565
df.groupby(['second', 'A']).sum()
6666

6767

68+
.. _whatsnew_0200.enhancements.json_table_schema
69+
70+
JSON Table Schema Output
71+
^^^^^^^^^^^^^^^^^^^^^^^^
72+
73+
The new top-level method `:func:pd.to_json_schmea` will generate
74+
a `JSON Table Schema`_ compatible dict describing the DataFrame.
75+
76+
.. ipython:: python
77+
78+
df = pd.DataFrame(
79+
{'A': [1, 2, 3],
80+
'B': ['a', 'b', 'c'],
81+
'C': pd.date_range('2016-01-01', freq='d', periods=3),
82+
}, index=pd.Index(range(3), name='idx'))
83+
df
84+
85+
pd.to_json_schema(df)
86+
87+
.. _JSON Table Schema: http://specs.frictionlessdata.io/json-table-schema/
88+
6889
.. _whatsnew_0200.enhancements.other:
6990

7091
Other enhancements

pandas/api/tests/test_api.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,8 @@ class TestPDApi(Base, tm.TestCase):
8383
'pivot', 'pivot_table', 'plot_params', 'qcut',
8484
'scatter_matrix',
8585
'show_versions', 'timedelta_range', 'unique',
86-
'value_counts', 'wide_to_long']
86+
'value_counts', 'wide_to_long',
87+
'to_json_schema']
8788

8889
# top-level option funcs
8990
funcs_option = ['reset_option', 'describe_option', 'get_option',

pandas/core/api.py

+1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from pandas.core.categorical import Categorical
1010
from pandas.core.groupby import Grouper
1111
from pandas.formats.format import set_eng_float_format
12+
from pandas.formats.json import to_json_schema
1213
from pandas.core.index import (Index, CategoricalIndex, Int64Index,
1314
RangeIndex, Float64Index, MultiIndex)
1415

pandas/formats/json.py

+146
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
"""
2+
http://specs.frictionlessdata.io/json-table-schema/
3+
"""
4+
import json
5+
from collections import OrderedDict
6+
7+
from pandas.core import config
8+
from pandas.types.common import (
9+
is_integer_dtype, is_timedelta64_dtype, is_string_dtype, is_numeric_dtype,
10+
is_bool_dtype, is_datetime64_dtype
11+
)
12+
13+
def as_jsontable_type(x):
14+
"""
15+
Convert a NumPy / pandas type to its corresponding jsontable type
16+
17+
============== ======================
18+
Pandas type JSON Table Schema type
19+
============== ======================
20+
int64 integer
21+
float64 number
22+
bool boolean
23+
datetime64[ns] date
24+
timedelta64[ns] timedelta
25+
=============== ======================
26+
"""
27+
if is_integer_dtype(x):
28+
return 'integer'
29+
elif is_bool_dtype(x):
30+
return 'boolean'
31+
elif is_numeric_dtype(x):
32+
return 'number'
33+
elif is_datetime64_dtype(x):
34+
return 'date'
35+
elif is_timedelta64_dtype(x):
36+
return 'duration'
37+
elif is_string_dtype(x):
38+
return 'string'
39+
else:
40+
return 'any'
41+
42+
43+
def _set_default_names(data):
44+
"""Sets index names to 'index' for regular, or 'level_x' for Multi"""
45+
if all(name is not None for name in data.index.names):
46+
return data
47+
48+
data = data.copy()
49+
if data.index.nlevels > 1:
50+
names = [name if name is not None else 'level_{}'.format(i)
51+
for i, name in enumerate(data.index.names)]
52+
data.index.names = names
53+
else:
54+
data.index.name = 'index'
55+
return data
56+
57+
58+
def to_json_schema(data, index=True, primary_key=None):
59+
"""
60+
Create a JSON Table schema from ``data``.
61+
62+
Parameters
63+
----------
64+
data : Series, DataFrame
65+
index : bool
66+
Whether to include ``data.index`` in the schema.
67+
primary_key : bool or None
68+
column names to designate as the primary key.
69+
The default `None` will set `'primary_key'` to the index
70+
level or levels if the index is unique.
71+
72+
Returns
73+
-------
74+
schema : dict
75+
76+
Examples
77+
--------
78+
>>> df = pd.DataFrame(
79+
... {'A': [1, 2, 3],
80+
... 'B': ['a', 'b', 'c'],
81+
... 'C': pd.date_range('2016-01-01', freq='d', periods=3),
82+
... }, index=pd.Index(range(3), name='idx'))
83+
>>> pd.to_json_schema(df)
84+
{'fields': [{'name': 'idx', 'type': 'integer'},
85+
{'name': 'A', 'type': 'integer'},
86+
{'name': 'B', 'type': 'string'},
87+
{'name': 'C', 'type': 'date'}],
88+
'primary_key': 'idx'}
89+
"""
90+
if index is True:
91+
data = _set_default_names(data)
92+
93+
schema = {}
94+
fields = []
95+
96+
if index:
97+
if data.index.nlevels > 1:
98+
for level in data.index.levels:
99+
fields.append({'name': level.name,
100+
'type': as_jsontable_type(level.dtype)})
101+
else:
102+
fields.append({'name': data.index.name,
103+
'type': as_jsontable_type(data.index.dtype)})
104+
105+
if data.ndim > 1:
106+
for column, type_ in data.dtypes.iteritems():
107+
fields.append({'name': column,
108+
'type': as_jsontable_type(type_)})
109+
else:
110+
fields.append({
111+
'name': data.name if data.name is not None else 'values',
112+
'type': as_jsontable_type(data.dtype)})
113+
114+
schema['fields'] = fields
115+
if index and data.index.is_unique and primary_key is None:
116+
# TODO: Always a list, spec allows for a string scalar.
117+
if data.index.nlevels == 1:
118+
schema['primary_key'] = data.index.name
119+
else:
120+
schema['primary_key'] = data.index.names
121+
elif primary_key is not None:
122+
schema['primary_key'] = primary_key
123+
return schema
124+
125+
126+
def _build_payload(data):
127+
# for testing
128+
schema = to_json_schema(data)
129+
sample = data.head(config.get_option('display.max_rows'))
130+
# XXX: timedelta64 types ruin this...
131+
payload = {'schema': schema,
132+
# XXX: using to_json to do the conversion to serializable
133+
# types. Better to fix `to_dict` to return python types,
134+
# or make a to_json that doesn't write to strings...
135+
'data': json.loads(
136+
sample.reset_index().to_json(orient='records',
137+
date_format='iso'),
138+
object_pairs_hook=OrderedDict)}
139+
return payload
140+
141+
def publish_tableschema(data):
142+
"""Temporary helper for testing w/ frontend"""
143+
from IPython.display import display
144+
mimetype = 'application/vnd.tableschema.v1+json'
145+
payload = _build_payload(data)
146+
display({mimetype: payload}, raw=True)

pandas/tests/formats/test_json.py

+118
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
"""Tests for JSON Table Schema integration."""
2+
# import datetime
3+
from collections import OrderedDict
4+
5+
import numpy as np
6+
import pandas as pd
7+
8+
from pandas import DataFrame
9+
import pandas.util.testing as tm
10+
from pandas.formats.json import (
11+
as_jsontable_type, to_json_schema, _build_payload)
12+
13+
14+
class TestJSONTableSchema(tm.TestCase):
15+
16+
def setUp(self):
17+
self.df = DataFrame(
18+
{'A': [1, 2, 3, 4],
19+
'B': ['a', 'b', 'c', 'c'],
20+
'C': pd.date_range('2016-01-01', freq='d', periods=4),
21+
'D': pd.timedelta_range('1H', periods=4),
22+
},
23+
index=pd.Index(range(4), name='idx'))
24+
25+
def test_to_json_schema(self):
26+
result = to_json_schema(self.df)
27+
expected = {
28+
'fields': [{'name': 'idx', 'type': 'integer'},
29+
{'name': 'A', 'type': 'integer'},
30+
{'name': 'B', 'type': 'string'},
31+
{'name': 'C', 'type': 'date'},
32+
{'name': 'D', 'type': 'duration'},
33+
],
34+
'primary_key': 'idx'
35+
}
36+
self.assertEqual(result, expected)
37+
38+
def test_series(self):
39+
result = to_json_schema(pd.Series([1, 2, 3], name='foo'))
40+
expected = {'fields': [{'name': 'index', 'type': 'integer'},
41+
{'name': 'foo', 'type': 'integer'}],
42+
'primary_key': 'index'}
43+
self.assertEqual(result, expected)
44+
45+
def tets_series_unnamed(self):
46+
result = to_json_schema(pd.Series([1, 2, 3]))
47+
expected = {'fields': [{'name': 'index', 'type': 'integer'},
48+
{'name': 'values', 'type': 'integer'}],
49+
'primary_key': 'index'}
50+
self.assertEqual(result, expected)
51+
52+
def test_multiindex(self):
53+
df = self.df.copy()
54+
idx = pd.MultiIndex.from_product([('a', 'b'), (1, 2)])
55+
df.index = idx
56+
57+
result = to_json_schema(df)
58+
expected = {
59+
'fields': [{'name': 'level_0', 'type': 'string'},
60+
{'name': 'level_1', 'type': 'integer'},
61+
{'name': 'A', 'type': 'integer'},
62+
{'name': 'B', 'type': 'string'},
63+
{'name': 'C', 'type': 'date'},
64+
{'name': 'D', 'type': 'duration'},
65+
],
66+
'primary_key': ['level_0', 'level_1']
67+
}
68+
self.assertEqual(result, expected)
69+
70+
df.index.names = ['idx0', None]
71+
expected['fields'][0]['name'] = 'idx0'
72+
expected['primary_key'] = ['idx0', 'level_1']
73+
result = to_json_schema(df)
74+
self.assertEqual(result, expected)
75+
76+
def test_as_jsontable_type(self):
77+
integers = [np.int, np.int16, np.int32, np.int64]
78+
for t in integers:
79+
self.assertEqual(as_jsontable_type(t), 'integer')
80+
81+
floats = [np.float, np.float16, np.float32, np.float64]
82+
for t in floats:
83+
self.assertEqual(as_jsontable_type(t), 'number')
84+
85+
bools = [bool, np.bool]
86+
for t in bools:
87+
self.assertEqual(as_jsontable_type(t), 'boolean')
88+
89+
# TODO: datedate.date? datetime.time?
90+
dates = [np.datetime64, np.dtype("<M8[ns]")]
91+
for t in dates:
92+
self.assertEqual(as_jsontable_type(t), 'date')
93+
94+
durations = []
95+
for t in durations:
96+
self.assertEqual(as_jsontable_type(t), 'duration')
97+
98+
strings = [object] # TODO
99+
for t in strings:
100+
self.assertEqual(as_jsontable_type(t), 'string')
101+
102+
def test_build_payload(self):
103+
result = _build_payload(self.df.drop('D', axis=1))
104+
expected = {'data': [
105+
OrderedDict([('idx', 0), ('A', 1), ('B', 'a'),
106+
('C', '2016-01-01T00:00:00.000Z')]),
107+
OrderedDict([('idx', 1), ('A', 2), ('B', 'b'),
108+
('C', '2016-01-02T00:00:00.000Z')]),
109+
OrderedDict([('idx', 2), ('A', 3), ('B', 'c'),
110+
('C', '2016-01-03T00:00:00.000Z')]),
111+
OrderedDict([('idx', 3), ('A', 4), ('B', 'c'),
112+
('C', '2016-01-04T00:00:00.000Z')]),
113+
], 'schema': {'primary_key': 'idx',
114+
'fields': [{'name': 'idx', 'type': 'integer'},
115+
{'name': 'A', 'type': 'integer'},
116+
{'name': 'B', 'type': 'string'},
117+
{'name': 'C', 'type': 'date'}]}}
118+
self.assertEqual(result, expected)

0 commit comments

Comments
 (0)