Skip to content

Commit dc4daa4

Browse files
committed
ENH: Added to_json_schema
Lays the groundwork for #14386 This handles the schema part of the request there. We'll still need to do the work to publish the data to the frontend, but that can be done as a followup. DOC: More notes in prose docs Move files use isoformat updates Moved to to_json
1 parent 4b37274 commit dc4daa4

File tree

5 files changed

+352
-7
lines changed

5 files changed

+352
-7
lines changed

doc/source/api.rst

+5
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,11 @@ JSON
6363

6464
.. currentmodule:: pandas
6565

66+
.. autosummary::
67+
:toctree: generated/
68+
69+
to_json_schema
70+
6671
HTML
6772
~~~~
6873

doc/source/io.rst

+40
Original file line numberDiff line numberDiff line change
@@ -2052,6 +2052,46 @@ using Hadoop or Spark.
20522052
df
20532053
df.to_json(orient='records', lines=True)
20542054
2055+
2056+
JSON Table Schema
2057+
-----------------
2058+
2059+
`JSON Table Schema`_ is a spec for describing tabular datasets as a JSON
2060+
object. The JSON includes information on the field names, types, and
2061+
other attributes. The :func:`pd.to_json_schema` function will build a
2062+
JSON Table Schema compatible dict, which can be easily seriealized.
2063+
2064+
.. ipython:: python
2065+
2066+
df = pd.DataFrame(
2067+
{'A': [1, 2, 3],
2068+
'B': ['a', 'b', 'c'],
2069+
'C': pd.date_range('2016-01-01', freq='d', periods=3),
2070+
}, index=pd.Index(range(3), name='idx'))
2071+
df
2072+
2073+
pd.to_json_schema(df)
2074+
2075+
The full list of types supported are described in the JSON Table Schema
2076+
spec. This table shows the mapping from pandas types:
2077+
2078+
============== ======================
2079+
Pandas type JSON Table Schema type
2080+
============== ======================
2081+
int64 integer
2082+
float64 number
2083+
bool boolean
2084+
datetime64[ns] date
2085+
timedelta64[ns] timedelta
2086+
=============== ======================
2087+
2088+
By default, the `primary_key` attribute is set to the index when
2089+
the index (or MultiIndex) has a name (or names) and is unique.
2090+
This behavior can be overridden with the `index` and `primary_key`
2091+
arguments.
2092+
2093+
_JSON Table Schema: http://specs.frictionlessdata.io/json-table-schema/
2094+
20552095
HTML
20562096
----
20572097

doc/source/whatsnew/v0.20.0.txt

+21
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,27 @@ support for bz2 compression in the python 2 c-engine improved (:issue:`14874`).
9191
df = pd.read_table(url, compression='bz2') # explicitly specify compression
9292
df.head(2)
9393

94+
.. _whatsnew_0200.enhancements.json_table_schema
95+
96+
JSON Table Schema Output
97+
^^^^^^^^^^^^^^^^^^^^^^^^
98+
99+
The new top-level method `:func:pd.to_json_schmea` will generate
100+
a `JSON Table Schema`_ compatible dict describing the DataFrame.
101+
102+
.. ipython:: python
103+
104+
df = pd.DataFrame(
105+
{'A': [1, 2, 3],
106+
'B': ['a', 'b', 'c'],
107+
'C': pd.date_range('2016-01-01', freq='d', periods=3),
108+
}, index=pd.Index(range(3), name='idx'))
109+
df
110+
111+
pd.to_json_schema(df)
112+
113+
.. _JSON Table Schema: http://specs.frictionlessdata.io/json-table-schema/
114+
94115
.. _whatsnew_0200.enhancements.other:
95116

96117
Other enhancements

pandas/io/json.py

+160-7
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import os
44
import copy
5+
import json
56
from collections import defaultdict
67
import numpy as np
78

@@ -11,9 +12,13 @@
1112
from pandas import compat, isnull
1213
from pandas import Series, DataFrame, to_datetime
1314
from pandas.io.common import get_filepath_or_buffer, _get_handle
15+
from pandas.core import config
1416
from pandas.core.common import AbstractMethodError
1517
from pandas.formats.printing import pprint_thing
16-
18+
from pandas.types.common import (
19+
is_integer_dtype, is_timedelta64_dtype, is_string_dtype, is_numeric_dtype,
20+
is_bool_dtype, is_datetime64_dtype
21+
)
1722
loads = _json.loads
1823
dumps = _json.dumps
1924

@@ -61,6 +66,22 @@ def __init__(self, obj, orient, date_format, double_precision,
6166
if orient is None:
6267
orient = self._default_orient
6368

69+
self.is_jsontable_schema = orient == 'jsontable_schema'
70+
if self.is_jsontable_schema:
71+
self.schema = to_json_schema(obj)
72+
73+
# XXX: Do this timedelta properly in to_json
74+
sample = obj.head(
75+
config.get_option('display.max_rows')).reset_index()
76+
timedeltas = sample.select_dtypes(include=['timedelta']).columns
77+
sample[timedeltas] = sample[timedeltas].applymap(isoformat)
78+
self.obj = sample
79+
date_format = 'iso' # ignoring user input, but epoch not allowed
80+
orient = 'records'
81+
82+
else:
83+
self.schema = None
84+
6485
self.orient = orient
6586
self.date_format = date_format
6687
self.double_precision = double_precision
@@ -75,14 +96,19 @@ def _format_axes(self):
7596
raise AbstractMethodError(self)
7697

7798
def write(self):
78-
return dumps(
99+
serialized = dumps(
79100
self.obj,
80101
orient=self.orient,
81102
double_precision=self.double_precision,
82103
ensure_ascii=self.ensure_ascii,
83104
date_unit=self.date_unit,
84105
iso_dates=self.date_format == 'iso',
85-
default_handler=self.default_handler)
106+
default_handler=self.default_handler
107+
)
108+
if self.is_jsontable_schema:
109+
serialized = '{{"schema": {}, "data": {}}}'.format(
110+
json.dumps(self.schema), serialized)
111+
return serialized
86112

87113

88114
class SeriesWriter(Writer):
@@ -884,10 +910,6 @@ def _recursive_extract(data, path, seen_meta, level=0):
884910

885911
return result
886912

887-
# ---------------------------------------------------------------------
888-
# JSON-Table Schema routines
889-
# http://specs.frictionlessdata.io/json-table-schema/
890-
891913

892914
# TODO: Make method on Timedelta?
893915
def isoformat(x):
@@ -909,3 +931,134 @@ def isoformat(x):
909931
tpl = 'Pn{td.days}Tn{td.hours}n{td.minutes}n{seconds}'.format(
910932
td=components, seconds=seconds)
911933
return tpl
934+
935+
# ---------------------------------------------------------------------
936+
# JSON-Table Schema routines
937+
# http://specs.frictionlessdata.io/json-table-schema/
938+
939+
940+
def as_jsontable_type(x):
941+
"""
942+
Convert a NumPy / pandas type to its corresponding jsontable type
943+
944+
============== ======================
945+
Pandas type JSON Table Schema type
946+
============== ======================
947+
int64 integer
948+
float64 number
949+
bool boolean
950+
datetime64[ns] date
951+
timedelta64[ns] duration
952+
=============== ======================
953+
"""
954+
if is_integer_dtype(x):
955+
return 'integer'
956+
elif is_bool_dtype(x):
957+
return 'boolean'
958+
elif is_numeric_dtype(x):
959+
return 'number'
960+
elif is_datetime64_dtype(x):
961+
return 'date'
962+
elif is_timedelta64_dtype(x):
963+
return 'duration'
964+
elif is_string_dtype(x):
965+
return 'string'
966+
else:
967+
return 'any'
968+
969+
970+
def _set_default_names(data):
971+
"""Sets index names to 'index' for regular, or 'level_x' for Multi"""
972+
if all(name is not None for name in data.index.names):
973+
return data
974+
975+
data = data.copy()
976+
if data.index.nlevels > 1:
977+
names = [name if name is not None else 'level_{}'.format(i)
978+
for i, name in enumerate(data.index.names)]
979+
data.index.names = names
980+
else:
981+
data.index.name = 'index'
982+
return data
983+
984+
985+
def to_json_schema(data, index=True, primary_key=None):
986+
"""
987+
Create a JSON Table schema from ``data``.
988+
989+
Parameters
990+
----------
991+
data : Series, DataFrame
992+
index : bool
993+
Whether to include ``data.index`` in the schema.
994+
primary_key : bool or None
995+
column names to designate as the primary key.
996+
The default `None` will set `'primary_key'` to the index
997+
level or levels if the index is unique.
998+
999+
Returns
1000+
-------
1001+
schema : dict
1002+
1003+
Examples
1004+
--------
1005+
>>> df = pd.DataFrame(
1006+
... {'A': [1, 2, 3],
1007+
... 'B': ['a', 'b', 'c'],
1008+
... 'C': pd.date_range('2016-01-01', freq='d', periods=3),
1009+
... }, index=pd.Index(range(3), name='idx'))
1010+
>>> pd.to_json_schema(df)
1011+
{'fields': [{'name': 'idx', 'type': 'integer'},
1012+
{'name': 'A', 'type': 'integer'},
1013+
{'name': 'B', 'type': 'string'},
1014+
{'name': 'C', 'type': 'date'}],
1015+
'primary_key': 'idx'}
1016+
1017+
Notes
1018+
-----
1019+
See `as_jsontable_type` for conversion types.
1020+
Timedeltas as converted to ISO8601 duration format with
1021+
9 decimal places after the secnods field for nanosecond precision.
1022+
"""
1023+
if index is True:
1024+
data = _set_default_names(data)
1025+
1026+
schema = {}
1027+
fields = []
1028+
1029+
if index:
1030+
if data.index.nlevels > 1:
1031+
for level in data.index.levels:
1032+
fields.append({'name': level.name,
1033+
'type': as_jsontable_type(level.dtype)})
1034+
else:
1035+
fields.append({'name': data.index.name,
1036+
'type': as_jsontable_type(data.index.dtype)})
1037+
1038+
if data.ndim > 1:
1039+
for column, type_ in data.dtypes.iteritems():
1040+
fields.append({'name': column,
1041+
'type': as_jsontable_type(type_)})
1042+
else:
1043+
fields.append({
1044+
'name': data.name if data.name is not None else 'values',
1045+
'type': as_jsontable_type(data.dtype)})
1046+
1047+
schema['fields'] = fields
1048+
if index and data.index.is_unique and primary_key is None:
1049+
# TODO: Always a list, spec allows for a string scalar.
1050+
if data.index.nlevels == 1:
1051+
schema['primary_key'] = data.index.name
1052+
else:
1053+
schema['primary_key'] = data.index.names
1054+
elif primary_key is not None:
1055+
schema['primary_key'] = primary_key
1056+
return schema
1057+
1058+
1059+
def publish_tableschema(data):
1060+
"""Temporary helper for testing w/ frontend"""
1061+
from IPython.display import display
1062+
mimetype = 'application/vnd.tableschema.v1+json'
1063+
payload = data.to_json(orient='jsontable_schema')
1064+
display({mimetype: payload}, raw=True)

0 commit comments

Comments
 (0)