Skip to content

Commit fc893bd

Browse files
committed
read_json support for orient="table"
1 parent 6552718 commit fc893bd

File tree

5 files changed

+226
-3
lines changed

5 files changed

+226
-3
lines changed

doc/source/io.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -1833,7 +1833,7 @@ is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series``
18331833

18341834
DataFrame
18351835
- default is ``columns``
1836-
- allowed values are {``split``, ``records``, ``index``, ``columns``, ``values``}
1836+
- allowed values are {``split``, ``records``, ``index``, ``columns``, ``values``, ``table``}
18371837

18381838
The format of the JSON string
18391839

@@ -1846,6 +1846,7 @@ is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series``
18461846
``index``; dict like {index -> {column -> value}}
18471847
``columns``; dict like {column -> {index -> value}}
18481848
``values``; just the values array
1849+
``table``; adhering to the JSON `Table Schema`_
18491850

18501851
- ``dtype`` : if True, infer dtypes, if a dict of column to dtype, then use those, if False, then don't infer dtypes at all, default is True, apply only to the data
18511852
- ``convert_axes`` : boolean, try to convert the axes to the proper dtypes, default is True

doc/source/whatsnew/v0.23.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@ Other Enhancements
145145
- ``Resampler`` objects now have a functioning :attr:`~pandas.core.resample.Resampler.pipe` method.
146146
Previously, calls to ``pipe`` were diverted to the ``mean`` method (:issue:`17905`).
147147
- :func:`~pandas.api.types.is_scalar` now returns ``True`` for ``DateOffset`` objects (:issue:`18943`).
148+
- :func:`read_json` now supports ``table`` as a value to the ``orient`` argument (:issue:`18912`)
148149

149150
.. _whatsnew_0230.api_breaking:
150151

pandas/io/json/json.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from pandas.core.reshape.concat import concat
1717
from pandas.io.formats.printing import pprint_thing
1818
from .normalize import _convert_to_line_delimits
19-
from .table_schema import build_table_schema
19+
from .table_schema import build_table_schema, parse_table_schema
2020
from pandas.core.dtypes.common import is_period_dtype
2121

2222
loads = json.loads
@@ -261,13 +261,16 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
261261
* when ``typ == 'frame'``,
262262
263263
- allowed orients are ``{'split','records','index',
264-
'columns','values'}``
264+
'columns','values', 'table'}``
265265
- default is ``'columns'``
266266
- The DataFrame index must be unique for orients ``'index'`` and
267267
``'columns'``.
268268
- The DataFrame columns must be unique for orients ``'index'``,
269269
``'columns'``, and ``'records'``.
270270
271+
.. versionadded:: 0.23.0
272+
'table' as an allowed value for the ``orient`` argument
273+
271274
typ : type of object to recover (series or frame), default 'frame'
272275
dtype : boolean or dict, default True
273276
If True, infer dtypes, if a dict of column to dtype, then use those,
@@ -839,6 +842,9 @@ def _parse_no_numpy(self):
839842
elif orient == "index":
840843
self.obj = DataFrame(
841844
loads(json, precise_float=self.precise_float), dtype=None).T
845+
elif orient == 'table':
846+
self.obj = parse_table_schema(json,
847+
precise_float=self.precise_float)
842848
else:
843849
self.obj = DataFrame(
844850
loads(json, precise_float=self.precise_float), dtype=None)

pandas/io/json/table_schema.py

+91
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,18 @@
33
44
http://specs.frictionlessdata.io/json-table-schema/
55
"""
6+
import pandas._libs.json as json
7+
from pandas import DataFrame
8+
from pandas.api.types import CategoricalDtype
69
from pandas.core.common import _all_not_none
710
from pandas.core.dtypes.common import (
811
is_integer_dtype, is_timedelta64_dtype, is_numeric_dtype,
912
is_bool_dtype, is_datetime64_dtype, is_datetime64tz_dtype,
1013
is_categorical_dtype, is_period_dtype, is_string_dtype
1114
)
1215

16+
loads = json.loads
17+
1318

1419
def as_json_table_type(x):
1520
"""
@@ -103,6 +108,40 @@ def make_field(arr, dtype=None):
103108
return field
104109

105110

111+
def revert_field(field):
112+
'''
113+
Converts a JSON field descriptor into its corresponding NumPy / pandas type
114+
115+
Parameters
116+
----------
117+
field
118+
A JSON field descriptor
119+
120+
Returns
121+
-------
122+
dtype
123+
'''
124+
typ = field['type']
125+
if typ == 'integer':
126+
return 'int64'
127+
elif typ == 'number':
128+
return 'float64'
129+
elif typ == 'boolean':
130+
return 'bool'
131+
elif typ == 'duration':
132+
return 'timedelta64'
133+
elif typ == 'datetime':
134+
if field.get('tz'):
135+
return 'datetime64[ns, {tz}]'.format(tz=field['tz'])
136+
else:
137+
return 'datetime64[ns]'
138+
elif typ == 'any':
139+
if 'constraints' in field and 'ordered' in field:
140+
return CategoricalDtype(categories=field['constraints']['enum'],
141+
ordered=field['ordered'])
142+
return 'object'
143+
144+
106145
def build_table_schema(data, index=True, primary_key=None, version=True):
107146
"""
108147
Create a Table schema from ``data``.
@@ -180,3 +219,55 @@ def build_table_schema(data, index=True, primary_key=None, version=True):
180219
if version:
181220
schema['pandas_version'] = '0.20.0'
182221
return schema
222+
223+
224+
def parse_table_schema(json, precise_float):
225+
"""
226+
Builds a DataFrame from a given schema
227+
228+
Parameters
229+
----------
230+
json :
231+
A JSON table schema
232+
precise_float : boolean
233+
Flag controlling precision when decoding string to double values, as
234+
dictated by ``read_json``
235+
236+
Returns
237+
-------
238+
df : DataFrame
239+
240+
Raises
241+
------
242+
NotImplementedError
243+
If the JSON table schema contains either timezone or timedelta data
244+
245+
See also
246+
--------
247+
build_table_schema : inverse function
248+
pandas.read_json
249+
"""
250+
table = loads(json, precise_float=precise_float)
251+
col_order = [field['name'] for field in table['schema']['fields']]
252+
df = DataFrame(table['data'])[col_order]
253+
254+
dtypes = {field['name']: revert_field(field)
255+
for field in table['schema']['fields']}
256+
257+
# Cannot directly use as_type with timezone data on object; raise for now
258+
if any(str(x).startswith('datetime64[ns, ') for x in dtypes.values()):
259+
raise NotImplementedError('table="orient" can not yet read timezone '
260+
'data')
261+
262+
# No ISO constructor for Timedelta as of yet, so need to raise
263+
if 'timedelta64' in dtypes.values():
264+
raise NotImplementedError('table="orient" can not yet read '
265+
'ISO-formatted Timedelta data')
266+
267+
df = df.astype(dtypes)
268+
269+
df = df.set_index(table['schema']['primaryKey'])
270+
if all(x.startswith('level_') for x in df.index.names):
271+
df.index.names = [None] * len(df.index.names)
272+
273+
return df

pandas/tests/io/json/test_json_table_schema.py

+124
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
build_table_schema,
1515
make_field,
1616
set_default_names)
17+
import pandas.util.testing as tm
1718

1819

1920
class TestBuildSchema(object):
@@ -471,3 +472,126 @@ def test_mi_falsey_name(self):
471472
('a', 'b')]))
472473
result = [x['name'] for x in build_table_schema(df)['fields']]
473474
assert result == ['level_0', 'level_1', 0, 1, 2, 3]
475+
476+
477+
class TestTableOrientReader(object):
478+
479+
def test_integer(self):
480+
df = DataFrame(
481+
{'A': [1, 2, 3, 4],
482+
},
483+
index=pd.Index(range(4), name='idx'))
484+
out = df.to_json(orient="table")
485+
result = pd.read_json(out, orient="table")
486+
tm.assert_frame_equal(df, result)
487+
488+
def test_object(self):
489+
df = DataFrame(
490+
{'B': ['a', 'b', 'c', 'c'],
491+
},
492+
index=pd.Index(range(4), name='idx'))
493+
out = df.to_json(orient="table")
494+
result = pd.read_json(out, orient="table")
495+
tm.assert_frame_equal(df, result)
496+
497+
def test_date_range(self):
498+
df = DataFrame(
499+
{'C': pd.date_range('2016-01-01', freq='d', periods=4),
500+
},
501+
index=pd.Index(range(4), name='idx'))
502+
503+
out = df.to_json(orient="table")
504+
result = pd.read_json(out, orient="table")
505+
tm.assert_frame_equal(df, result)
506+
507+
def test_timedelta_raises(self):
508+
df = DataFrame(
509+
{'D': pd.timedelta_range('1H', periods=4, freq='T'),
510+
},
511+
index=pd.Index(range(4), name='idx'))
512+
513+
out = df.to_json(orient="table")
514+
with tm.assert_raises_regex(NotImplementedError, 'can not yet read '
515+
'ISO-formatted Timedelta data'):
516+
pd.read_json(out, orient="table")
517+
518+
def test_categorical(self):
519+
df = DataFrame(
520+
{'E': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'])),
521+
'F': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'],
522+
ordered=True)),
523+
},
524+
index=pd.Index(range(4), name='idx'))
525+
526+
out = df.to_json(orient="table")
527+
result = pd.read_json(out, orient="table")
528+
tm.assert_frame_equal(df, result)
529+
530+
@pytest.mark.parametrize("float_vals", [
531+
pytest.param([1., 2., 3., 4.], marks=pytest.mark.xfail),
532+
[1.1, 2.2, 3.3, 4.4]])
533+
def test_float(self, float_vals):
534+
df = DataFrame(
535+
{'G': float_vals,
536+
},
537+
index=pd.Index(range(4), name='idx'))
538+
539+
out = df.to_json(orient="table")
540+
result = pd.read_json(out, orient="table", convert_axes=False)
541+
tm.assert_frame_equal(df, result)
542+
543+
def test_timezone_raises(self):
544+
df = DataFrame(
545+
{'H': pd.date_range('2016-01-01', freq='d', periods=4,
546+
tz='US/Central'),
547+
},
548+
index=pd.Index(range(4), name='idx'))
549+
550+
out = df.to_json(orient="table")
551+
with tm.assert_raises_regex(NotImplementedError, 'can not yet read '
552+
'timezone data'):
553+
pd.read_json(out, orient="table")
554+
555+
def test_bool(self):
556+
df = DataFrame(
557+
{'I': [True, False, False, True],
558+
},
559+
index=pd.Index(range(4), name='idx'))
560+
561+
out = df.to_json(orient="table")
562+
result = pd.read_json(out, orient="table")
563+
tm.assert_frame_equal(df, result)
564+
565+
def test_comprehensive(self):
566+
df = DataFrame(
567+
{'A': [1, 2, 3, 4],
568+
'B': ['a', 'b', 'c', 'c'],
569+
'C': pd.date_range('2016-01-01', freq='d', periods=4),
570+
# 'D': pd.timedelta_range('1H', periods=4, freq='T'),
571+
'E': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'])),
572+
'F': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'],
573+
ordered=True)),
574+
'G': [1.1, 2.2, 3.3, 4.4],
575+
# 'H': pd.date_range('2016-01-01', freq='d', periods=4,
576+
# tz='US/Central'),
577+
'I': [True, False, False, True],
578+
},
579+
index=pd.Index(range(4), name='idx'))
580+
581+
out = df.to_json(orient="table")
582+
result = pd.read_json(out, orient="table")
583+
tm.assert_frame_equal(df, result)
584+
585+
@pytest.mark.parametrize("index_names", [[None, None], ['foo', 'bar']])
586+
def test_multiindex(self, index_names):
587+
# GH 18912
588+
df = pd.DataFrame(
589+
[["Arr", "alpha", [1, 2, 3, 4]],
590+
["Bee", "Beta", [10, 20, 30, 40]]],
591+
index=[["A", "B"], ["Null", "Eins"]],
592+
columns=["Aussprache", "Griechisch", "Args"]
593+
)
594+
df.index.names = index_names
595+
out = df.to_json(orient="table")
596+
result = pd.read_json(out, orient="table")
597+
tm.assert_frame_equal(df, result)

0 commit comments

Comments
 (0)