diff --git a/doc/source/io.rst b/doc/source/io.rst index 2ef7e6d3b64f4..3e1619d6e1578 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1648,7 +1648,7 @@ with optional parameters: DataFrame - default is ``columns`` - - allowed values are {``split``, ``records``, ``index``, ``columns``, ``values``} + - allowed values are {``split``, ``records``, ``index``, ``columns``, ``values``, ``table``} The format of the JSON string @@ -1732,6 +1732,9 @@ values, index and columns. Name is also included for ``Series``: dfjo.to_json(orient="split") sjo.to_json(orient="split") +**Table oriented** serializes to the JSON `Table Schema`_, allowing for the +preservation of metadata including but not limited to dtypes and index names. + .. note:: Any orient option that encodes to a JSON object will not preserve the ordering of @@ -1833,7 +1836,7 @@ is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series`` DataFrame - default is ``columns`` - - allowed values are {``split``, ``records``, ``index``, ``columns``, ``values``} + - allowed values are {``split``, ``records``, ``index``, ``columns``, ``values``, ``table``} The format of the JSON string @@ -1846,6 +1849,8 @@ is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series`` ``index``; dict like {index -> {column -> value}} ``columns``; dict like {column -> {index -> value}} ``values``; just the values array + ``table``; adhering to the JSON `Table Schema`_ + - ``dtype`` : if True, infer dtypes, if a dict of column to dtype, then use those, if False, then don't infer dtypes at all, default is True, apply only to the data - ``convert_axes`` : boolean, try to convert the axes to the proper dtypes, default is True @@ -2202,7 +2207,39 @@ A few notes on the generated table schema: then ``level_`` is used. -_Table Schema: http://specs.frictionlessdata.io/json-table-schema/ +.. versionadded:: 0.23.0 + +``read_json`` also accepts ``orient='table'`` as an argument. This allows for +the preserveration of metadata such as dtypes and index names in a +round-trippable manner. + + .. ipython:: python + + df = pd.DataFrame({'foo': [1, 2, 3, 4], + 'bar': ['a', 'b', 'c', 'd'], + 'baz': pd.date_range('2018-01-01', freq='d', periods=4), + 'qux': pd.Categorical(['a', 'b', 'c', 'c']) + }, index=pd.Index(range(4), name='idx')) + df + df.dtypes + + df.to_json('test.json', orient='table') + new_df = pd.read_json('test.json', orient='table') + new_df + new_df.dtypes + +Please note that the string `index` is not supported with the round trip +format, as it is used by default in ``write_json`` to indicate a missing index +name. + +.. ipython:: python + + df.index.name = 'index' + df.to_json('test.json', orient='table') + new_df = pd.read_json('test.json', orient='table') + print(new_df.index.name) + +.. _Table Schema: http://specs.frictionlessdata.io/json-table-schema/ HTML ---- diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 77de1851490b2..4edf8f8a62f61 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -145,6 +145,37 @@ Current Behavior s.rank(na_option='top') +.. _whatsnew_0230.enhancements.round-trippable_json: + +JSON read/write round-trippable with ``orient='table'`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A ``DataFrame`` can now be written to and subsequently read back via JSON while preserving metadata through usage of the ``orient='table'`` argument (see :issue:`18912` and :issue:`9146`). Previously, none of the available ``orient`` values guaranteed the preservation of dtypes and index names, amongst other metadata. + +.. ipython:: python + + df = pd.DataFrame({'foo': [1, 2, 3, 4], + 'bar': ['a', 'b', 'c', 'd'], + 'baz': pd.date_range('2018-01-01', freq='d', periods=4), + 'qux': pd.Categorical(['a', 'b', 'c', 'c']) + }, index=pd.Index(range(4), name='idx')) + df + df.dtypes + df.to_json('test.json', orient='table') + new_df = pd.read_json('test.json', orient='table') + new_df + new_df.dtypes + +Please note that the string `index` is not supported with the round trip format, as it is used by default in ``write_json`` to indicate a missing index name. + +.. ipython:: python + + df.index.name = 'index' + df.to_json('test.json', orient='table') + new_df = pd.read_json('test.json', orient='table') + new_df + print(new_df.index.name) + .. _whatsnew_0230.enhancements.other: Other Enhancements diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 72ec5c59c90af..d1c83ad57f59d 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -16,7 +16,7 @@ from pandas.core.reshape.concat import concat from pandas.io.formats.printing import pprint_thing from .normalize import _convert_to_line_delimits -from .table_schema import build_table_schema +from .table_schema import build_table_schema, parse_table_schema from pandas.core.dtypes.common import is_period_dtype loads = json.loads @@ -261,13 +261,16 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, * when ``typ == 'frame'``, - allowed orients are ``{'split','records','index', - 'columns','values'}`` + 'columns','values', 'table'}`` - default is ``'columns'`` - The DataFrame index must be unique for orients ``'index'`` and ``'columns'``. - The DataFrame columns must be unique for orients ``'index'``, ``'columns'``, and ``'records'``. + .. versionadded:: 0.23.0 + 'table' as an allowed value for the ``orient`` argument + typ : type of object to recover (series or frame), default 'frame' dtype : boolean or dict, default True If True, infer dtypes, if a dict of column to dtype, then use those, @@ -336,6 +339,15 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, ------- result : Series or DataFrame, depending on the value of `typ`. + Notes + ----- + Specific to ``orient='table'``, if a ``DataFrame`` with a literal ``Index`` + name of `index` gets written with ``write_json``, the subsequent read + operation will incorrectly set the ``Index`` name to ``None``. This is + because `index` is also used by ``write_json`` to denote a missing + ``Index`` name, and the subsequent ``read_json`` operation cannot + distinguish between the two. + See Also -------- DataFrame.to_json @@ -839,6 +851,9 @@ def _parse_no_numpy(self): elif orient == "index": self.obj = DataFrame( loads(json, precise_float=self.precise_float), dtype=None).T + elif orient == 'table': + self.obj = parse_table_schema(json, + precise_float=self.precise_float) else: self.obj = DataFrame( loads(json, precise_float=self.precise_float), dtype=None) diff --git a/pandas/io/json/table_schema.py b/pandas/io/json/table_schema.py index 9cec5b3d6ba49..8da36b64b0914 100644 --- a/pandas/io/json/table_schema.py +++ b/pandas/io/json/table_schema.py @@ -3,6 +3,9 @@ http://specs.frictionlessdata.io/json-table-schema/ """ +import pandas._libs.json as json +from pandas import DataFrame +from pandas.api.types import CategoricalDtype from pandas.core.common import _all_not_none from pandas.core.dtypes.common import ( is_integer_dtype, is_timedelta64_dtype, is_numeric_dtype, @@ -10,6 +13,8 @@ is_categorical_dtype, is_period_dtype, is_string_dtype ) +loads = json.loads + def as_json_table_type(x): """ @@ -75,7 +80,7 @@ def set_default_names(data): return data -def make_field(arr, dtype=None): +def convert_pandas_type_to_json_field(arr, dtype=None): dtype = dtype or arr.dtype if arr.name is None: name = 'values' @@ -103,6 +108,69 @@ def make_field(arr, dtype=None): return field +def convert_json_field_to_pandas_type(field): + """ + Converts a JSON field descriptor into its corresponding NumPy / pandas type + + Parameters + ---------- + field + A JSON field descriptor + + Returns + ------- + dtype + + Raises + ----- + ValueError + If the type of the provided field is unknown or currently unsupported + + Examples + -------- + >>> convert_json_field_to_pandas_type({'name': 'an_int', + 'type': 'integer'}) + 'int64' + >>> convert_json_field_to_pandas_type({'name': 'a_categorical', + 'type': 'any', + 'contraints': {'enum': [ + 'a', 'b', 'c']}, + 'ordered': True}) + 'CategoricalDtype(categories=['a', 'b', 'c'], ordered=True)' + >>> convert_json_field_to_pandas_type({'name': 'a_datetime', + 'type': 'datetime'}) + 'datetime64[ns]' + >>> convert_json_field_to_pandas_type({'name': 'a_datetime_with_tz', + 'type': 'datetime', + 'tz': 'US/Central'}) + 'datetime64[ns, US/Central]' + """ + typ = field['type'] + if typ == 'string': + return 'object' + elif typ == 'integer': + return 'int64' + elif typ == 'number': + return 'float64' + elif typ == 'boolean': + return 'bool' + elif typ == 'duration': + return 'timedelta64' + elif typ == 'datetime': + if field.get('tz'): + return 'datetime64[ns, {tz}]'.format(tz=field['tz']) + else: + return 'datetime64[ns]' + elif typ == 'any': + if 'constraints' in field and 'ordered' in field: + return CategoricalDtype(categories=field['constraints']['enum'], + ordered=field['ordered']) + else: + return 'object' + + raise ValueError("Unsupported or invalid field type: {}".format(typ)) + + def build_table_schema(data, index=True, primary_key=None, version=True): """ Create a Table schema from ``data``. @@ -158,15 +226,15 @@ def build_table_schema(data, index=True, primary_key=None, version=True): if index: if data.index.nlevels > 1: for level in data.index.levels: - fields.append(make_field(level)) + fields.append(convert_pandas_type_to_json_field(level)) else: - fields.append(make_field(data.index)) + fields.append(convert_pandas_type_to_json_field(data.index)) if data.ndim > 1: for column, s in data.iteritems(): - fields.append(make_field(s)) + fields.append(convert_pandas_type_to_json_field(s)) else: - fields.append(make_field(data)) + fields.append(convert_pandas_type_to_json_field(data)) schema['fields'] = fields if index and data.index.is_unique and primary_key is None: @@ -180,3 +248,65 @@ def build_table_schema(data, index=True, primary_key=None, version=True): if version: schema['pandas_version'] = '0.20.0' return schema + + +def parse_table_schema(json, precise_float): + """ + Builds a DataFrame from a given schema + + Parameters + ---------- + json : + A JSON table schema + precise_float : boolean + Flag controlling precision when decoding string to double values, as + dictated by ``read_json`` + + Returns + ------- + df : DataFrame + + Raises + ------ + NotImplementedError + If the JSON table schema contains either timezone or timedelta data + + Notes + ----- + Because ``write_json`` uses the string `index` to denote a name-less + ``Index``, this function sets the name of the returned ``DataFrame`` to + ``None`` when said string is encountered. Therefore, intentional usage + of `index` as the ``Index`` name is not supported. + + See also + -------- + build_table_schema : inverse function + pandas.read_json + """ + table = loads(json, precise_float=precise_float) + col_order = [field['name'] for field in table['schema']['fields']] + df = DataFrame(table['data'])[col_order] + + dtypes = {field['name']: convert_json_field_to_pandas_type(field) + for field in table['schema']['fields']} + + # Cannot directly use as_type with timezone data on object; raise for now + if any(str(x).startswith('datetime64[ns, ') for x in dtypes.values()): + raise NotImplementedError('table="orient" can not yet read timezone ' + 'data') + + # No ISO constructor for Timedelta as of yet, so need to raise + if 'timedelta64' in dtypes.values(): + raise NotImplementedError('table="orient" can not yet read ' + 'ISO-formatted Timedelta data') + + df = df.astype(dtypes) + + df = df.set_index(table['schema']['primaryKey']) + if len(df.index.names) == 1 and df.index.name == 'index': + df.index.name = None + else: + if all(x.startswith('level_') for x in df.index.names): + df.index.names = [None] * len(df.index.names) + + return df diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index dab56e264b955..76748f30e639b 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -12,8 +12,10 @@ from pandas.io.json.table_schema import ( as_json_table_type, build_table_schema, - make_field, + convert_pandas_type_to_json_field, + convert_json_field_to_pandas_type, set_default_names) +import pandas.util.testing as tm class TestBuildSchema(object): @@ -334,62 +336,89 @@ def test_date_format_raises(self): self.df.to_json(orient='table', date_format='iso') self.df.to_json(orient='table') - def test_make_field_int(self): + def test_convert_pandas_type_to_json_field_int(self): data = [1, 2, 3] kinds = [pd.Series(data, name='name'), pd.Index(data, name='name')] for kind in kinds: - result = make_field(kind) + result = convert_pandas_type_to_json_field(kind) expected = {"name": "name", "type": 'integer'} assert result == expected - def test_make_field_float(self): + def test_convert_pandas_type_to_json_field_float(self): data = [1., 2., 3.] kinds = [pd.Series(data, name='name'), pd.Index(data, name='name')] for kind in kinds: - result = make_field(kind) + result = convert_pandas_type_to_json_field(kind) expected = {"name": "name", "type": 'number'} assert result == expected - def test_make_field_datetime(self): + def test_convert_pandas_type_to_json_field_datetime(self): data = [1., 2., 3.] kinds = [pd.Series(pd.to_datetime(data), name='values'), pd.to_datetime(data)] for kind in kinds: - result = make_field(kind) + result = convert_pandas_type_to_json_field(kind) expected = {"name": "values", "type": 'datetime'} assert result == expected kinds = [pd.Series(pd.to_datetime(data, utc=True), name='values'), pd.to_datetime(data, utc=True)] for kind in kinds: - result = make_field(kind) + result = convert_pandas_type_to_json_field(kind) expected = {"name": "values", "type": 'datetime', "tz": "UTC"} assert result == expected arr = pd.period_range('2016', freq='A-DEC', periods=4) - result = make_field(arr) + result = convert_pandas_type_to_json_field(arr) expected = {"name": "values", "type": 'datetime', "freq": "A-DEC"} assert result == expected - def test_make_field_categorical(self): + def test_convert_pandas_type_to_json_field_categorical(self): data = ['a', 'b', 'c'] ordereds = [True, False] for ordered in ordereds: arr = pd.Series(pd.Categorical(data, ordered=ordered), name='cats') - result = make_field(arr) + result = convert_pandas_type_to_json_field(arr) expected = {"name": "cats", "type": "any", "constraints": {"enum": data}, "ordered": ordered} assert result == expected arr = pd.CategoricalIndex(data, ordered=ordered, name='cats') - result = make_field(arr) + result = convert_pandas_type_to_json_field(arr) expected = {"name": "cats", "type": "any", "constraints": {"enum": data}, "ordered": ordered} assert result == expected + @pytest.mark.parametrize("inp,exp", [ + ({'type': 'integer'}, 'int64'), + ({'type': 'number'}, 'float64'), + ({'type': 'boolean'}, 'bool'), + ({'type': 'duration'}, 'timedelta64'), + ({'type': 'datetime'}, 'datetime64[ns]'), + ({'type': 'datetime', 'tz': 'US/Hawaii'}, 'datetime64[ns, US/Hawaii]'), + ({'type': 'any'}, 'object'), + ({'type': 'any', 'constraints': {'enum': ['a', 'b', 'c']}, + 'ordered': False}, CategoricalDtype(categories=['a', 'b', 'c'], + ordered=False)), + ({'type': 'any', 'constraints': {'enum': ['a', 'b', 'c']}, + 'ordered': True}, CategoricalDtype(categories=['a', 'b', 'c'], + ordered=True)), + ({'type': 'string'}, 'object')]) + def test_convert_json_field_to_pandas_type(self, inp, exp): + field = {'name': 'foo'} + field.update(inp) + assert convert_json_field_to_pandas_type(field) == exp + + @pytest.mark.parametrize("inp", ["geopoint", "geojson", "fake_type"]) + def test_convert_json_field_to_pandas_type_raises(self, inp): + field = {'type': inp} + with tm.assert_raises_regex(ValueError, "Unsupported or invalid field " + "type: {}".format(inp)): + convert_json_field_to_pandas_type(field) + def test_categorical(self): s = pd.Series(pd.Categorical(['a', 'b', 'a'])) s.index.name = 'idx' @@ -471,3 +500,70 @@ def test_mi_falsey_name(self): ('a', 'b')])) result = [x['name'] for x in build_table_schema(df)['fields']] assert result == ['level_0', 'level_1', 0, 1, 2, 3] + + +class TestTableOrientReader(object): + + @pytest.mark.parametrize("index_nm", [ + None, "idx", pytest.param("index", marks=pytest.mark.xfail)]) + @pytest.mark.parametrize("vals", [ + {'ints': [1, 2, 3, 4]}, + {'objects': ['a', 'b', 'c', 'd']}, + {'date_ranges': pd.date_range('2016-01-01', freq='d', periods=4)}, + {'categoricals': pd.Series(pd.Categorical(['a', 'b', 'c', 'c']))}, + {'ordered_cats': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'], + ordered=True))}, + pytest.param({'floats': [1., 2., 3., 4.]}, marks=pytest.mark.xfail), + {'floats': [1.1, 2.2, 3.3, 4.4]}, + {'bools': [True, False, False, True]}]) + def test_read_json_table_orient(self, index_nm, vals): + df = DataFrame(vals, index=pd.Index(range(4), name=index_nm)) + out = df.to_json(orient="table") + result = pd.read_json(out, orient="table") + tm.assert_frame_equal(df, result) + + @pytest.mark.parametrize("index_nm", [ + None, "idx", pytest.param("index", marks=pytest.mark.xfail)]) + @pytest.mark.parametrize("vals", [ + {'timedeltas': pd.timedelta_range('1H', periods=4, freq='T')}, + {'timezones': pd.date_range('2016-01-01', freq='d', periods=4, + tz='US/Central')}]) + def test_read_json_table_orient_raises(self, index_nm, vals): + df = DataFrame(vals, index=pd.Index(range(4), name=index_nm)) + out = df.to_json(orient="table") + with tm.assert_raises_regex(NotImplementedError, 'can not yet read '): + pd.read_json(out, orient="table") + + def test_comprehensive(self): + df = DataFrame( + {'A': [1, 2, 3, 4], + 'B': ['a', 'b', 'c', 'c'], + 'C': pd.date_range('2016-01-01', freq='d', periods=4), + # 'D': pd.timedelta_range('1H', periods=4, freq='T'), + 'E': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'])), + 'F': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'], + ordered=True)), + 'G': [1.1, 2.2, 3.3, 4.4], + # 'H': pd.date_range('2016-01-01', freq='d', periods=4, + # tz='US/Central'), + 'I': [True, False, False, True], + }, + index=pd.Index(range(4), name='idx')) + + out = df.to_json(orient="table") + result = pd.read_json(out, orient="table") + tm.assert_frame_equal(df, result) + + @pytest.mark.parametrize("index_names", [[None, None], ['foo', 'bar']]) + def test_multiindex(self, index_names): + # GH 18912 + df = pd.DataFrame( + [["Arr", "alpha", [1, 2, 3, 4]], + ["Bee", "Beta", [10, 20, 30, 40]]], + index=[["A", "B"], ["Null", "Eins"]], + columns=["Aussprache", "Griechisch", "Args"] + ) + df.index.names = index_names + out = df.to_json(orient="table") + result = pd.read_json(out, orient="table") + tm.assert_frame_equal(df, result)