Skip to content

ENH: Add orient=tight format for dictionaries #35292

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Oct 16, 2021
22 changes: 22 additions & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,28 @@ Previously, negative arguments returned empty frames.
df.groupby("A").nth(slice(1, -1))
df.groupby("A").nth([slice(None, 1), slice(-1, None)])

.. _whatsnew_140.dict_tight:

DataFrame.from_dict and DataFrame.to_dict have new ``'tight'`` option
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

A new ``'tight'`` dictionary format that preserves :class:`MultiIndex` entries and names
is now available with the :meth:`DataFrame.from_dict` and :meth:`DataFrame.to_dict` methods
and can be used with the standard ``json`` library to produce a tight
representation of :class:`DataFrame` objects (:issue:`4889`).

.. ipython:: python

df = pd.DataFrame.from_records(
[[1, 3], [2, 4]],
index=pd.MultiIndex.from_tuples([("a", "b"), ("a", "c")],
names=["n1", "n2"]),
columns=pd.MultiIndex.from_tuples([("x", 1), ("y", 2)],
names=["z1", "z2"]),
)
df
df.to_dict(orient='tight')

.. _whatsnew_140.enhancements.other:

Other enhancements
Expand Down
74 changes: 69 additions & 5 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1520,15 +1520,21 @@ def from_dict(
----------
data : dict
Of the form {field : array-like} or {field : dict}.
orient : {'columns', 'index'}, default 'columns'
orient : {'columns', 'index', 'tight'}, default 'columns'
The "orientation" of the data. If the keys of the passed dict
should be the columns of the resulting DataFrame, pass 'columns'
(default). Otherwise if the keys should be rows, pass 'index'.
If 'tight', assume a dict with keys ['index', 'columns', 'data',
'index_names', 'column_names'].

.. versionadded:: 1.4.0
'tight' as an allowed value for the ``orient`` argument

dtype : dtype, default None
Data type to force, otherwise infer.
columns : list, default None
Column labels to use when ``orient='index'``. Raises a ValueError
if used with ``orient='columns'``.
if used with ``orient='columns'`` or ``orient='tight'``.

Returns
-------
Expand All @@ -1539,6 +1545,7 @@ def from_dict(
DataFrame.from_records : DataFrame from structured ndarray, sequence
of tuples or dicts, or DataFrame.
DataFrame : DataFrame object creation using constructor.
DataFrame.to_dict : Convert the DataFrame to a dictionary.

Examples
--------
Expand Down Expand Up @@ -1569,6 +1576,21 @@ def from_dict(
A B C D
row_1 3 2 1 0
row_2 a b c d

Specify ``orient='tight'`` to create the DataFrame using a 'tight'
format:

>>> data = {'index': [('a', 'b'), ('a', 'c')],
... 'columns': [('x', 1), ('y', 2)],
... 'data': [[1, 3], [2, 4]],
... 'index_names': ['n1', 'n2'],
... 'column_names': ['z1', 'z2']}
>>> pd.DataFrame.from_dict(data, orient='tight')
z1 x y
z2 1 2
n1 n2
a b 1 3
c 2 4
"""
index = None
orient = orient.lower()
Expand All @@ -1579,13 +1601,28 @@ def from_dict(
data = _from_nested_dict(data)
else:
data, index = list(data.values()), list(data.keys())
elif orient == "columns":
elif orient == "columns" or orient == "tight":
if columns is not None:
raise ValueError("cannot use columns parameter with orient='columns'")
raise ValueError(f"cannot use columns parameter with orient='{orient}'")
else: # pragma: no cover
raise ValueError("only recognize index or columns for orient")

return cls(data, index=index, columns=columns, dtype=dtype)
if orient != "tight":
return cls(data, index=index, columns=columns, dtype=dtype)
else:
realdata = data["data"]

def create_index(indexlist, namelist):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think ensure_index does this

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Dr-Irv can you review this comment here

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback ensure_index doesn't handle the names of the indexes (or columns), so you'd still have logic to handle that part anyway.

Copy link
Contributor

@jreback jreback Oct 16, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does this work
e..g just use Index

In [98]: pd.Index([(1, 0), (1, 1)], names=['foo', 'bar'])
Out[98]: 
MultiIndex([(1, 0),
            (1, 1)],
           names=['foo', 'bar'])

In [99]: pd.Index([1, 2], names=['foo'])
Out[99]: Int64Index([1, 2], dtype='int64')

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, because note that the argument to set the names of a MultiIndex is names, while for an Index it is name . In your example in cell 99, you lost the name of the index.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

grr this is a bug, actually is this still true on master (this was a pretty old version i am using)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Still an issue. Using pd.Index, with names, you get a FutureWarning. With name, you get a stack trace.

In [1]: import pandas as pd

In [2]: pd.__version__
Out[2]: '1.4.0.dev0+905.gdace93d694'

In [3]: pd.Index([(1,0), (1,1)], names=["foo", "bar"])
<ipython-input-3-99fe0b033485>:1: FutureWarning: Passing keywords other than 'data', 'dtype', 'copy', 'name', 'tupleize_cols' is deprecated and will raise TypeError in a future version.  Use the specific Index subclass directly instead.
  pd.Index([(1,0), (1,1)], names=["foo", "bar"])
Out[3]:
MultiIndex([(1, 0),
            (1, 1)],
           names=['foo', 'bar'])

In [4]: pd.Index([(1,0), (1,1)], name=["foo", "bar"])
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-4-138c3fe61553> in <module>
----> 1 pd.Index([(1,0), (1,1)], name=["foo", "bar"])

c:\Code\pandas_dev\pandas\pandas\core\indexes\base.py in __new__(cls, data, dtype, copy, name, tupleize_cols, **kwargs)
    403         from pandas.core.indexes.range import RangeIndex
    404
--> 405         name = maybe_extract_name(name, data, cls)
    406
    407         if dtype is not None:

c:\Code\pandas_dev\pandas\pandas\core\indexes\base.py in maybe_extract_name(name, obj, cls)
   6876     # GH#29069
   6877     if not is_hashable(name):
-> 6878         raise TypeError(f"{cls.__name__}.name must be a hashable type")
   6879
   6880     return name

TypeError: Index.name must be a hashable type

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nvm we removed names=. i still find this a bit of a footgun.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok i think we need to handle this a bit bitter if you wouldn't mind opening an issue.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback Seems like we ought to have another issue with respect to pd.Index([(1,0), (1,1)], name=["foo", "bar"]) not returning a MultiIndex ??

index: Index
if len(namelist) > 1:
index = MultiIndex.from_tuples(indexlist, names=namelist)
else:
index = Index(indexlist, name=namelist[0])
return index

index = create_index(data["index"], data["index_names"])
columns = create_index(data["columns"], data["column_names"])
return cls(realdata, index=index, columns=columns, dtype=dtype)

def to_numpy(
self,
Expand Down Expand Up @@ -1675,13 +1712,19 @@ def to_dict(self, orient: str = "dict", into=dict):
- 'series' : dict like {column -> Series(values)}
- 'split' : dict like
{'index' -> [index], 'columns' -> [columns], 'data' -> [values]}
- 'tight' : dict like
{'index' -> [index], 'columns' -> [columns], 'data' -> [values],
'index_names' -> [index.names], 'column_names' -> [column.names]}
- 'records' : list like
[{column -> value}, ... , {column -> value}]
- 'index' : dict like {index -> {column -> value}}

Abbreviations are allowed. `s` indicates `series` and `sp`
indicates `split`.

.. versionadded:: 1.4.0
'tight' as an allowed value for the ``orient`` argument

into : class, default dict
The collections.abc.Mapping subclass used for all Mappings
in the return value. Can be the actual class or an empty
Expand Down Expand Up @@ -1731,6 +1774,10 @@ def to_dict(self, orient: str = "dict", into=dict):
>>> df.to_dict('index')
{'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}}

>>> df.to_dict('tight')
{'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
'data': [[1, 0.5], [2, 0.75]], 'index_names': [None], 'column_names': [None]}

You can also specify the mapping type.

>>> from collections import OrderedDict, defaultdict
Expand Down Expand Up @@ -1807,6 +1854,23 @@ def to_dict(self, orient: str = "dict", into=dict):
)
)

elif orient == "tight":
return into_c(
(
("index", self.index.tolist()),
("columns", self.columns.tolist()),
(
"data",
[
list(map(maybe_box_native, t))
for t in self.itertuples(index=False, name=None)
],
),
("index_names", list(self.index.names)),
("column_names", list(self.columns.names)),
)
)

elif orient == "series":
return into_c((k, v) for k, v in self.items())

Expand Down
32 changes: 32 additions & 0 deletions pandas/tests/frame/methods/test_to_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@

from pandas import (
DataFrame,
Index,
MultiIndex,
Series,
Timestamp,
)
Expand Down Expand Up @@ -312,3 +314,33 @@ def test_to_dict_mixed_numeric_frame(self):
result = df.reset_index().to_dict("records")
expected = [{"index": 0, "a": 1.0, "b": 9.0}]
assert result == expected

@pytest.mark.parametrize(
"index",
[
None,
Index(["aa", "bb"]),
Index(["aa", "bb"], name="cc"),
MultiIndex.from_tuples([("a", "b"), ("a", "c")]),
MultiIndex.from_tuples([("a", "b"), ("a", "c")], names=["n1", "n2"]),
],
)
@pytest.mark.parametrize(
"columns",
[
["x", "y"],
Index(["x", "y"]),
Index(["x", "y"], name="z"),
MultiIndex.from_tuples([("x", 1), ("y", 2)]),
MultiIndex.from_tuples([("x", 1), ("y", 2)], names=["z1", "z2"]),
],
)
def test_to_dict_orient_tight(self, index, columns):
df = DataFrame.from_records(
[[1, 3], [2, 4]],
columns=columns,
index=index,
)
roundtrip = DataFrame.from_dict(df.to_dict(orient="tight"), orient="tight")

tm.assert_frame_equal(df, roundtrip)