Skip to content

ENH: Provide dict object for to_dict() #16122 #16220

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
May 16, 2017
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ New features

Other Enhancements
^^^^^^^^^^^^^^^^^^
- ``Series.to_dict()`` and ``DataFrame.to_dict()`` now support an ``into`` keyword which allows you to specify the ``collections.Mapping`` subclass that you would like returned. The default is ``dict``, which is backwards compatible. (:issue:`16122`)
- ``RangeIndex.append`` now returns a ``RangeIndex`` object when possible (:issue:`16212`)


Expand Down
38 changes: 38 additions & 0 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import warnings
from datetime import datetime, timedelta
from functools import partial
import inspect
import collections

import numpy as np
from pandas._libs import lib, tslib
Expand Down Expand Up @@ -479,6 +481,42 @@ def _dict_compat(d):
for key, value in iteritems(d))


def standardize_mapping(into):
"""
Helper function to standardize a supplied mapping.

.. versionadded:: 0.21.0

Parameters
----------
into : instance or subclass of collections.Mapping
Must be a class, an initialized collections.defaultdict,
or an instance of a collections.Mapping subclass.

Returns
-------
mapping : a collections.Mapping subclass or other constructor
a callable object that can accept an iterator to create
the desired Mapping.

See Also
--------
DataFrame.to_dict
Series.to_dict
"""
if not inspect.isclass(into):
if isinstance(into, collections.defaultdict):
return partial(
collections.defaultdict, into.default_factory)
into = type(into)
if not issubclass(into, collections.Mapping):
raise TypeError('unsupported type: {}'.format(into))
elif into == collections.defaultdict:
raise TypeError(
'to_dict() only accepts initialized defaultdicts')
return into


def sentinel_factory():
class Sentinel(object):
pass
Expand Down
84 changes: 69 additions & 15 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@
_default_index,
_values_from_object,
_maybe_box_datetimelike,
_dict_compat)
_dict_compat,
standardize_mapping)
from pandas.core.generic import NDFrame, _shared_docs
from pandas.core.index import Index, MultiIndex, _ensure_index
from pandas.core.indexing import (maybe_droplevels, convert_to_index_sliceable,
Expand Down Expand Up @@ -860,7 +861,7 @@ def from_dict(cls, data, orient='columns', dtype=None):

return cls(data, index=index, columns=columns, dtype=dtype)

def to_dict(self, orient='dict'):
def to_dict(self, orient='dict', into=dict):
"""Convert DataFrame to dictionary.

Parameters
Expand All @@ -882,32 +883,85 @@ def to_dict(self, orient='dict'):
Abbreviations are allowed. `s` indicates `series` and `sp`
indicates `split`.

into : class, default dict
The collections.Mapping subclass used for all Mappings
in the return value. Can be the actual class or an empty
instance of the mapping type you want. If you want a
collections.defaultdict, you must pass it initialized.

.. versionadded:: 0.21.0

Returns
-------
result : dict like {column -> {index -> value}}
result : collections.Mapping like {column -> {index -> value}}

Examples
--------
>>> df = pd.DataFrame(
{'col1': [1, 2], 'col2': [0.5, 0.75]}, index=['a', 'b'])
>>> df
col1 col2
a 1 0.1
b 2 0.2
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The 0.1 and 0.2 are wrong, should be 0.5 and 0.75

>>> df.to_dict()
{'col1': {'a': 1, 'b': 2}, 'col2': {'a': 0.5, 'b': 0.75}}

You can specify the return orientation.

>>> df.to_dict('series')
{'col1': a 1
b 2
Name: col1, dtype: int64, 'col2': a 0.50
b 0.75
Name: col2, dtype: float64}
>>> df.to_dict('split')
{'columns': ['col1', 'col2'],
'data': [[1.0, 0.5], [2.0, 0.75]],
'index': ['a', 'b']}
>>> df.to_dict('records')
[{'col1': 1.0, 'col2': 0.5}, {'col1': 2.0, 'col2': 0.75}]
>>> df.to_dict('index')
{'a': {'col1': 1.0, 'col2': 0.5}, 'b': {'col1': 2.0, 'col2': 0.75}}

You can also specify the mapping type.

>>> from collections import OrderedDict, defaultdict
>>> df.to_dict(into=OrderedDict)
OrderedDict([('col1', OrderedDict([('a', 1), ('b', 2)])),
('col2', OrderedDict([('a', 0.5), ('b', 0.75)]))])

If you want a `defaultdict`, you need to initialize it:

>>> dd = defaultdict(list)
>>> df.to_dict('records', into=dd)
[defaultdict(<type 'list'>, {'col2': 0.5, 'col1': 1.0}),
defaultdict(<type 'list'>, {'col2': 0.75, 'col1': 2.0})]
"""
if not self.columns.is_unique:
warnings.warn("DataFrame columns are not unique, some "
"columns will be omitted.", UserWarning)
# GH16122
into_c = standardize_mapping(into)
if orient.lower().startswith('d'):
return dict((k, v.to_dict()) for k, v in compat.iteritems(self))
return into_c(
(k, v.to_dict(into)) for k, v in compat.iteritems(self))
elif orient.lower().startswith('l'):
return dict((k, v.tolist()) for k, v in compat.iteritems(self))
return into_c((k, v.tolist()) for k, v in compat.iteritems(self))
elif orient.lower().startswith('sp'):
return {'index': self.index.tolist(),
'columns': self.columns.tolist(),
'data': lib.map_infer(self.values.ravel(),
_maybe_box_datetimelike)
.reshape(self.values.shape).tolist()}
return into_c((('index', self.index.tolist()),
('columns', self.columns.tolist()),
('data', lib.map_infer(self.values.ravel(),
_maybe_box_datetimelike)
.reshape(self.values.shape).tolist())))
elif orient.lower().startswith('s'):
return dict((k, _maybe_box_datetimelike(v))
for k, v in compat.iteritems(self))
return into_c((k, _maybe_box_datetimelike(v))
for k, v in compat.iteritems(self))
elif orient.lower().startswith('r'):
return [dict((k, _maybe_box_datetimelike(v))
for k, v in zip(self.columns, row))
return [into_c((k, _maybe_box_datetimelike(v))
for k, v in zip(self.columns, row))
for row in self.values]
elif orient.lower().startswith('i'):
return dict((k, v.to_dict()) for k, v in self.iterrows())
return into_c((k, v.to_dict(into)) for k, v in self.iterrows())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it correct to use into here? What if the user passed an instance rather than a class? Wouldn't the values all be written into the same object?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is correct. v.to_dict(into) should call standardize_mapping again. Since standardize_mapping only returns a class, I don't think there is a danger of populating the same object twice.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gotcha.nI forgot that standardize_mapping always returned a class

else:
raise ValueError("orient '%s' not understood" % orient)

Expand Down
37 changes: 31 additions & 6 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@
_maybe_match_name,
SettingWithCopyError,
_maybe_box_datetimelike,
_dict_compat)
_dict_compat,
standardize_mapping)
from pandas.core.index import (Index, MultiIndex, InvalidIndexError,
Float64Index, _ensure_index)
from pandas.core.indexing import check_bool_indexer, maybe_convert_indices
Expand Down Expand Up @@ -1074,15 +1075,39 @@ def tolist(self):
""" Convert Series to a nested list """
return list(self.asobject)

def to_dict(self):
def to_dict(self, into=dict):
"""
Convert Series to {label -> value} dict
Convert Series to {label -> value} dict or dict-like object.

Parameters
----------
into : class, default dict
The collections.Mapping subclass to use as the return
object. Can be the actual class or an empty
instance of the mapping type you want. If you want a
collections.defaultdict, you must pass it initialized.

.. versionadded:: 0.21.0

Returns
-------
value_dict : dict
"""
return dict(compat.iteritems(self))
value_dict : collections.Mapping

Examples
--------
>>> s = pd.Series([1, 2, 3, 4])
>>> s.to_dict()
{0: 1, 1: 2, 2: 3, 3: 4}
>>> from collections import OrderedDict, defaultdict
>>> s.to_dict(OrderedDict)
OrderedDict([(0, 1), (1, 2), (2, 3), (3, 4)])
>>> dd = defaultdict(list)
>>> s.to_dict(dd)
defaultdict(<type 'list'>, {0: 1, 1: 2, 2: 3, 3: 4})
"""
# GH16122
into_c = standardize_mapping(into)
return into_c(compat.iteritems(self))

def to_frame(self, name=None):
"""
Expand Down
133 changes: 79 additions & 54 deletions pandas/tests/frame/test_convert_to.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-

import pytest
import collections
import numpy as np

from pandas import compat
Expand All @@ -13,50 +14,6 @@

class TestDataFrameConvertTo(TestData):

def test_to_dict(self):
test_data = {
'A': {'1': 1, '2': 2},
'B': {'1': '1', '2': '2', '3': '3'},
}
recons_data = DataFrame(test_data).to_dict()

for k, v in compat.iteritems(test_data):
for k2, v2 in compat.iteritems(v):
assert v2 == recons_data[k][k2]

recons_data = DataFrame(test_data).to_dict("l")

for k, v in compat.iteritems(test_data):
for k2, v2 in compat.iteritems(v):
assert v2 == recons_data[k][int(k2) - 1]

recons_data = DataFrame(test_data).to_dict("s")

for k, v in compat.iteritems(test_data):
for k2, v2 in compat.iteritems(v):
assert v2 == recons_data[k][k2]

recons_data = DataFrame(test_data).to_dict("sp")
expected_split = {'columns': ['A', 'B'], 'index': ['1', '2', '3'],
'data': [[1.0, '1'], [2.0, '2'], [np.nan, '3']]}
tm.assert_dict_equal(recons_data, expected_split)

recons_data = DataFrame(test_data).to_dict("r")
expected_records = [{'A': 1.0, 'B': '1'},
{'A': 2.0, 'B': '2'},
{'A': np.nan, 'B': '3'}]
assert isinstance(recons_data, list)
assert len(recons_data) == 3
for l, r in zip(recons_data, expected_records):
tm.assert_dict_equal(l, r)

# GH10844
recons_data = DataFrame(test_data).to_dict("i")

for k, v in compat.iteritems(test_data):
for k2, v2 in compat.iteritems(v):
assert v2 == recons_data[k2][k]

def test_to_dict_timestamp(self):

# GH11247
Expand Down Expand Up @@ -190,17 +147,85 @@ def test_to_records_with_unicode_column_names(self):
)
tm.assert_almost_equal(result, expected)

@pytest.mark.parametrize('mapping', [
dict,
collections.defaultdict(list),
collections.OrderedDict])
def test_to_dict(self, mapping):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This may be tested elsewhere, but can you add a test with a dataframe that has duplicate columns? Make sure to catch the warning.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done - I added one at the end. Let me know if that is what you were getting at.

test_data = {
'A': {'1': 1, '2': 2},
'B': {'1': '1', '2': '2', '3': '3'},
}

# GH16122
recons_data = DataFrame(test_data).to_dict(into=mapping)

for k, v in compat.iteritems(test_data):
for k2, v2 in compat.iteritems(v):
assert (v2 == recons_data[k][k2])

recons_data = DataFrame(test_data).to_dict("l", mapping)

for k, v in compat.iteritems(test_data):
for k2, v2 in compat.iteritems(v):
assert (v2 == recons_data[k][int(k2) - 1])

recons_data = DataFrame(test_data).to_dict("s", mapping)

for k, v in compat.iteritems(test_data):
for k2, v2 in compat.iteritems(v):
assert (v2 == recons_data[k][k2])

recons_data = DataFrame(test_data).to_dict("sp", mapping)
expected_split = {'columns': ['A', 'B'], 'index': ['1', '2', '3'],
'data': [[1.0, '1'], [2.0, '2'], [np.nan, '3']]}
tm.assert_dict_equal(recons_data, expected_split)

recons_data = DataFrame(test_data).to_dict("r", mapping)
expected_records = [{'A': 1.0, 'B': '1'},
{'A': 2.0, 'B': '2'},
{'A': np.nan, 'B': '3'}]
assert isinstance(recons_data, list)
assert (len(recons_data) == 3)
for l, r in zip(recons_data, expected_records):
tm.assert_dict_equal(l, r)

# GH10844
recons_data = DataFrame(test_data).to_dict("i")

for k, v in compat.iteritems(test_data):
for k2, v2 in compat.iteritems(v):
assert (v2 == recons_data[k2][k])

df = DataFrame(test_data)
df['duped'] = df[df.columns[0]]
recons_data = df.to_dict("i")
comp_data = test_data.copy()
comp_data['duped'] = comp_data[df.columns[0]]
for k, v in compat.iteritems(comp_data):
for k2, v2 in compat.iteritems(v):
assert (v2 == recons_data[k2][k])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would also add a test that hits some of the errors people might encounter (you do check these in the testing of standardize_mapping), but this is an integration test. you can put a test right after this, maybe test_to_dict_errors

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added this test - made sure the TypeErrors were caught. Let me know if you think anything else should go in there.


@pytest.mark.parametrize('mapping', [
list,
collections.defaultdict,
[]])
def test_to_dict_errors(self, mapping):
# GH16122
df = DataFrame(np.random.randn(3, 3))
with pytest.raises(TypeError):
df.to_dict(into=mapping)

@pytest.mark.parametrize('tz', ['UTC', 'GMT', 'US/Eastern'])
def test_to_records_datetimeindex_with_tz(tz):
# GH13937
dr = date_range('2016-01-01', periods=10,
freq='S', tz=tz)
@pytest.mark.parametrize('tz', ['UTC', 'GMT', 'US/Eastern'])
def test_to_records_datetimeindex_with_tz(self, tz):
# GH13937
dr = date_range('2016-01-01', periods=10,
freq='S', tz=tz)

df = DataFrame({'datetime': dr}, index=dr)
df = DataFrame({'datetime': dr}, index=dr)

expected = df.to_records()
result = df.tz_convert("UTC").to_records()
expected = df.to_records()
result = df.tz_convert("UTC").to_records()

# both converted to UTC, so they are equal
tm.assert_numpy_array_equal(result, expected)
# both converted to UTC, so they are equal
tm.assert_numpy_array_equal(result, expected)
Loading