ENH: Provide dict object for to_dict() #16122 (#16220)

dwkenefick · TomAugspurger · commit f040ed227883 · 2017-05-16T11:06:18.000-05:00
* ENH: Provide dict object for to_dict() #16122 * ENH: Provide dict object for to_dict() #16122 * ENH: Provide dict object for to_dict() #16122 * ENH: Provide dict object for to_dict() #16122 * ENH: Provide dict object for to_dict() #16122 * ENH: Provide dict object for to_dict() #16122 * ENH: Provide dict object for to_dict() #16122
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -26,6 +26,7 @@ New features
 
 Other Enhancements
 ^^^^^^^^^^^^^^^^^^
+- ``Series.to_dict()`` and ``DataFrame.to_dict()`` now support an ``into`` keyword which allows you to specify the ``collections.Mapping`` subclass that you would like returned.  The default is ``dict``, which is backwards compatible. (:issue:`16122`)
 - ``RangeIndex.append`` now returns a ``RangeIndex`` object when possible (:issue:`16212`)
 
 
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -6,6 +6,8 @@
 import warnings
 from datetime import datetime, timedelta
 from functools import partial
+import inspect
+import collections
 
 import numpy as np
 from pandas._libs import lib, tslib
@@ -479,6 +481,42 @@ def _dict_compat(d):
                 for key, value in iteritems(d))
 
 
+def standardize_mapping(into):
+    """
+    Helper function to standardize a supplied mapping.
+
+    .. versionadded:: 0.21.0
+
+    Parameters
+    ----------
+    into : instance or subclass of collections.Mapping
+        Must be a class, an initialized collections.defaultdict,
+        or an instance of a collections.Mapping subclass.
+
+    Returns
+    -------
+    mapping : a collections.Mapping subclass or other constructor
+        a callable object that can accept an iterator to create
+        the desired Mapping.
+
+    See Also
+    --------
+    DataFrame.to_dict
+    Series.to_dict
+    """
+    if not inspect.isclass(into):
+        if isinstance(into, collections.defaultdict):
+            return partial(
+                collections.defaultdict, into.default_factory)
+        into = type(into)
+    if not issubclass(into, collections.Mapping):
+        raise TypeError('unsupported type: {}'.format(into))
+    elif into == collections.defaultdict:
+        raise TypeError(
+            'to_dict() only accepts initialized defaultdicts')
+    return into
+
+
 def sentinel_factory():
     class Sentinel(object):
         pass
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -63,7 +63,8 @@
                                 _default_index,
                                 _values_from_object,
                                 _maybe_box_datetimelike,
-                                _dict_compat)
+                                _dict_compat,
+                                standardize_mapping)
 from pandas.core.generic import NDFrame, _shared_docs
 from pandas.core.index import Index, MultiIndex, _ensure_index
 from pandas.core.indexing import (maybe_droplevels, convert_to_index_sliceable,
@@ -860,7 +861,7 @@ def from_dict(cls, data, orient='columns', dtype=None):
 
         return cls(data, index=index, columns=columns, dtype=dtype)
 
-    def to_dict(self, orient='dict'):
+    def to_dict(self, orient='dict', into=dict):
         """Convert DataFrame to dictionary.
 
         Parameters
@@ -882,32 +883,85 @@ def to_dict(self, orient='dict'):
             Abbreviations are allowed. `s` indicates `series` and `sp`
             indicates `split`.
 
+        into : class, default dict
+            The collections.Mapping subclass used for all Mappings
+            in the return value.  Can be the actual class or an empty
+            instance of the mapping type you want.  If you want a
+            collections.defaultdict, you must pass it initialized.
+
+            .. versionadded:: 0.21.0
+
         Returns
         -------
-        result : dict like {column -> {index -> value}}
+        result : collections.Mapping like {column -> {index -> value}}
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+                {'col1': [1, 2], 'col2': [0.5, 0.75]}, index=['a', 'b'])
+        >>> df
+           col1  col2
+        a     1   0.1
+        b     2   0.2
+        >>> df.to_dict()
+        {'col1': {'a': 1, 'b': 2}, 'col2': {'a': 0.5, 'b': 0.75}}
+
+        You can specify the return orientation.
+
+        >>> df.to_dict('series')
+        {'col1': a    1
+        b    2
+        Name: col1, dtype: int64, 'col2': a    0.50
+        b    0.75
+        Name: col2, dtype: float64}
+        >>> df.to_dict('split')
+        {'columns': ['col1', 'col2'],
+        'data': [[1.0, 0.5], [2.0, 0.75]],
+        'index': ['a', 'b']}
+        >>> df.to_dict('records')
+        [{'col1': 1.0, 'col2': 0.5}, {'col1': 2.0, 'col2': 0.75}]
+        >>> df.to_dict('index')
+        {'a': {'col1': 1.0, 'col2': 0.5}, 'b': {'col1': 2.0, 'col2': 0.75}}
+
+        You can also specify the mapping type.
+
+        >>> from collections import OrderedDict, defaultdict
+        >>> df.to_dict(into=OrderedDict)
+        OrderedDict([('col1', OrderedDict([('a', 1), ('b', 2)])),
+                   ('col2', OrderedDict([('a', 0.5), ('b', 0.75)]))])
+
+        If you want a `defaultdict`, you need to initialize it:
+
+        >>> dd = defaultdict(list)
+        >>> df.to_dict('records', into=dd)
+        [defaultdict(<type 'list'>, {'col2': 0.5, 'col1': 1.0}),
+        defaultdict(<type 'list'>, {'col2': 0.75, 'col1': 2.0})]
         """
         if not self.columns.is_unique:
             warnings.warn("DataFrame columns are not unique, some "
                           "columns will be omitted.", UserWarning)
+        # GH16122
+        into_c = standardize_mapping(into)
         if orient.lower().startswith('d'):
-            return dict((k, v.to_dict()) for k, v in compat.iteritems(self))
+            return into_c(
+                (k, v.to_dict(into)) for k, v in compat.iteritems(self))
         elif orient.lower().startswith('l'):
-            return dict((k, v.tolist()) for k, v in compat.iteritems(self))
+            return into_c((k, v.tolist()) for k, v in compat.iteritems(self))
         elif orient.lower().startswith('sp'):
-            return {'index': self.index.tolist(),
-                    'columns': self.columns.tolist(),
-                    'data': lib.map_infer(self.values.ravel(),
-                                          _maybe_box_datetimelike)
-                    .reshape(self.values.shape).tolist()}
+            return into_c((('index', self.index.tolist()),
+                           ('columns', self.columns.tolist()),
+                           ('data', lib.map_infer(self.values.ravel(),
+                                                  _maybe_box_datetimelike)
+                            .reshape(self.values.shape).tolist())))
         elif orient.lower().startswith('s'):
-            return dict((k, _maybe_box_datetimelike(v))
-                        for k, v in compat.iteritems(self))
+            return into_c((k, _maybe_box_datetimelike(v))
+                          for k, v in compat.iteritems(self))
         elif orient.lower().startswith('r'):
-            return [dict((k, _maybe_box_datetimelike(v))
-                         for k, v in zip(self.columns, row))
+            return [into_c((k, _maybe_box_datetimelike(v))
+                           for k, v in zip(self.columns, row))
                     for row in self.values]
         elif orient.lower().startswith('i'):
-            return dict((k, v.to_dict()) for k, v in self.iterrows())
+            return into_c((k, v.to_dict(into)) for k, v in self.iterrows())
         else:
             raise ValueError("orient '%s' not understood" % orient)
 
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -46,7 +46,8 @@
                                 _maybe_match_name,
                                 SettingWithCopyError,
                                 _maybe_box_datetimelike,
-                                _dict_compat)
+                                _dict_compat,
+                                standardize_mapping)
 from pandas.core.index import (Index, MultiIndex, InvalidIndexError,
                                Float64Index, _ensure_index)
 from pandas.core.indexing import check_bool_indexer, maybe_convert_indices
@@ -1074,15 +1075,39 @@ def tolist(self):
         """ Convert Series to a nested list """
         return list(self.asobject)
 
-    def to_dict(self):
+    def to_dict(self, into=dict):
         """
-        Convert Series to {label -> value} dict
+        Convert Series to {label -> value} dict or dict-like object.
+
+        Parameters
+        ----------
+        into : class, default dict
+            The collections.Mapping subclass to use as the return
+            object. Can be the actual class or an empty
+            instance of the mapping type you want.  If you want a
+            collections.defaultdict, you must pass it initialized.
+
+            .. versionadded:: 0.21.0
 
         Returns
         -------
-        value_dict : dict
-        """
-        return dict(compat.iteritems(self))
+        value_dict : collections.Mapping
+
+        Examples
+        --------
+        >>> s = pd.Series([1, 2, 3, 4])
+        >>> s.to_dict()
+        {0: 1, 1: 2, 2: 3, 3: 4}
+        >>> from collections import OrderedDict, defaultdict
+        >>> s.to_dict(OrderedDict)
+        OrderedDict([(0, 1), (1, 2), (2, 3), (3, 4)])
+        >>> dd = defaultdict(list)
+        >>> s.to_dict(dd)
+        defaultdict(<type 'list'>, {0: 1, 1: 2, 2: 3, 3: 4})
+        """
+        # GH16122
+        into_c = standardize_mapping(into)
+        return into_c(compat.iteritems(self))
 
     def to_frame(self, name=None):
         """
diff --git a/pandas/tests/frame/test_convert_to.py b/pandas/tests/frame/test_convert_to.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 
 import pytest
+import collections
 import numpy as np
 
 from pandas import compat
@@ -13,50 +14,6 @@
 
 class TestDataFrameConvertTo(TestData):
 
-    def test_to_dict(self):
-        test_data = {
-            'A': {'1': 1, '2': 2},
-            'B': {'1': '1', '2': '2', '3': '3'},
-        }
-        recons_data = DataFrame(test_data).to_dict()
-
-        for k, v in compat.iteritems(test_data):
-            for k2, v2 in compat.iteritems(v):
-                assert v2 == recons_data[k][k2]
-
-        recons_data = DataFrame(test_data).to_dict("l")
-
-        for k, v in compat.iteritems(test_data):
-            for k2, v2 in compat.iteritems(v):
-                assert v2 == recons_data[k][int(k2) - 1]
-
-        recons_data = DataFrame(test_data).to_dict("s")
-
-        for k, v in compat.iteritems(test_data):
-            for k2, v2 in compat.iteritems(v):
-                assert v2 == recons_data[k][k2]
-
-        recons_data = DataFrame(test_data).to_dict("sp")
-        expected_split = {'columns': ['A', 'B'], 'index': ['1', '2', '3'],
-                          'data': [[1.0, '1'], [2.0, '2'], [np.nan, '3']]}
-        tm.assert_dict_equal(recons_data, expected_split)
-
-        recons_data = DataFrame(test_data).to_dict("r")
-        expected_records = [{'A': 1.0, 'B': '1'},
-                            {'A': 2.0, 'B': '2'},
-                            {'A': np.nan, 'B': '3'}]
-        assert isinstance(recons_data, list)
-        assert len(recons_data) == 3
-        for l, r in zip(recons_data, expected_records):
-            tm.assert_dict_equal(l, r)
-
-        # GH10844
-        recons_data = DataFrame(test_data).to_dict("i")
-
-        for k, v in compat.iteritems(test_data):
-            for k2, v2 in compat.iteritems(v):
-                assert v2 == recons_data[k2][k]
-
     def test_to_dict_timestamp(self):
 
         # GH11247
@@ -190,17 +147,85 @@ def test_to_records_with_unicode_column_names(self):
         )
         tm.assert_almost_equal(result, expected)
 
+    @pytest.mark.parametrize('mapping', [
+        dict,
+        collections.defaultdict(list),
+        collections.OrderedDict])
+    def test_to_dict(self, mapping):
+        test_data = {
+            'A': {'1': 1, '2': 2},
+            'B': {'1': '1', '2': '2', '3': '3'},
+        }
+
+        # GH16122
+        recons_data = DataFrame(test_data).to_dict(into=mapping)
+
+        for k, v in compat.iteritems(test_data):
+            for k2, v2 in compat.iteritems(v):
+                assert (v2 == recons_data[k][k2])
+
+        recons_data = DataFrame(test_data).to_dict("l", mapping)
+
+        for k, v in compat.iteritems(test_data):
+            for k2, v2 in compat.iteritems(v):
+                assert (v2 == recons_data[k][int(k2) - 1])
+
+        recons_data = DataFrame(test_data).to_dict("s", mapping)
+
+        for k, v in compat.iteritems(test_data):
+            for k2, v2 in compat.iteritems(v):
+                assert (v2 == recons_data[k][k2])
+
+        recons_data = DataFrame(test_data).to_dict("sp", mapping)
+        expected_split = {'columns': ['A', 'B'], 'index': ['1', '2', '3'],
+                          'data': [[1.0, '1'], [2.0, '2'], [np.nan, '3']]}
+        tm.assert_dict_equal(recons_data, expected_split)
+
+        recons_data = DataFrame(test_data).to_dict("r", mapping)
+        expected_records = [{'A': 1.0, 'B': '1'},
+                            {'A': 2.0, 'B': '2'},
+                            {'A': np.nan, 'B': '3'}]
+        assert isinstance(recons_data, list)
+        assert (len(recons_data) == 3)
+        for l, r in zip(recons_data, expected_records):
+            tm.assert_dict_equal(l, r)
+
+        # GH10844
+        recons_data = DataFrame(test_data).to_dict("i")
+
+        for k, v in compat.iteritems(test_data):
+            for k2, v2 in compat.iteritems(v):
+                assert (v2 == recons_data[k2][k])
+
+        df = DataFrame(test_data)
+        df['duped'] = df[df.columns[0]]
+        recons_data = df.to_dict("i")
+        comp_data = test_data.copy()
+        comp_data['duped'] = comp_data[df.columns[0]]
+        for k, v in compat.iteritems(comp_data):
+            for k2, v2 in compat.iteritems(v):
+                assert (v2 == recons_data[k2][k])
+
+    @pytest.mark.parametrize('mapping', [
+        list,
+        collections.defaultdict,
+        []])
+    def test_to_dict_errors(self, mapping):
+        # GH16122
+        df = DataFrame(np.random.randn(3, 3))
+        with pytest.raises(TypeError):
+            df.to_dict(into=mapping)
 
-@pytest.mark.parametrize('tz', ['UTC', 'GMT', 'US/Eastern'])
-def test_to_records_datetimeindex_with_tz(tz):
-    # GH13937
-    dr = date_range('2016-01-01', periods=10,
-                    freq='S', tz=tz)
+    @pytest.mark.parametrize('tz', ['UTC', 'GMT', 'US/Eastern'])
+    def test_to_records_datetimeindex_with_tz(self, tz):
+        # GH13937
+        dr = date_range('2016-01-01', periods=10,
+                        freq='S', tz=tz)
 
-    df = DataFrame({'datetime': dr}, index=dr)
+        df = DataFrame({'datetime': dr}, index=dr)
 
-    expected = df.to_records()
-    result = df.tz_convert("UTC").to_records()
+        expected = df.to_records()
+        result = df.tz_convert("UTC").to_records()
 
-    # both converted to UTC, so they are equal
-    tm.assert_numpy_array_equal(result, expected)
+        # both converted to UTC, so they are equal
+        tm.assert_numpy_array_equal(result, expected)
diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py
diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py