Skip to content

Commit 2683b71

Browse files
RogerThomasRoger ThomasRogerThomas
authored
PERF: Slow performance of to_dict (#46487)
Co-authored-by: Roger Thomas <[email protected]> Co-authored-by: RogerThomas <[email protected]>
1 parent 5d8d323 commit 2683b71

File tree

4 files changed

+121
-23
lines changed

4 files changed

+121
-23
lines changed

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -608,6 +608,7 @@ Performance improvements
608608
- Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``observed=False`` (:issue:`49596`)
609609
- Performance improvement in :func:`read_stata` with parameter ``index_col`` set to ``None`` (the default). Now the index will be a :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`49745`)
610610
- Performance improvement in :func:`merge` when not merging on the index - the new index will now be :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`49478`)
611+
- Performance improvement in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` when using any non-object dtypes (:issue:`46470`)
611612

612613
.. ---------------------------------------------------------------------------
613614
.. _whatsnew_200.bug_fixes:

pandas/core/frame.py

+103-22
Original file line numberDiff line numberDiff line change
@@ -1811,6 +1811,28 @@ def to_numpy(
18111811

18121812
return result
18131813

1814+
def _create_data_for_split_and_tight_to_dict(
1815+
self, are_all_object_dtype_cols: bool, object_dtype_indices: list[int]
1816+
) -> list:
1817+
"""
1818+
Simple helper method to create data for to ``to_dict(orient="split")`` and
1819+
``to_dict(orient="tight")`` to create the main output data
1820+
"""
1821+
if are_all_object_dtype_cols:
1822+
data = [
1823+
list(map(maybe_box_native, t))
1824+
for t in self.itertuples(index=False, name=None)
1825+
]
1826+
else:
1827+
data = [list(t) for t in self.itertuples(index=False, name=None)]
1828+
if object_dtype_indices:
1829+
# If we have object_dtype_cols, apply maybe_box_naive after list
1830+
# comprehension for perf
1831+
for row in data:
1832+
for i in object_dtype_indices:
1833+
row[i] = maybe_box_native(row[i])
1834+
return data
1835+
18141836
@overload
18151837
def to_dict(
18161838
self,
@@ -1950,30 +1972,50 @@ def to_dict(
19501972
"'index=False' is only valid when 'orient' is 'split' or 'tight'"
19511973
)
19521974

1975+
if orient == "series":
1976+
# GH46470 Return quickly if orient series to avoid creating dtype objects
1977+
return into_c((k, v) for k, v in self.items())
1978+
1979+
object_dtype_indices = [
1980+
i
1981+
for i, col_dtype in enumerate(self.dtypes.values)
1982+
if is_object_dtype(col_dtype)
1983+
]
1984+
are_all_object_dtype_cols = len(object_dtype_indices) == len(self.dtypes)
1985+
19531986
if orient == "dict":
19541987
return into_c((k, v.to_dict(into)) for k, v in self.items())
19551988

19561989
elif orient == "list":
1990+
object_dtype_indices_as_set = set(object_dtype_indices)
19571991
return into_c(
1958-
(k, list(map(maybe_box_native, v.tolist()))) for k, v in self.items()
1992+
(
1993+
k,
1994+
list(map(maybe_box_native, v.tolist()))
1995+
if i in object_dtype_indices_as_set
1996+
else v.tolist(),
1997+
)
1998+
for i, (k, v) in enumerate(self.items())
19591999
)
19602000

19612001
elif orient == "split":
2002+
data = self._create_data_for_split_and_tight_to_dict(
2003+
are_all_object_dtype_cols, object_dtype_indices
2004+
)
2005+
19622006
return into_c(
19632007
((("index", self.index.tolist()),) if index else ())
19642008
+ (
19652009
("columns", self.columns.tolist()),
1966-
(
1967-
"data",
1968-
[
1969-
list(map(maybe_box_native, t))
1970-
for t in self.itertuples(index=False, name=None)
1971-
],
1972-
),
2010+
("data", data),
19732011
)
19742012
)
19752013

19762014
elif orient == "tight":
2015+
data = self._create_data_for_split_and_tight_to_dict(
2016+
are_all_object_dtype_cols, object_dtype_indices
2017+
)
2018+
19772019
return into_c(
19782020
((("index", self.index.tolist()),) if index else ())
19792021
+ (
@@ -1990,26 +2032,65 @@ def to_dict(
19902032
+ (("column_names", list(self.columns.names)),)
19912033
)
19922034

1993-
elif orient == "series":
1994-
return into_c((k, v) for k, v in self.items())
1995-
19962035
elif orient == "records":
19972036
columns = self.columns.tolist()
1998-
rows = (
1999-
dict(zip(columns, row))
2000-
for row in self.itertuples(index=False, name=None)
2001-
)
2002-
return [
2003-
into_c((k, maybe_box_native(v)) for k, v in row.items()) for row in rows
2004-
]
2037+
if are_all_object_dtype_cols:
2038+
rows = (
2039+
dict(zip(columns, row))
2040+
for row in self.itertuples(index=False, name=None)
2041+
)
2042+
return [
2043+
into_c((k, maybe_box_native(v)) for k, v in row.items())
2044+
for row in rows
2045+
]
2046+
else:
2047+
data = [
2048+
into_c(zip(columns, t))
2049+
for t in self.itertuples(index=False, name=None)
2050+
]
2051+
if object_dtype_indices:
2052+
object_dtype_indices_as_set = set(object_dtype_indices)
2053+
object_dtype_cols = {
2054+
col
2055+
for i, col in enumerate(self.columns)
2056+
if i in object_dtype_indices_as_set
2057+
}
2058+
for row in data:
2059+
for col in object_dtype_cols:
2060+
row[col] = maybe_box_native(row[col])
2061+
return data
20052062

20062063
elif orient == "index":
20072064
if not self.index.is_unique:
20082065
raise ValueError("DataFrame index must be unique for orient='index'.")
2009-
return into_c(
2010-
(t[0], dict(zip(self.columns, map(maybe_box_native, t[1:]))))
2011-
for t in self.itertuples(name=None)
2012-
)
2066+
columns = self.columns.tolist()
2067+
if are_all_object_dtype_cols:
2068+
return into_c(
2069+
(t[0], dict(zip(self.columns, map(maybe_box_native, t[1:]))))
2070+
for t in self.itertuples(name=None)
2071+
)
2072+
elif object_dtype_indices:
2073+
object_dtype_indices_as_set = set(object_dtype_indices)
2074+
is_object_dtype_by_index = [
2075+
i in object_dtype_indices_as_set for i in range(len(self.columns))
2076+
]
2077+
return into_c(
2078+
(
2079+
t[0],
2080+
{
2081+
columns[i]: maybe_box_native(v)
2082+
if is_object_dtype_by_index[i]
2083+
else v
2084+
for i, v in enumerate(t[1:])
2085+
},
2086+
)
2087+
for t in self.itertuples(name=None)
2088+
)
2089+
else:
2090+
return into_c(
2091+
(t[0], dict(zip(self.columns, t[1:])))
2092+
for t in self.itertuples(name=None)
2093+
)
20132094

20142095
else:
20152096
raise ValueError(f"orient '{orient}' not understood")

pandas/core/series.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -1812,7 +1812,13 @@ def to_dict(self, into: type[dict] = dict) -> dict:
18121812
"""
18131813
# GH16122
18141814
into_c = com.standardize_mapping(into)
1815-
return into_c((k, maybe_box_native(v)) for k, v in self.items())
1815+
1816+
if is_object_dtype(self):
1817+
return into_c((k, maybe_box_native(v)) for k, v in self.items())
1818+
else:
1819+
# Not an object dtype => all types will be the same so let the default
1820+
# indexer return native python type
1821+
return into_c((k, v) for k, v in self.items())
18161822

18171823
def to_frame(self, name: Hashable = lib.no_default) -> DataFrame:
18181824
"""

pandas/tests/frame/methods/test_to_dict.py

+10
Original file line numberDiff line numberDiff line change
@@ -379,6 +379,16 @@ def test_to_dict_orient_tight(self, index, columns):
379379
"b": [float, float, float],
380380
},
381381
),
382+
( # Make sure we have one df which is all object type cols
383+
{
384+
"a": [1, "hello", 3],
385+
"b": [1.1, "world", 3.3],
386+
},
387+
{
388+
"a": [int, str, int],
389+
"b": [float, str, float],
390+
},
391+
),
382392
),
383393
)
384394
def test_to_dict_returns_native_types(self, orient, data, expected_types):

0 commit comments

Comments
 (0)