PERF: Slow performance of to_dict (#46487)

RogerThomas · Roger Thomas · RogerThomas · web-flow · commit 2683b71ae4bc · 2022-11-22T16:59:14.000-05:00
Co-authored-by: Roger Thomas &lt;roger.thomas@oliverwyman.com&gt;
Co-authored-by: RogerThomas &lt;roger.thomas@gmail.com&gt;
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -608,6 +608,7 @@ Performance improvements
 - Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``observed=False`` (:issue:`49596`)
 - Performance improvement in :func:`read_stata` with parameter ``index_col`` set to ``None`` (the default). Now the index will be a :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`49745`)
 - Performance improvement in :func:`merge` when not merging on the index - the new index will now be :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`49478`)
+- Performance improvement in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` when using any non-object dtypes (:issue:`46470`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_200.bug_fixes:
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1811,6 +1811,28 @@ def to_numpy(
 
         return result
 
+    def _create_data_for_split_and_tight_to_dict(
+        self, are_all_object_dtype_cols: bool, object_dtype_indices: list[int]
+    ) -> list:
+        """
+        Simple helper method to create data for to ``to_dict(orient="split")`` and
+        ``to_dict(orient="tight")`` to create the main output data
+        """
+        if are_all_object_dtype_cols:
+            data = [
+                list(map(maybe_box_native, t))
+                for t in self.itertuples(index=False, name=None)
+            ]
+        else:
+            data = [list(t) for t in self.itertuples(index=False, name=None)]
+            if object_dtype_indices:
+                # If we have object_dtype_cols, apply maybe_box_naive after list
+                # comprehension for perf
+                for row in data:
+                    for i in object_dtype_indices:
+                        row[i] = maybe_box_native(row[i])
+        return data
+
     @overload
     def to_dict(
         self,
@@ -1950,30 +1972,50 @@ def to_dict(
                 "'index=False' is only valid when 'orient' is 'split' or 'tight'"
             )
 
+        if orient == "series":
+            # GH46470 Return quickly if orient series to avoid creating dtype objects
+            return into_c((k, v) for k, v in self.items())
+
+        object_dtype_indices = [
+            i
+            for i, col_dtype in enumerate(self.dtypes.values)
+            if is_object_dtype(col_dtype)
+        ]
+        are_all_object_dtype_cols = len(object_dtype_indices) == len(self.dtypes)
+
         if orient == "dict":
             return into_c((k, v.to_dict(into)) for k, v in self.items())
 
         elif orient == "list":
+            object_dtype_indices_as_set = set(object_dtype_indices)
             return into_c(
-                (k, list(map(maybe_box_native, v.tolist()))) for k, v in self.items()
+                (
+                    k,
+                    list(map(maybe_box_native, v.tolist()))
+                    if i in object_dtype_indices_as_set
+                    else v.tolist(),
+                )
+                for i, (k, v) in enumerate(self.items())
             )
 
         elif orient == "split":
+            data = self._create_data_for_split_and_tight_to_dict(
+                are_all_object_dtype_cols, object_dtype_indices
+            )
+
             return into_c(
                 ((("index", self.index.tolist()),) if index else ())
                 + (
                     ("columns", self.columns.tolist()),
-                    (
-                        "data",
-                        [
-                            list(map(maybe_box_native, t))
-                            for t in self.itertuples(index=False, name=None)
-                        ],
-                    ),
+                    ("data", data),
                 )
             )
 
         elif orient == "tight":
+            data = self._create_data_for_split_and_tight_to_dict(
+                are_all_object_dtype_cols, object_dtype_indices
+            )
+
             return into_c(
                 ((("index", self.index.tolist()),) if index else ())
                 + (
@@ -1990,26 +2032,65 @@ def to_dict(
                 + (("column_names", list(self.columns.names)),)
             )
 
-        elif orient == "series":
-            return into_c((k, v) for k, v in self.items())
-
         elif orient == "records":
             columns = self.columns.tolist()
-            rows = (
-                dict(zip(columns, row))
-                for row in self.itertuples(index=False, name=None)
-            )
-            return [
-                into_c((k, maybe_box_native(v)) for k, v in row.items()) for row in rows
-            ]
+            if are_all_object_dtype_cols:
+                rows = (
+                    dict(zip(columns, row))
+                    for row in self.itertuples(index=False, name=None)
+                )
+                return [
+                    into_c((k, maybe_box_native(v)) for k, v in row.items())
+                    for row in rows
+                ]
+            else:
+                data = [
+                    into_c(zip(columns, t))
+                    for t in self.itertuples(index=False, name=None)
+                ]
+                if object_dtype_indices:
+                    object_dtype_indices_as_set = set(object_dtype_indices)
+                    object_dtype_cols = {
+                        col
+                        for i, col in enumerate(self.columns)
+                        if i in object_dtype_indices_as_set
+                    }
+                    for row in data:
+                        for col in object_dtype_cols:
+                            row[col] = maybe_box_native(row[col])
+                return data
 
         elif orient == "index":
             if not self.index.is_unique:
                 raise ValueError("DataFrame index must be unique for orient='index'.")
-            return into_c(
-                (t[0], dict(zip(self.columns, map(maybe_box_native, t[1:]))))
-                for t in self.itertuples(name=None)
-            )
+            columns = self.columns.tolist()
+            if are_all_object_dtype_cols:
+                return into_c(
+                    (t[0], dict(zip(self.columns, map(maybe_box_native, t[1:]))))
+                    for t in self.itertuples(name=None)
+                )
+            elif object_dtype_indices:
+                object_dtype_indices_as_set = set(object_dtype_indices)
+                is_object_dtype_by_index = [
+                    i in object_dtype_indices_as_set for i in range(len(self.columns))
+                ]
+                return into_c(
+                    (
+                        t[0],
+                        {
+                            columns[i]: maybe_box_native(v)
+                            if is_object_dtype_by_index[i]
+                            else v
+                            for i, v in enumerate(t[1:])
+                        },
+                    )
+                    for t in self.itertuples(name=None)
+                )
+            else:
+                return into_c(
+                    (t[0], dict(zip(self.columns, t[1:])))
+                    for t in self.itertuples(name=None)
+                )
 
         else:
             raise ValueError(f"orient '{orient}' not understood")
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1812,7 +1812,13 @@ def to_dict(self, into: type[dict] = dict) -> dict:
         """
         # GH16122
         into_c = com.standardize_mapping(into)
-        return into_c((k, maybe_box_native(v)) for k, v in self.items())
+
+        if is_object_dtype(self):
+            return into_c((k, maybe_box_native(v)) for k, v in self.items())
+        else:
+            # Not an object dtype => all types will be the same so let the default
+            # indexer return native python type
+            return into_c((k, v) for k, v in self.items())
 
     def to_frame(self, name: Hashable = lib.no_default) -> DataFrame:
         """
diff --git a/pandas/tests/frame/methods/test_to_dict.py b/pandas/tests/frame/methods/test_to_dict.py
@@ -379,6 +379,16 @@ def test_to_dict_orient_tight(self, index, columns):
                     "b": [float, float, float],
                 },
             ),
+            (  # Make sure we have one df which is all object type cols
+                {
+                    "a": [1, "hello", 3],
+                    "b": [1.1, "world", 3.3],
+                },
+                {
+                    "a": [int, str, int],
+                    "b": [float, str, float],
+                },
+            ),
         ),
     )
     def test_to_dict_returns_native_types(self, orient, data, expected_types):