Skip to content

Commit c811353

Browse files
authored
CLN: to_dict (#57159)
* Make to_dict lazier * Remove some extra looping and indexing * Add erroneous ignore
1 parent b41ea09 commit c811353

File tree

2 files changed

+45
-51
lines changed

2 files changed

+45
-51
lines changed

pandas/core/frame.py

-23
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,6 @@
8989
find_common_type,
9090
infer_dtype_from_scalar,
9191
invalidate_string_dtypes,
92-
maybe_box_native,
9392
maybe_downcast_to_dtype,
9493
)
9594
from pandas.core.dtypes.common import (
@@ -1983,28 +1982,6 @@ def to_numpy(
19831982

19841983
return result
19851984

1986-
def _create_data_for_split_and_tight_to_dict(
1987-
self, are_all_object_dtype_cols: bool, object_dtype_indices: list[int]
1988-
) -> list:
1989-
"""
1990-
Simple helper method to create data for to ``to_dict(orient="split")`` and
1991-
``to_dict(orient="tight")`` to create the main output data
1992-
"""
1993-
if are_all_object_dtype_cols:
1994-
data = [
1995-
list(map(maybe_box_native, t))
1996-
for t in self.itertuples(index=False, name=None)
1997-
]
1998-
else:
1999-
data = [list(t) for t in self.itertuples(index=False, name=None)]
2000-
if object_dtype_indices:
2001-
# If we have object_dtype_cols, apply maybe_box_naive after list
2002-
# comprehension for perf
2003-
for row in data:
2004-
for i in object_dtype_indices:
2005-
row[i] = maybe_box_native(row[i])
2006-
return data
2007-
20081985
@overload
20091986
def to_dict(
20101987
self,

pandas/core/methods/to_dict.py

+45-28
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,34 @@
2424
from pandas.core import common as com
2525

2626
if TYPE_CHECKING:
27+
from collections.abc import Generator
28+
2729
from pandas._typing import MutableMappingT
2830

2931
from pandas import DataFrame
3032

3133

34+
def create_data_for_split(
35+
df: DataFrame, are_all_object_dtype_cols: bool, object_dtype_indices: list[int]
36+
) -> Generator[list, None, None]:
37+
"""
38+
Simple helper method to create data for to ``to_dict(orient="split")``
39+
to create the main output data
40+
"""
41+
if are_all_object_dtype_cols:
42+
for tup in df.itertuples(index=False, name=None):
43+
yield list(map(maybe_box_native, tup))
44+
else:
45+
for tup in df.itertuples(index=False, name=None):
46+
data = list(tup)
47+
if object_dtype_indices:
48+
# If we have object_dtype_cols, apply maybe_box_naive after
49+
# for perf
50+
for i in object_dtype_indices:
51+
data[i] = maybe_box_native(data[i])
52+
yield data
53+
54+
3255
@overload
3356
def to_dict(
3457
df: DataFrame,
@@ -152,35 +175,38 @@ def to_dict(
152175
# GH46470 Return quickly if orient series to avoid creating dtype objects
153176
return into_c((k, v) for k, v in df.items())
154177

178+
if orient == "dict":
179+
return into_c((k, v.to_dict(into=into)) for k, v in df.items())
180+
155181
box_native_indices = [
156182
i
157183
for i, col_dtype in enumerate(df.dtypes.values)
158184
if col_dtype == np.dtype(object) or isinstance(col_dtype, ExtensionDtype)
159185
]
160-
box_na_values = [
161-
lib.no_default if not isinstance(col_dtype, BaseMaskedDtype) else libmissing.NA
162-
for i, col_dtype in enumerate(df.dtypes.values)
163-
]
164-
are_all_object_dtype_cols = len(box_native_indices) == len(df.dtypes)
165186

166-
if orient == "dict":
167-
return into_c((k, v.to_dict(into=into)) for k, v in df.items())
187+
are_all_object_dtype_cols = len(box_native_indices) == len(df.dtypes)
168188

169-
elif orient == "list":
189+
if orient == "list":
170190
object_dtype_indices_as_set: set[int] = set(box_native_indices)
191+
box_na_values = (
192+
lib.no_default
193+
if not isinstance(col_dtype, BaseMaskedDtype)
194+
else libmissing.NA
195+
for col_dtype in df.dtypes.values
196+
)
171197
return into_c(
172198
(
173199
k,
174-
list(map(maybe_box_native, v.to_numpy(na_value=box_na_values[i])))
200+
list(map(maybe_box_native, v.to_numpy(na_value=box_na_value)))
175201
if i in object_dtype_indices_as_set
176202
else list(map(maybe_box_native, v.to_numpy())),
177203
)
178-
for i, (k, v) in enumerate(df.items())
204+
for i, (box_na_value, (k, v)) in enumerate(zip(box_na_values, df.items()))
179205
)
180206

181207
elif orient == "split":
182-
data = df._create_data_for_split_and_tight_to_dict(
183-
are_all_object_dtype_cols, box_native_indices
208+
data = list(
209+
create_data_for_split(df, are_all_object_dtype_cols, box_native_indices)
184210
)
185211

186212
return into_c(
@@ -192,10 +218,6 @@ def to_dict(
192218
)
193219

194220
elif orient == "tight":
195-
data = df._create_data_for_split_and_tight_to_dict(
196-
are_all_object_dtype_cols, box_native_indices
197-
)
198-
199221
return into_c(
200222
((("index", df.index.tolist()),) if index else ())
201223
+ (
@@ -215,11 +237,9 @@ def to_dict(
215237
elif orient == "records":
216238
columns = df.columns.tolist()
217239
if are_all_object_dtype_cols:
218-
rows = (
219-
dict(zip(columns, row)) for row in df.itertuples(index=False, name=None)
220-
)
221240
return [
222-
into_c((k, maybe_box_native(v)) for k, v in row.items()) for row in rows
241+
into_c(zip(columns, map(maybe_box_native, row)))
242+
for row in df.itertuples(index=False, name=None)
223243
]
224244
else:
225245
data = [
@@ -235,7 +255,7 @@ def to_dict(
235255
for row in data:
236256
for col in object_dtype_cols:
237257
row[col] = maybe_box_native(row[col])
238-
return data
258+
return data # type: ignore[return-value]
239259

240260
elif orient == "index":
241261
if not df.index.is_unique:
@@ -248,24 +268,21 @@ def to_dict(
248268
)
249269
elif box_native_indices:
250270
object_dtype_indices_as_set = set(box_native_indices)
251-
is_object_dtype_by_index = [
252-
i in object_dtype_indices_as_set for i in range(len(df.columns))
253-
]
254271
return into_c(
255272
(
256273
t[0],
257274
{
258-
columns[i]: maybe_box_native(v)
259-
if is_object_dtype_by_index[i]
275+
column: maybe_box_native(v)
276+
if i in object_dtype_indices_as_set
260277
else v
261-
for i, v in enumerate(t[1:])
278+
for i, (column, v) in enumerate(zip(columns, t[1:]))
262279
},
263280
)
264281
for t in df.itertuples(name=None)
265282
)
266283
else:
267284
return into_c(
268-
(t[0], dict(zip(df.columns, t[1:]))) for t in df.itertuples(name=None)
285+
(t[0], dict(zip(columns, t[1:]))) for t in df.itertuples(name=None)
269286
)
270287

271288
else:

0 commit comments

Comments
 (0)