Skip to content

Commit 23ed54f

Browse files
Backport PR #43213: BUG: groupby agg fails silently with mixed dtypes (#43808)
Co-authored-by: Shoham Debnath <[email protected]>
1 parent 7b6a5c4 commit 23ed54f

File tree

3 files changed

+55
-1
lines changed

3 files changed

+55
-1
lines changed

doc/source/whatsnew/v1.3.4.rst

+1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ including other versions of pandas.
1414

1515
Fixed regressions
1616
~~~~~~~~~~~~~~~~~
17+
- Fixed regression in :meth:`.GroupBy.agg` where it was failing silently with mixed data types along ``axis=1`` and :class:`MultiIndex` (:issue:`43209`)
1718
- Fixed regression in :meth:`merge` with integer and ``NaN`` keys failing with ``outer`` merge (:issue:`43550`)
1819
- Fixed regression in :meth:`DataFrame.corr` raising ``ValueError`` with ``method="spearman"`` on 32-bit platforms (:issue:`43588`)
1920
- Fixed performance regression in :meth:`MultiIndex.equals` (:issue:`43549`)

pandas/core/groupby/groupby.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -1129,7 +1129,10 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool:
11291129
numeric_only = True
11301130
# GH#42395 GH#43108 GH#43154
11311131
# Regression from 1.2.5 to 1.3 caused object columns to be dropped
1132-
obj = self._obj_with_exclusions
1132+
if self.axis:
1133+
obj = self._obj_with_exclusions.T
1134+
else:
1135+
obj = self._obj_with_exclusions
11331136
check = obj._get_numeric_data()
11341137
if len(obj.columns) and not len(check.columns) and not obj.empty:
11351138
numeric_only = False

pandas/tests/groupby/aggregate/test_aggregate.py

+50
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,56 @@ def test_agg_str_with_kwarg_axis_1_raises(df, reduction_func):
225225
gb.agg(reduction_func, axis=1)
226226

227227

228+
@pytest.mark.parametrize(
229+
"func, expected, dtype, result_dtype_dict",
230+
[
231+
("sum", [5, 7, 9], "int64", {}),
232+
("std", [4.5 ** 0.5] * 3, int, {"i": float, "j": float, "k": float}),
233+
("var", [4.5] * 3, int, {"i": float, "j": float, "k": float}),
234+
("sum", [5, 7, 9], "Int64", {"j": "int64"}),
235+
("std", [4.5 ** 0.5] * 3, "Int64", {"i": float, "j": float, "k": float}),
236+
("var", [4.5] * 3, "Int64", {"i": "float64", "j": "float64", "k": "float64"}),
237+
],
238+
)
239+
def test_multiindex_groupby_mixed_cols_axis1(func, expected, dtype, result_dtype_dict):
240+
# GH#43209
241+
df = DataFrame(
242+
[[1, 2, 3, 4, 5, 6]] * 3,
243+
columns=MultiIndex.from_product([["a", "b"], ["i", "j", "k"]]),
244+
).astype({("a", "j"): dtype, ("b", "j"): dtype})
245+
result = df.groupby(level=1, axis=1).agg(func)
246+
expected = DataFrame([expected] * 3, columns=["i", "j", "k"]).astype(
247+
result_dtype_dict
248+
)
249+
tm.assert_frame_equal(result, expected)
250+
251+
252+
@pytest.mark.parametrize(
253+
"func, expected_data, result_dtype_dict",
254+
[
255+
("sum", [[2, 4], [10, 12], [18, 20]], {10: "int64", 20: "int64"}),
256+
# std should ideally return Int64 / Float64 #43330
257+
("std", [[2 ** 0.5] * 2] * 3, "float64"),
258+
("var", [[2] * 2] * 3, {10: "float64", 20: "float64"}),
259+
],
260+
)
261+
def test_groupby_mixed_cols_axis1(func, expected_data, result_dtype_dict):
262+
# GH#43209
263+
df = DataFrame(
264+
np.arange(12).reshape(3, 4),
265+
index=Index([0, 1, 0], name="y"),
266+
columns=Index([10, 20, 10, 20], name="x"),
267+
dtype="int64",
268+
).astype({10: "Int64"})
269+
result = df.groupby("x", axis=1).agg(func)
270+
expected = DataFrame(
271+
data=expected_data,
272+
index=Index([0, 1, 0], name="y"),
273+
columns=Index([10, 20], name="x"),
274+
).astype(result_dtype_dict)
275+
tm.assert_frame_equal(result, expected)
276+
277+
228278
def test_aggregate_item_by_item(df):
229279
grouped = df.groupby("A")
230280

0 commit comments

Comments
 (0)