Skip to content

Commit ac09649

Browse files
Backport PR pandas-dev#43154: Updating _resolve_numeric_only function of GroupBy (pandas-dev#43481)
Co-authored-by: Prerana Chakraborty <[email protected]>
1 parent 04328a7 commit ac09649

File tree

4 files changed

+73
-1
lines changed

4 files changed

+73
-1
lines changed

doc/source/whatsnew/v1.3.3.rst

+1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ Fixed regressions
2626
- Fixed regression in :meth:`DataFrame.__getitem__` raising error for slice of :class:`DatetimeIndex` when index is non monotonic (:issue:`43223`)
2727
- Fixed regression in :meth:`.Resampler.aggregate` when used after column selection would raise if ``func`` is a list of aggregation functions (:issue:`42905`)
2828
- Fixed regression in :meth:`DataFrame.corr` where Kendall correlation would produce incorrect results for columns with repeated values (:issue:`43401`)
29+
- Fixed regression in :meth:`DataFrame.groupby` where aggregation on columns with object types dropped results on those columns (:issue:`42395`, :issue:`43108`)
2930
- Fixed regression in :meth:`Series.fillna` raising ``TypeError`` when filling ``float`` ``Series`` with list-like fill value having a dtype which couldn't cast lostlessly (like ``float32`` filled with ``float64``) (:issue:`43424`)
3031
-
3132

pandas/core/groupby/groupby.py

+8
Original file line numberDiff line numberDiff line change
@@ -1126,6 +1126,14 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool:
11261126
if self.obj.ndim == 2:
11271127
# i.e. DataFrameGroupBy
11281128
numeric_only = True
1129+
# GH#42395 GH#43108 GH#43154
1130+
# Regression from 1.2.5 to 1.3 caused object columns to be dropped
1131+
obj = self._obj_with_exclusions
1132+
check = obj._get_numeric_data()
1133+
if len(obj.columns) and not len(check.columns) and not obj.empty:
1134+
numeric_only = False
1135+
# TODO: v1.4+ Add FutureWarning
1136+
11291137
else:
11301138
numeric_only = False
11311139

pandas/tests/groupby/aggregate/test_cython.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,8 @@ def test_cython_agg_nothing_to_agg():
9797

9898
frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25})
9999

100-
result = frame[["b"]].groupby(frame["a"]).mean()
100+
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
101+
result = frame[["b"]].groupby(frame["a"]).mean()
101102
expected = DataFrame([], index=frame["a"].sort_values().drop_duplicates())
102103
tm.assert_frame_equal(result, expected)
103104

pandas/tests/groupby/test_groupby.py

+62
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
MultiIndex,
1818
RangeIndex,
1919
Series,
20+
Timedelta,
2021
Timestamp,
2122
date_range,
2223
read_csv,
@@ -2377,6 +2378,67 @@ def test_groupby_empty_multi_column(as_index, numeric_only):
23772378
tm.assert_frame_equal(result, expected)
23782379

23792380

2381+
def test_groupby_aggregation_non_numeric_dtype():
2382+
# GH #43108
2383+
df = DataFrame(
2384+
[["M", [1]], ["M", [1]], ["W", [10]], ["W", [20]]], columns=["MW", "v"]
2385+
)
2386+
2387+
expected = DataFrame(
2388+
{
2389+
"v": [[1, 1], [10, 20]],
2390+
},
2391+
index=Index(["M", "W"], dtype="object", name="MW"),
2392+
)
2393+
2394+
gb = df.groupby(by=["MW"])
2395+
result = gb.sum()
2396+
tm.assert_frame_equal(result, expected)
2397+
2398+
2399+
def test_groupby_aggregation_multi_non_numeric_dtype():
2400+
# GH #42395
2401+
df = DataFrame(
2402+
{
2403+
"x": [1, 0, 1, 1, 0],
2404+
"y": [Timedelta(i, "days") for i in range(1, 6)],
2405+
"z": [Timedelta(i * 10, "days") for i in range(1, 6)],
2406+
}
2407+
)
2408+
2409+
expected = DataFrame(
2410+
{
2411+
"y": [Timedelta(i, "days") for i in range(7, 9)],
2412+
"z": [Timedelta(i * 10, "days") for i in range(7, 9)],
2413+
},
2414+
index=Index([0, 1], dtype="int64", name="x"),
2415+
)
2416+
2417+
gb = df.groupby(by=["x"])
2418+
result = gb.sum()
2419+
tm.assert_frame_equal(result, expected)
2420+
2421+
2422+
def test_groupby_aggregation_numeric_with_non_numeric_dtype():
2423+
# GH #43108
2424+
df = DataFrame(
2425+
{
2426+
"x": [1, 0, 1, 1, 0],
2427+
"y": [Timedelta(i, "days") for i in range(1, 6)],
2428+
"z": list(range(1, 6)),
2429+
}
2430+
)
2431+
2432+
expected = DataFrame(
2433+
{"z": [7, 8]},
2434+
index=Index([0, 1], dtype="int64", name="x"),
2435+
)
2436+
2437+
gb = df.groupby(by=["x"])
2438+
result = gb.sum()
2439+
tm.assert_frame_equal(result, expected)
2440+
2441+
23802442
def test_groupby_filtered_df_std():
23812443
# GH 16174
23822444
dicts = [

0 commit comments

Comments
 (0)