Skip to content

Commit 5d3fafa

Browse files
rhshadrachmliu08
authored andcommitted
BUG: groupby.describe with as_index=False incorrect (pandas-dev#49643)
* BUG: groupby.describe with as_index=False incorrect * Add test for two groupings * Simplify logic
1 parent 1974fd9 commit 5d3fafa

File tree

4 files changed

+70
-31
lines changed

4 files changed

+70
-31
lines changed

doc/source/whatsnew/v2.0.0.rst

+3-1
Original file line numberDiff line numberDiff line change
@@ -718,7 +718,9 @@ Groupby/resample/rolling
718718
- Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` would not include unobserved categories in result when grouping by categorical indexes (:issue:`49354`)
719719
- Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` would change result order depending on the input index when grouping by categoricals (:issue:`49223`)
720720
- Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` when grouping on categorical data would sort result values even when used with ``sort=False`` (:issue:`42482`)
721-
-
721+
- Bug in :meth:`.DataFrameGroupBy.apply` and :class:`SeriesGroupBy.apply` with ``as_index=False`` would not attempt the computation without using the grouping keys when using them failed with a ``TypeError`` (:issue:`49256`)
722+
- Bug in :meth:`.DataFrameGroupBy.describe` would describe the group keys (:issue:`49256`)
723+
- Bug in :meth:`.SeriesGroupBy.describe` with ``as_index=False`` would have the incorrect shape (:issue:`49256`)
722724

723725
Reshaping
724726
^^^^^^^^^

pandas/core/groupby/groupby.py

+9-3
Original file line numberDiff line numberDiff line change
@@ -1061,8 +1061,7 @@ def _set_group_selection(self) -> None:
10611061
# This is a no-op for SeriesGroupBy
10621062
grp = self.grouper
10631063
if not (
1064-
self.as_index
1065-
and grp.groupings is not None
1064+
grp.groupings is not None
10661065
and self.obj.ndim > 1
10671066
and self._group_selection is None
10681067
):
@@ -2640,7 +2639,14 @@ def describe(self, **kwargs):
26402639
)
26412640
if self.axis == 1:
26422641
return result.T
2643-
return result.unstack()
2642+
2643+
# GH#49256 - properly handle the grouping column(s)
2644+
if self._selected_obj.ndim != 1 or self.as_index:
2645+
result = result.unstack()
2646+
if not self.as_index:
2647+
self._insert_inaxis_grouper_inplace(result)
2648+
2649+
return result
26442650

26452651
@final
26462652
def resample(self, rule, *args, **kwargs):

pandas/tests/groupby/test_apply.py

+9-3
Original file line numberDiff line numberDiff line change
@@ -974,15 +974,21 @@ def test_apply_function_index_return(function):
974974

975975

976976
def test_apply_function_with_indexing_return_column():
977-
# GH#7002, GH#41480
977+
# GH#7002, GH#41480, GH#49256
978978
df = DataFrame(
979979
{
980980
"foo1": ["one", "two", "two", "three", "one", "two"],
981981
"foo2": [1, 2, 4, 4, 5, 6],
982982
}
983983
)
984-
with pytest.raises(TypeError, match="Could not convert"):
985-
df.groupby("foo1", as_index=False).apply(lambda x: x.mean())
984+
result = df.groupby("foo1", as_index=False).apply(lambda x: x.mean())
985+
expected = DataFrame(
986+
{
987+
"foo1": ["one", "three", "two"],
988+
"foo2": [3.0, 4.0, 4.0],
989+
}
990+
)
991+
tm.assert_frame_equal(result, expected)
986992

987993

988994
@pytest.mark.parametrize(

pandas/tests/groupby/test_function.py

+49-24
Original file line numberDiff line numberDiff line change
@@ -336,13 +336,7 @@ def test_describe(self, df, gb, gni):
336336
result = gb.describe()
337337
tm.assert_frame_equal(result, expected)
338338

339-
expected = pd.concat(
340-
[
341-
df[df.A == 1].describe().unstack().to_frame().T,
342-
df[df.A == 3].describe().unstack().to_frame().T,
343-
]
344-
)
345-
expected.index = Index([0, 1])
339+
expected = expected.reset_index()
346340
result = gni.describe()
347341
tm.assert_frame_equal(result, expected)
348342

@@ -1093,6 +1087,38 @@ def test_series_describe_single():
10931087
tm.assert_series_equal(result, expected)
10941088

10951089

1090+
@pytest.mark.parametrize("keys", ["key1", ["key1", "key2"]])
1091+
def test_series_describe_as_index(as_index, keys):
1092+
# GH#49256
1093+
df = DataFrame(
1094+
{
1095+
"key1": ["one", "two", "two", "three", "two"],
1096+
"key2": ["one", "two", "two", "three", "two"],
1097+
"foo2": [1, 2, 4, 4, 6],
1098+
}
1099+
)
1100+
gb = df.groupby(keys, as_index=as_index)["foo2"]
1101+
result = gb.describe()
1102+
expected = DataFrame(
1103+
{
1104+
"key1": ["one", "three", "two"],
1105+
"count": [1.0, 1.0, 3.0],
1106+
"mean": [1.0, 4.0, 4.0],
1107+
"std": [np.nan, np.nan, 2.0],
1108+
"min": [1.0, 4.0, 2.0],
1109+
"25%": [1.0, 4.0, 3.0],
1110+
"50%": [1.0, 4.0, 4.0],
1111+
"75%": [1.0, 4.0, 5.0],
1112+
"max": [1.0, 4.0, 6.0],
1113+
}
1114+
)
1115+
if len(keys) == 2:
1116+
expected.insert(1, "key2", expected["key1"])
1117+
if as_index:
1118+
expected = expected.set_index(keys)
1119+
tm.assert_frame_equal(result, expected)
1120+
1121+
10961122
def test_series_index_name(df):
10971123
grouped = df.loc[:, ["C"]].groupby(df["A"])
10981124
result = grouped.agg(lambda x: x.mean())
@@ -1177,29 +1203,25 @@ def test_frame_describe_unstacked_format():
11771203
"pandas.errors.PerformanceWarning"
11781204
)
11791205
@pytest.mark.parametrize("as_index", [True, False])
1180-
def test_describe_with_duplicate_output_column_names(as_index):
1206+
@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]])
1207+
def test_describe_with_duplicate_output_column_names(as_index, keys):
11811208
# GH 35314
11821209
df = DataFrame(
11831210
{
1184-
"a": [99, 99, 99, 88, 88, 88],
1211+
"a1": [99, 99, 99, 88, 88, 88],
1212+
"a2": [99, 99, 99, 88, 88, 88],
11851213
"b": [1, 2, 3, 4, 5, 6],
11861214
"c": [10, 20, 30, 40, 50, 60],
11871215
},
1188-
columns=["a", "b", "b"],
1216+
columns=["a1", "a2", "b", "b"],
11891217
copy=False,
11901218
)
1219+
if keys == ["a1"]:
1220+
df = df.drop(columns="a2")
11911221

11921222
expected = (
11931223
DataFrame.from_records(
11941224
[
1195-
("a", "count", 3.0, 3.0),
1196-
("a", "mean", 88.0, 99.0),
1197-
("a", "std", 0.0, 0.0),
1198-
("a", "min", 88.0, 99.0),
1199-
("a", "25%", 88.0, 99.0),
1200-
("a", "50%", 88.0, 99.0),
1201-
("a", "75%", 88.0, 99.0),
1202-
("a", "max", 88.0, 99.0),
12031225
("b", "count", 3.0, 3.0),
12041226
("b", "mean", 5.0, 2.0),
12051227
("b", "std", 1.0, 1.0),
@@ -1222,14 +1244,17 @@ def test_describe_with_duplicate_output_column_names(as_index):
12221244
.T
12231245
)
12241246
expected.columns.names = [None, None]
1225-
expected.index = Index([88, 99], name="a")
1226-
1227-
if as_index:
1228-
expected = expected.drop(columns=["a"], level=0)
1247+
if len(keys) == 2:
1248+
expected.index = MultiIndex(
1249+
levels=[[88, 99], [88, 99]], codes=[[0, 1], [0, 1]], names=["a1", "a2"]
1250+
)
12291251
else:
1230-
expected = expected.reset_index(drop=True)
1252+
expected.index = Index([88, 99], name="a1")
1253+
1254+
if not as_index:
1255+
expected = expected.reset_index()
12311256

1232-
result = df.groupby("a", as_index=as_index).describe()
1257+
result = df.groupby(keys, as_index=as_index).describe()
12331258

12341259
tm.assert_frame_equal(result, expected)
12351260

0 commit comments

Comments
 (0)