From 70748fdf881c4ce65982b18aaa892197bd9fdb85 Mon Sep 17 00:00:00 2001 From: Kei Date: Mon, 22 Apr 2024 21:55:41 +0800 Subject: [PATCH 1/9] in apply, if as_index is false, return single index --- pandas/core/groupby/groupby.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 79d9f49a3b355..f44ef8c4dbbfa 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1202,10 +1202,7 @@ def _concat_objects( sort=False, ) else: - # GH5610, returns a MI, with the first level being a - # range index - keys = RangeIndex(len(values)) - result = concat(values, axis=0, keys=keys) + result = concat(values, axis=0) elif not not_indexed_same: result = concat(values, axis=0) From 292e4fdaa9e76f9a673e23146da1808e18d0e25c Mon Sep 17 00:00:00 2001 From: Kei Date: Mon, 22 Apr 2024 21:57:29 +0800 Subject: [PATCH 2/9] Update tests to return single index --- pandas/tests/groupby/methods/test_value_counts.py | 9 +++------ pandas/tests/groupby/test_groupby.py | 9 ++++----- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index be52b4a591c26..de8d4b1f01410 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -329,13 +329,10 @@ def test_against_frame_and_seriesgroupby( else: name = "proportion" if normalize else "count" expected = expected.reset_index().rename({0: name}, axis=1) - if groupby == "column": - expected = expected.rename({"level_0": "country"}, axis=1) - expected["country"] = np.where(expected["country"], "US", "FR") - elif groupby == "function": - expected["level_0"] = expected["level_0"] == 1 + if "level_0" in result: + expected.insert(loc=0, column="level_0", value=result["level_0"]) else: - expected["level_0"] = np.where(expected["level_0"], "US", "FR") + expected.insert(loc=0, column="country", value=result["country"]) tm.assert_frame_equal(result, expected) else: # compare against SeriesGroupBy value_counts diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 54d7895691f3f..d50fea459552a 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -113,8 +113,9 @@ def f(x, q=None, axis=0): expected_seq = df_grouped.quantile([0.4, 0.8]) if not as_index: # apply treats the op as a transform; .quantile knows it's a reduction - apply_result = apply_result.reset_index() - apply_result["level_0"] = [1, 1, 2, 2] + apply_result.index = range(4) + apply_result.insert(loc=0, column="level_0", value=[1, 1, 2, 2]) + apply_result.insert(loc=1, column="level_1", value=[0.4, 0.8, 0.4, 0.8]) tm.assert_frame_equal(apply_result, expected_seq, check_names=False) agg_result = df_grouped.agg(f, q=80) @@ -519,9 +520,7 @@ def test_as_index_select_column(): result = df.groupby("A", as_index=False, group_keys=True)["B"].apply( lambda x: x.cumsum() ) - expected = Series( - [2, 6, 6], name="B", index=MultiIndex.from_tuples([(0, 0), (0, 1), (1, 2)]) - ) + expected = Series([2, 6, 6], name="B", index=range(3)) tm.assert_series_equal(result, expected) From 59746e9b03b9a1b951f182a058fe62850be321c3 Mon Sep 17 00:00:00 2001 From: Kei Date: Mon, 22 Apr 2024 23:11:38 +0800 Subject: [PATCH 3/9] Add tests --- pandas/tests/groupby/test_apply.py | 107 +++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 1a2589fe94ea5..76dbb18a62b4d 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1583,3 +1583,110 @@ def f_4(grp): e.loc["Pony"] = np.nan e.name = None tm.assert_series_equal(result, e) + + +# GH58291 +def test_apply_frame_not_as_index_returns_single_index(): + df = DataFrame( + [ + ["group_a", 0], + ["group_a", 2], + ["group_b", 1], + ["group_b", 3], + ["group_b", 5], + ], + columns=["group", "value"], + ) + gb = df.groupby("group", as_index=False)[["group", "value"]] + + def up_to_two_rows(df: DataFrame) -> DataFrame: + return df.head(2) + + result = gb.apply(up_to_two_rows) + + expected = DataFrame( + [["group_a", 0], ["group_a", 2], ["group_b", 1], ["group_b", 3]], + columns=["group", "value"], + index=range(4), + ) + tm.assert_frame_equal(result, expected) + + +# GH58291 +def test_apply_column_groupby_frame_not_as_index_returns_single_index(df): + gb = df.groupby("A", as_index=False)[["A", "C", "D"]] + result = gb.apply(lambda x: x.min()) + print(str(result)) + + expected = DataFrame( + [["bar", -2.441467, -2.441467], ["foo", -0.413064, -0.413064]], + columns=["A", "C", "D"], + index=range(2), + ) + tm.assert_frame_equal(result, expected) + + +# GH58291 +def test_apply_non_column_groupby_frame_not_as_index_returns_single_index(tsframe): + gb = tsframe.groupby(lambda x: x.month, as_index=False) + result = gb.apply(lambda x: x.mean()) + print(str(result)) + + expected = DataFrame( + [ + [1, 0.115464, 0.260960, 0.187824, -0.411523], + [2, 0.047104, -0.183591, 0.330640, -0.207427], + ], + columns=["index", "A", "B", "C", "D"], + index=range(2), + ) + tm.assert_frame_equal(result, expected) + + result = gb.apply(lambda x: x.mean() + x.max()) + print(str(result)) + + expected = DataFrame( + [ + [1, 2.490351, 2.062381, 2.244527, 0.912824], + [2, 1.918738, 2.273746, 1.759184, 0.700975], + ], + columns=["index", "A", "B", "C", "D"], + index=range(2), + ) + tm.assert_frame_equal(result, expected) + + result = gb.apply(lambda x: 1) + print(str(result)) + + expected = DataFrame([[1, 1], [2, 1]], columns=["index", None], index=range(2)) + tm.assert_frame_equal(result, expected) + + result = gb.apply(lambda x: x.quantile([0.2, 0.38])) + print(str(result)) + + expected = DataFrame( + [ + [-0.742155, -0.607186, -0.325423, -1.254187], + [-0.212477, -0.034416, -0.117452, -0.561536], + [-0.842537, -0.920719, -0.404496, -0.729036], + [-0.411161, -0.830757, 0.321981, -0.444410], + ], + columns=["A", "B", "C", "D"], + index=[0.20, 0.38] * 2, + ) + tm.assert_frame_equal(result, expected) + + result = gb.apply(DataFrame.quantile, [0.2, 0.38]) + print(str(result)) + + expected = DataFrame( + [ + [-0.742155, -0.607186, -0.325423, -1.254187], + [-0.212477, -0.034416, -0.117452, -0.561536], + [-0.842537, -0.920719, -0.404496, -0.729036], + [-0.411161, -0.830757, 0.321981, -0.444410], + ], + columns=["A", "B", "C", "D"], + index=[0.20, 0.38] * 2, + ) + tm.assert_frame_equal(result, expected) From cc357878f2aef8dcf50ba9f14acb0777199649c8 Mon Sep 17 00:00:00 2001 From: Kei Date: Tue, 23 Apr 2024 19:11:15 +0800 Subject: [PATCH 4/9] Update tests --- pandas/tests/groupby/test_apply.py | 2 +- pandas/tests/groupby/test_apply_mutate.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 76dbb18a62b4d..5a034f0ebccbe 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -315,7 +315,7 @@ def test_groupby_as_index_apply(): # apply doesn't maintain the original ordering # changed in GH5610 as the as_index=False returns a MI here - exp_not_as_apply = MultiIndex.from_tuples([(0, 0), (0, 2), (1, 1), (2, 4)]) + exp_not_as_apply = Index([0, 1, 2, 4]) tp = [(1, 0), (1, 2), (2, 1), (3, 4)] exp_as_apply = MultiIndex.from_tuples(tp, names=["user_id", None]) diff --git a/pandas/tests/groupby/test_apply_mutate.py b/pandas/tests/groupby/test_apply_mutate.py index e5028884e992b..fa20efad4da77 100644 --- a/pandas/tests/groupby/test_apply_mutate.py +++ b/pandas/tests/groupby/test_apply_mutate.py @@ -90,9 +90,7 @@ def fn(x): result = df.groupby(["col1"], as_index=False).apply(fn) expected = pd.Series( [1, 2, 0, 4, 5, 0], - index=pd.MultiIndex.from_tuples( - [(0, 0), (0, 1), (0, 2), (1, 3), (1, 4), (1, 5)] - ), + index=range(6), name="col2", ) tm.assert_series_equal(result, expected) From 03a7833e537cb497e72bf3f6ca6076c03a78dafd Mon Sep 17 00:00:00 2001 From: Kei Date: Tue, 23 Apr 2024 19:52:58 +0800 Subject: [PATCH 5/9] Update tests --- pandas/tests/groupby/test_apply.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 5a034f0ebccbe..479cf1220bdbe 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -315,7 +315,7 @@ def test_groupby_as_index_apply(): # apply doesn't maintain the original ordering # changed in GH5610 as the as_index=False returns a MI here - exp_not_as_apply = Index([0, 1, 2, 4]) + exp_not_as_apply = Index([0, 2, 1, 4]) tp = [(1, 0), (1, 2), (2, 1), (3, 4)] exp_as_apply = MultiIndex.from_tuples(tp, names=["user_id", None]) From 52fb291f7eea60c9f2e05ca335e9a84771456849 Mon Sep 17 00:00:00 2001 From: Kei Date: Wed, 24 Apr 2024 01:33:23 +0800 Subject: [PATCH 6/9] Retrigger pipeline From 5a3021638ddfda304fc3a80811198664daa10d48 Mon Sep 17 00:00:00 2001 From: Kei Date: Thu, 25 Apr 2024 20:21:11 +0800 Subject: [PATCH 7/9] Update rst doc --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index c817e09b3b360..5f5ceb8693c44 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -432,6 +432,7 @@ Groupby/resample/rolling - Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`) - Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`) - Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`) +- Bug in :meth:`DataFrameGroupBy.apply` with ``as_index=False`` that was returning :class:`MultiIndex` instead of returning :class:`Index`. (:issue:`58291`) Reshaping From 062031e2eede20537be653637df6a33f2b37f519 Mon Sep 17 00:00:00 2001 From: Kei Date: Sun, 28 Apr 2024 23:04:10 +0800 Subject: [PATCH 8/9] Remove unneccessary tests --- pandas/tests/groupby/test_apply.py | 107 ----------------------------- 1 file changed, 107 deletions(-) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 479cf1220bdbe..e27c782c1bdcf 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1583,110 +1583,3 @@ def f_4(grp): e.loc["Pony"] = np.nan e.name = None tm.assert_series_equal(result, e) - - -# GH58291 -def test_apply_frame_not_as_index_returns_single_index(): - df = DataFrame( - [ - ["group_a", 0], - ["group_a", 2], - ["group_b", 1], - ["group_b", 3], - ["group_b", 5], - ], - columns=["group", "value"], - ) - gb = df.groupby("group", as_index=False)[["group", "value"]] - - def up_to_two_rows(df: DataFrame) -> DataFrame: - return df.head(2) - - result = gb.apply(up_to_two_rows) - - expected = DataFrame( - [["group_a", 0], ["group_a", 2], ["group_b", 1], ["group_b", 3]], - columns=["group", "value"], - index=range(4), - ) - tm.assert_frame_equal(result, expected) - - -# GH58291 -def test_apply_column_groupby_frame_not_as_index_returns_single_index(df): - gb = df.groupby("A", as_index=False)[["A", "C", "D"]] - result = gb.apply(lambda x: x.min()) - print(str(result)) - - expected = DataFrame( - [["bar", -2.441467, -2.441467], ["foo", -0.413064, -0.413064]], - columns=["A", "C", "D"], - index=range(2), - ) - tm.assert_frame_equal(result, expected) - - -# GH58291 -def test_apply_non_column_groupby_frame_not_as_index_returns_single_index(tsframe): - gb = tsframe.groupby(lambda x: x.month, as_index=False) - result = gb.apply(lambda x: x.mean()) - print(str(result)) - - expected = DataFrame( - [ - [1, 0.115464, 0.260960, 0.187824, -0.411523], - [2, 0.047104, -0.183591, 0.330640, -0.207427], - ], - columns=["index", "A", "B", "C", "D"], - index=range(2), - ) - tm.assert_frame_equal(result, expected) - - result = gb.apply(lambda x: x.mean() + x.max()) - print(str(result)) - - expected = DataFrame( - [ - [1, 2.490351, 2.062381, 2.244527, 0.912824], - [2, 1.918738, 2.273746, 1.759184, 0.700975], - ], - columns=["index", "A", "B", "C", "D"], - index=range(2), - ) - tm.assert_frame_equal(result, expected) - - result = gb.apply(lambda x: 1) - print(str(result)) - - expected = DataFrame([[1, 1], [2, 1]], columns=["index", None], index=range(2)) - tm.assert_frame_equal(result, expected) - - result = gb.apply(lambda x: x.quantile([0.2, 0.38])) - print(str(result)) - - expected = DataFrame( - [ - [-0.742155, -0.607186, -0.325423, -1.254187], - [-0.212477, -0.034416, -0.117452, -0.561536], - [-0.842537, -0.920719, -0.404496, -0.729036], - [-0.411161, -0.830757, 0.321981, -0.444410], - ], - columns=["A", "B", "C", "D"], - index=[0.20, 0.38] * 2, - ) - tm.assert_frame_equal(result, expected) - - result = gb.apply(DataFrame.quantile, [0.2, 0.38]) - print(str(result)) - - expected = DataFrame( - [ - [-0.742155, -0.607186, -0.325423, -1.254187], - [-0.212477, -0.034416, -0.117452, -0.561536], - [-0.842537, -0.920719, -0.404496, -0.729036], - [-0.411161, -0.830757, 0.321981, -0.444410], - ], - columns=["A", "B", "C", "D"], - index=[0.20, 0.38] * 2, - ) - tm.assert_frame_equal(result, expected) From 845714784d0741982e320fb8b0e6bddb6ec1d90e Mon Sep 17 00:00:00 2001 From: Kei Date: Sun, 28 Apr 2024 23:33:15 +0800 Subject: [PATCH 9/9] Update expected result based on test params instead of result obj --- pandas/tests/groupby/methods/test_value_counts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index de8d4b1f01410..0f136b06c782a 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -329,7 +329,7 @@ def test_against_frame_and_seriesgroupby( else: name = "proportion" if normalize else "count" expected = expected.reset_index().rename({0: name}, axis=1) - if "level_0" in result: + if groupby in ["array", "function"] and (not as_index and frame): expected.insert(loc=0, column="level_0", value=result["level_0"]) else: expected.insert(loc=0, column="country", value=result["country"])