From e5f470cef900bf8de6010aedbbe6ad723d307205 Mon Sep 17 00:00:00 2001 From: matt Date: Sun, 24 Jul 2022 21:53:39 +0100 Subject: [PATCH 01/11] BUG Fixing columns dropped from multi index in group by transform GH47787 --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/core/groupby/groupby.py | 5 +++++ pandas/tests/groupby/test_groupby.py | 15 +++++++++++++++ 3 files changed, 21 insertions(+) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 252bea3ba774a..d88badffdbae3 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -1007,6 +1007,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.resample` reduction methods when used with ``on`` would attempt to aggregate the provided column (:issue:`47079`) - Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` would not respect ``dropna=False`` when the input DataFrame/Series had a NaN values in a :class:`MultiIndex` (:issue:`46783`) - Bug in :meth:`DataFrameGroupBy.resample` raises ``KeyError`` when getting the result from a key list which misses the resample key (:issue:`47362`) +- Bug in :meth:`DataFrame.groupby` would lose index columns when the dataframe is empty for transforms, like fillna (:issue:`47787`) - Reshaping diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 9b4991d32692b..59ca5579afbcc 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1020,6 +1020,11 @@ def curried(x): return self.apply(curried) is_transform = name in base.transformation_kernels + + # Transform needs to keep the same schema, including when empty + if is_transform and self._obj_with_exclusions.empty: + return self._obj_with_exclusions + result = self._python_apply_general( curried, self._obj_with_exclusions, is_transform=is_transform ) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 920b869ef799b..d78261caa3227 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2348,6 +2348,21 @@ def test_groupby_duplicate_index(): tm.assert_series_equal(result, expected) +def test_group_on_empty_multiindex(transformation_func): + # GH 47787 + # With one row, those are transforms so the schema should be the same + df = pd.DataFrame(data=[[1, 2, 3, 4]], columns=['col_1', 'col_2', 'col_3', 'col_4']) + df = df.set_index(['col_1', 'col_2']) + result = df.groupby(['col_1']).fillna('') + assert df.index.names == result.index.names + + # When empty, expect the same schema as well + df = pd.DataFrame(data=[], columns=['col_1', 'col_2', 'col_3', 'col_4']) + df = df.set_index(['col_1', 'col_2']) + result = df.groupby(['col_1']).fillna('') + assert df.index.names == result.index.names + + @pytest.mark.parametrize( "idx", [ From 9cf9a7ca92be335ac16c3f3a5a9fdef615cc7fab Mon Sep 17 00:00:00 2001 From: matt Date: Sun, 24 Jul 2022 22:20:38 +0100 Subject: [PATCH 02/11] fixing pep8 issues --- pandas/tests/groupby/test_groupby.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index d78261caa3227..7ea1a2d2e19a6 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2351,15 +2351,14 @@ def test_groupby_duplicate_index(): def test_group_on_empty_multiindex(transformation_func): # GH 47787 # With one row, those are transforms so the schema should be the same - df = pd.DataFrame(data=[[1, 2, 3, 4]], columns=['col_1', 'col_2', 'col_3', 'col_4']) - df = df.set_index(['col_1', 'col_2']) - result = df.groupby(['col_1']).fillna('') + df = DataFrame(data=[[1, 2, 3, 4]], columns=["col_1", "col_2", "col_3", "col_4"]) + df = df.set_index(["col_1", "col_2"]) + result = df.groupby(["col_1"]).fillna("") assert df.index.names == result.index.names - # When empty, expect the same schema as well - df = pd.DataFrame(data=[], columns=['col_1', 'col_2', 'col_3', 'col_4']) - df = df.set_index(['col_1', 'col_2']) - result = df.groupby(['col_1']).fillna('') + df = DataFrame(data=[], columns=["col_1", "col_2", "col_3", "col_4"]) + df = df.set_index(["col_1", "col_2"]) + result = df.groupby(["col_1"]).fillna("") assert df.index.names == result.index.names From 57fc24b004f1de151917663f0e39bae77f01818e Mon Sep 17 00:00:00 2001 From: matt Date: Sat, 30 Jul 2022 18:04:40 +0100 Subject: [PATCH 03/11] testing series as well as dataframe --- doc/source/whatsnew/v1.5.0.rst | 2 +- pandas/tests/groupby/test_groupby.py | 24 ++++++++++++++++++++---- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index d88badffdbae3..0c1f794a727da 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -1007,7 +1007,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.resample` reduction methods when used with ``on`` would attempt to aggregate the provided column (:issue:`47079`) - Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` would not respect ``dropna=False`` when the input DataFrame/Series had a NaN values in a :class:`MultiIndex` (:issue:`46783`) - Bug in :meth:`DataFrameGroupBy.resample` raises ``KeyError`` when getting the result from a key list which misses the resample key (:issue:`47362`) -- Bug in :meth:`DataFrame.groupby` would lose index columns when the dataframe is empty for transforms, like fillna (:issue:`47787`) +- Bug in :meth:`DataFrame.groupby` would lose index columns when the DataFrame is empty for transforms, like fillna (:issue:`47787`) - Reshaping diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 7ea1a2d2e19a6..bc8b9767dea9a 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2348,19 +2348,35 @@ def test_groupby_duplicate_index(): tm.assert_series_equal(result, expected) -def test_group_on_empty_multiindex(transformation_func): +def test_group_on_empty_multiindex(transformation_func, request): # GH 47787 # With one row, those are transforms so the schema should be the same + if transformation_func == "tshift": + mark = pytest.mark.xfail(raises=NotImplemented) + request.node.add_marker(mark) df = DataFrame(data=[[1, 2, 3, 4]], columns=["col_1", "col_2", "col_3", "col_4"]) df = df.set_index(["col_1", "col_2"]) - result = df.groupby(["col_1"]).fillna("") + if transformation_func == "fillna": + args = ("ffill",) + else: + args = () + result = df.groupby(["col_1"]).transform(transformation_func, *args) assert df.index.names == result.index.names + + col_3 = df["col_3"] + result = col_3.groupby(["col_1"]).transform(transformation_func, *args) + assert col_3.index.names == result.index.names + # When empty, expect the same schema as well - df = DataFrame(data=[], columns=["col_1", "col_2", "col_3", "col_4"]) + df = DataFrame(data=[], columns=["col_1", "col_2", "col_3", "col_4"], dtype=int) df = df.set_index(["col_1", "col_2"]) - result = df.groupby(["col_1"]).fillna("") + result = df.groupby(["col_1"]).transform(transformation_func, *args) assert df.index.names == result.index.names + col_3 = df["col_3"] + result = col_3.groupby(["col_1"]).transform(transformation_func, *args) + assert col_3.index.names == result.index.names + @pytest.mark.parametrize( "idx", From c27f799f33edb814392dcbea620a7b67abaa19df Mon Sep 17 00:00:00 2001 From: matt Date: Sun, 31 Jul 2022 11:42:19 +0100 Subject: [PATCH 04/11] fixing typo --- pandas/tests/groupby/test_groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index bc8b9767dea9a..19cbff95a4945 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2352,7 +2352,7 @@ def test_group_on_empty_multiindex(transformation_func, request): # GH 47787 # With one row, those are transforms so the schema should be the same if transformation_func == "tshift": - mark = pytest.mark.xfail(raises=NotImplemented) + mark = pytest.mark.xfail(raises=NotImplementedError) request.node.add_marker(mark) df = DataFrame(data=[[1, 2, 3, 4]], columns=["col_1", "col_2", "col_3", "col_4"]) df = df.set_index(["col_1", "col_2"]) From 42f4803455e6ebe4fb4b7035b5cd162632073370 Mon Sep 17 00:00:00 2001 From: matt Date: Sun, 31 Jul 2022 15:54:49 +0100 Subject: [PATCH 05/11] adding a timestamp in the index so tshift fails with the right error --- pandas/tests/groupby/test_groupby.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 19cbff95a4945..54829bf9731df 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2354,10 +2354,12 @@ def test_group_on_empty_multiindex(transformation_func, request): if transformation_func == "tshift": mark = pytest.mark.xfail(raises=NotImplementedError) request.node.add_marker(mark) - df = DataFrame(data=[[1, 2, 3, 4]], columns=["col_1", "col_2", "col_3", "col_4"]) + df = DataFrame(data=[[1, Timestamp("today"), 3, 4]], columns=["col_1", "col_2", "col_3", "col_4"]) df = df.set_index(["col_1", "col_2"]) if transformation_func == "fillna": args = ("ffill",) + elif transformation_func == "tshift": + args = (1, "D") else: args = () result = df.groupby(["col_1"]).transform(transformation_func, *args) From 43a25710a224cb820cb42c4070cea8593e4370bf Mon Sep 17 00:00:00 2001 From: matt Date: Sun, 31 Jul 2022 16:13:28 +0100 Subject: [PATCH 06/11] fixing formatting --- pandas/tests/groupby/test_groupby.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 54829bf9731df..9d46cb262b8b3 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2354,7 +2354,10 @@ def test_group_on_empty_multiindex(transformation_func, request): if transformation_func == "tshift": mark = pytest.mark.xfail(raises=NotImplementedError) request.node.add_marker(mark) - df = DataFrame(data=[[1, Timestamp("today"), 3, 4]], columns=["col_1", "col_2", "col_3", "col_4"]) + df = DataFrame( + data=[[1, Timestamp("today"), 3, 4]], + columns=["col_1", "col_2", "col_3", "col_4"], + ) df = df.set_index(["col_1", "col_2"]) if transformation_func == "fillna": args = ("ffill",) From c67128a01037cc760e6ab4334c54831fdc8a720f Mon Sep 17 00:00:00 2001 From: matt Date: Tue, 2 Aug 2022 19:32:41 +0100 Subject: [PATCH 07/11] using the module assert --- pandas/tests/groupby/test_groupby.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 9d46cb262b8b3..4aa83ac48fe75 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2366,11 +2366,11 @@ def test_group_on_empty_multiindex(transformation_func, request): else: args = () result = df.groupby(["col_1"]).transform(transformation_func, *args) - assert df.index.names == result.index.names + tm.assert_index_equal(df.index, result.index) col_3 = df["col_3"] result = col_3.groupby(["col_1"]).transform(transformation_func, *args) - assert col_3.index.names == result.index.names + tm.assert_index_equal(col_3.index, result.index) # When empty, expect the same schema as well df = DataFrame(data=[], columns=["col_1", "col_2", "col_3", "col_4"], dtype=int) From 3d5e5a3bca89aacf5b60da7918c47cc23fd72e59 Mon Sep 17 00:00:00 2001 From: matt Date: Thu, 11 Aug 2022 21:54:33 +0100 Subject: [PATCH 08/11] adding a test on the dataframe --- pandas/tests/groupby/test_groupby.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 4aa83ac48fe75..b9f7ea90f442c 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2367,6 +2367,9 @@ def test_group_on_empty_multiindex(transformation_func, request): args = () result = df.groupby(["col_1"]).transform(transformation_func, *args) tm.assert_index_equal(df.index, result.index) + has_same_columns = ( + type(result) == DataFrame and (df.columns == result.columns).all() + ) col_3 = df["col_3"] result = col_3.groupby(["col_1"]).transform(transformation_func, *args) @@ -2377,10 +2380,18 @@ def test_group_on_empty_multiindex(transformation_func, request): df = df.set_index(["col_1", "col_2"]) result = df.groupby(["col_1"]).transform(transformation_func, *args) assert df.index.names == result.index.names + type_changes = {"pct_change", "rank"} + # pct change will return a float, so different from the original type + if has_same_columns and transformation_func not in type_changes: + tm.assert_frame_equal(df, result) col_3 = df["col_3"] - result = col_3.groupby(["col_1"]).transform(transformation_func, *args) - assert col_3.index.names == result.index.names + series_result = col_3.groupby(["col_1"]).transform(transformation_func, *args) + if transformation_func not in type_changes: + if has_same_columns and transformation_func: + tm.assert_series_equal(col_3, series_result) + else: + tm.assert_series_equal(series_result, result) @pytest.mark.parametrize( From 93a95e02df9743cafccc9a856ebcaeb2b41cf477 Mon Sep 17 00:00:00 2001 From: matt Date: Sun, 14 Aug 2022 08:25:17 +0100 Subject: [PATCH 09/11] improve test post review --- pandas/tests/groupby/test_groupby.py | 40 ++++++++++------------------ 1 file changed, 14 insertions(+), 26 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index b9f7ea90f442c..7cc4343c9d0c2 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2365,33 +2365,21 @@ def test_group_on_empty_multiindex(transformation_func, request): args = (1, "D") else: args = () - result = df.groupby(["col_1"]).transform(transformation_func, *args) - tm.assert_index_equal(df.index, result.index) - has_same_columns = ( - type(result) == DataFrame and (df.columns == result.columns).all() - ) - - col_3 = df["col_3"] - result = col_3.groupby(["col_1"]).transform(transformation_func, *args) - tm.assert_index_equal(col_3.index, result.index) + result = df.iloc[:0].groupby(["col_1"]).transform(transformation_func, *args) + expected = df.groupby(["col_1"]).transform(transformation_func, *args).iloc[:0] + if transformation_func in {"diff", "shift"}: + expected = expected.astype(int) + tm.assert_equal(result, expected) - # When empty, expect the same schema as well - df = DataFrame(data=[], columns=["col_1", "col_2", "col_3", "col_4"], dtype=int) - df = df.set_index(["col_1", "col_2"]) - result = df.groupby(["col_1"]).transform(transformation_func, *args) - assert df.index.names == result.index.names - type_changes = {"pct_change", "rank"} - # pct change will return a float, so different from the original type - if has_same_columns and transformation_func not in type_changes: - tm.assert_frame_equal(df, result) - - col_3 = df["col_3"] - series_result = col_3.groupby(["col_1"]).transform(transformation_func, *args) - if transformation_func not in type_changes: - if has_same_columns and transformation_func: - tm.assert_series_equal(col_3, series_result) - else: - tm.assert_series_equal(series_result, result) + result = ( + df["col_3"].iloc[:0].groupby(["col_1"]).transform(transformation_func, *args) + ) + expected = ( + df["col_3"].groupby(["col_1"]).transform(transformation_func, *args).iloc[:0] + ) + if transformation_func in ("diff", "shift"): + expected = expected.astype(int) + tm.assert_equal(result, expected) @pytest.mark.parametrize( From ee398530e00d7ee2f64662bc02f120882b1bd3ef Mon Sep 17 00:00:00 2001 From: matt Date: Sun, 14 Aug 2022 08:26:13 +0100 Subject: [PATCH 10/11] typo fix --- pandas/tests/groupby/test_groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 7cc4343c9d0c2..de0c077f9620d 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2367,7 +2367,7 @@ def test_group_on_empty_multiindex(transformation_func, request): args = () result = df.iloc[:0].groupby(["col_1"]).transform(transformation_func, *args) expected = df.groupby(["col_1"]).transform(transformation_func, *args).iloc[:0] - if transformation_func in {"diff", "shift"}: + if transformation_func in ("diff", "shift"): expected = expected.astype(int) tm.assert_equal(result, expected) From 94addbfa00c035386c4fc7a6dffca6aafe8d425c Mon Sep 17 00:00:00 2001 From: matt Date: Tue, 16 Aug 2022 18:05:08 +0100 Subject: [PATCH 11/11] explicitly casting to int --- pandas/tests/groupby/test_groupby.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index de0c077f9620d..df49db9ec9dad 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2358,6 +2358,8 @@ def test_group_on_empty_multiindex(transformation_func, request): data=[[1, Timestamp("today"), 3, 4]], columns=["col_1", "col_2", "col_3", "col_4"], ) + df["col_3"] = df["col_3"].astype(int) + df["col_4"] = df["col_4"].astype(int) df = df.set_index(["col_1", "col_2"]) if transformation_func == "fillna": args = ("ffill",)