From 8bc63ba84246d8c94c994b0425fe24bddd56989f Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 14 Feb 2022 12:17:57 -0500 Subject: [PATCH 1/3] BUG: groupby.size and groupby.transform('size') incorrect for axis=1 --- doc/source/whatsnew/v1.5.0.rst | 3 ++- pandas/conftest.py | 8 +++++++ pandas/core/groupby/groupby.py | 11 +++++++-- pandas/tests/groupby/conftest.py | 5 ++++ pandas/tests/groupby/test_size.py | 23 +++++++++++++++++++ .../tests/groupby/transform/test_transform.py | 23 +++++++++++++++++++ 6 files changed, 70 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index ea5258cf1537d..3cd2b0334ca15 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -371,7 +371,8 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :meth:`DataFrame.resample` ignoring ``closed="right"`` on :class:`TimedeltaIndex` (:issue:`45414`) -- Bug in :meth:`.DataFrameGroupBy.transform` fails when the input DataFrame has multiple columns (:issue:`27469`) +- Bug in :meth:`.DataFrameGroupBy.transform` fails when ``func="size"`` and the input DataFrame has multiple columns (:issue:`27469`) +- Bug in :meth:`.DataFrameGroupBy.size` and :meth:`.DataFrameGroupBy.transform` with ``func="size"`` produced incorrect results when ``axis=1`` (:issue:`45715`) Reshaping ^^^^^^^^^ diff --git a/pandas/conftest.py b/pandas/conftest.py index 963cf2de0428c..8d5913ce0a9ae 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -229,6 +229,14 @@ def axis(request): axis_frame = axis +@pytest.fixture(params=[1, "columns"], ids=lambda x: f"axis={repr(x)}") +def axis_1(request): + """ + Fixture for returning aliases of axis 1 of a DataFrame. + """ + return request.param + + @pytest.fixture(params=[True, False, None]) def observed(request): """ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 4eb907e06adf1..f65d85bf1d8fc 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1665,14 +1665,14 @@ def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT: # for each col, reshape to size of original frame by take operation ids, _, _ = self.grouper.group_info - result = result.reindex(self.grouper.result_index, copy=False) + result = result.reindex(self.grouper.result_index, axis=self.axis, copy=False) if self.obj.ndim == 1: # i.e. SeriesGroupBy out = algorithms.take_nd(result._values, ids) output = obj._constructor(out, index=obj.index, name=obj.name) else: - output = result.take(ids, axis=0) + output = result.take(ids, axis=self.axis) output.index = obj.index return output @@ -2150,6 +2150,13 @@ def size(self) -> DataFrame | Series: """ result = self.grouper.size() + if self.axis == 1: + return DataFrame( + data=np.tile(result.values, (self.obj.shape[0], 1)), + columns=result.index, + index=self.obj.index, + ) + # GH28330 preserve subclassed Series/DataFrames through calls if issubclass(self.obj._constructor, Series): result = self._obj_1d_constructor(result, name=self.obj.name) diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index 403e3edcc34d0..58d9e500554dd 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -19,6 +19,11 @@ def as_index(request): return request.param +@pytest.fixture(params=[True, False]) +def dropna(request): + return request.param + + @pytest.fixture def mframe(multiindex_dataframe_random_data): return multiindex_dataframe_random_data diff --git a/pandas/tests/groupby/test_size.py b/pandas/tests/groupby/test_size.py index 06f79ef609db1..a614cf7abd684 100644 --- a/pandas/tests/groupby/test_size.py +++ b/pandas/tests/groupby/test_size.py @@ -18,6 +18,29 @@ def test_size(df, by): assert result[key] == len(group) +@pytest.mark.parametrize( + "by", + [ + [0, 0, 0, 0], + [0, 1, 1, 1], + [1, 0, 1, 1], + [0, None, None, None], + pytest.param([None, None, None, None], marks=pytest.mark.xfail), + ], +) +def test_size_axis_1(df, axis_1, by, sort, dropna): + # GH#45715 + counts = {key: sum(value == key for value in by) for key in dict.fromkeys(by)} + if dropna: + counts = {key: value for key, value in counts.items() if key is not None} + expected = DataFrame(counts, index=df.index) + if sort: + expected = expected.sort_index(axis=1) + grouped = df.groupby(by=by, axis=axis_1, sort=sort, dropna=dropna) + result = grouped.size() + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("by", ["A", "B", ["A", "B"]]) @pytest.mark.parametrize("sort", [True, False]) def test_size_sort(sort, by): diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 6e9b8c35b3698..1bd92984a66d7 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -191,6 +191,29 @@ def test_transform_axis_1(request, transformation_func): tm.assert_equal(result, expected) +def test_transform_axis_1_reducer(request, reduction_func): + # GH 45984 + if reduction_func in ( + "corrwith", + "first", + "idxmax", + "idxmin", + "last", + "ngroup", + "nth", + ): + marker = pytest.mark.xfail(reason="transform incorrectly fails - GH#") + request.node.add_marker(marker) + df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}, index=["x", "y"]) + result = df.groupby([0, 0, 1], axis=1).transform(reduction_func) + if reduction_func == "size": + # size doesn't behave in the same manner; hardcode expected result + expected = DataFrame([[2, 2, 1], [2, 2, 1]], index=df.index, columns=[0, 0, 1]) + else: + expected = df.T.groupby([0, 0, 1]).transform(reduction_func).T + tm.assert_equal(result, expected) + + def test_transform_axis_ts(tsframe): # make sure that we are setting the axes From fba7abfed7ba70270252b8730eb0dc1091326977 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 14 Feb 2022 12:22:11 -0500 Subject: [PATCH 2/3] GH# fixups --- pandas/tests/groupby/transform/test_transform.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 1bd92984a66d7..6379e747aa332 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -192,7 +192,7 @@ def test_transform_axis_1(request, transformation_func): def test_transform_axis_1_reducer(request, reduction_func): - # GH 45984 + # GH#45715 if reduction_func in ( "corrwith", "first", @@ -202,7 +202,7 @@ def test_transform_axis_1_reducer(request, reduction_func): "ngroup", "nth", ): - marker = pytest.mark.xfail(reason="transform incorrectly fails - GH#") + marker = pytest.mark.xfail(reason="transform incorrectly fails - GH#45986") request.node.add_marker(marker) df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}, index=["x", "y"]) result = df.groupby([0, 0, 1], axis=1).transform(reduction_func) From 2ed18bba45c70b342228d6c4acf26009fd066b72 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 14 Feb 2022 13:33:40 -0500 Subject: [PATCH 3/3] Fixup --- pandas/core/groupby/groupby.py | 2 +- pandas/tests/groupby/transform/test_transform.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f65d85bf1d8fc..dbff541e9568b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1673,7 +1673,7 @@ def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT: output = obj._constructor(out, index=obj.index, name=obj.name) else: output = result.take(ids, axis=self.axis) - output.index = obj.index + output = output.set_axis(obj._get_axis(self.axis), axis=self.axis) return output # ----------------------------------------------------------------- diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 6379e747aa332..4b707d255b18f 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -208,7 +208,7 @@ def test_transform_axis_1_reducer(request, reduction_func): result = df.groupby([0, 0, 1], axis=1).transform(reduction_func) if reduction_func == "size": # size doesn't behave in the same manner; hardcode expected result - expected = DataFrame([[2, 2, 1], [2, 2, 1]], index=df.index, columns=[0, 0, 1]) + expected = DataFrame(2 * [[2, 2, 1]], index=df.index, columns=df.columns) else: expected = df.T.groupby([0, 0, 1]).transform(reduction_func).T tm.assert_equal(result, expected)