Skip to content

BUG: groupby.size and groupby.transform('size') incorrect for axis=1 #45987

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Feb 15, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,8 @@ Plotting
Groupby/resample/rolling
^^^^^^^^^^^^^^^^^^^^^^^^
- Bug in :meth:`DataFrame.resample` ignoring ``closed="right"`` on :class:`TimedeltaIndex` (:issue:`45414`)
- Bug in :meth:`.DataFrameGroupBy.transform` fails when the input DataFrame has multiple columns (:issue:`27469`)
- Bug in :meth:`.DataFrameGroupBy.transform` fails when ``func="size"`` and the input DataFrame has multiple columns (:issue:`27469`)
- Bug in :meth:`.DataFrameGroupBy.size` and :meth:`.DataFrameGroupBy.transform` with ``func="size"`` produced incorrect results when ``axis=1`` (:issue:`45715`)

Reshaping
^^^^^^^^^
Expand Down
8 changes: 8 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,14 @@ def axis(request):
axis_frame = axis


@pytest.fixture(params=[1, "columns"], ids=lambda x: f"axis={repr(x)}")
def axis_1(request):
"""
Fixture for returning aliases of axis 1 of a DataFrame.
"""
return request.param


@pytest.fixture(params=[True, False, None])
def observed(request):
"""
Expand Down
13 changes: 10 additions & 3 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1665,15 +1665,15 @@ def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT:

# for each col, reshape to size of original frame by take operation
ids, _, _ = self.grouper.group_info
result = result.reindex(self.grouper.result_index, copy=False)
result = result.reindex(self.grouper.result_index, axis=self.axis, copy=False)

if self.obj.ndim == 1:
# i.e. SeriesGroupBy
out = algorithms.take_nd(result._values, ids)
output = obj._constructor(out, index=obj.index, name=obj.name)
else:
output = result.take(ids, axis=0)
output.index = obj.index
output = result.take(ids, axis=self.axis)
output = output.set_axis(obj._get_axis(self.axis), axis=self.axis)
return output

# -----------------------------------------------------------------
Expand Down Expand Up @@ -2150,6 +2150,13 @@ def size(self) -> DataFrame | Series:
"""
result = self.grouper.size()

if self.axis == 1:
return DataFrame(
data=np.tile(result.values, (self.obj.shape[0], 1)),
columns=result.index,
index=self.obj.index,
)

# GH28330 preserve subclassed Series/DataFrames through calls
if issubclass(self.obj._constructor, Series):
result = self._obj_1d_constructor(result, name=self.obj.name)
Expand Down
5 changes: 5 additions & 0 deletions pandas/tests/groupby/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@ def as_index(request):
return request.param


@pytest.fixture(params=[True, False])
def dropna(request):
return request.param


@pytest.fixture
def mframe(multiindex_dataframe_random_data):
return multiindex_dataframe_random_data
Expand Down
23 changes: 23 additions & 0 deletions pandas/tests/groupby/test_size.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,29 @@ def test_size(df, by):
assert result[key] == len(group)


@pytest.mark.parametrize(
"by",
[
[0, 0, 0, 0],
[0, 1, 1, 1],
[1, 0, 1, 1],
[0, None, None, None],
pytest.param([None, None, None, None], marks=pytest.mark.xfail),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rhshadrach im trying to chase down xfails and im not even sure what this means. what do you expect df.groupby(by=[None]*4, ...) to do?

Copy link
Member Author

@rhshadrach rhshadrach Jan 31, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Current tlyhe [0, 1, 1, 1] case is returning

0    1
1    3
dtype: int64

When dropna=True, I'd expect 4 * [None] to return Series([], dtype='int64') and when dropna=False, I'd expect

np.nan   4
dtype: int64

],
)
def test_size_axis_1(df, axis_1, by, sort, dropna):
# GH#45715
counts = {key: sum(value == key for value in by) for key in dict.fromkeys(by)}
if dropna:
counts = {key: value for key, value in counts.items() if key is not None}
expected = DataFrame(counts, index=df.index)
if sort:
expected = expected.sort_index(axis=1)
grouped = df.groupby(by=by, axis=axis_1, sort=sort, dropna=dropna)
result = grouped.size()
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("by", ["A", "B", ["A", "B"]])
@pytest.mark.parametrize("sort", [True, False])
def test_size_sort(sort, by):
Expand Down
23 changes: 23 additions & 0 deletions pandas/tests/groupby/transform/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,29 @@ def test_transform_axis_1(request, transformation_func):
tm.assert_equal(result, expected)


def test_transform_axis_1_reducer(request, reduction_func):
# GH#45715
if reduction_func in (
"corrwith",
"first",
"idxmax",
"idxmin",
"last",
"ngroup",
"nth",
):
marker = pytest.mark.xfail(reason="transform incorrectly fails - GH#45986")
request.node.add_marker(marker)
df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}, index=["x", "y"])
result = df.groupby([0, 0, 1], axis=1).transform(reduction_func)
if reduction_func == "size":
# size doesn't behave in the same manner; hardcode expected result
expected = DataFrame(2 * [[2, 2, 1]], index=df.index, columns=df.columns)
else:
expected = df.T.groupby([0, 0, 1]).transform(reduction_func).T
tm.assert_equal(result, expected)


def test_transform_axis_ts(tsframe):

# make sure that we are setting the axes
Expand Down