diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index b16ca0a80c5b4..74c683638c654 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -132,6 +132,7 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ +- Bug in :meth:`DataFrameGroupBy.apply` that would some times throw an erroneous ``ValueError`` if the grouping axis had duplicate entries (:issue:`16646`) - - diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 3aaeef3b63760..2cb4674b2e33a 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -211,7 +211,7 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0): # group might be modified group_axes = group.axes res = f(group) - if not _is_indexed_like(res, group_axes): + if not _is_indexed_like(res, group_axes, axis): mutated = True result_values.append(res) @@ -897,13 +897,13 @@ def agg_series( return grouper.get_result() -def _is_indexed_like(obj, axes) -> bool: +def _is_indexed_like(obj, axes, axis: int) -> bool: if isinstance(obj, Series): if len(axes) > 1: return False - return obj.index.equals(axes[0]) + return obj.axes[axis].equals(axes[axis]) elif isinstance(obj, DataFrame): - return obj.index.equals(axes[0]) + return obj.axes[axis].equals(axes[axis]) return False diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 525a6fe2637c3..665cd12225ad7 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -63,15 +63,8 @@ def test_apply_trivial(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail( - reason="GH#20066; function passed into apply " - "returns a DataFrame with the same index " - "as the one to create GroupBy object." -) def test_apply_trivial_fail(): # GH 20066 - # trivial apply fails if the constant dataframe has the same index - # with the one used to create GroupBy object. df = pd.DataFrame( {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=["key", "data"], @@ -1044,3 +1037,23 @@ def test_apply_with_date_in_multiindex_does_not_convert_to_timestamp(): tm.assert_frame_equal(result, expected) for val in result.index.levels[1]: assert type(val) is date + + +def test_apply_by_cols_equals_apply_by_rows_transposed(): + # GH 16646 + # Operating on the columns, or transposing and operating on the rows + # should give the same result. There was previously a bug where the + # by_rows operation would work fine, but by_cols would throw a ValueError + + df = pd.DataFrame( + np.random.random([6, 4]), + columns=pd.MultiIndex.from_product([["A", "B"], [1, 2]]), + ) + + by_rows = df.T.groupby(axis=0, level=0).apply( + lambda x: x.droplevel(axis=0, level=0) + ) + by_cols = df.groupby(axis=1, level=0).apply(lambda x: x.droplevel(axis=1, level=0)) + + tm.assert_frame_equal(by_cols, by_rows.T) + tm.assert_frame_equal(by_cols, df) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index cbfba16223f74..42945be923fa0 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -940,10 +940,6 @@ def test_frame_describe_multikey(tsframe): groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1) result = groupedT.describe() expected = tsframe.describe().T - expected.index = pd.MultiIndex( - levels=[[0, 1], expected.index], - codes=[[0, 0, 1, 1], range(len(expected.index))], - ) tm.assert_frame_equal(result, expected)