From da5693b7e8c76e05d1e1c19bce48adbc74d1ddc3 Mon Sep 17 00:00:00 2001 From: fjetter Date: Mon, 3 Feb 2020 14:13:17 +0100 Subject: [PATCH 1/4] groupy apply: Ensure same index is returned for slow and fast path --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/_libs/reduction.pyx | 2 +- pandas/tests/groupby/test_apply.py | 19 +++++++++++++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index bfe2dcee40d5e..88a1da7db8ffa 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -827,6 +827,7 @@ Groupby/resample/rolling - Bug in :meth:`GroupBy.agg`, :meth:`GroupBy.transform`, and :meth:`GroupBy.resample` where subclasses are not preserved (:issue:`28330`) - Bug in :meth:`GroupBy.rolling.apply` ignores args and kwargs parameters (:issue:`33433`) - Bug in :meth:`DataFrameGroupby.std` and :meth:`DataFrameGroupby.sem` would modify grouped-by columns when ``as_index=False`` (:issue:`10355`) +- Bug in :meth:`core.groupby.DataFrameGroupBy.apply` where the result shape was incorrect when the output index was not identical to the input index (:issue:`31612`) Reshaping ^^^^^^^^^ diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 0988cd7ff0dde..18422c2f86129 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -502,7 +502,7 @@ def apply_frame_axis0(object frame, object f, object names, # Need to infer if low level index slider will cause segfaults require_slow_apply = i == 0 and piece is chunk try: - if piece.index is not chunk.index: + if not piece.index.equals(chunk.index): mutated = True except AttributeError: # `piece` might not have an index, could be e.g. an int diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index e2b5118922a5a..cc1a74fd8cbaf 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -921,3 +921,22 @@ def fn(x): name="col2", ) tm.assert_series_equal(result, expected) + + +def test_apply_fast_slow_identical(): + # GH 31613 + + df = DataFrame({"A": [0, 0, 1], "b": range(3)}) + + # For simple index structures we check for fast/slow apply using + # an identity check on in/output + def slow(group): + return group + + def fast(group): + return group.copy() + + fast_df = df.groupby("A").apply(fast) + slow_df = df.groupby("A").apply(slow) + + tm.assert_frame_equal(fast_df, slow_df) From 95ac998088f09a9b7e5422da675205d775d8ae85 Mon Sep 17 00:00:00 2001 From: fjetter Date: Tue, 4 Feb 2020 09:33:48 +0100 Subject: [PATCH 2/4] Add test case for GH14927 --- pandas/tests/groupby/test_apply.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index cc1a74fd8cbaf..d3298493b0dc7 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -940,3 +940,20 @@ def fast(group): slow_df = df.groupby("A").apply(slow) tm.assert_frame_equal(fast_df, slow_df) + + +def test_gh14927(): + # GH 14927 + df = pd.DataFrame({"g": [1, 2, 2, 2], "a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + + df1 = df.groupby("g").apply(lambda x: x) + + df2 = df.groupby("g").apply(lambda x: x[:]) + + df3 = df.groupby("g").apply(lambda x: x.copy(deep=False)) + + df4 = df.groupby("g").apply(lambda x: x.copy(deep=True)) + + tm.assert_frame_equal(df1, df2) + tm.assert_frame_equal(df2, df3) + tm.assert_frame_equal(df3, df4) From 8503c04591d322946a438a326d2bce9d23fe9ffb Mon Sep 17 00:00:00 2001 From: fjetter Date: Wed, 15 Apr 2020 11:14:20 +0200 Subject: [PATCH 3/4] Move tests --- pandas/tests/groupby/test_apply.py | 76 ++++++++++++++++-------------- 1 file changed, 40 insertions(+), 36 deletions(-) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index d3298493b0dc7..bc8067212d60e 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -190,6 +190,46 @@ def f_constant_df(group): assert names == group_names +def test_apply_fast_slow_identical(): + # GH 31613 + + df = DataFrame({"A": [0, 0, 1], "b": range(3)}) + + # For simple index structures we check for fast/slow apply using + # an identity check on in/output + def slow(group): + return group + + def fast(group): + return group.copy() + + fast_df = df.groupby("A").apply(fast) + slow_df = df.groupby("A").apply(slow) + + tm.assert_frame_equal(fast_df, slow_df) + + +@pytest.mark.parametrize( + "func", + [ + lambda x: x, + lambda x: x[:], + lambda x: x.copy(deep=False), + lambda x: x.copy(deep=True), + ], +) +def test_groupby_apply_identity_maybecopy_index_identical(func): + # GH 14927 + # Whether the function returns a copy of the input data or not should not + # have an impact on the index structure of the result since this is not + # transparent to the user + + df = pd.DataFrame({"g": [1, 2, 2, 2], "a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + + result = df.groupby("g").apply(func) + tm.assert_frame_equal(result, df) + + def test_apply_with_mixed_dtype(): # GH3480, apply with mixed dtype on axis=1 breaks in 0.11 df = DataFrame( @@ -921,39 +961,3 @@ def fn(x): name="col2", ) tm.assert_series_equal(result, expected) - - -def test_apply_fast_slow_identical(): - # GH 31613 - - df = DataFrame({"A": [0, 0, 1], "b": range(3)}) - - # For simple index structures we check for fast/slow apply using - # an identity check on in/output - def slow(group): - return group - - def fast(group): - return group.copy() - - fast_df = df.groupby("A").apply(fast) - slow_df = df.groupby("A").apply(slow) - - tm.assert_frame_equal(fast_df, slow_df) - - -def test_gh14927(): - # GH 14927 - df = pd.DataFrame({"g": [1, 2, 2, 2], "a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) - - df1 = df.groupby("g").apply(lambda x: x) - - df2 = df.groupby("g").apply(lambda x: x[:]) - - df3 = df.groupby("g").apply(lambda x: x.copy(deep=False)) - - df4 = df.groupby("g").apply(lambda x: x.copy(deep=True)) - - tm.assert_frame_equal(df1, df2) - tm.assert_frame_equal(df2, df3) - tm.assert_frame_equal(df3, df4) From bdec31737cbb2e09606cdff99eb613553678f435 Mon Sep 17 00:00:00 2001 From: fjetter Date: Wed, 27 May 2020 14:26:10 +0200 Subject: [PATCH 4/4] More descriptive bug report in whatsnew --- doc/source/whatsnew/v1.1.0.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 88a1da7db8ffa..0495b21d2cb93 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -827,7 +827,10 @@ Groupby/resample/rolling - Bug in :meth:`GroupBy.agg`, :meth:`GroupBy.transform`, and :meth:`GroupBy.resample` where subclasses are not preserved (:issue:`28330`) - Bug in :meth:`GroupBy.rolling.apply` ignores args and kwargs parameters (:issue:`33433`) - Bug in :meth:`DataFrameGroupby.std` and :meth:`DataFrameGroupby.sem` would modify grouped-by columns when ``as_index=False`` (:issue:`10355`) -- Bug in :meth:`core.groupby.DataFrameGroupBy.apply` where the result shape was incorrect when the output index was not identical to the input index (:issue:`31612`) +- Bug in :meth:`core.groupby.DataFrameGroupBy.apply` where the output index shape for functions returning a DataFrame which is equally indexed + to the input DataFrame is inconsistent. An internal heuristic to detect index mutation would behave differently for equal but not identical + indices. In particular, the result index shape might change if a copy of the input would be returned. + The behaviour now is consistent, independent of internal heuristics. (:issue:`31612`, :issue:`14927`, :issue:`13056`) Reshaping ^^^^^^^^^