From cc0029273c22fc41844c83c40b855dd090db9351 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Fri, 17 Jan 2020 20:04:36 +0800 Subject: [PATCH 1/9] BUG: groupby transform fillna produces wrong result (GH30918) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/groupby/generic.py | 6 ++++++ pandas/tests/groupby/test_transform.py | 13 +++++++++++++ 3 files changed, 20 insertions(+) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index fa562838c8f7c..b941f53d764e4 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -1112,6 +1112,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby` when using nunique on axis=1 (:issue:`30253`) - Bug in :meth:`GroupBy.quantile` with multiple list-like q value and integer column names (:issue:`30289`) - Bug in :meth:`GroupBy.pct_change` and :meth:`core.groupby.SeriesGroupBy.pct_change` causes ``TypeError`` when ``fill_method`` is ``None`` (:issue:`30463`) +- Bug in :meth:`DataFrameGroupby.transform` produces incorrect result with ``func='fillna'`` (:issue:`30918`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index c49677fa27a31..2c95e5f590d56 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1428,6 +1428,12 @@ def transform(self, func, *args, **kwargs): if not result.columns.equals(obj.columns): return self._transform_general(func, *args, **kwargs) + # GH 30918 + if len(result) != self.ngroups: + # if func does not aggregate each group, + # we don't want to broadcast the result + return self._transform_general(func, *args, **kwargs) + return self._transform_fast(result, func) def _transform_fast(self, result: DataFrame, func_nm: str) -> DataFrame: diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 6c05c4038a829..6fff1b6372d8b 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -317,6 +317,19 @@ def test_dispatch_transform(tsframe): tm.assert_frame_equal(filled, expected) +def test_transform_fillna(): + # GH 30918 + df = pd.DataFrame( + { + "A": ["foo", "foo", "foo", "foo", "bar", "bar", "baz"], + "B": [1, 2, np.nan, 3, 3, np.nan, 4], + } + ) + result = df.groupby("A").transform("fillna", value=9) + expected = df[["B"]].fillna(value=9) + tm.assert_frame_equal(result, expected) + + def test_transform_select_columns(df): f = lambda x: x.mean() result = df.groupby("A")[["C", "D"]].transform(f) From bfb6ff3f9e564109f541f6a0fbedbaa95397f12f Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Sat, 18 Jan 2020 12:48:33 +0800 Subject: [PATCH 2/9] BUG: corrected an earlier logic and cleaned up code (GH30918) --- pandas/core/groupby/generic.py | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 2c95e5f590d56..827ae875c4b05 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1418,23 +1418,16 @@ def transform(self, func, *args, **kwargs): # and deal with possible broadcasting below. result = getattr(self, func)(*args, **kwargs) - # a reduction transform - if not isinstance(result, DataFrame): - return self._transform_general(func, *args, **kwargs) - - obj = self._obj_with_exclusions - - # nuisance columns - if not result.columns.equals(obj.columns): - return self._transform_general(func, *args, **kwargs) - - # GH 30918 - if len(result) != self.ngroups: - # if func does not aggregate each group, - # we don't want to broadcast the result - return self._transform_general(func, *args, **kwargs) - - return self._transform_fast(result, func) + if ( + isinstance(result, DataFrame) + and result.columns.equals(self._obj_with_exclusions.columns) + and result.index.equals(self.grouper.result_index) + ): + # GH 30918 + # Call _transform_fast only when we know func is an aggregation + return self._transform_fast(result, func) + + return self._transform_general(func, *args, **kwargs) def _transform_fast(self, result: DataFrame, func_nm: str) -> DataFrame: """ From a7923d7ee2bc90a37458a9b1ace2a82616c8da24 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Sat, 18 Jan 2020 23:50:30 +0800 Subject: [PATCH 3/9] DOC: moved whatsnew to V1.1.0 --- doc/source/whatsnew/v1.0.0.rst | 1 - doc/source/whatsnew/v1.1.0.rst | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 29fc97aaaed07..3bd86bb02155f 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -1131,7 +1131,6 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby` when using nunique on axis=1 (:issue:`30253`) - Bug in :meth:`GroupBy.quantile` with multiple list-like q value and integer column names (:issue:`30289`) - Bug in :meth:`GroupBy.pct_change` and :meth:`core.groupby.SeriesGroupBy.pct_change` causes ``TypeError`` when ``fill_method`` is ``None`` (:issue:`30463`) -- Bug in :meth:`DataFrameGroupby.transform` produces incorrect result with ``func='fillna'`` (:issue:`30918`) Reshaping ^^^^^^^^^ diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 8133e54c934ad..dd590709dfaed 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -133,9 +133,7 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- -- - +- Bug in :meth:`DataFrameGroupby.transform` produces incorrect result with ``func='fillna'`` (:issue:`30918`) Reshaping ^^^^^^^^^ From 5b222ad755039eb844c690812f4f8c380685d7b2 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Wed, 22 Jan 2020 11:26:17 +0800 Subject: [PATCH 4/9] BUG: generalized fix to other non-reduction functions and added tests (GH30918) --- pandas/core/groupby/generic.py | 25 ++++++++++++------------- pandas/tests/groupby/test_transform.py | 22 ++++++++++++++++++---- 2 files changed, 30 insertions(+), 17 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ac93fc5ea6b5a..27dd6e953c219 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1416,19 +1416,18 @@ def transform(self, func, *args, **kwargs): # cythonized transformation or canned "reduction+broadcast" return getattr(self, func)(*args, **kwargs) - # If func is a reduction, we need to broadcast the - # result to the whole group. Compute func result - # and deal with possible broadcasting below. - result = getattr(self, func)(*args, **kwargs) - - if ( - isinstance(result, DataFrame) - and result.columns.equals(self._obj_with_exclusions.columns) - and result.index.equals(self.grouper.result_index) - ): - # GH 30918 - # Call _transform_fast only when we know func is an aggregation - return self._transform_fast(result, func) + # GH 30918 + # Use _transform_fast only when we know func is an aggregation + if func in base.reduction_kernels: + # If func is a reduction, we need to broadcast the + # result to the whole group. Compute func result + # and deal with possible broadcasting below. + result = getattr(self, func)(*args, **kwargs) + + if isinstance(result, DataFrame) and result.columns.equals( + self._obj_with_exclusions.columns + ): + return self._transform_fast(result, func) return self._transform_general(func, *args, **kwargs) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 6fff1b6372d8b..2e3cc33d5e7bf 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -317,16 +317,30 @@ def test_dispatch_transform(tsframe): tm.assert_frame_equal(filled, expected) -def test_transform_fillna(): +@pytest.mark.parametrize("func", ["fillna", "ffill", "bfill", "shift", "cumsum"]) +def test_transform_non_reduction_func(func): # GH 30918 - df = pd.DataFrame( + df = DataFrame( { "A": ["foo", "foo", "foo", "foo", "bar", "bar", "baz"], "B": [1, 2, np.nan, 3, 3, np.nan, 4], } ) - result = df.groupby("A").transform("fillna", value=9) - expected = df[["B"]].fillna(value=9) + expected = { + "fillna": [1.0, 2, 0, 3, 3, 0, 4], + "ffill": [1.0, 2, 2, 3, 3, 3, 4], + "bfill": [1, 2, 3, 3, 3, np.nan, 4], + "shift": [np.nan, 1, 2, np.nan, np.nan, 3, np.nan], + "cumsum": [1, 3, np.nan, 6, 3, np.nan, 4], + } + expected = DataFrame(expected[func], columns=["B"]) + grouped = df.groupby("A") + + if func == "fillna": + result = grouped.transform(func, value=0) + else: + result = grouped.transform(func) + tm.assert_frame_equal(result, expected) From 942512897d8cf13ae269e1aa5abdf59134509ccb Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Wed, 22 Jan 2020 21:57:30 +0800 Subject: [PATCH 5/9] TST: added more tests (GH30918) --- pandas/tests/groupby/test_transform.py | 27 +++++++++++++------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 2e3cc33d5e7bf..64dbd9c091697 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -317,8 +317,7 @@ def test_dispatch_transform(tsframe): tm.assert_frame_equal(filled, expected) -@pytest.mark.parametrize("func", ["fillna", "ffill", "bfill", "shift", "cumsum"]) -def test_transform_non_reduction_func(func): +def test_transform_transformation_func(transformation_func): # GH 30918 df = DataFrame( { @@ -326,20 +325,20 @@ def test_transform_non_reduction_func(func): "B": [1, 2, np.nan, 3, 3, np.nan, 4], } ) - expected = { - "fillna": [1.0, 2, 0, 3, 3, 0, 4], - "ffill": [1.0, 2, 2, 3, 3, 3, 4], - "bfill": [1, 2, 3, 3, 3, np.nan, 4], - "shift": [np.nan, 1, 2, np.nan, np.nan, 3, np.nan], - "cumsum": [1, 3, np.nan, 6, 3, np.nan, 4], - } - expected = DataFrame(expected[func], columns=["B"]) - grouped = df.groupby("A") - if func == "fillna": - result = grouped.transform(func, value=0) + if transformation_func in ["pad", "backfill", "tshift", "corrwith", "cumcount"]: + # These transformation functions are not yet covered in this test + return + elif transformation_func == "fillna": + test_op = lambda x: x.transform("fillna", value=0) + mock_op = lambda x: x.fillna(value=0) else: - result = grouped.transform(func) + test_op = lambda x: x.transform(transformation_func) + mock_op = lambda x: getattr(x, transformation_func)() + + result = test_op(df.groupby("A")) + groups = [df[["B"]].iloc[:4], df[["B"]].iloc[4:6], df[["B"]].iloc[6:]] + expected = concat([mock_op(g) for g in groups]) tm.assert_frame_equal(result, expected) From ab5f11d2d824d91cda5f7958778b60ca93537747 Mon Sep 17 00:00:00 2001 From: Jiaxiang Date: Fri, 24 Jan 2020 11:32:47 +0800 Subject: [PATCH 6/9] Update pandas/tests/groupby/test_transform.py Use xfail for cases that are not applicable yet Co-Authored-By: William Ayd --- pandas/tests/groupby/test_transform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 64dbd9c091697..207223145c81a 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -328,7 +328,7 @@ def test_transform_transformation_func(transformation_func): if transformation_func in ["pad", "backfill", "tshift", "corrwith", "cumcount"]: # These transformation functions are not yet covered in this test - return + pytest.xfail() elif transformation_func == "fillna": test_op = lambda x: x.transform("fillna", value=0) mock_op = lambda x: x.fillna(value=0) From 8a17c1e7dd88a6dedc2adce06ca1071e5b5d1e56 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Fri, 24 Jan 2020 12:00:11 +0800 Subject: [PATCH 7/9] DOC: updated whatsnew (GH30918) --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/tests/tslibs/test_timestamps.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) create mode 100644 pandas/tests/tslibs/test_timestamps.py diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index b640f53564c40..ad58232c81b23 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -169,7 +169,7 @@ Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates and the applied ``func`` does not mutate passed in objects (:issue:`30667`) -- Bug in :meth:`DataFrameGroupby.transform` produces incorrect result with ``func='fillna'`` (:issue:`30918`) +- Bug in :meth:`DataFrameGroupby.transform` produces incorrect result with transformation functions (:issue:`30918`) Reshaping ^^^^^^^^^ diff --git a/pandas/tests/tslibs/test_timestamps.py b/pandas/tests/tslibs/test_timestamps.py new file mode 100644 index 0000000000000..9d29732edf09a --- /dev/null +++ b/pandas/tests/tslibs/test_timestamps.py @@ -0,0 +1,10 @@ +import numpy as np +import pytest + +from pandas._libs.tslibs.timedeltas import delta_to_nanoseconds + +from pandas import Timestamp + + +def test(): + Timestamp(year=2020) \ No newline at end of file From 56c70234e94172d7cbdd5fe1dc935f43d2d4f067 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Fri, 24 Jan 2020 12:04:04 +0800 Subject: [PATCH 8/9] CLN: removed accidentally added file (GH30918) --- pandas/tests/tslibs/test_timestamps.py | 10 ---------- 1 file changed, 10 deletions(-) delete mode 100644 pandas/tests/tslibs/test_timestamps.py diff --git a/pandas/tests/tslibs/test_timestamps.py b/pandas/tests/tslibs/test_timestamps.py deleted file mode 100644 index 9d29732edf09a..0000000000000 --- a/pandas/tests/tslibs/test_timestamps.py +++ /dev/null @@ -1,10 +0,0 @@ -import numpy as np -import pytest - -from pandas._libs.tslibs.timedeltas import delta_to_nanoseconds - -from pandas import Timestamp - - -def test(): - Timestamp(year=2020) \ No newline at end of file From 1bd45d4f5deb79f03a4d17287f84ee2763c32831 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Fri, 24 Jan 2020 13:12:11 +0800 Subject: [PATCH 9/9] DOC: added reference to new issues in xfail message (GH30918) --- pandas/tests/groupby/test_transform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 207223145c81a..8967ef06f50fb 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -328,7 +328,7 @@ def test_transform_transformation_func(transformation_func): if transformation_func in ["pad", "backfill", "tshift", "corrwith", "cumcount"]: # These transformation functions are not yet covered in this test - pytest.xfail() + pytest.xfail("See GH 31269 and GH 31270") elif transformation_func == "fillna": test_op = lambda x: x.transform("fillna", value=0) mock_op = lambda x: x.fillna(value=0)