From ea52d263e7c1f8567dada8b58dadfc5afb6e574b Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 2 Mar 2022 00:39:01 -0500 Subject: [PATCH 1/8] BUG: Fix some cases of groupby(...).transform with dropna=True --- pandas/core/generic.py | 20 +- pandas/core/groupby/groupby.py | 11 +- pandas/core/internals/managers.py | 11 +- .../tests/groupby/transform/test_transform.py | 178 ++++++++++++++++++ 4 files changed, 214 insertions(+), 6 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 83d0a95b8adb2..aaad2555e0186 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3693,10 +3693,26 @@ class max_speed nv.validate_take((), kwargs) + return self._take(indices, axis) + + def _take( + self: NDFrameT, + indices, + axis=0, + convert_indices: bool_t = True, + ) -> NDFrameT: + """ + Internal version of the `take` allowing specification of additional args. + + See the docstring of `take` for full explanation of the parameters. + """ self._consolidate_inplace() new_data = self._mgr.take( - indices, axis=self._get_block_manager_axis(axis), verify=True + indices, + axis=self._get_block_manager_axis(axis), + verify=True, + convert_indices=convert_indices, ) return self._constructor(new_data).__finalize__(self, method="take") @@ -3708,7 +3724,7 @@ def _take_with_is_copy(self: NDFrameT, indices, axis=0) -> NDFrameT: See the docstring of `take` for full explanation of the parameters. """ - result = self.take(indices=indices, axis=axis) + result = self._take(indices=indices, axis=axis) # Maybe set copy if we didn't actually change the index. if not result._get_axis(axis).equals(self._get_axis(axis)): result._set_is_copy(self) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 65e8b238cd476..d07e1bfb6ad09 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1646,7 +1646,9 @@ def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT: out = algorithms.take_nd(result._values, ids) output = obj._constructor(out, index=obj.index, name=obj.name) else: - output = result.take(ids, axis=self.axis) + # Don't convert indices: negative indices need to give rise + # to null values in the result + output = result._take(ids, axis=self.axis, convert_indices=False) output = output.set_axis(obj._get_axis(self.axis), axis=self.axis) return output @@ -1699,9 +1701,14 @@ def _cumcount_array(self, ascending: bool = True) -> np.ndarray: else: out = np.repeat(out[np.r_[run[1:], True]], rep) - out + if self.grouper.has_dropped_na: + out = np.where(ids == -1, np.nan, out.astype(np.float64, copy=False)) + else: + out = out.astype(np.int64, copy=False) + rev = np.empty(count, dtype=np.intp) rev[sorter] = np.arange(count, dtype=np.intp) - return out[rev].astype(np.int64, copy=False) + return out[rev] # ----------------------------------------------------------------- diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 4b8f1aae75b1b..4b5b979551917 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -829,7 +829,13 @@ def _make_na_block( block_values.fill(fill_value) return new_block_2d(block_values, placement=placement) - def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T: + def take( + self: T, + indexer, + axis: int = 1, + verify: bool = True, + convert_indices: bool = True, + ) -> T: """ Take items along any axis. @@ -851,7 +857,8 @@ def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T: ) n = self.shape[axis] - indexer = maybe_convert_indices(indexer, n, verify=verify) + if convert_indices: + indexer = maybe_convert_indices(indexer, n, verify=verify) new_labels = self.axes[axis].take(indexer) return self.reindex_indexer( diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 5cc9ac9b09ae9..9645c4952a46c 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1308,3 +1308,181 @@ def test_null_group_lambda_self(sort, dropna): gb = df.groupby("A", dropna=dropna, sort=sort) result = gb.transform(lambda x: x) tm.assert_frame_equal(result, expected) + + +def test_null_group_str_reducer(request, dropna, reduction_func): + # GH 17093 + if reduction_func == "ngroup": + msg = "ngroup fails" + request.node.add_marker(pytest.mark.xfail(reason=msg)) + index = [1, 2, 3, 4] # test transform preserves non-standard index + df = DataFrame({"A": [1, 1, np.nan, np.nan], "B": [1, 2, 2, 3]}, index=index) + gb = df.groupby("A", dropna=dropna) + + if reduction_func == "corrwith": + args = (df["B"],) + elif reduction_func == "nth": + args = (0,) + else: + args = () + + # Manually handle reducers that don't fit the generic pattern + # Set expected with dropna=False, then replace if necessary + if reduction_func == "first": + expected = DataFrame({"B": [1, 1, 2, 2]}, index=index) + elif reduction_func == "last": + expected = DataFrame({"B": [2, 2, 3, 3]}, index=index) + elif reduction_func == "nth": + expected = DataFrame({"B": [1, 1, 2, 2]}, index=index) + elif reduction_func == "size": + expected = Series([2, 2, 2, 2], index=index) + elif reduction_func == "corrwith": + expected = DataFrame({"B": [1.0, 1.0, 1.0, 1.0]}, index=index) + else: + expected_gb = df.groupby("A", dropna=False) + buffer = [] + for idx, group in expected_gb: + res = getattr(group["B"], reduction_func)() + buffer.append(Series(res, index=group.index)) + expected = concat(buffer).to_frame("B") + if dropna: + dtype = object if reduction_func in ("any", "all") else float + expected = expected.astype(dtype) + if expected.ndim == 2: + expected.iloc[[2, 3], 0] = np.nan + else: + expected.iloc[[2, 3]] = np.nan + + result = gb.transform(reduction_func, *args) + tm.assert_equal(result, expected) + + +def test_null_group_str_transformer(request, dropna, transformation_func): + # GH 17093 + if transformation_func == "tshift": + msg = "tshift requires timeseries" + request.node.add_marker(pytest.mark.xfail(reason=msg)) + elif dropna and transformation_func in ( + "backfill", + "bfill", + "cummax", + "cummin", + "cumsum", + "ffill", + "fillna", + "pad", + "rank", + ): + msg = "produces incorrect results when nans are present" + request.node.add_marker(pytest.mark.xfail(reason=msg)) + args = (0,) if transformation_func == "fillna" else () + df = DataFrame({"A": [1, 1, np.nan], "B": [1, 2, 2]}, index=[1, 2, 3]) + gb = df.groupby("A", dropna=dropna) + + buffer = [] + for k, (idx, group) in enumerate(gb): + if transformation_func == "cumcount": + # DataFrame has no cumcount method + res = DataFrame({"B": range(len(group))}, index=group.index) + elif transformation_func == "ngroup": + res = DataFrame(len(group) * [k], index=group.index, columns=["B"]) + else: + res = getattr(group[["B"]], transformation_func)(*args) + buffer.append(res) + if dropna: + dtype = object if transformation_func in ("any", "all") else None + buffer.append(DataFrame([[np.nan]], index=[3], dtype=dtype, columns=["B"])) + expected = concat(buffer) + + if transformation_func in ("cumcount", "ngroup"): + # ngroup/cumcount always returns a Series as it counts the groups, not values + expected = expected["B"].rename(None) + + result = gb.transform(transformation_func, *args) + tm.assert_equal(result, expected) + + +def test_null_group_str_reducer_series(request, dropna, reduction_func): + if reduction_func == "corrwith": + msg = "corrwith not implemented for SeriesGroupBy" + request.node.add_marker(pytest.mark.xfail(reason=msg)) + + if reduction_func == "ngroup": + msg = "ngroup fails" + request.node.add_marker(pytest.mark.xfail(reason=msg)) + + # GH 17093 + index = [1, 2, 3, 4] # test transform preserves non-standard index + ser = Series([1, 2, 2, 3], index=index) + gb = ser.groupby([1, 1, np.nan, np.nan], dropna=dropna) + + if reduction_func == "corrwith": + args = (ser,) + elif reduction_func == "nth": + args = (0,) + else: + args = () + + # Manually handle reducers that don't fit the generic pattern + # Set expected with dropna=False, then replace if necessary + if reduction_func == "first": + expected = Series([1, 1, 2, 2], index=index) + elif reduction_func == "last": + expected = Series([2, 2, 3, 3], index=index) + elif reduction_func == "nth": + expected = Series([1, 1, 2, 2], index=index) + elif reduction_func == "size": + expected = Series([2, 2, 2, 2], index=index) + elif reduction_func == "corrwith": + expected = Series([1, 1, 2, 2], index=index) + else: + expected_gb = ser.groupby([1, 1, np.nan, np.nan], dropna=False) + buffer = [] + for idx, group in expected_gb: + res = getattr(group, reduction_func)() + buffer.append(Series(res, index=group.index)) + expected = concat(buffer) + if dropna: + dtype = object if reduction_func in ("any", "all") else float + expected = expected.astype(dtype) + expected.iloc[[2, 3]] = np.nan + + result = gb.transform(reduction_func, *args) + tm.assert_series_equal(result, expected) + + +def test_null_group_str_transformer_series(request, dropna, transformation_func): + # GH 17093 + if transformation_func == "tshift": + msg = "tshift requires timeseries" + request.node.add_marker(pytest.mark.xfail(reason=msg)) + elif dropna and transformation_func in ( + "cummax", + "cummin", + "cumsum", + "fillna", + "rank", + ): + msg = "produces incorrect results when nans are present" + request.node.add_marker(pytest.mark.xfail(reason=msg)) + args = (0,) if transformation_func == "fillna" else () + ser = Series([1, 2, 2], index=[1, 2, 3]) + gb = ser.groupby([1, 1, np.nan], dropna=dropna) + + buffer = [] + for k, (idx, group) in enumerate(gb): + if transformation_func == "cumcount": + # Series has no cumcount method + res = Series(range(len(group)), index=group.index) + elif transformation_func == "ngroup": + res = Series(k, index=group.index) + else: + res = getattr(group, transformation_func)(*args) + buffer.append(res) + if dropna: + dtype = object if transformation_func in ("any", "all") else None + buffer.append(Series([np.nan], index=[3], dtype=dtype)) + expected = concat(buffer) + + result = gb.transform(transformation_func, *args) + tm.assert_equal(result, expected) From 7a62db4274d92f68596103915e80ecfc57fa3c73 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 2 Mar 2022 20:39:09 -0500 Subject: [PATCH 2/8] GH # in test --- pandas/tests/groupby/transform/test_transform.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 9645c4952a46c..4e0781d54da73 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1403,6 +1403,7 @@ def test_null_group_str_transformer(request, dropna, transformation_func): def test_null_group_str_reducer_series(request, dropna, reduction_func): + # GH 17093 if reduction_func == "corrwith": msg = "corrwith not implemented for SeriesGroupBy" request.node.add_marker(pytest.mark.xfail(reason=msg)) From cf8e4565a95bb0f0955c4ab2028f334c35491803 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 2 Mar 2022 20:45:40 -0500 Subject: [PATCH 3/8] whatsnew --- doc/source/whatsnew/v1.5.0.rst | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 3bc9a1ef1ca48..e9ab38cfe3a58 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -82,17 +82,32 @@ did not have the same index as the input. .. code-block:: ipython + In [3]: df.groupby('a', dropna=True).transform(lambda x: x.sum()) + Out[3]: + b + 0 5 + 1 5 + In [3]: df.groupby('a', dropna=True).transform(lambda x: x) Out[3]: b 0 2 1 3 + In [3]: df.groupby('a', dropna=True).transform('sum') + Out[3]: + b + 0 5 + 1 5 + 2 5 + *New behavior*: .. ipython:: python + df.groupby('a', dropna=True).transform(lambda x: x.sum()) df.groupby('a', dropna=True).transform(lambda x: x) + df.groupby('a', dropna=True).transform('sum') .. _whatsnew_150.notable_bug_fixes.notable_bug_fix2: From c6812366feeb980e5b47ffd9159ce6918e9d923c Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 2 Mar 2022 20:51:17 -0500 Subject: [PATCH 4/8] docstring --- pandas/core/internals/managers.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 4b5b979551917..985b1115f2ced 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -844,6 +844,8 @@ def take( verify : bool, default True Check that all entries are between 0 and len(self) - 1, inclusive. Pass verify=False if this check has been done by the caller. + convert_indices : bool, default True + Whether to attempt to convert indices to positive values. Returns ------- From d06acd50594a67bda07fccb2a40e730af7f633c9 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 4 Mar 2022 21:10:38 -0500 Subject: [PATCH 5/8] fixups --- pandas/core/internals/array_manager.py | 11 ++++++-- .../tests/groupby/transform/test_transform.py | 25 ++++++++++++------- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 3e499c99ac144..eeb250e2a0022 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -640,7 +640,13 @@ def _reindex_indexer( return type(self)(new_arrays, new_axes, verify_integrity=False) - def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T: + def take( + self: T, + indexer, + axis: int = 1, + verify: bool = True, + convert_indices: bool = True, + ) -> T: """ Take items along any axis. """ @@ -656,7 +662,8 @@ def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T: raise ValueError("indexer should be 1-dimensional") n = self.shape_proper[axis] - indexer = maybe_convert_indices(indexer, n, verify=verify) + if convert_indices: + indexer = maybe_convert_indices(indexer, n, verify=verify) new_labels = self._axes[axis].take(indexer) return self._reindex_indexer( diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 4e0781d54da73..f0bfb037dcd3d 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1357,21 +1357,28 @@ def test_null_group_str_reducer(request, dropna, reduction_func): tm.assert_equal(result, expected) -def test_null_group_str_transformer(request, dropna, transformation_func): +def test_null_group_str_transformer( + request, using_array_manager, dropna, transformation_func +): # GH 17093 - if transformation_func == "tshift": - msg = "tshift requires timeseries" - request.node.add_marker(pytest.mark.xfail(reason=msg)) - elif dropna and transformation_func in ( - "backfill", - "bfill", + xfails_block = ( "cummax", "cummin", "cumsum", - "ffill", "fillna", - "pad", "rank", + "backfill", + "ffill", + "bfill", + "pad", + ) + xfails_array = ("cummax", "cummin", "cumsum", "fillna", "rank") + if transformation_func == "tshift": + msg = "tshift requires timeseries" + request.node.add_marker(pytest.mark.xfail(reason=msg)) + elif dropna and ( + (not using_array_manager and transformation_func in xfails_block) + or (using_array_manager and transformation_func in xfails_array) ): msg = "produces incorrect results when nans are present" request.node.add_marker(pytest.mark.xfail(reason=msg)) From 6367db9d7f8b9c9101495ab0d805236edee45299 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 5 Mar 2022 05:45:01 -0500 Subject: [PATCH 6/8] xfail corrwith --- pandas/tests/groupby/transform/test_transform.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index f0bfb037dcd3d..0dc3cadc449e6 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1312,8 +1312,8 @@ def test_null_group_lambda_self(sort, dropna): def test_null_group_str_reducer(request, dropna, reduction_func): # GH 17093 - if reduction_func == "ngroup": - msg = "ngroup fails" + if reduction_func in ("corrwith", "ngroup"): + msg = "incorrectly raises" request.node.add_marker(pytest.mark.xfail(reason=msg)) index = [1, 2, 3, 4] # test transform preserves non-standard index df = DataFrame({"A": [1, 1, np.nan, np.nan], "B": [1, 2, 2, 3]}, index=index) From e81dc16a7dd8acbe59974648dffbb19df3d85dc8 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 5 Mar 2022 07:44:53 -0500 Subject: [PATCH 7/8] Suppress warning --- pandas/tests/groupby/transform/test_transform.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 0dc3cadc449e6..5415c99f5239a 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1405,7 +1405,11 @@ def test_null_group_str_transformer( # ngroup/cumcount always returns a Series as it counts the groups, not values expected = expected["B"].rename(None) - result = gb.transform(transformation_func, *args) + warn = FutureWarning if transformation_func in ("backfill", "pad") else None + msg = f"{transformation_func} is deprecated" + with tm.assert_produces_warning(warn, match=msg): + result = gb.transform(transformation_func, *args) + tm.assert_equal(result, expected) @@ -1492,5 +1496,8 @@ def test_null_group_str_transformer_series(request, dropna, transformation_func) buffer.append(Series([np.nan], index=[3], dtype=dtype)) expected = concat(buffer) - result = gb.transform(transformation_func, *args) + warn = FutureWarning if transformation_func in ("backfill", "pad") else None + msg = f"{transformation_func} is deprecated" + with tm.assert_produces_warning(warn, match=msg): + result = gb.transform(transformation_func, *args) tm.assert_equal(result, expected) From f0511932c288b778914f1bc630599256faac0d2d Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 7 Mar 2022 16:50:18 -0500 Subject: [PATCH 8/8] GH# --- pandas/core/groupby/groupby.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d07e1bfb6ad09..a4dcf6e010c2b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1646,6 +1646,7 @@ def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT: out = algorithms.take_nd(result._values, ids) output = obj._constructor(out, index=obj.index, name=obj.name) else: + # GH#46209 # Don't convert indices: negative indices need to give rise # to null values in the result output = result._take(ids, axis=self.axis, convert_indices=False)