diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 14e9fda1b910c..42cd34c94b069 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -83,32 +83,44 @@ did not have the same index as the input. .. code-block:: ipython - In [3]: df.groupby('a', dropna=True).transform(lambda x: x.sum()) + In [3]: # Value in the last row should be np.nan + df.groupby('a', dropna=True).transform('sum') Out[3]: b 0 5 1 5 + 2 5 - In [3]: df.groupby('a', dropna=True).transform(lambda x: x) + In [3]: # Should have one additional row with the value np.nan + df.groupby('a', dropna=True).transform(lambda x: x.sum()) Out[3]: b - 0 2 - 1 3 + 0 5 + 1 5 + + In [3]: # The value in the last row is np.nan interpreted as an integer + df.groupby('a', dropna=True).transform('ffill') + Out[3]: + b + 0 2 + 1 3 + 2 -9223372036854775808 - In [3]: df.groupby('a', dropna=True).transform('sum') + In [3]: # Should have one additional row with the value np.nan + df.groupby('a', dropna=True).transform(lambda x: x) Out[3]: b - 0 5 - 1 5 - 2 5 + 0 2 + 1 3 *New behavior*: .. ipython:: python + df.groupby('a', dropna=True).transform('sum') df.groupby('a', dropna=True).transform(lambda x: x.sum()) + df.groupby('a', dropna=True).transform('ffill') df.groupby('a', dropna=True).transform(lambda x: x) - df.groupby('a', dropna=True).transform('sum') .. _whatsnew_150.notable_bug_fixes.visualization: diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index 0c8474c9badc7..ec9a2e4a4b5c0 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -70,7 +70,6 @@ class OutputKey: "mean", "median", "min", - "ngroup", "nth", "nunique", "prod", @@ -113,6 +112,7 @@ def maybe_normalize_deprecated_kernels(kernel): "diff", "ffill", "fillna", + "ngroup", "pad", "pct_change", "rank", diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 9f80c8f91ac6f..4a617d1b63639 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -62,6 +62,7 @@ class providing the base-class of operations. ) from pandas.util._exceptions import find_stack_level +from pandas.core.dtypes.cast import ensure_dtype_can_hold_na from pandas.core.dtypes.common import ( is_bool_dtype, is_datetime64_dtype, @@ -950,7 +951,13 @@ def curried(x): if name in base.plotting_methods: return self.apply(curried) - return self._python_apply_general(curried, self._obj_with_exclusions) + result = self._python_apply_general(curried, self._obj_with_exclusions) + + if self.grouper.has_dropped_na and name in base.transformation_kernels: + # result will have dropped rows due to nans, fill with null + # and ensure index is ordered same as the input + result = self._set_result_index_ordered(result) + return result wrapper.__name__ = name return wrapper @@ -2608,7 +2615,11 @@ def blk_func(values: ArrayLike) -> ArrayLike: # then there will be no -1s in indexer, so we can use # the original dtype (no need to ensure_dtype_can_hold_na) if isinstance(values, np.ndarray): - out = np.empty(values.shape, dtype=values.dtype) + dtype = values.dtype + if self.grouper.has_dropped_na: + # dropped null groups give rise to nan in the result + dtype = ensure_dtype_can_hold_na(values.dtype) + out = np.empty(values.shape, dtype=dtype) else: out = type(values)._empty(values.shape, dtype=values.dtype) @@ -3114,9 +3125,16 @@ def ngroup(self, ascending: bool = True): """ with self._group_selection_context(): index = self._selected_obj.index - result = self._obj_1d_constructor( - self.grouper.group_info[0], index, dtype=np.int64 - ) + comp_ids = self.grouper.group_info[0] + + dtype: type + if self.grouper.has_dropped_na: + comp_ids = np.where(comp_ids == -1, np.nan, comp_ids) + dtype = np.float64 + else: + dtype = np.int64 + + result = self._obj_1d_constructor(comp_ids, index, dtype=dtype) if not ascending: result = self.ngroups - 1 - result return result diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 15bc262997075..5f15e11c4740c 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -98,15 +98,25 @@ class WrappedCythonOp: """ Dispatch logic for functions defined in _libs.groupby + + Parameters + ---------- + kind: str + Whether the operation is an aggregate or transform. + how: str + Operation name, e.g. "mean". + has_dropped_na: bool + True precisely when dropna=True and the grouper contains a null value. """ # Functions for which we do _not_ attempt to cast the cython result # back to the original dtype. cast_blocklist = frozenset(["rank", "count", "size", "idxmin", "idxmax"]) - def __init__(self, kind: str, how: str) -> None: + def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None: self.kind = kind self.how = how + self.has_dropped_na = has_dropped_na _CYTHON_FUNCTIONS = { "aggregate": { @@ -194,7 +204,9 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray: values = ensure_float64(values) elif values.dtype.kind in ["i", "u"]: - if how in ["add", "var", "prod", "mean", "ohlc"]: + if how in ["add", "var", "prod", "mean", "ohlc"] or ( + self.kind == "transform" and self.has_dropped_na + ): # result may still include NaN, so we have to cast values = ensure_float64(values) @@ -582,6 +594,10 @@ def _call_cython_op( result = result.T + if self.how == "rank" and self.has_dropped_na: + # TODO: Wouldn't need this if group_rank supported mask + result = np.where(comp_ids < 0, np.nan, result) + if self.how not in self.cast_blocklist: # e.g. if we are int64 and need to restore to datetime64/timedelta64 # "rank" is the only member of cast_blocklist we get here @@ -959,7 +975,7 @@ def _cython_operation( """ assert kind in ["transform", "aggregate"] - cy_op = WrappedCythonOp(kind=kind, how=how) + cy_op = WrappedCythonOp(kind=kind, how=how, has_dropped_na=self.has_dropped_na) ids, _, _ = self.group_info ngroups = self.ngroups diff --git a/pandas/tests/apply/test_frame_transform.py b/pandas/tests/apply/test_frame_transform.py index 3384f9d46029a..2d4deff8ebc4a 100644 --- a/pandas/tests/apply/test_frame_transform.py +++ b/pandas/tests/apply/test_frame_transform.py @@ -139,6 +139,10 @@ def test_transform_bad_dtype(op, frame_or_series, request): raises=ValueError, reason="GH 40418: rank does not raise a TypeError" ) ) + elif op == "ngroup": + request.node.add_marker( + pytest.mark.xfail(raises=ValueError, reason="ngroup not valid for NDFrame") + ) obj = DataFrame({"A": 3 * [object]}) # DataFrame that will fail on most transforms obj = tm.get_obj(obj, frame_or_series) @@ -157,9 +161,14 @@ def test_transform_bad_dtype(op, frame_or_series, request): @pytest.mark.parametrize("op", frame_kernels_raise) -def test_transform_partial_failure_typeerror(op): +def test_transform_partial_failure_typeerror(request, op): # GH 35964 + if op == "ngroup": + request.node.add_marker( + pytest.mark.xfail(raises=ValueError, reason="ngroup not valid for NDFrame") + ) + # Using object makes most transform kernels fail df = DataFrame({"A": 3 * [object], "B": [1, 2, 3]}) diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py index 554fd4174c7a4..2af063dee8b55 100644 --- a/pandas/tests/apply/test_str.py +++ b/pandas/tests/apply/test_str.py @@ -243,8 +243,12 @@ def test_agg_cython_table_transform_frame(df, func, expected, axis): @pytest.mark.parametrize("op", series_transform_kernels) -def test_transform_groupby_kernel_series(string_series, op): +def test_transform_groupby_kernel_series(request, string_series, op): # GH 35964 + if op == "ngroup": + request.node.add_marker( + pytest.mark.xfail(raises=ValueError, reason="ngroup not valid for NDFrame") + ) # TODO(2.0) Remove after pad/backfill deprecation enforced op = maybe_normalize_deprecated_kernels(op) args = [0.0] if op == "fillna" else [] @@ -255,9 +259,15 @@ def test_transform_groupby_kernel_series(string_series, op): @pytest.mark.parametrize("op", frame_transform_kernels) -def test_transform_groupby_kernel_frame(axis, float_frame, op): +def test_transform_groupby_kernel_frame(request, axis, float_frame, op): # TODO(2.0) Remove after pad/backfill deprecation enforced op = maybe_normalize_deprecated_kernels(op) + + if op == "ngroup": + request.node.add_marker( + pytest.mark.xfail(raises=ValueError, reason="ngroup not valid for NDFrame") + ) + # GH 35964 args = [0.0] if op == "fillna" else [] diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index 83b8d5c29bbf0..7830c229ece2f 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -658,6 +658,8 @@ def test_non_unique_index(): ) result = df.groupby([df.index, "A"]).value.rank(ascending=True, pct=True) expected = Series( - [1.0] * 4, index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4, name="value" + [1.0, 1.0, 1.0, np.nan], + index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4, + name="value", ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 0f450adf67b23..3042e38d9014c 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -168,6 +168,10 @@ def test_transform_axis_1(request, transformation_func): # TODO(2.0) Remove after pad/backfill deprecation enforced transformation_func = maybe_normalize_deprecated_kernels(transformation_func) + if transformation_func == "ngroup": + msg = "ngroup fails with axis=1: #45986" + request.node.add_marker(pytest.mark.xfail(reason=msg)) + warn = None if transformation_func == "tshift": warn = FutureWarning @@ -383,6 +387,15 @@ def test_transform_transformation_func(request, transformation_func): elif transformation_func == "fillna": test_op = lambda x: x.transform("fillna", value=0) mock_op = lambda x: x.fillna(value=0) + elif transformation_func == "ngroup": + test_op = lambda x: x.transform("ngroup") + counter = -1 + + def mock_op(x): + nonlocal counter + counter += 1 + return Series(counter, index=x.index) + elif transformation_func == "tshift": msg = ( "Current behavior of groupby.tshift is inconsistent with other " @@ -394,10 +407,14 @@ def test_transform_transformation_func(request, transformation_func): mock_op = lambda x: getattr(x, transformation_func)() result = test_op(df.groupby("A")) - groups = [df[["B"]].iloc[:4], df[["B"]].iloc[4:6], df[["B"]].iloc[6:]] - expected = concat([mock_op(g) for g in groups]) + # pass the group in same order as iterating `for ... in df.groupby(...)` + # but reorder to match df's index since this is a transform + groups = [df[["B"]].iloc[4:6], df[["B"]].iloc[6:], df[["B"]].iloc[:4]] + expected = concat([mock_op(g) for g in groups]).sort_index() + # sort_index does not preserve the freq + expected = expected.set_axis(df.index) - if transformation_func == "cumcount": + if transformation_func in ("cumcount", "ngroup"): tm.assert_series_equal(result, expected) else: tm.assert_frame_equal(result, expected) @@ -1122,10 +1139,6 @@ def test_transform_agg_by_name(request, reduction_func, obj): func = reduction_func g = obj.groupby(np.repeat([0, 1], 3)) - if func == "ngroup": # GH#27468 - request.node.add_marker( - pytest.mark.xfail(reason="TODO: g.transform('ngroup') doesn't work") - ) if func == "corrwith" and isinstance(obj, Series): # GH#32293 request.node.add_marker( pytest.mark.xfail(reason="TODO: implement SeriesGroupBy.corrwith") @@ -1137,8 +1150,8 @@ def test_transform_agg_by_name(request, reduction_func, obj): # this is the *definition* of a transformation tm.assert_index_equal(result.index, obj.index) - if func != "size" and obj.ndim == 2: - # size returns a Series, unlike other transforms + if func not in ("ngroup", "size") and obj.ndim == 2: + # size/ngroup return a Series, unlike other transforms tm.assert_index_equal(result.columns, obj.columns) # verify that values were broadcasted across each group @@ -1312,7 +1325,7 @@ def test_null_group_lambda_self(sort, dropna): def test_null_group_str_reducer(request, dropna, reduction_func): # GH 17093 - if reduction_func in ("corrwith", "ngroup"): + if reduction_func == "corrwith": msg = "incorrectly raises" request.node.add_marker(pytest.mark.xfail(reason=msg)) index = [1, 2, 3, 4] # test transform preserves non-standard index @@ -1358,31 +1371,11 @@ def test_null_group_str_reducer(request, dropna, reduction_func): @pytest.mark.filterwarnings("ignore:tshift is deprecated:FutureWarning") -def test_null_group_str_transformer( - request, using_array_manager, dropna, transformation_func -): +def test_null_group_str_transformer(request, dropna, transformation_func): # GH 17093 - xfails_block = ( - "cummax", - "cummin", - "cumsum", - "fillna", - "rank", - "backfill", - "ffill", - "bfill", - "pad", - ) - xfails_array = ("cummax", "cummin", "cumsum", "fillna", "rank") if transformation_func == "tshift": msg = "tshift requires timeseries" request.node.add_marker(pytest.mark.xfail(reason=msg)) - elif dropna and ( - (not using_array_manager and transformation_func in xfails_block) - or (using_array_manager and transformation_func in xfails_array) - ): - msg = "produces incorrect results when nans are present" - request.node.add_marker(pytest.mark.xfail(reason=msg)) args = (0,) if transformation_func == "fillna" else () df = DataFrame({"A": [1, 1, np.nan], "B": [1, 2, 2]}, index=[1, 2, 3]) gb = df.groupby("A", dropna=dropna) @@ -1420,10 +1413,6 @@ def test_null_group_str_reducer_series(request, dropna, reduction_func): msg = "corrwith not implemented for SeriesGroupBy" request.node.add_marker(pytest.mark.xfail(reason=msg)) - if reduction_func == "ngroup": - msg = "ngroup fails" - request.node.add_marker(pytest.mark.xfail(reason=msg)) - # GH 17093 index = [1, 2, 3, 4] # test transform preserves non-standard index ser = Series([1, 2, 2, 3], index=index) @@ -1470,15 +1459,6 @@ def test_null_group_str_transformer_series(request, dropna, transformation_func) if transformation_func == "tshift": msg = "tshift requires timeseries" request.node.add_marker(pytest.mark.xfail(reason=msg)) - elif dropna and transformation_func in ( - "cummax", - "cummin", - "cumsum", - "fillna", - "rank", - ): - msg = "produces incorrect results when nans are present" - request.node.add_marker(pytest.mark.xfail(reason=msg)) args = (0,) if transformation_func == "fillna" else () ser = Series([1, 2, 2], index=[1, 2, 3]) gb = ser.groupby([1, 1, np.nan], dropna=dropna) @@ -1502,4 +1482,10 @@ def test_null_group_str_transformer_series(request, dropna, transformation_func) msg = f"{transformation_func} is deprecated" with tm.assert_produces_warning(warn, match=msg): result = gb.transform(transformation_func, *args) - tm.assert_equal(result, expected) + if dropna and transformation_func == "fillna": + # GH#46369 - result name is the group; remove this block when fixed. + tm.assert_equal(result, expected, check_names=False) + # This should be None + assert result.name == 1.0 + else: + tm.assert_equal(result, expected)