diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index ac39546603266..2bcdf0d0a00d9 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -50,10 +50,61 @@ Notable bug fixes These are bug fixes that might have notable behavior changes. -.. _whatsnew_150.notable_bug_fixes.notable_bug_fix1: +.. _whatsnew_150.notable_bug_fixes.groupby_transform_dropna: -notable_bug_fix1 -^^^^^^^^^^^^^^^^ +Using ``dropna=True`` with ``groupby`` transformers +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A transformer is a function whose result has the same size as its input. When the +result is a :class:`DataFrame` or :class:`Series`, it is also required that the +index of the result is equal to that of the input. In pandas 1.4, using +:meth:`.DataFrameGroupBy.transform` or :meth:`.SeriesGroupBy.transform` with null +values in the groups and ``dropna=True`` gave incorrect results. Demonstrated by the +examples below, the incorrect results either contained incorrect values, or the result +did not have the same index as the input. + +.. ipython:: python + + df = pd.DataFrame({'a': [1, 1, np.nan], 'b': [2, 3, 4]}) + +*Old behavior*: + +.. code-block:: ipython + + In [3]: df.groupby('a', dropna=True).transform('sum') + Out[3]: + b + 0 5 + 1 5 + 2 5 + + In [3]: df.groupby('a', dropna=True).transform(lambda x: x.sum()) + Out[3]: + b + 0 5 + 1 5 + + In [3]: df.groupby('a', dropna=True).transform('ffill') + Out[3]: + b + 0 2 + 1 3 + 2 -9223372036854775808 + + In [3]: df.groupby('a', dropna=True).transform(lambda x: x) + Out[3]: + b + 0 2 + 1 3 + +*New behavior*: + +.. ipython:: python + + df.groupby('a', dropna=True).transform('sum') + df.groupby('a', dropna=True).transform(lambda x: x.sum()) + df.groupby('a', dropna=True).transform('ffill') + df.groupby('a', dropna=True).transform(lambda x: x) .. _whatsnew_150.notable_bug_fixes.notable_bug_fix2: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index fcc7e915e2b57..22c5b08596977 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3704,10 +3704,26 @@ class max_speed nv.validate_take((), kwargs) + return self._take(indices, axis) + + def _take( + self: NDFrameT, + indices, + axis=0, + convert_indices: bool_t = True, + ) -> NDFrameT: + """ + Internal version of the `take` allowing specification of additional args. + + See the docstring of `take` for full explanation of the parameters. + """ self._consolidate_inplace() new_data = self._mgr.take( - indices, axis=self._get_block_manager_axis(axis), verify=True + indices, + axis=self._get_block_manager_axis(axis), + verify=True, + convert_indices=convert_indices, ) return self._constructor(new_data).__finalize__(self, method="take") @@ -3719,7 +3735,7 @@ def _take_with_is_copy(self: NDFrameT, indices, axis=0) -> NDFrameT: See the docstring of `take` for full explanation of the parameters. """ - result = self.take(indices=indices, axis=axis) + result = self._take(indices=indices, axis=axis) # Maybe set copy if we didn't actually change the index. if not result._get_axis(axis).equals(self._get_axis(axis)): result._set_is_copy(self) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 71cef46950e12..140f12b1e9cb7 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1185,7 +1185,9 @@ def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): ) def _can_use_transform_fast(self, func: str, result) -> bool: - return func == "size" or ( + # ngroup can only use transform_fast when there are no null keys dropped + # caller is responsible to check this case + return func in ("ngroup", "size") or ( isinstance(result, DataFrame) and result.columns.equals(self._obj_with_exclusions.columns) ) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 4eb907e06adf1..2b880437b3d3f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -62,6 +62,7 @@ class providing the base-class of operations. ) from pandas.util._exceptions import find_stack_level +from pandas.core.dtypes.cast import ensure_dtype_can_hold_na from pandas.core.dtypes.common import ( is_bool_dtype, is_datetime64_dtype, @@ -108,6 +109,7 @@ class providing the base-class of operations. CategoricalIndex, Index, MultiIndex, + RangeIndex, ) from pandas.core.internals.blocks import ensure_block_shape import pandas.core.sample as sample @@ -614,6 +616,14 @@ def indices(self): """ return self.grouper.indices + @final + def _get_indices_by_codes(self, codes): + """ + Safe get multiple indices, translate keys for + datelike to underlying repr. + """ + return [self.grouper.code_indices.get(code, []) for code in codes] + @final def _get_indices(self, names): """ @@ -947,7 +957,12 @@ def curried(x): if name in base.plotting_methods: return self.apply(curried) - return self._python_apply_general(curried, self._obj_with_exclusions) + result = self._python_apply_general(curried, self._obj_with_exclusions) + if self.grouper.has_dropped_na and name in base.transformation_kernels: + # result will have dropped rows due to nans, fill with null + # and ensure index is ordered same as the input + result = self._set_result_index_ordered(result) + return result wrapper.__name__ = name return wrapper @@ -1086,28 +1101,24 @@ def _set_result_index_ordered( ) -> OutputFrameOrSeries: # set the result index on the passed values object and # return the new object, xref 8046 - if self.grouper.is_monotonic: # shortcut if we have an already ordered grouper result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True) return result # row order is scrambled => sort the rows by position in original index + _, obs_group_ids, _ = self.grouper.group_info original_positions = Index( - np.concatenate(self._get_indices(self.grouper.result_index)) + np.concatenate(self._get_indices_by_codes(obs_group_ids)) ) result.set_axis(original_positions, axis=self.axis, inplace=True) result = result.sort_index(axis=self.axis) - - dropped_rows = len(result.index) < len(self.obj.index) - - if dropped_rows: - # get index by slicing original index according to original positions - # slice drops attrs => use set_axis when no rows were dropped - sorted_indexer = result.index - result.index = self._selected_obj.index[sorted_indexer] - else: - result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True) + obj_axis = self.obj._get_axis(self.axis) + if self.grouper.has_dropped_na: + # Fill in any missing values due to dropna - index here is integral + # with values referring to the row of the input so can use RangeIndex + result = result.reindex(RangeIndex(len(obj_axis)), axis=self.axis) + result.set_axis(obj_axis, axis=self.axis, inplace=True) return result @@ -1650,6 +1661,11 @@ def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): with com.temp_setattr(self, "observed", True): result = getattr(self, func)(*args, **kwargs) + if func == "ngroup" and not self.grouper.has_dropped_na: + # ngroup handles its own wrapping, as long as there aren't + # dropped null keys + return result + if self._can_use_transform_fast(func, result): return self._wrap_transform_fast_result(result) @@ -1672,7 +1688,9 @@ def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT: out = algorithms.take_nd(result._values, ids) output = obj._constructor(out, index=obj.index, name=obj.name) else: - output = result.take(ids, axis=0) + # Don't convert indices: negative indices need to give rise + # to null values in the result + output = result._take(ids, axis=0, convert_indices=False) output.index = obj.index return output @@ -1725,9 +1743,14 @@ def _cumcount_array(self, ascending: bool = True) -> np.ndarray: else: out = np.repeat(out[np.r_[run[1:], True]], rep) - out + if self.grouper.has_dropped_na: + out = np.where(ids == -1, np.nan, out.astype(np.float64, copy=False)) + else: + out = out.astype(np.int64, copy=False) + rev = np.empty(count, dtype=np.intp) rev[sorter] = np.arange(count, dtype=np.intp) - return out[rev].astype(np.int64, copy=False) + return out[rev] # ----------------------------------------------------------------- @@ -2556,7 +2579,12 @@ def blk_func(values: ArrayLike) -> ArrayLike: # then there will be no -1s in indexer, so we can use # the original dtype (no need to ensure_dtype_can_hold_na) if isinstance(values, np.ndarray): - out = np.empty(values.shape, dtype=values.dtype) + dtype = values.dtype + if self.grouper.has_dropped_na: + # ...unless there are any dropped null groups, + # these give rise to nan in the result + dtype = ensure_dtype_can_hold_na(values.dtype) + out = np.empty(values.shape, dtype=dtype) else: out = type(values)._empty(values.shape, dtype=values.dtype) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index d4aa6ae9f4059..4702c8bae958a 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -38,6 +38,7 @@ from pandas.util._decorators import cache_readonly from pandas.core.dtypes.cast import ( + ensure_dtype_can_hold_na, maybe_cast_pointwise_result, maybe_downcast_to_dtype, ) @@ -112,7 +113,8 @@ class WrappedCythonOp: # back to the original dtype. cast_blocklist = frozenset(["rank", "count", "size", "idxmin", "idxmax"]) - def __init__(self, kind: str, how: str): + def __init__(self, grouper, kind: str, how: str): + self.grouper = grouper self.kind = kind self.how = how @@ -199,7 +201,9 @@ def get_cython_func_and_vals(self, values: np.ndarray, is_numeric: bool): func = self._get_cython_function(kind, how, values.dtype, is_numeric) if values.dtype.kind in ["i", "u"]: - if how in ["add", "var", "prod", "mean", "ohlc"]: + if how in ["add", "var", "prod", "mean", "ohlc"] or ( + kind == "transform" and self.grouper.has_dropped_na + ): # result may still include NaN, so we have to cast values = ensure_float64(values) @@ -263,6 +267,9 @@ def _get_output_shape(self, ngroups: int, values: np.ndarray) -> Shape: def get_out_dtype(self, dtype: np.dtype) -> np.dtype: how = self.how + if self.kind == "transform" and self.grouper.has_dropped_na: + dtype = ensure_dtype_can_hold_na(dtype) + if how == "rank": out_dtype = "float64" else: @@ -463,6 +470,11 @@ def _cython_op_ndim_compat( # otherwise we have OHLC return res.T + if self.kind == "transform" and self.grouper.has_dropped_na: + mask = comp_ids == -1 + # make mask 2d + mask = mask[None, :] + return self._call_cython_op( values, min_count=min_count, @@ -571,6 +583,10 @@ def _call_cython_op( result = result.T + if self.how == "rank" and self.grouper.has_dropped_na: + # TODO: Wouldn't need this if group_rank supported mask + result = np.where(comp_ids < 0, np.nan, result) + if self.how not in self.cast_blocklist: # e.g. if we are int64 and need to restore to datetime64/timedelta64 # "rank" is the only member of cast_blocklist we get here @@ -783,6 +799,13 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: keys = [ping.group_index for ping in self.groupings] return get_indexer_dict(codes_list, keys) + @cache_readonly + def code_indices(self) -> dict[int, npt.NDArray[np.intp]]: + group_index = get_group_index(self.codes, self.shape, sort=True, xnull=True) + _, obs_group_ids, _ = self.group_info + result = {e: np.where(group_index == e)[0] for e in obs_group_ids} + return result + @final @property def codes(self) -> list[np.ndarray]: @@ -825,6 +848,14 @@ def is_monotonic(self) -> bool: # return if my group orderings are monotonic return Index(self.group_info[0]).is_monotonic_increasing + @final + @cache_readonly + def has_dropped_na(self) -> bool: + # TODO: Would like this to be named "contains_na" and be True whenever + # there is a null in the grouper (regardless of dropna), but right + # now will only be True when there is a null and dropna=True. + return bool((self.group_info[0] < 0).any()) + @cache_readonly def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: comp_ids, obs_group_ids = self._get_compressed_codes() @@ -928,7 +959,7 @@ def _cython_operation( """ assert kind in ["transform", "aggregate"] - cy_op = WrappedCythonOp(kind=kind, how=how) + cy_op = WrappedCythonOp(grouper=self, kind=kind, how=how) ids, _, _ = self.group_info ngroups = self.ngroups diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 6297a7578ccd4..7fd3e2e7f1554 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -829,7 +829,13 @@ def _make_na_block( block_values.fill(fill_value) return new_block_2d(block_values, placement=placement) - def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T: + def take( + self: T, + indexer, + axis: int = 1, + verify: bool = True, + convert_indices: bool = True, + ) -> T: """ Take items along any axis. @@ -851,7 +857,8 @@ def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T: ) n = self.shape[axis] - indexer = maybe_convert_indices(indexer, n, verify=verify) + if convert_indices: + indexer = maybe_convert_indices(indexer, n, verify=verify) new_labels = self.axes[axis].take(indexer) return self.reindex_indexer( diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index 403e3edcc34d0..58d9e500554dd 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -19,6 +19,11 @@ def as_index(request): return request.param +@pytest.fixture(params=[True, False]) +def dropna(request): + return request.param + + @pytest.fixture def mframe(multiindex_dataframe_random_data): return multiindex_dataframe_random_data diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index ab568e24ff029..9fcc28d8f65a9 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -171,52 +171,30 @@ def test_grouper_dropna_propagation(dropna): @pytest.mark.parametrize( - "dropna,input_index,expected_data,expected_index", + "index", [ - (True, pd.RangeIndex(0, 4), {"B": [2, 2, 1]}, pd.RangeIndex(0, 3)), - (True, list("abcd"), {"B": [2, 2, 1]}, list("abc")), - ( - True, - pd.MultiIndex.from_tuples( - [(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"] - ), - {"B": [2, 2, 1]}, - pd.MultiIndex.from_tuples( - [(1, "R"), (1, "B"), (2, "R")], names=["num", "col"] - ), - ), - (False, pd.RangeIndex(0, 4), {"B": [2, 2, 1, 1]}, pd.RangeIndex(0, 4)), - (False, list("abcd"), {"B": [2, 2, 1, 1]}, list("abcd")), - ( - False, - pd.MultiIndex.from_tuples( - [(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"] - ), - {"B": [2, 2, 1, 1]}, - pd.MultiIndex.from_tuples( - [(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"] - ), - ), + pd.RangeIndex(0, 4), + list("abcd"), + pd.MultiIndex.from_product([(1, 2), ("R", "B")], names=["num", "col"]), ], ) -def test_groupby_dataframe_slice_then_transform( - dropna, input_index, expected_data, expected_index -): +def test_groupby_dataframe_slice_then_transform(dropna, index): # GH35014 & GH35612 + expected_data = {"B": [2, 2, 1, np.nan if dropna else 1]} - df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}, index=input_index) + df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}, index=index) gb = df.groupby("A", dropna=dropna) result = gb.transform(len) - expected = pd.DataFrame(expected_data, index=expected_index) + expected = pd.DataFrame(expected_data, index=index) tm.assert_frame_equal(result, expected) result = gb[["B"]].transform(len) - expected = pd.DataFrame(expected_data, index=expected_index) + expected = pd.DataFrame(expected_data, index=index) tm.assert_frame_equal(result, expected) result = gb["B"].transform(len) - expected = pd.Series(expected_data["B"], index=expected_index, name="B") + expected = pd.Series(expected_data["B"], index=index, name="B") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index 83b8d5c29bbf0..7830c229ece2f 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -658,6 +658,8 @@ def test_non_unique_index(): ) result = df.groupby([df.index, "A"]).value.rank(ascending=True, pct=True) expected = Series( - [1.0] * 4, index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4, name="value" + [1.0, 1.0, 1.0, np.nan], + index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4, + name="value", ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 6e9b8c35b3698..97faef7d13cf7 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1113,10 +1113,6 @@ def test_transform_agg_by_name(request, reduction_func, obj): func = reduction_func g = obj.groupby(np.repeat([0, 1], 3)) - if func == "ngroup": # GH#27468 - request.node.add_marker( - pytest.mark.xfail(reason="TODO: g.transform('ngroup') doesn't work") - ) if func == "corrwith" and isinstance(obj, Series): # GH#32293 request.node.add_marker( pytest.mark.xfail(reason="TODO: implement SeriesGroupBy.corrwith") @@ -1128,8 +1124,8 @@ def test_transform_agg_by_name(request, reduction_func, obj): # this is the *definition* of a transformation tm.assert_index_equal(result.index, obj.index) - if func != "size" and obj.ndim == 2: - # size returns a Series, unlike other transforms + if func not in ("ngroup", "size") and obj.ndim == 2: + # size/ngroup return a Series, unlike other transforms tm.assert_index_equal(result.columns, obj.columns) # verify that values were broadcasted across each group @@ -1281,9 +1277,88 @@ def test_transform_cumcount(): tm.assert_series_equal(result, expected) -def test_null_group_lambda_self(): +# Test both monotonic increasing and not +@pytest.mark.parametrize("keys", [[1, 2, np.nan], [2, 1, np.nan]]) +def test_null_group_lambda_self(dropna, keys): # GH 17093 - df = DataFrame({"A": [1, np.nan], "B": [1, 1]}) - result = df.groupby("A").transform(lambda x: x) - expected = DataFrame([1], columns=["B"]) + df = DataFrame({"A": keys, "B": [1, 2, 3]}) + result = df.groupby("A", dropna=dropna).transform(lambda x: x) + value = np.nan if dropna else 3 + expected = DataFrame([1, 2, value], columns=["B"]) tm.assert_frame_equal(result, expected) + + +def test_null_group_str_reducer(request, dropna, reduction_func): + # GH 17093 + index = [1, 2, 3, 4] # test transform preserves non-standard index + df = DataFrame({"A": [1, 1, np.nan, np.nan], "B": [1, 2, 2, 3]}, index=index) + gb = df.groupby("A", dropna=dropna) + + if reduction_func == "corrwith": + args = (df["B"],) + elif reduction_func == "nth": + args = (0,) + else: + args = () + + # Manually handle reducers that don't fit the generic pattern + # Set expected with dropna=False, then replace if necessary + if reduction_func == "first": + expected = DataFrame({"B": [1, 1, 2, 2]}, index=index) + elif reduction_func == "last": + expected = DataFrame({"B": [2, 2, 3, 3]}, index=index) + elif reduction_func == "nth": + expected = DataFrame({"B": [1, 1, 2, 2]}, index=index) + elif reduction_func == "size": + expected = Series([2, 2, 2, 2], index=index) + elif reduction_func == "ngroup": + expected = Series([0, 0, 1, 1], index=index) + elif reduction_func == "corrwith": + expected = DataFrame({"B": [1.0, 1.0, 1.0, 1.0]}, index=index) + else: + expected_gb = df.groupby("A", dropna=False) + buffer = [] + for idx, group in expected_gb: + res = getattr(group["B"], reduction_func)() + buffer.append(Series(res, index=group.index)) + expected = concat(buffer).to_frame("B") + if dropna: + dtype = object if reduction_func in ("any", "all") else float + expected = expected.astype(dtype) + if expected.ndim == 2: + expected.iloc[[2, 3], 0] = np.nan + else: + expected.iloc[[2, 3]] = np.nan + + result = gb.transform(reduction_func, *args) + tm.assert_equal(result, expected) + + +def test_null_group_str_transformer(request, dropna, transformation_func): + # GH 17093 + if transformation_func == "tshift": + msg = "tshift requires timeseries" + request.node.add_marker(pytest.mark.xfail(reason=msg)) + args = (0,) if transformation_func == "fillna" else () + df = DataFrame({"A": [1, 1, np.nan], "B": [1, 2, 2]}, index=[1, 2, 3]) + gb = df.groupby("A", dropna=dropna) + + buffer = [] + for idx, group in gb: + if transformation_func == "cumcount": + # DataFrame has no cumcount method + res = DataFrame({"B": range(len(group))}, index=group.index) + else: + res = getattr(group[["B"]], transformation_func)(*args) + buffer.append(res) + if dropna: + dtype = object if transformation_func in ("any", "all") else None + buffer.append(DataFrame([[np.nan]], index=[3], dtype=dtype, columns=["B"])) + expected = concat(buffer) + + if transformation_func == "cumcount": + # cumcount always returns a Series as it counts the groups, not values + expected = expected["B"].rename(None) + + result = gb.transform(transformation_func, *args) + tm.assert_equal(result, expected)