BUG: Fix some cases of groupby(...).transform with dropna=True (#46209)

rhshadrach · web-flow · commit 4d7a03a33bd4 · 2022-03-07T18:49:10.000-05:00
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -82,17 +82,32 @@ did not have the same index as the input.
 
 .. code-block:: ipython
 
+    In [3]: df.groupby('a', dropna=True).transform(lambda x: x.sum())
+    Out[3]:
+       b
+    0  5
+    1  5
+
     In [3]: df.groupby('a', dropna=True).transform(lambda x: x)
     Out[3]:
        b
     0  2
     1  3
 
+    In [3]: df.groupby('a', dropna=True).transform('sum')
+    Out[3]:
+       b
+    0  5
+    1  5
+    2  5
+
 *New behavior*:
 
 .. ipython:: python
 
+    df.groupby('a', dropna=True).transform(lambda x: x.sum())
     df.groupby('a', dropna=True).transform(lambda x: x)
+    df.groupby('a', dropna=True).transform('sum')
 
 .. _whatsnew_150.notable_bug_fixes.notable_bug_fix2:
 
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -3693,10 +3693,26 @@ class  max_speed
 
         nv.validate_take((), kwargs)
 
+        return self._take(indices, axis)
+
+    def _take(
+        self: NDFrameT,
+        indices,
+        axis=0,
+        convert_indices: bool_t = True,
+    ) -> NDFrameT:
+        """
+        Internal version of the `take` allowing specification of additional args.
+
+        See the docstring of `take` for full explanation of the parameters.
+        """
         self._consolidate_inplace()
 
         new_data = self._mgr.take(
-            indices, axis=self._get_block_manager_axis(axis), verify=True
+            indices,
+            axis=self._get_block_manager_axis(axis),
+            verify=True,
+            convert_indices=convert_indices,
         )
         return self._constructor(new_data).__finalize__(self, method="take")
 
@@ -3708,7 +3724,7 @@ def _take_with_is_copy(self: NDFrameT, indices, axis=0) -> NDFrameT:
 
         See the docstring of `take` for full explanation of the parameters.
         """
-        result = self.take(indices=indices, axis=axis)
+        result = self._take(indices=indices, axis=axis)
         # Maybe set copy if we didn't actually change the index.
         if not result._get_axis(axis).equals(self._get_axis(axis)):
             result._set_is_copy(self)
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -1646,7 +1646,10 @@ def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT:
             out = algorithms.take_nd(result._values, ids)
             output = obj._constructor(out, index=obj.index, name=obj.name)
         else:
-            output = result.take(ids, axis=self.axis)
+            # GH#46209
+            # Don't convert indices: negative indices need to give rise
+            # to null values in the result
+            output = result._take(ids, axis=self.axis, convert_indices=False)
             output = output.set_axis(obj._get_axis(self.axis), axis=self.axis)
         return output
 
@@ -1699,9 +1702,14 @@ def _cumcount_array(self, ascending: bool = True) -> np.ndarray:
         else:
             out = np.repeat(out[np.r_[run[1:], True]], rep) - out
 
+        if self.grouper.has_dropped_na:
+            out = np.where(ids == -1, np.nan, out.astype(np.float64, copy=False))
+        else:
+            out = out.astype(np.int64, copy=False)
+
         rev = np.empty(count, dtype=np.intp)
         rev[sorter] = np.arange(count, dtype=np.intp)
-        return out[rev].astype(np.int64, copy=False)
+        return out[rev]
 
     # -----------------------------------------------------------------
 
diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py
@@ -640,7 +640,13 @@ def _reindex_indexer(
 
         return type(self)(new_arrays, new_axes, verify_integrity=False)
 
-    def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T:
+    def take(
+        self: T,
+        indexer,
+        axis: int = 1,
+        verify: bool = True,
+        convert_indices: bool = True,
+    ) -> T:
         """
         Take items along any axis.
         """
@@ -656,7 +662,8 @@ def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T:
             raise ValueError("indexer should be 1-dimensional")
 
         n = self.shape_proper[axis]
-        indexer = maybe_convert_indices(indexer, n, verify=verify)
+        if convert_indices:
+            indexer = maybe_convert_indices(indexer, n, verify=verify)
 
         new_labels = self._axes[axis].take(indexer)
         return self._reindex_indexer(
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -829,7 +829,13 @@ def _make_na_block(
         block_values.fill(fill_value)
         return new_block_2d(block_values, placement=placement)
 
-    def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T:
+    def take(
+        self: T,
+        indexer,
+        axis: int = 1,
+        verify: bool = True,
+        convert_indices: bool = True,
+    ) -> T:
         """
         Take items along any axis.
 
@@ -838,6 +844,8 @@ def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T:
         verify : bool, default True
             Check that all entries are between 0 and len(self) - 1, inclusive.
             Pass verify=False if this check has been done by the caller.
+        convert_indices : bool, default True
+            Whether to attempt to convert indices to positive values.
 
         Returns
         -------
@@ -851,7 +859,8 @@ def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T:
         )
 
         n = self.shape[axis]
-        indexer = maybe_convert_indices(indexer, n, verify=verify)
+        if convert_indices:
+            indexer = maybe_convert_indices(indexer, n, verify=verify)
 
         new_labels = self.axes[axis].take(indexer)
         return self.reindex_indexer(
diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py
@@ -1308,3 +1308,196 @@ def test_null_group_lambda_self(sort, dropna):
     gb = df.groupby("A", dropna=dropna, sort=sort)
     result = gb.transform(lambda x: x)
     tm.assert_frame_equal(result, expected)
+
+
+def test_null_group_str_reducer(request, dropna, reduction_func):
+    # GH 17093
+    if reduction_func in ("corrwith", "ngroup"):
+        msg = "incorrectly raises"
+        request.node.add_marker(pytest.mark.xfail(reason=msg))
+    index = [1, 2, 3, 4]  # test transform preserves non-standard index
+    df = DataFrame({"A": [1, 1, np.nan, np.nan], "B": [1, 2, 2, 3]}, index=index)
+    gb = df.groupby("A", dropna=dropna)
+
+    if reduction_func == "corrwith":
+        args = (df["B"],)
+    elif reduction_func == "nth":
+        args = (0,)
+    else:
+        args = ()
+
+    # Manually handle reducers that don't fit the generic pattern
+    # Set expected with dropna=False, then replace if necessary
+    if reduction_func == "first":
+        expected = DataFrame({"B": [1, 1, 2, 2]}, index=index)
+    elif reduction_func == "last":
+        expected = DataFrame({"B": [2, 2, 3, 3]}, index=index)
+    elif reduction_func == "nth":
+        expected = DataFrame({"B": [1, 1, 2, 2]}, index=index)
+    elif reduction_func == "size":
+        expected = Series([2, 2, 2, 2], index=index)
+    elif reduction_func == "corrwith":
+        expected = DataFrame({"B": [1.0, 1.0, 1.0, 1.0]}, index=index)
+    else:
+        expected_gb = df.groupby("A", dropna=False)
+        buffer = []
+        for idx, group in expected_gb:
+            res = getattr(group["B"], reduction_func)()
+            buffer.append(Series(res, index=group.index))
+        expected = concat(buffer).to_frame("B")
+    if dropna:
+        dtype = object if reduction_func in ("any", "all") else float
+        expected = expected.astype(dtype)
+        if expected.ndim == 2:
+            expected.iloc[[2, 3], 0] = np.nan
+        else:
+            expected.iloc[[2, 3]] = np.nan
+
+    result = gb.transform(reduction_func, *args)
+    tm.assert_equal(result, expected)
+
+
+def test_null_group_str_transformer(
+    request, using_array_manager, dropna, transformation_func
+):
+    # GH 17093
+    xfails_block = (
+        "cummax",
+        "cummin",
+        "cumsum",
+        "fillna",
+        "rank",
+        "backfill",
+        "ffill",
+        "bfill",
+        "pad",
+    )
+    xfails_array = ("cummax", "cummin", "cumsum", "fillna", "rank")
+    if transformation_func == "tshift":
+        msg = "tshift requires timeseries"
+        request.node.add_marker(pytest.mark.xfail(reason=msg))
+    elif dropna and (
+        (not using_array_manager and transformation_func in xfails_block)
+        or (using_array_manager and transformation_func in xfails_array)
+    ):
+        msg = "produces incorrect results when nans are present"
+        request.node.add_marker(pytest.mark.xfail(reason=msg))
+    args = (0,) if transformation_func == "fillna" else ()
+    df = DataFrame({"A": [1, 1, np.nan], "B": [1, 2, 2]}, index=[1, 2, 3])
+    gb = df.groupby("A", dropna=dropna)
+
+    buffer = []
+    for k, (idx, group) in enumerate(gb):
+        if transformation_func == "cumcount":
+            # DataFrame has no cumcount method
+            res = DataFrame({"B": range(len(group))}, index=group.index)
+        elif transformation_func == "ngroup":
+            res = DataFrame(len(group) * [k], index=group.index, columns=["B"])
+        else:
+            res = getattr(group[["B"]], transformation_func)(*args)
+        buffer.append(res)
+    if dropna:
+        dtype = object if transformation_func in ("any", "all") else None
+        buffer.append(DataFrame([[np.nan]], index=[3], dtype=dtype, columns=["B"]))
+    expected = concat(buffer)
+
+    if transformation_func in ("cumcount", "ngroup"):
+        # ngroup/cumcount always returns a Series as it counts the groups, not values
+        expected = expected["B"].rename(None)
+
+    warn = FutureWarning if transformation_func in ("backfill", "pad") else None
+    msg = f"{transformation_func} is deprecated"
+    with tm.assert_produces_warning(warn, match=msg):
+        result = gb.transform(transformation_func, *args)
+
+    tm.assert_equal(result, expected)
+
+
+def test_null_group_str_reducer_series(request, dropna, reduction_func):
+    # GH 17093
+    if reduction_func == "corrwith":
+        msg = "corrwith not implemented for SeriesGroupBy"
+        request.node.add_marker(pytest.mark.xfail(reason=msg))
+
+    if reduction_func == "ngroup":
+        msg = "ngroup fails"
+        request.node.add_marker(pytest.mark.xfail(reason=msg))
+
+    # GH 17093
+    index = [1, 2, 3, 4]  # test transform preserves non-standard index
+    ser = Series([1, 2, 2, 3], index=index)
+    gb = ser.groupby([1, 1, np.nan, np.nan], dropna=dropna)
+
+    if reduction_func == "corrwith":
+        args = (ser,)
+    elif reduction_func == "nth":
+        args = (0,)
+    else:
+        args = ()
+
+    # Manually handle reducers that don't fit the generic pattern
+    # Set expected with dropna=False, then replace if necessary
+    if reduction_func == "first":
+        expected = Series([1, 1, 2, 2], index=index)
+    elif reduction_func == "last":
+        expected = Series([2, 2, 3, 3], index=index)
+    elif reduction_func == "nth":
+        expected = Series([1, 1, 2, 2], index=index)
+    elif reduction_func == "size":
+        expected = Series([2, 2, 2, 2], index=index)
+    elif reduction_func == "corrwith":
+        expected = Series([1, 1, 2, 2], index=index)
+    else:
+        expected_gb = ser.groupby([1, 1, np.nan, np.nan], dropna=False)
+        buffer = []
+        for idx, group in expected_gb:
+            res = getattr(group, reduction_func)()
+            buffer.append(Series(res, index=group.index))
+        expected = concat(buffer)
+    if dropna:
+        dtype = object if reduction_func in ("any", "all") else float
+        expected = expected.astype(dtype)
+        expected.iloc[[2, 3]] = np.nan
+
+    result = gb.transform(reduction_func, *args)
+    tm.assert_series_equal(result, expected)
+
+
+def test_null_group_str_transformer_series(request, dropna, transformation_func):
+    # GH 17093
+    if transformation_func == "tshift":
+        msg = "tshift requires timeseries"
+        request.node.add_marker(pytest.mark.xfail(reason=msg))
+    elif dropna and transformation_func in (
+        "cummax",
+        "cummin",
+        "cumsum",
+        "fillna",
+        "rank",
+    ):
+        msg = "produces incorrect results when nans are present"
+        request.node.add_marker(pytest.mark.xfail(reason=msg))
+    args = (0,) if transformation_func == "fillna" else ()
+    ser = Series([1, 2, 2], index=[1, 2, 3])
+    gb = ser.groupby([1, 1, np.nan], dropna=dropna)
+
+    buffer = []
+    for k, (idx, group) in enumerate(gb):
+        if transformation_func == "cumcount":
+            # Series has no cumcount method
+            res = Series(range(len(group)), index=group.index)
+        elif transformation_func == "ngroup":
+            res = Series(k, index=group.index)
+        else:
+            res = getattr(group, transformation_func)(*args)
+        buffer.append(res)
+    if dropna:
+        dtype = object if transformation_func in ("any", "all") else None
+        buffer.append(Series([np.nan], index=[3], dtype=dtype))
+    expected = concat(buffer)
+
+    warn = FutureWarning if transformation_func in ("backfill", "pad") else None
+    msg = f"{transformation_func} is deprecated"
+    with tm.assert_produces_warning(warn, match=msg):
+        result = gb.transform(transformation_func, *args)
+    tm.assert_equal(result, expected)