From ea52d263e7c1f8567dada8b58dadfc5afb6e574b Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Wed, 2 Mar 2022 00:39:01 -0500
Subject: [PATCH 1/8] BUG: Fix some cases of groupby(...).transform with
 dropna=True

---
 pandas/core/generic.py                        |  20 +-
 pandas/core/groupby/groupby.py                |  11 +-
 pandas/core/internals/managers.py             |  11 +-
 .../tests/groupby/transform/test_transform.py | 178 ++++++++++++++++++
 4 files changed, 214 insertions(+), 6 deletions(-)

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 83d0a95b8adb2..aaad2555e0186 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -3693,10 +3693,26 @@ class  max_speed
 
         nv.validate_take((), kwargs)
 
+        return self._take(indices, axis)
+
+    def _take(
+        self: NDFrameT,
+        indices,
+        axis=0,
+        convert_indices: bool_t = True,
+    ) -> NDFrameT:
+        """
+        Internal version of the `take` allowing specification of additional args.
+
+        See the docstring of `take` for full explanation of the parameters.
+        """
         self._consolidate_inplace()
 
         new_data = self._mgr.take(
-            indices, axis=self._get_block_manager_axis(axis), verify=True
+            indices,
+            axis=self._get_block_manager_axis(axis),
+            verify=True,
+            convert_indices=convert_indices,
         )
         return self._constructor(new_data).__finalize__(self, method="take")
 
@@ -3708,7 +3724,7 @@ def _take_with_is_copy(self: NDFrameT, indices, axis=0) -> NDFrameT:
 
         See the docstring of `take` for full explanation of the parameters.
         """
-        result = self.take(indices=indices, axis=axis)
+        result = self._take(indices=indices, axis=axis)
         # Maybe set copy if we didn't actually change the index.
         if not result._get_axis(axis).equals(self._get_axis(axis)):
             result._set_is_copy(self)
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 65e8b238cd476..d07e1bfb6ad09 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -1646,7 +1646,9 @@ def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT:
             out = algorithms.take_nd(result._values, ids)
             output = obj._constructor(out, index=obj.index, name=obj.name)
         else:
-            output = result.take(ids, axis=self.axis)
+            # Don't convert indices: negative indices need to give rise
+            # to null values in the result
+            output = result._take(ids, axis=self.axis, convert_indices=False)
             output = output.set_axis(obj._get_axis(self.axis), axis=self.axis)
         return output
 
@@ -1699,9 +1701,14 @@ def _cumcount_array(self, ascending: bool = True) -> np.ndarray:
         else:
             out = np.repeat(out[np.r_[run[1:], True]], rep) - out
 
+        if self.grouper.has_dropped_na:
+            out = np.where(ids == -1, np.nan, out.astype(np.float64, copy=False))
+        else:
+            out = out.astype(np.int64, copy=False)
+
         rev = np.empty(count, dtype=np.intp)
         rev[sorter] = np.arange(count, dtype=np.intp)
-        return out[rev].astype(np.int64, copy=False)
+        return out[rev]
 
     # -----------------------------------------------------------------
 
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
index 4b8f1aae75b1b..4b5b979551917 100644
--- a/pandas/core/internals/managers.py
+++ b/pandas/core/internals/managers.py
@@ -829,7 +829,13 @@ def _make_na_block(
         block_values.fill(fill_value)
         return new_block_2d(block_values, placement=placement)
 
-    def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T:
+    def take(
+        self: T,
+        indexer,
+        axis: int = 1,
+        verify: bool = True,
+        convert_indices: bool = True,
+    ) -> T:
         """
         Take items along any axis.
 
@@ -851,7 +857,8 @@ def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T:
         )
 
         n = self.shape[axis]
-        indexer = maybe_convert_indices(indexer, n, verify=verify)
+        if convert_indices:
+            indexer = maybe_convert_indices(indexer, n, verify=verify)
 
         new_labels = self.axes[axis].take(indexer)
         return self.reindex_indexer(
diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py
index 5cc9ac9b09ae9..9645c4952a46c 100644
--- a/pandas/tests/groupby/transform/test_transform.py
+++ b/pandas/tests/groupby/transform/test_transform.py
@@ -1308,3 +1308,181 @@ def test_null_group_lambda_self(sort, dropna):
     gb = df.groupby("A", dropna=dropna, sort=sort)
     result = gb.transform(lambda x: x)
     tm.assert_frame_equal(result, expected)
+
+
+def test_null_group_str_reducer(request, dropna, reduction_func):
+    # GH 17093
+    if reduction_func == "ngroup":
+        msg = "ngroup fails"
+        request.node.add_marker(pytest.mark.xfail(reason=msg))
+    index = [1, 2, 3, 4]  # test transform preserves non-standard index
+    df = DataFrame({"A": [1, 1, np.nan, np.nan], "B": [1, 2, 2, 3]}, index=index)
+    gb = df.groupby("A", dropna=dropna)
+
+    if reduction_func == "corrwith":
+        args = (df["B"],)
+    elif reduction_func == "nth":
+        args = (0,)
+    else:
+        args = ()
+
+    # Manually handle reducers that don't fit the generic pattern
+    # Set expected with dropna=False, then replace if necessary
+    if reduction_func == "first":
+        expected = DataFrame({"B": [1, 1, 2, 2]}, index=index)
+    elif reduction_func == "last":
+        expected = DataFrame({"B": [2, 2, 3, 3]}, index=index)
+    elif reduction_func == "nth":
+        expected = DataFrame({"B": [1, 1, 2, 2]}, index=index)
+    elif reduction_func == "size":
+        expected = Series([2, 2, 2, 2], index=index)
+    elif reduction_func == "corrwith":
+        expected = DataFrame({"B": [1.0, 1.0, 1.0, 1.0]}, index=index)
+    else:
+        expected_gb = df.groupby("A", dropna=False)
+        buffer = []
+        for idx, group in expected_gb:
+            res = getattr(group["B"], reduction_func)()
+            buffer.append(Series(res, index=group.index))
+        expected = concat(buffer).to_frame("B")
+    if dropna:
+        dtype = object if reduction_func in ("any", "all") else float
+        expected = expected.astype(dtype)
+        if expected.ndim == 2:
+            expected.iloc[[2, 3], 0] = np.nan
+        else:
+            expected.iloc[[2, 3]] = np.nan
+
+    result = gb.transform(reduction_func, *args)
+    tm.assert_equal(result, expected)
+
+
+def test_null_group_str_transformer(request, dropna, transformation_func):
+    # GH 17093
+    if transformation_func == "tshift":
+        msg = "tshift requires timeseries"
+        request.node.add_marker(pytest.mark.xfail(reason=msg))
+    elif dropna and transformation_func in (
+        "backfill",
+        "bfill",
+        "cummax",
+        "cummin",
+        "cumsum",
+        "ffill",
+        "fillna",
+        "pad",
+        "rank",
+    ):
+        msg = "produces incorrect results when nans are present"
+        request.node.add_marker(pytest.mark.xfail(reason=msg))
+    args = (0,) if transformation_func == "fillna" else ()
+    df = DataFrame({"A": [1, 1, np.nan], "B": [1, 2, 2]}, index=[1, 2, 3])
+    gb = df.groupby("A", dropna=dropna)
+
+    buffer = []
+    for k, (idx, group) in enumerate(gb):
+        if transformation_func == "cumcount":
+            # DataFrame has no cumcount method
+            res = DataFrame({"B": range(len(group))}, index=group.index)
+        elif transformation_func == "ngroup":
+            res = DataFrame(len(group) * [k], index=group.index, columns=["B"])
+        else:
+            res = getattr(group[["B"]], transformation_func)(*args)
+        buffer.append(res)
+    if dropna:
+        dtype = object if transformation_func in ("any", "all") else None
+        buffer.append(DataFrame([[np.nan]], index=[3], dtype=dtype, columns=["B"]))
+    expected = concat(buffer)
+
+    if transformation_func in ("cumcount", "ngroup"):
+        # ngroup/cumcount always returns a Series as it counts the groups, not values
+        expected = expected["B"].rename(None)
+
+    result = gb.transform(transformation_func, *args)
+    tm.assert_equal(result, expected)
+
+
+def test_null_group_str_reducer_series(request, dropna, reduction_func):
+    if reduction_func == "corrwith":
+        msg = "corrwith not implemented for SeriesGroupBy"
+        request.node.add_marker(pytest.mark.xfail(reason=msg))
+
+    if reduction_func == "ngroup":
+        msg = "ngroup fails"
+        request.node.add_marker(pytest.mark.xfail(reason=msg))
+
+    # GH 17093
+    index = [1, 2, 3, 4]  # test transform preserves non-standard index
+    ser = Series([1, 2, 2, 3], index=index)
+    gb = ser.groupby([1, 1, np.nan, np.nan], dropna=dropna)
+
+    if reduction_func == "corrwith":
+        args = (ser,)
+    elif reduction_func == "nth":
+        args = (0,)
+    else:
+        args = ()
+
+    # Manually handle reducers that don't fit the generic pattern
+    # Set expected with dropna=False, then replace if necessary
+    if reduction_func == "first":
+        expected = Series([1, 1, 2, 2], index=index)
+    elif reduction_func == "last":
+        expected = Series([2, 2, 3, 3], index=index)
+    elif reduction_func == "nth":
+        expected = Series([1, 1, 2, 2], index=index)
+    elif reduction_func == "size":
+        expected = Series([2, 2, 2, 2], index=index)
+    elif reduction_func == "corrwith":
+        expected = Series([1, 1, 2, 2], index=index)
+    else:
+        expected_gb = ser.groupby([1, 1, np.nan, np.nan], dropna=False)
+        buffer = []
+        for idx, group in expected_gb:
+            res = getattr(group, reduction_func)()
+            buffer.append(Series(res, index=group.index))
+        expected = concat(buffer)
+    if dropna:
+        dtype = object if reduction_func in ("any", "all") else float
+        expected = expected.astype(dtype)
+        expected.iloc[[2, 3]] = np.nan
+
+    result = gb.transform(reduction_func, *args)
+    tm.assert_series_equal(result, expected)
+
+
+def test_null_group_str_transformer_series(request, dropna, transformation_func):
+    # GH 17093
+    if transformation_func == "tshift":
+        msg = "tshift requires timeseries"
+        request.node.add_marker(pytest.mark.xfail(reason=msg))
+    elif dropna and transformation_func in (
+        "cummax",
+        "cummin",
+        "cumsum",
+        "fillna",
+        "rank",
+    ):
+        msg = "produces incorrect results when nans are present"
+        request.node.add_marker(pytest.mark.xfail(reason=msg))
+    args = (0,) if transformation_func == "fillna" else ()
+    ser = Series([1, 2, 2], index=[1, 2, 3])
+    gb = ser.groupby([1, 1, np.nan], dropna=dropna)
+
+    buffer = []
+    for k, (idx, group) in enumerate(gb):
+        if transformation_func == "cumcount":
+            # Series has no cumcount method
+            res = Series(range(len(group)), index=group.index)
+        elif transformation_func == "ngroup":
+            res = Series(k, index=group.index)
+        else:
+            res = getattr(group, transformation_func)(*args)
+        buffer.append(res)
+    if dropna:
+        dtype = object if transformation_func in ("any", "all") else None
+        buffer.append(Series([np.nan], index=[3], dtype=dtype))
+    expected = concat(buffer)
+
+    result = gb.transform(transformation_func, *args)
+    tm.assert_equal(result, expected)

From 7a62db4274d92f68596103915e80ecfc57fa3c73 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Wed, 2 Mar 2022 20:39:09 -0500
Subject: [PATCH 2/8] GH # in test

---
 pandas/tests/groupby/transform/test_transform.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py
index 9645c4952a46c..4e0781d54da73 100644
--- a/pandas/tests/groupby/transform/test_transform.py
+++ b/pandas/tests/groupby/transform/test_transform.py
@@ -1403,6 +1403,7 @@ def test_null_group_str_transformer(request, dropna, transformation_func):
 
 
 def test_null_group_str_reducer_series(request, dropna, reduction_func):
+    # GH 17093
     if reduction_func == "corrwith":
         msg = "corrwith not implemented for SeriesGroupBy"
         request.node.add_marker(pytest.mark.xfail(reason=msg))

From cf8e4565a95bb0f0955c4ab2028f334c35491803 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Wed, 2 Mar 2022 20:45:40 -0500
Subject: [PATCH 3/8] whatsnew

---
 doc/source/whatsnew/v1.5.0.rst | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index 3bc9a1ef1ca48..e9ab38cfe3a58 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -82,17 +82,32 @@ did not have the same index as the input.
 
 .. code-block:: ipython
 
+    In [3]: df.groupby('a', dropna=True).transform(lambda x: x.sum())
+    Out[3]:
+       b
+    0  5
+    1  5
+
     In [3]: df.groupby('a', dropna=True).transform(lambda x: x)
     Out[3]:
        b
     0  2
     1  3
 
+    In [3]: df.groupby('a', dropna=True).transform('sum')
+    Out[3]:
+       b
+    0  5
+    1  5
+    2  5
+
 *New behavior*:
 
 .. ipython:: python
 
+    df.groupby('a', dropna=True).transform(lambda x: x.sum())
     df.groupby('a', dropna=True).transform(lambda x: x)
+    df.groupby('a', dropna=True).transform('sum')
 
 .. _whatsnew_150.notable_bug_fixes.notable_bug_fix2:
 

From c6812366feeb980e5b47ffd9159ce6918e9d923c Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Wed, 2 Mar 2022 20:51:17 -0500
Subject: [PATCH 4/8] docstring

---
 pandas/core/internals/managers.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
index 4b5b979551917..985b1115f2ced 100644
--- a/pandas/core/internals/managers.py
+++ b/pandas/core/internals/managers.py
@@ -844,6 +844,8 @@ def take(
         verify : bool, default True
             Check that all entries are between 0 and len(self) - 1, inclusive.
             Pass verify=False if this check has been done by the caller.
+        convert_indices : bool, default True
+            Whether to attempt to convert indices to positive values.
 
         Returns
         -------

From d06acd50594a67bda07fccb2a40e730af7f633c9 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Fri, 4 Mar 2022 21:10:38 -0500
Subject: [PATCH 5/8] fixups

---
 pandas/core/internals/array_manager.py        | 11 ++++++--
 .../tests/groupby/transform/test_transform.py | 25 ++++++++++++-------
 2 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py
index 3e499c99ac144..eeb250e2a0022 100644
--- a/pandas/core/internals/array_manager.py
+++ b/pandas/core/internals/array_manager.py
@@ -640,7 +640,13 @@ def _reindex_indexer(
 
         return type(self)(new_arrays, new_axes, verify_integrity=False)
 
-    def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T:
+    def take(
+        self: T,
+        indexer,
+        axis: int = 1,
+        verify: bool = True,
+        convert_indices: bool = True,
+    ) -> T:
         """
         Take items along any axis.
         """
@@ -656,7 +662,8 @@ def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T:
             raise ValueError("indexer should be 1-dimensional")
 
         n = self.shape_proper[axis]
-        indexer = maybe_convert_indices(indexer, n, verify=verify)
+        if convert_indices:
+            indexer = maybe_convert_indices(indexer, n, verify=verify)
 
         new_labels = self._axes[axis].take(indexer)
         return self._reindex_indexer(
diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py
index 4e0781d54da73..f0bfb037dcd3d 100644
--- a/pandas/tests/groupby/transform/test_transform.py
+++ b/pandas/tests/groupby/transform/test_transform.py
@@ -1357,21 +1357,28 @@ def test_null_group_str_reducer(request, dropna, reduction_func):
     tm.assert_equal(result, expected)
 
 
-def test_null_group_str_transformer(request, dropna, transformation_func):
+def test_null_group_str_transformer(
+    request, using_array_manager, dropna, transformation_func
+):
     # GH 17093
-    if transformation_func == "tshift":
-        msg = "tshift requires timeseries"
-        request.node.add_marker(pytest.mark.xfail(reason=msg))
-    elif dropna and transformation_func in (
-        "backfill",
-        "bfill",
+    xfails_block = (
         "cummax",
         "cummin",
         "cumsum",
-        "ffill",
         "fillna",
-        "pad",
         "rank",
+        "backfill",
+        "ffill",
+        "bfill",
+        "pad",
+    )
+    xfails_array = ("cummax", "cummin", "cumsum", "fillna", "rank")
+    if transformation_func == "tshift":
+        msg = "tshift requires timeseries"
+        request.node.add_marker(pytest.mark.xfail(reason=msg))
+    elif dropna and (
+        (not using_array_manager and transformation_func in xfails_block)
+        or (using_array_manager and transformation_func in xfails_array)
     ):
         msg = "produces incorrect results when nans are present"
         request.node.add_marker(pytest.mark.xfail(reason=msg))

From 6367db9d7f8b9c9101495ab0d805236edee45299 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Sat, 5 Mar 2022 05:45:01 -0500
Subject: [PATCH 6/8] xfail corrwith

---
 pandas/tests/groupby/transform/test_transform.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py
index f0bfb037dcd3d..0dc3cadc449e6 100644
--- a/pandas/tests/groupby/transform/test_transform.py
+++ b/pandas/tests/groupby/transform/test_transform.py
@@ -1312,8 +1312,8 @@ def test_null_group_lambda_self(sort, dropna):
 
 def test_null_group_str_reducer(request, dropna, reduction_func):
     # GH 17093
-    if reduction_func == "ngroup":
-        msg = "ngroup fails"
+    if reduction_func in ("corrwith", "ngroup"):
+        msg = "incorrectly raises"
         request.node.add_marker(pytest.mark.xfail(reason=msg))
     index = [1, 2, 3, 4]  # test transform preserves non-standard index
     df = DataFrame({"A": [1, 1, np.nan, np.nan], "B": [1, 2, 2, 3]}, index=index)

From e81dc16a7dd8acbe59974648dffbb19df3d85dc8 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Sat, 5 Mar 2022 07:44:53 -0500
Subject: [PATCH 7/8] Suppress warning

---
 pandas/tests/groupby/transform/test_transform.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py
index 0dc3cadc449e6..5415c99f5239a 100644
--- a/pandas/tests/groupby/transform/test_transform.py
+++ b/pandas/tests/groupby/transform/test_transform.py
@@ -1405,7 +1405,11 @@ def test_null_group_str_transformer(
         # ngroup/cumcount always returns a Series as it counts the groups, not values
         expected = expected["B"].rename(None)
 
-    result = gb.transform(transformation_func, *args)
+    warn = FutureWarning if transformation_func in ("backfill", "pad") else None
+    msg = f"{transformation_func} is deprecated"
+    with tm.assert_produces_warning(warn, match=msg):
+        result = gb.transform(transformation_func, *args)
+
     tm.assert_equal(result, expected)
 
 
@@ -1492,5 +1496,8 @@ def test_null_group_str_transformer_series(request, dropna, transformation_func)
         buffer.append(Series([np.nan], index=[3], dtype=dtype))
     expected = concat(buffer)
 
-    result = gb.transform(transformation_func, *args)
+    warn = FutureWarning if transformation_func in ("backfill", "pad") else None
+    msg = f"{transformation_func} is deprecated"
+    with tm.assert_produces_warning(warn, match=msg):
+        result = gb.transform(transformation_func, *args)
     tm.assert_equal(result, expected)

From f0511932c288b778914f1bc630599256faac0d2d Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Mon, 7 Mar 2022 16:50:18 -0500
Subject: [PATCH 8/8] GH#

---
 pandas/core/groupby/groupby.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index d07e1bfb6ad09..a4dcf6e010c2b 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -1646,6 +1646,7 @@ def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT:
             out = algorithms.take_nd(result._values, ids)
             output = obj._constructor(out, index=obj.index, name=obj.name)
         else:
+            # GH#46209
             # Don't convert indices: negative indices need to give rise
             # to null values in the result
             output = result._take(ids, axis=self.axis, convert_indices=False)