From b58536c2a3081e15bde6658c03db8fe1a4de628d Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Thu, 10 Feb 2022 22:43:49 -0500
Subject: [PATCH 01/11] BUG: Fix groupby(...).transform with dropna=True

---
 pandas/core/groupby/groupby.py                | 28 ++++++++-----
 pandas/core/groupby/ops.py                    | 12 ++++++
 pandas/tests/groupby/conftest.py              |  5 +++
 pandas/tests/groupby/test_groupby_dropna.py   | 42 +++++--------------
 .../tests/groupby/transform/test_transform.py | 11 +++--
 5 files changed, 51 insertions(+), 47 deletions(-)

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 4eb907e06adf1..9424da35d3d4e 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -108,6 +108,7 @@ class providing the base-class of operations.
     CategoricalIndex,
     Index,
     MultiIndex,
+    RangeIndex,
 )
 from pandas.core.internals.blocks import ensure_block_shape
 import pandas.core.sample as sample
@@ -614,6 +615,14 @@ def indices(self):
         """
         return self.grouper.indices
 
+    @final
+    def _get_indices_by_codes(self, codes):
+        """
+        Safe get multiple indices, translate keys for
+        datelike to underlying repr.
+        """
+        return [self.grouper.code_indices.get(code, []) for code in codes]
+
     @final
     def _get_indices(self, names):
         """
@@ -1093,21 +1102,18 @@ def _set_result_index_ordered(
             return result
 
         # row order is scrambled => sort the rows by position in original index
+        _, obs_group_ids, _ = self.grouper.group_info
         original_positions = Index(
-            np.concatenate(self._get_indices(self.grouper.result_index))
+            np.concatenate(self._get_indices_by_codes(obs_group_ids))
         )
         result.set_axis(original_positions, axis=self.axis, inplace=True)
         result = result.sort_index(axis=self.axis)
-
-        dropped_rows = len(result.index) < len(self.obj.index)
-
-        if dropped_rows:
-            # get index by slicing original index according to original positions
-            # slice drops attrs => use set_axis when no rows were dropped
-            sorted_indexer = result.index
-            result.index = self._selected_obj.index[sorted_indexer]
-        else:
-            result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True)
+        obj_axis = self.obj._get_axis(self.axis)
+        if self.grouper.has_dropped_na:
+            # Fill in any missing values due to dropna - index here is integral
+            # with values referring to the row of the input so can use RangeIndex
+            result = result.reindex(RangeIndex(len(obj_axis)), axis=self.axis)
+        result.set_axis(obj_axis, axis=self.axis, inplace=True)
 
         return result
 
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 467da586f604d..f21a191da5de8 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -783,6 +783,13 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:
         keys = [ping.group_index for ping in self.groupings]
         return get_indexer_dict(codes_list, keys)
 
+    @cache_readonly
+    def code_indices(self) -> dict[int, npt.NDArray[np.intp]]:
+        group_index = get_group_index(self.codes, self.shape, sort=True, xnull=True)
+        _, obs_group_ids, _ = self.group_info
+        result = {e: np.where(group_index == e)[0] for e in obs_group_ids}
+        return result
+
     @final
     @property
     def codes(self) -> list[np.ndarray]:
@@ -825,6 +832,11 @@ def is_monotonic(self) -> bool:
         # return if my group orderings are monotonic
         return Index(self.group_info[0]).is_monotonic_increasing
 
+    @final
+    @cache_readonly
+    def has_dropped_na(self) -> bool:
+        return bool((self.group_info[0] < 0).any())
+
     @cache_readonly
     def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
         comp_ids, obs_group_ids = self._get_compressed_codes()
diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py
index 403e3edcc34d0..58d9e500554dd 100644
--- a/pandas/tests/groupby/conftest.py
+++ b/pandas/tests/groupby/conftest.py
@@ -19,6 +19,11 @@ def as_index(request):
     return request.param
 
 
+@pytest.fixture(params=[True, False])
+def dropna(request):
+    return request.param
+
+
 @pytest.fixture
 def mframe(multiindex_dataframe_random_data):
     return multiindex_dataframe_random_data
diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py
index ab568e24ff029..9fcc28d8f65a9 100644
--- a/pandas/tests/groupby/test_groupby_dropna.py
+++ b/pandas/tests/groupby/test_groupby_dropna.py
@@ -171,52 +171,30 @@ def test_grouper_dropna_propagation(dropna):
 
 
 @pytest.mark.parametrize(
-    "dropna,input_index,expected_data,expected_index",
+    "index",
     [
-        (True, pd.RangeIndex(0, 4), {"B": [2, 2, 1]}, pd.RangeIndex(0, 3)),
-        (True, list("abcd"), {"B": [2, 2, 1]}, list("abc")),
-        (
-            True,
-            pd.MultiIndex.from_tuples(
-                [(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"]
-            ),
-            {"B": [2, 2, 1]},
-            pd.MultiIndex.from_tuples(
-                [(1, "R"), (1, "B"), (2, "R")], names=["num", "col"]
-            ),
-        ),
-        (False, pd.RangeIndex(0, 4), {"B": [2, 2, 1, 1]}, pd.RangeIndex(0, 4)),
-        (False, list("abcd"), {"B": [2, 2, 1, 1]}, list("abcd")),
-        (
-            False,
-            pd.MultiIndex.from_tuples(
-                [(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"]
-            ),
-            {"B": [2, 2, 1, 1]},
-            pd.MultiIndex.from_tuples(
-                [(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"]
-            ),
-        ),
+        pd.RangeIndex(0, 4),
+        list("abcd"),
+        pd.MultiIndex.from_product([(1, 2), ("R", "B")], names=["num", "col"]),
     ],
 )
-def test_groupby_dataframe_slice_then_transform(
-    dropna, input_index, expected_data, expected_index
-):
+def test_groupby_dataframe_slice_then_transform(dropna, index):
     # GH35014 & GH35612
+    expected_data = {"B": [2, 2, 1, np.nan if dropna else 1]}
 
-    df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}, index=input_index)
+    df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}, index=index)
     gb = df.groupby("A", dropna=dropna)
 
     result = gb.transform(len)
-    expected = pd.DataFrame(expected_data, index=expected_index)
+    expected = pd.DataFrame(expected_data, index=index)
     tm.assert_frame_equal(result, expected)
 
     result = gb[["B"]].transform(len)
-    expected = pd.DataFrame(expected_data, index=expected_index)
+    expected = pd.DataFrame(expected_data, index=index)
     tm.assert_frame_equal(result, expected)
 
     result = gb["B"].transform(len)
-    expected = pd.Series(expected_data["B"], index=expected_index, name="B")
+    expected = pd.Series(expected_data["B"], index=index, name="B")
     tm.assert_series_equal(result, expected)
 
 
diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py
index 6e9b8c35b3698..1ea0fc3f0e3db 100644
--- a/pandas/tests/groupby/transform/test_transform.py
+++ b/pandas/tests/groupby/transform/test_transform.py
@@ -1281,9 +1281,12 @@ def test_transform_cumcount():
     tm.assert_series_equal(result, expected)
 
 
-def test_null_group_lambda_self():
+# Test both monotonic increasing and not
+@pytest.mark.parametrize("keys", [[1, 2, np.nan], [2, 1, np.nan]])
+def test_null_group_lambda_self(dropna, keys):
     # GH 17093
-    df = DataFrame({"A": [1, np.nan], "B": [1, 1]})
-    result = df.groupby("A").transform(lambda x: x)
-    expected = DataFrame([1], columns=["B"])
+    df = DataFrame({"A": keys, "B": [1, 2, 3]})
+    result = df.groupby("A", dropna=dropna).transform(lambda x: x)
+    value = np.nan if dropna else 3
+    expected = DataFrame([1, 2, value], columns=["B"])
     tm.assert_frame_equal(result, expected)

From 726a1b83d11c1933bd9866013fdf742e7207641c Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Fri, 11 Feb 2022 21:25:28 -0500
Subject: [PATCH 02/11] WIP

---
 pandas/_libs/lib.pyx           | 15 ++++++++++++++
 pandas/core/groupby/groupby.py | 36 ++++++++++++++++++++++++++++++----
 pandas/core/groupby/ops.py     |  2 +-
 3 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index c86eb80da93f7..95782b583a565 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -944,6 +944,21 @@ def generate_slices(const intp_t[:] labels, Py_ssize_t ngroups):
     return np.asarray(starts), np.asarray(ends)
 
 
+def code_indices_fast(ndarray[intp_t, ndim=1] index, ndarray[intp_t, ndim=1] obs_group_ids) -> dict:
+    cdef:
+        Py_ssize_t i, j, k, l
+        ndarray[intp_t, ndim=1] result
+
+    result = np.empty(len(index))
+
+    for i in range(k):
+        result[obs_group_ids[i]] = []
+
+    for j in range(l):
+        result[index[j]].append(j)
+    return result
+
+
 def indices_fast(ndarray[intp_t, ndim=1] index, const int64_t[:] labels, list keys,
                  list sorted_labels) -> dict:
     """
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 9424da35d3d4e..6de52701cae6d 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -621,7 +621,17 @@ def _get_indices_by_codes(self, codes):
         Safe get multiple indices, translate keys for
         datelike to underlying repr.
         """
-        return [self.grouper.code_indices.get(code, []) for code in codes]
+        # code_indices = self.grouper.code_indices
+        # return [self.grouper.code_indices.get(code, []) for code in codes]
+        from pandas.core.sorting import get_group_index
+
+        group_index = get_group_index(
+            self.grouper.codes, self.grouper.shape, sort=True, xnull=True
+        )
+        _, obs_group_ids, _ = self.grouper.group_info
+
+        sorter = get_group_index_sorter(group_index, len(group_index))
+        return sorter
 
     @final
     def _get_indices(self, names):
@@ -1101,11 +1111,29 @@ def _set_result_index_ordered(
             result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True)
             return result
 
+        # # row order is scrambled => sort the rows by position in original index
+        # original_positions = Index(
+        #     np.concatenate(self._get_indices(self.grouper.result_index))
+        # )
+        # result.set_axis(original_positions, axis=self.axis, inplace=True)
+        # result = result.sort_index(axis=self.axis)
+
+        # dropped_rows = len(result.index) < len(self.obj.index)
+        #
+        # if dropped_rows:
+        #     # get index by slicing original index according to original positions
+        #     # slice drops attrs => use set_axis when no rows were dropped
+        #     sorted_indexer = result.index
+        #     result.index = self._selected_obj.index[sorted_indexer]
+        # else:
+        #     result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True)
+        #
+        # return result
+
         # row order is scrambled => sort the rows by position in original index
         _, obs_group_ids, _ = self.grouper.group_info
-        original_positions = Index(
-            np.concatenate(self._get_indices_by_codes(obs_group_ids))
-        )
+        new = self._get_indices_by_codes(obs_group_ids)
+        original_positions = Index(new)
         result.set_axis(original_positions, axis=self.axis, inplace=True)
         result = result.sort_index(axis=self.axis)
         obj_axis = self.obj._get_axis(self.axis)
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 7978970006437..92db80e3f3c23 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -787,7 +787,7 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:
     def code_indices(self) -> dict[int, npt.NDArray[np.intp]]:
         group_index = get_group_index(self.codes, self.shape, sort=True, xnull=True)
         _, obs_group_ids, _ = self.group_info
-        result = {e: np.where(group_index == e)[0] for e in obs_group_ids}
+        result = lib.code_indices_fast(group_index, obs_group_ids)
         return result
 
     @final

From 2f32be165d025368a42b41f65a4e0a970c93f2ee Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Thu, 10 Feb 2022 22:43:49 -0500
Subject: [PATCH 03/11] BUG: Fix some cases of groupby(...).transform with
 dropna=True

---
 doc/source/whatsnew/v1.5.0.rst                | 33 +++++++++++++++
 pandas/core/groupby/groupby.py                | 40 +++++++++++-------
 pandas/core/groupby/ops.py                    |  8 ++++
 pandas/tests/groupby/conftest.py              |  5 +++
 pandas/tests/groupby/test_groupby_dropna.py   | 42 +++++--------------
 .../tests/groupby/transform/test_transform.py | 11 +++--
 6 files changed, 89 insertions(+), 50 deletions(-)

diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index 4298809b0f2e4..d01a74e332c4f 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -57,6 +57,39 @@ Styler
 
 - Fixed bug in :class:`CSSToExcelConverter` leading to ``TypeError`` when border color provided without border style for ``xlsxwriter`` engine (:issue:`42276`)
 
+.. _whatsnew_150.notable_bug_fixes.groupby_transform_dropna:
+
+Using ``dropna=True`` with ``groupby`` transformers
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+A transformer is an operation whose result has the same size as its input. When the
+result is a :class:`DataFrame` or :class:`Series`, it is also required that the
+index of the result is equal to that of the input. In pandas 1.4, using
+:meth:`.DataFrameGroupBy.transform` or :meth:`.SeriesGroupBy.transform` with null
+values in the groups and ``dropna=True`` gave incorrect results. Demonstrated by the
+examples below, the incorrect results either contained incorrect values, or the result
+did not have the same index as the input.
+
+.. ipython:: python
+
+    df = pd.DataFrame({'a': [1, 1, np.nan], 'b': [2, 3, 4]})
+
+*Old behavior*:
+
+.. code-block:: ipython
+
+    In [3]: df.groupby('a', dropna=True).transform(lambda x: x)
+    Out[3]:
+       b
+    0  2
+    1  3
+
+*New behavior*:
+
+.. ipython:: python
+
+    df.groupby('a', dropna=True).transform(lambda x: x)
+
 .. _whatsnew_150.notable_bug_fixes.notable_bug_fix2:
 
 notable_bug_fix2
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 4eb907e06adf1..75c8c1cbb4cd4 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -108,11 +108,15 @@ class providing the base-class of operations.
     CategoricalIndex,
     Index,
     MultiIndex,
+    RangeIndex,
 )
 from pandas.core.internals.blocks import ensure_block_shape
 import pandas.core.sample as sample
 from pandas.core.series import Series
-from pandas.core.sorting import get_group_index_sorter
+from pandas.core.sorting import (
+    get_group_index,
+    get_group_index_sorter,
+)
 from pandas.core.util.numba_ import (
     NUMBA_FUNC_CACHE,
     maybe_use_numba,
@@ -614,6 +618,20 @@ def indices(self):
         """
         return self.grouper.indices
 
+    @final
+    def _get_indices_by_codes(self, codes):
+        """
+        Safe get multiple indices by code.
+        """
+        group_index = get_group_index(
+            codes, self.grouper.shape, sort=True, xnull=self.dropna
+        )
+        if self.dropna:
+            # group_index can only be negative when passing xnull=True
+            group_index = group_index[np.where(group_index >= 0)]
+        sorter = get_group_index_sorter(group_index, len(group_index))
+        return sorter
+
     @final
     def _get_indices(self, names):
         """
@@ -1093,21 +1111,15 @@ def _set_result_index_ordered(
             return result
 
         # row order is scrambled => sort the rows by position in original index
-        original_positions = Index(
-            np.concatenate(self._get_indices(self.grouper.result_index))
-        )
+        original_positions = Index(self._get_indices_by_codes(self.grouper.codes))
         result.set_axis(original_positions, axis=self.axis, inplace=True)
         result = result.sort_index(axis=self.axis)
-
-        dropped_rows = len(result.index) < len(self.obj.index)
-
-        if dropped_rows:
-            # get index by slicing original index according to original positions
-            # slice drops attrs => use set_axis when no rows were dropped
-            sorted_indexer = result.index
-            result.index = self._selected_obj.index[sorted_indexer]
-        else:
-            result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True)
+        obj_axis = self.obj._get_axis(self.axis)
+        if self.grouper.has_dropped_na:
+            # Fill in any missing values due to dropna - index here is integral
+            # with values referring to the row of the input so can use RangeIndex
+            result = result.reindex(RangeIndex(len(obj_axis)), axis=self.axis)
+        result.set_axis(obj_axis, axis=self.axis, inplace=True)
 
         return result
 
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index d4aa6ae9f4059..6bb6973a65071 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -825,6 +825,14 @@ def is_monotonic(self) -> bool:
         # return if my group orderings are monotonic
         return Index(self.group_info[0]).is_monotonic_increasing
 
+    @final
+    @cache_readonly
+    def has_dropped_na(self) -> bool:
+        """
+        Whether grouper has null value(s) that are dropped.
+        """
+        return bool((self.group_info[0] < 0).any())
+
     @cache_readonly
     def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
         comp_ids, obs_group_ids = self._get_compressed_codes()
diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py
index 403e3edcc34d0..58d9e500554dd 100644
--- a/pandas/tests/groupby/conftest.py
+++ b/pandas/tests/groupby/conftest.py
@@ -19,6 +19,11 @@ def as_index(request):
     return request.param
 
 
+@pytest.fixture(params=[True, False])
+def dropna(request):
+    return request.param
+
+
 @pytest.fixture
 def mframe(multiindex_dataframe_random_data):
     return multiindex_dataframe_random_data
diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py
index ab568e24ff029..9fcc28d8f65a9 100644
--- a/pandas/tests/groupby/test_groupby_dropna.py
+++ b/pandas/tests/groupby/test_groupby_dropna.py
@@ -171,52 +171,30 @@ def test_grouper_dropna_propagation(dropna):
 
 
 @pytest.mark.parametrize(
-    "dropna,input_index,expected_data,expected_index",
+    "index",
     [
-        (True, pd.RangeIndex(0, 4), {"B": [2, 2, 1]}, pd.RangeIndex(0, 3)),
-        (True, list("abcd"), {"B": [2, 2, 1]}, list("abc")),
-        (
-            True,
-            pd.MultiIndex.from_tuples(
-                [(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"]
-            ),
-            {"B": [2, 2, 1]},
-            pd.MultiIndex.from_tuples(
-                [(1, "R"), (1, "B"), (2, "R")], names=["num", "col"]
-            ),
-        ),
-        (False, pd.RangeIndex(0, 4), {"B": [2, 2, 1, 1]}, pd.RangeIndex(0, 4)),
-        (False, list("abcd"), {"B": [2, 2, 1, 1]}, list("abcd")),
-        (
-            False,
-            pd.MultiIndex.from_tuples(
-                [(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"]
-            ),
-            {"B": [2, 2, 1, 1]},
-            pd.MultiIndex.from_tuples(
-                [(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"]
-            ),
-        ),
+        pd.RangeIndex(0, 4),
+        list("abcd"),
+        pd.MultiIndex.from_product([(1, 2), ("R", "B")], names=["num", "col"]),
     ],
 )
-def test_groupby_dataframe_slice_then_transform(
-    dropna, input_index, expected_data, expected_index
-):
+def test_groupby_dataframe_slice_then_transform(dropna, index):
     # GH35014 & GH35612
+    expected_data = {"B": [2, 2, 1, np.nan if dropna else 1]}
 
-    df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}, index=input_index)
+    df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}, index=index)
     gb = df.groupby("A", dropna=dropna)
 
     result = gb.transform(len)
-    expected = pd.DataFrame(expected_data, index=expected_index)
+    expected = pd.DataFrame(expected_data, index=index)
     tm.assert_frame_equal(result, expected)
 
     result = gb[["B"]].transform(len)
-    expected = pd.DataFrame(expected_data, index=expected_index)
+    expected = pd.DataFrame(expected_data, index=index)
     tm.assert_frame_equal(result, expected)
 
     result = gb["B"].transform(len)
-    expected = pd.Series(expected_data["B"], index=expected_index, name="B")
+    expected = pd.Series(expected_data["B"], index=index, name="B")
     tm.assert_series_equal(result, expected)
 
 
diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py
index 6e9b8c35b3698..1ea0fc3f0e3db 100644
--- a/pandas/tests/groupby/transform/test_transform.py
+++ b/pandas/tests/groupby/transform/test_transform.py
@@ -1281,9 +1281,12 @@ def test_transform_cumcount():
     tm.assert_series_equal(result, expected)
 
 
-def test_null_group_lambda_self():
+# Test both monotonic increasing and not
+@pytest.mark.parametrize("keys", [[1, 2, np.nan], [2, 1, np.nan]])
+def test_null_group_lambda_self(dropna, keys):
     # GH 17093
-    df = DataFrame({"A": [1, np.nan], "B": [1, 1]})
-    result = df.groupby("A").transform(lambda x: x)
-    expected = DataFrame([1], columns=["B"])
+    df = DataFrame({"A": keys, "B": [1, 2, 3]})
+    result = df.groupby("A", dropna=dropna).transform(lambda x: x)
+    value = np.nan if dropna else 3
+    expected = DataFrame([1, 2, value], columns=["B"])
     tm.assert_frame_equal(result, expected)

From 5e92af4c8e39bd144919b3c6bbedf045637c3af0 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Fri, 11 Feb 2022 22:34:26 -0500
Subject: [PATCH 04/11] Better conditional

---
 pandas/core/groupby/groupby.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 75c8c1cbb4cd4..eca4f0ed4e6fa 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -626,8 +626,7 @@ def _get_indices_by_codes(self, codes):
         group_index = get_group_index(
             codes, self.grouper.shape, sort=True, xnull=self.dropna
         )
-        if self.dropna:
-            # group_index can only be negative when passing xnull=True
+        if self.grouper.has_dropped_na:
             group_index = group_index[np.where(group_index >= 0)]
         sorter = get_group_index_sorter(group_index, len(group_index))
         return sorter

From 3919c515c2f20a922663c0587bb71b5150898d1b Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Fri, 11 Feb 2022 23:14:14 -0500
Subject: [PATCH 05/11] Remove xfail

---
 pandas/tests/extension/test_string.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index 4256142556894..73682620b8353 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -18,8 +18,6 @@
 import numpy as np
 import pytest
 
-from pandas.compat import pa_version_under2p0
-
 import pandas as pd
 from pandas.core.arrays import ArrowStringArray
 from pandas.core.arrays.string_ import StringDtype
@@ -193,10 +191,6 @@ class TestPrinting(base.BasePrintingTests):
 
 class TestGroupBy(base.BaseGroupbyTests):
     def test_groupby_extension_transform(self, data_for_grouping, request):
-        if data_for_grouping.dtype.storage == "pyarrow" and pa_version_under2p0:
-            # failure observed in 1.0.1, not in 2.0 or later
-            mark = pytest.mark.xfail(reason="pyarrow raises in self._data[item]")
-            request.node.add_marker(mark)
         super().test_groupby_extension_transform(data_for_grouping)
 
 

From 31240e179f32caab5554fe7bc983b321013b7958 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Fri, 11 Feb 2022 23:54:11 -0500
Subject: [PATCH 06/11] Fixup & type-hints

---
 pandas/core/groupby/groupby.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index eca4f0ed4e6fa..67de98578496e 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -619,12 +619,12 @@ def indices(self):
         return self.grouper.indices
 
     @final
-    def _get_indices_by_codes(self, codes):
+    def _get_indices_by_codes(self, codes: list[np.ndarray]) -> npt.NDArray[np.intp]:
         """
         Safe get multiple indices by code.
         """
         group_index = get_group_index(
-            codes, self.grouper.shape, sort=True, xnull=self.dropna
+            codes, self.grouper.shape, sort=True, xnull=self.grouper.dropna
         )
         if self.grouper.has_dropped_na:
             group_index = group_index[np.where(group_index >= 0)]

From d3cbf503ea024b18b8b258ad14a54d498c4d7f60 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Sun, 13 Feb 2022 11:30:20 -0500
Subject: [PATCH 07/11] WIP

---
 .../tests/groupby/transform/test_transform.py | 22 ++++++++++++++-----
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py
index 1ea0fc3f0e3db..6349bad3ed135 100644
--- a/pandas/tests/groupby/transform/test_transform.py
+++ b/pandas/tests/groupby/transform/test_transform.py
@@ -1281,12 +1281,22 @@ def test_transform_cumcount():
     tm.assert_series_equal(result, expected)
 
 
-# Test both monotonic increasing and not
-@pytest.mark.parametrize("keys", [[1, 2, np.nan], [2, 1, np.nan]])
-def test_null_group_lambda_self(dropna, keys):
+def test_null_group_lambda_self(dropna):
     # GH 17093
-    df = DataFrame({"A": keys, "B": [1, 2, 3]})
+    keys = np.random.randint(0, 5, size=5).astype(float)
+    nulls = np.random.choice([0, 1], keys.shape).astype(bool)
+    keys[nulls] = np.nan
+    values = np.random.randint(0, 5, size=keys.shape)
+    df = DataFrame({"A": keys, "B": values})
+
+    expected_values = values
+    if dropna:
+        expected_values = expected_values.astype(float)
+        expected_values[nulls] = np.nan
+    expected = DataFrame(expected_values, columns=["B"])
+
     result = df.groupby("A", dropna=dropna).transform(lambda x: x)
-    value = np.nan if dropna else 3
-    expected = DataFrame([1, 2, value], columns=["B"])
+    print(df)
+    print(result)
+    print(expected)
     tm.assert_frame_equal(result, expected)

From c60da747a9a64f86519e1bb4cfce697357058373 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Sun, 13 Feb 2022 12:50:54 -0500
Subject: [PATCH 08/11] Fix implementation, better comments/tests.

---
 pandas/_libs/lib.pyx                          | 15 ------------
 pandas/core/groupby/groupby.py                | 20 ++--------------
 pandas/core/groupby/ops.py                    | 24 +++++++++++++++++++
 .../tests/groupby/transform/test_transform.py | 13 +++++-----
 4 files changed, 32 insertions(+), 40 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 95782b583a565..c86eb80da93f7 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -944,21 +944,6 @@ def generate_slices(const intp_t[:] labels, Py_ssize_t ngroups):
     return np.asarray(starts), np.asarray(ends)
 
 
-def code_indices_fast(ndarray[intp_t, ndim=1] index, ndarray[intp_t, ndim=1] obs_group_ids) -> dict:
-    cdef:
-        Py_ssize_t i, j, k, l
-        ndarray[intp_t, ndim=1] result
-
-    result = np.empty(len(index))
-
-    for i in range(k):
-        result[obs_group_ids[i]] = []
-
-    for j in range(l):
-        result[index[j]].append(j)
-    return result
-
-
 def indices_fast(ndarray[intp_t, ndim=1] index, const int64_t[:] labels, list keys,
                  list sorted_labels) -> dict:
     """
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 67de98578496e..335ef3d254173 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -113,10 +113,7 @@ class providing the base-class of operations.
 from pandas.core.internals.blocks import ensure_block_shape
 import pandas.core.sample as sample
 from pandas.core.series import Series
-from pandas.core.sorting import (
-    get_group_index,
-    get_group_index_sorter,
-)
+from pandas.core.sorting import get_group_index_sorter
 from pandas.core.util.numba_ import (
     NUMBA_FUNC_CACHE,
     maybe_use_numba,
@@ -618,19 +615,6 @@ def indices(self):
         """
         return self.grouper.indices
 
-    @final
-    def _get_indices_by_codes(self, codes: list[np.ndarray]) -> npt.NDArray[np.intp]:
-        """
-        Safe get multiple indices by code.
-        """
-        group_index = get_group_index(
-            codes, self.grouper.shape, sort=True, xnull=self.grouper.dropna
-        )
-        if self.grouper.has_dropped_na:
-            group_index = group_index[np.where(group_index >= 0)]
-        sorter = get_group_index_sorter(group_index, len(group_index))
-        return sorter
-
     @final
     def _get_indices(self, names):
         """
@@ -1110,7 +1094,7 @@ def _set_result_index_ordered(
             return result
 
         # row order is scrambled => sort the rows by position in original index
-        original_positions = Index(self._get_indices_by_codes(self.grouper.codes))
+        original_positions = Index(self.grouper.result_ilocs())
         result.set_axis(original_positions, axis=self.axis, inplace=True)
         result = result.sort_index(axis=self.axis)
         obj_axis = self.obj._get_axis(self.axis)
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 6bb6973a65071..95f73cc9bab1c 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -783,6 +783,30 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:
         keys = [ping.group_index for ping in self.groupings]
         return get_indexer_dict(codes_list, keys)
 
+    @final
+    def result_ilocs(self) -> npt.NDArray[np.intp]:
+        """
+        Get the original integer locations of result_index in the input.
+        """
+        # Original indices are where group_index would go via sorting.
+        # But when dropna is true, we need to remove null values whilst accounting for
+        # any gaps that then occur because of them.
+        group_index = get_group_index(self.codes, self.shape, sort=False, xnull=True)
+
+        if self.has_dropped_na:
+            mask = np.where(group_index >= 0)
+            # Count how many gaps are caused by previous null values for each position
+            null_gaps = np.cumsum(group_index == -1)[mask]
+            group_index = group_index[mask]
+
+        result = get_group_index_sorter(group_index, self.ngroups)
+
+        if self.has_dropped_na:
+            # Shift by the number of prior null gaps
+            result += np.take(null_gaps, result)
+
+        return result
+
     @final
     @property
     def codes(self) -> list[np.ndarray]:
diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py
index 6349bad3ed135..9e8a1ff233575 100644
--- a/pandas/tests/groupby/transform/test_transform.py
+++ b/pandas/tests/groupby/transform/test_transform.py
@@ -1281,22 +1281,21 @@ def test_transform_cumcount():
     tm.assert_series_equal(result, expected)
 
 
-def test_null_group_lambda_self(dropna):
+def test_null_group_lambda_self(sort, dropna):
     # GH 17093
-    keys = np.random.randint(0, 5, size=5).astype(float)
+    np.random.seed(0)
+    keys = np.random.randint(0, 5, size=50).astype(float)
     nulls = np.random.choice([0, 1], keys.shape).astype(bool)
     keys[nulls] = np.nan
     values = np.random.randint(0, 5, size=keys.shape)
     df = DataFrame({"A": keys, "B": values})
 
     expected_values = values
-    if dropna:
+    if dropna and nulls.any():
         expected_values = expected_values.astype(float)
         expected_values[nulls] = np.nan
     expected = DataFrame(expected_values, columns=["B"])
 
-    result = df.groupby("A", dropna=dropna).transform(lambda x: x)
-    print(df)
-    print(result)
-    print(expected)
+    gb = df.groupby("A", dropna=dropna, sort=sort)
+    result = gb.transform(lambda x: x)
     tm.assert_frame_equal(result, expected)

From 80b662983213856e3e910d3c704f79f9e73d370f Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Sun, 13 Feb 2022 13:03:04 -0500
Subject: [PATCH 09/11] Fix comment

---
 pandas/core/groupby/groupby.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 335ef3d254173..289b445af1c24 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -1099,7 +1099,7 @@ def _set_result_index_ordered(
         result = result.sort_index(axis=self.axis)
         obj_axis = self.obj._get_axis(self.axis)
         if self.grouper.has_dropped_na:
-            # Fill in any missing values due to dropna - index here is integral
+            # Add back in any missing rows due to dropna - index here is integral
             # with values referring to the row of the input so can use RangeIndex
             result = result.reindex(RangeIndex(len(obj_axis)), axis=self.axis)
         result.set_axis(obj_axis, axis=self.axis, inplace=True)

From da4201a2d14a41010f177121e93d5d7b74a72c3b Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Wed, 16 Feb 2022 21:28:25 -0500
Subject: [PATCH 10/11] Improve comment/whatsnew

---
 doc/source/whatsnew/v1.5.0.rst | 4 ++--
 pandas/core/groupby/ops.py     | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index ccedaba977080..8e1cc22197301 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -62,9 +62,9 @@ Styler
 Using ``dropna=True`` with ``groupby`` transformers
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-A transformer is an operation whose result has the same size as its input. When the
+A transform is an operation whose result has the same size as its input. When the
 result is a :class:`DataFrame` or :class:`Series`, it is also required that the
-index of the result is equal to that of the input. In pandas 1.4, using
+index of the result matches that of the input. In pandas 1.4, using
 :meth:`.DataFrameGroupBy.transform` or :meth:`.SeriesGroupBy.transform` with null
 values in the groups and ``dropna=True`` gave incorrect results. Demonstrated by the
 examples below, the incorrect results either contained incorrect values, or the result
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index d233fdcbb263d..884bf3c1bcc48 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -789,7 +789,7 @@ def result_ilocs(self) -> npt.NDArray[np.intp]:
         Get the original integer locations of result_index in the input.
         """
         # Original indices are where group_index would go via sorting.
-        # But when dropna is true, we need to remove null values whilst accounting for
+        # But when dropna is true, we need to remove null values while accounting for
         # any gaps that then occur because of them.
         group_index = get_group_index(self.codes, self.shape, sort=False, xnull=True)
 

From e589e11b9f56970d3dd0bbd4efa06953fe7ef3a8 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Wed, 16 Feb 2022 23:35:44 -0500
Subject: [PATCH 11/11] missed the title

---
 doc/source/whatsnew/v1.5.0.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index 8e1cc22197301..00ce71609a3d8 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -59,8 +59,8 @@ Styler
 
 .. _whatsnew_150.notable_bug_fixes.groupby_transform_dropna:
 
-Using ``dropna=True`` with ``groupby`` transformers
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Using ``dropna=True`` with ``groupby`` transforms
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 A transform is an operation whose result has the same size as its input. When the
 result is a :class:`DataFrame` or :class:`Series`, it is also required that the