BUG: Fix some cases of groupby(...).transform with dropna=True (#45953)

rhshadrach · web-flow · commit 1efa4fb9cca4 · 2022-02-27T15:19:46.000-05:00
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -58,6 +58,39 @@ Styler
 
 - Fixed bug in :class:`CSSToExcelConverter` leading to ``TypeError`` when border color provided without border style for ``xlsxwriter`` engine (:issue:`42276`)
 
+.. _whatsnew_150.notable_bug_fixes.groupby_transform_dropna:
+
+Using ``dropna=True`` with ``groupby`` transforms
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+A transform is an operation whose result has the same size as its input. When the
+result is a :class:`DataFrame` or :class:`Series`, it is also required that the
+index of the result matches that of the input. In pandas 1.4, using
+:meth:`.DataFrameGroupBy.transform` or :meth:`.SeriesGroupBy.transform` with null
+values in the groups and ``dropna=True`` gave incorrect results. Demonstrated by the
+examples below, the incorrect results either contained incorrect values, or the result
+did not have the same index as the input.
+
+.. ipython:: python
+
+    df = pd.DataFrame({'a': [1, 1, np.nan], 'b': [2, 3, 4]})
+
+*Old behavior*:
+
+.. code-block:: ipython
+
+    In [3]: df.groupby('a', dropna=True).transform(lambda x: x)
+    Out[3]:
+       b
+    0  2
+    1  3
+
+*New behavior*:
+
+.. ipython:: python
+
+    df.groupby('a', dropna=True).transform(lambda x: x)
+
 .. _whatsnew_150.notable_bug_fixes.notable_bug_fix2:
 
 notable_bug_fix2
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -108,6 +108,7 @@ class providing the base-class of operations.
     CategoricalIndex,
     Index,
     MultiIndex,
+    RangeIndex,
 )
 from pandas.core.internals.blocks import ensure_block_shape
 import pandas.core.sample as sample
@@ -1093,21 +1094,15 @@ def _set_result_index_ordered(
             return result
 
         # row order is scrambled => sort the rows by position in original index
-        original_positions = Index(
-            np.concatenate(self._get_indices(self.grouper.result_index))
-        )
+        original_positions = Index(self.grouper.result_ilocs())
         result.set_axis(original_positions, axis=self.axis, inplace=True)
         result = result.sort_index(axis=self.axis)
-
-        dropped_rows = len(result.index) < len(self.obj.index)
-
-        if dropped_rows:
-            # get index by slicing original index according to original positions
-            # slice drops attrs => use set_axis when no rows were dropped
-            sorted_indexer = result.index
-            result.index = self._selected_obj.index[sorted_indexer]
-        else:
-            result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True)
+        obj_axis = self.obj._get_axis(self.axis)
+        if self.grouper.has_dropped_na:
+            # Add back in any missing rows due to dropna - index here is integral
+            # with values referring to the row of the input so can use RangeIndex
+            result = result.reindex(RangeIndex(len(obj_axis)), axis=self.axis)
+        result.set_axis(obj_axis, axis=self.axis, inplace=True)
 
         return result
 
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -795,6 +795,30 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:
         keys = [ping.group_index for ping in self.groupings]
         return get_indexer_dict(codes_list, keys)
 
+    @final
+    def result_ilocs(self) -> npt.NDArray[np.intp]:
+        """
+        Get the original integer locations of result_index in the input.
+        """
+        # Original indices are where group_index would go via sorting.
+        # But when dropna is true, we need to remove null values while accounting for
+        # any gaps that then occur because of them.
+        group_index = get_group_index(self.codes, self.shape, sort=False, xnull=True)
+
+        if self.has_dropped_na:
+            mask = np.where(group_index >= 0)
+            # Count how many gaps are caused by previous null values for each position
+            null_gaps = np.cumsum(group_index == -1)[mask]
+            group_index = group_index[mask]
+
+        result = get_group_index_sorter(group_index, self.ngroups)
+
+        if self.has_dropped_na:
+            # Shift by the number of prior null gaps
+            result += np.take(null_gaps, result)
+
+        return result
+
     @final
     @property
     def codes(self) -> list[npt.NDArray[np.signedinteger]]:
@@ -837,6 +861,14 @@ def is_monotonic(self) -> bool:
         # return if my group orderings are monotonic
         return Index(self.group_info[0]).is_monotonic_increasing
 
+    @final
+    @cache_readonly
+    def has_dropped_na(self) -> bool:
+        """
+        Whether grouper has null value(s) that are dropped.
+        """
+        return bool((self.group_info[0] < 0).any())
+
     @cache_readonly
     def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
         comp_ids, obs_group_ids = self._get_compressed_codes()
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
@@ -18,8 +18,6 @@
 import numpy as np
 import pytest
 
-from pandas.compat import pa_version_under2p0
-
 import pandas as pd
 from pandas.core.arrays import ArrowStringArray
 from pandas.core.arrays.string_ import StringDtype
@@ -193,10 +191,6 @@ class TestPrinting(base.BasePrintingTests):
 
 class TestGroupBy(base.BaseGroupbyTests):
     def test_groupby_extension_transform(self, data_for_grouping, request):
-        if data_for_grouping.dtype.storage == "pyarrow" and pa_version_under2p0:
-            # failure observed in 1.0.1, not in 2.0 or later
-            mark = pytest.mark.xfail(reason="pyarrow raises in self._data[item]")
-            request.node.add_marker(mark)
         super().test_groupby_extension_transform(data_for_grouping)
 
 
diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py
@@ -171,52 +171,30 @@ def test_grouper_dropna_propagation(dropna):
 
 
 @pytest.mark.parametrize(
-    "dropna,input_index,expected_data,expected_index",
+    "index",
     [
-        (True, pd.RangeIndex(0, 4), {"B": [2, 2, 1]}, pd.RangeIndex(0, 3)),
-        (True, list("abcd"), {"B": [2, 2, 1]}, list("abc")),
-        (
-            True,
-            pd.MultiIndex.from_tuples(
-                [(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"]
-            ),
-            {"B": [2, 2, 1]},
-            pd.MultiIndex.from_tuples(
-                [(1, "R"), (1, "B"), (2, "R")], names=["num", "col"]
-            ),
-        ),
-        (False, pd.RangeIndex(0, 4), {"B": [2, 2, 1, 1]}, pd.RangeIndex(0, 4)),
-        (False, list("abcd"), {"B": [2, 2, 1, 1]}, list("abcd")),
-        (
-            False,
-            pd.MultiIndex.from_tuples(
-                [(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"]
-            ),
-            {"B": [2, 2, 1, 1]},
-            pd.MultiIndex.from_tuples(
-                [(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"]
-            ),
-        ),
+        pd.RangeIndex(0, 4),
+        list("abcd"),
+        pd.MultiIndex.from_product([(1, 2), ("R", "B")], names=["num", "col"]),
     ],
 )
-def test_groupby_dataframe_slice_then_transform(
-    dropna, input_index, expected_data, expected_index
-):
+def test_groupby_dataframe_slice_then_transform(dropna, index):
     # GH35014 & GH35612
+    expected_data = {"B": [2, 2, 1, np.nan if dropna else 1]}
 
-    df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}, index=input_index)
+    df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}, index=index)
     gb = df.groupby("A", dropna=dropna)
 
     result = gb.transform(len)
-    expected = pd.DataFrame(expected_data, index=expected_index)
+    expected = pd.DataFrame(expected_data, index=index)
     tm.assert_frame_equal(result, expected)
 
     result = gb[["B"]].transform(len)
-    expected = pd.DataFrame(expected_data, index=expected_index)
+    expected = pd.DataFrame(expected_data, index=index)
     tm.assert_frame_equal(result, expected)
 
     result = gb["B"].transform(len)
-    expected = pd.Series(expected_data["B"], index=expected_index, name="B")
+    expected = pd.Series(expected_data["B"], index=index, name="B")
     tm.assert_series_equal(result, expected)
 
 
diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py
@@ -1290,9 +1290,21 @@ def test_transform_cumcount():
     tm.assert_series_equal(result, expected)
 
 
-def test_null_group_lambda_self():
+def test_null_group_lambda_self(sort, dropna):
     # GH 17093
-    df = DataFrame({"A": [1, np.nan], "B": [1, 1]})
-    result = df.groupby("A").transform(lambda x: x)
-    expected = DataFrame([1], columns=["B"])
+    np.random.seed(0)
+    keys = np.random.randint(0, 5, size=50).astype(float)
+    nulls = np.random.choice([0, 1], keys.shape).astype(bool)
+    keys[nulls] = np.nan
+    values = np.random.randint(0, 5, size=keys.shape)
+    df = DataFrame({"A": keys, "B": values})
+
+    expected_values = values
+    if dropna and nulls.any():
+        expected_values = expected_values.astype(float)
+        expected_values[nulls] = np.nan
+    expected = DataFrame(expected_values, columns=["B"])
+
+    gb = df.groupby("A", dropna=dropna, sort=sort)
+    result = gb.transform(lambda x: x)
     tm.assert_frame_equal(result, expected)