pandas-dev · rhshadrach · Feb 4, 2022 · Feb 5, 2022 · Feb 10, 2022 · Feb 10, 2022
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -50,10 +50,61 @@ Notable bug fixes
 
 These are bug fixes that might have notable behavior changes.
 
-.. _whatsnew_150.notable_bug_fixes.notable_bug_fix1:
+.. _whatsnew_150.notable_bug_fixes.groupby_transform_dropna:
 
-notable_bug_fix1
-^^^^^^^^^^^^^^^^
+Using ``dropna=True`` with ``groupby`` transformers
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+A transformer is a function whose result has the same size as its input. When the
+result is a :class:`DataFrame` or :class:`Series`, it is also required that the
+index of the result is equal to that of the input. In pandas 1.4, using
+:meth:`.DataFrameGroupBy.transform` or :meth:`.SeriesGroupBy.transform` with null
+values in the groups and ``dropna=True`` gave incorrect results. Demonstrated by the
+examples below, the incorrect results either contained incorrect values, or the result
+did not have the same index as the input.
+
+.. ipython:: python
+
+    df = pd.DataFrame({'a': [1, 1, np.nan], 'b': [2, 3, 4]})
+
+*Old behavior*:
+
+.. code-block:: ipython
+
+    In [3]: df.groupby('a', dropna=True).transform('sum')
+    Out[3]:
+       b
+    0  5
+    1  5
+    2  5
+
+    In [3]: df.groupby('a', dropna=True).transform(lambda x: x.sum())
+    Out[3]:
+       b
+    0  5
+    1  5
+
+    In [3]: df.groupby('a', dropna=True).transform('ffill')
+    Out[3]:
+                         b
+    0                    2
+    1                    3
+    2 -9223372036854775808
+
+    In [3]: df.groupby('a', dropna=True).transform(lambda x: x)
+    Out[3]:
+       b
+    0  2
+    1  3
+
+*New behavior*:
+
+.. ipython:: python
+
+    df.groupby('a', dropna=True).transform('sum')
+    df.groupby('a', dropna=True).transform(lambda x: x.sum())
+    df.groupby('a', dropna=True).transform('ffill')
+    df.groupby('a', dropna=True).transform(lambda x: x)
 
 .. _whatsnew_150.notable_bug_fixes.notable_bug_fix2:
 

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -3704,10 +3704,26 @@ class  max_speed
 
         nv.validate_take((), kwargs)
 
+        return self._take(indices, axis)
+
+    def _take(
+        self: NDFrameT,
+        indices,
+        axis=0,
+        convert_indices: bool_t = True,
+    ) -> NDFrameT:
+        """
+        Internal version of the `take` allowing specification of additional args.
+
+        See the docstring of `take` for full explanation of the parameters.
+        """
         self._consolidate_inplace()
 
         new_data = self._mgr.take(
-            indices, axis=self._get_block_manager_axis(axis), verify=True
+            indices,
+            axis=self._get_block_manager_axis(axis),
+            verify=True,
+            convert_indices=convert_indices,
         )
         return self._constructor(new_data).__finalize__(self, method="take")
 
@@ -3719,7 +3735,7 @@ def _take_with_is_copy(self: NDFrameT, indices, axis=0) -> NDFrameT:
 
         See the docstring of `take` for full explanation of the parameters.
         """
-        result = self.take(indices=indices, axis=axis)
+        result = self._take(indices=indices, axis=axis)
         # Maybe set copy if we didn't actually change the index.
         if not result._get_axis(axis).equals(self._get_axis(axis)):
             result._set_is_copy(self)

@@ -1185,7 +1185,9 @@ def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
         )
 
     def _can_use_transform_fast(self, func: str, result) -> bool:
-        return func == "size" or (
+        # ngroup can only use transform_fast when there are no null keys dropped
+        # caller is responsible to check this case
+        return func in ("ngroup", "size") or (
             isinstance(result, DataFrame)
             and result.columns.equals(self._obj_with_exclusions.columns)
         )

@@ -62,6 +62,7 @@ class providing the base-class of operations.
 )
 from pandas.util._exceptions import find_stack_level
 
+from pandas.core.dtypes.cast import ensure_dtype_can_hold_na
 from pandas.core.dtypes.common import (
     is_bool_dtype,
     is_datetime64_dtype,
@@ -108,6 +109,7 @@ class providing the base-class of operations.
     CategoricalIndex,
     Index,
     MultiIndex,
+    RangeIndex,
 )
 from pandas.core.internals.blocks import ensure_block_shape
 import pandas.core.sample as sample
@@ -614,6 +616,14 @@ def indices(self):
         """
         return self.grouper.indices
 
+    @final
+    def _get_indices_by_codes(self, codes):
+        """
+        Safe get multiple indices, translate keys for
+        datelike to underlying repr.
+        """
+        return [self.grouper.code_indices.get(code, []) for code in codes]
+
     @final
     def _get_indices(self, names):
         """
@@ -947,7 +957,12 @@ def curried(x):
             if name in base.plotting_methods:
                 return self.apply(curried)
 
-            return self._python_apply_general(curried, self._obj_with_exclusions)
+            result = self._python_apply_general(curried, self._obj_with_exclusions)
+            if self.grouper.has_dropped_na and name in base.transformation_kernels:
+                # result will have dropped rows due to nans, fill with null
+                # and ensure index is ordered same as the input
+                result = self._set_result_index_ordered(result)
+            return result
 
         wrapper.__name__ = name
         return wrapper
@@ -1086,28 +1101,24 @@ def _set_result_index_ordered(
     ) -> OutputFrameOrSeries:
         # set the result index on the passed values object and
         # return the new object, xref 8046
-
         if self.grouper.is_monotonic:
             # shortcut if we have an already ordered grouper
             result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True)
             return result
 
         # row order is scrambled => sort the rows by position in original index
+        _, obs_group_ids, _ = self.grouper.group_info
         original_positions = Index(
-            np.concatenate(self._get_indices(self.grouper.result_index))
+            np.concatenate(self._get_indices_by_codes(obs_group_ids))
         )
         result.set_axis(original_positions, axis=self.axis, inplace=True)
         result = result.sort_index(axis=self.axis)
-
-        dropped_rows = len(result.index) < len(self.obj.index)
-
-        if dropped_rows:
-            # get index by slicing original index according to original positions
-            # slice drops attrs => use set_axis when no rows were dropped
-            sorted_indexer = result.index
-            result.index = self._selected_obj.index[sorted_indexer]
-        else:
-            result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True)
+        obj_axis = self.obj._get_axis(self.axis)
+        if self.grouper.has_dropped_na:
+            # Fill in any missing values due to dropna - index here is integral
+            # with values referring to the row of the input so can use RangeIndex
+            result = result.reindex(RangeIndex(len(obj_axis)), axis=self.axis)
+        result.set_axis(obj_axis, axis=self.axis, inplace=True)
 
         return result
 
@@ -1650,6 +1661,11 @@ def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
             with com.temp_setattr(self, "observed", True):
                 result = getattr(self, func)(*args, **kwargs)
 
+            if func == "ngroup" and not self.grouper.has_dropped_na:
+                # ngroup handles its own wrapping, as long as there aren't
+                # dropped null keys
+                return result
+
             if self._can_use_transform_fast(func, result):
                 return self._wrap_transform_fast_result(result)
 
@@ -1672,7 +1688,9 @@ def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT:
             out = algorithms.take_nd(result._values, ids)
             output = obj._constructor(out, index=obj.index, name=obj.name)
         else:
-            output = result.take(ids, axis=0)
+            # Don't convert indices: negative indices need to give rise
+            # to null values in the result
+            output = result._take(ids, axis=0, convert_indices=False)
             output.index = obj.index
         return output
 
@@ -1725,9 +1743,14 @@ def _cumcount_array(self, ascending: bool = True) -> np.ndarray:
         else:
             out = np.repeat(out[np.r_[run[1:], True]], rep) - out
 
+        if self.grouper.has_dropped_na:
+            out = np.where(ids == -1, np.nan, out.astype(np.float64, copy=False))
+        else:
+            out = out.astype(np.int64, copy=False)
+
         rev = np.empty(count, dtype=np.intp)
         rev[sorter] = np.arange(count, dtype=np.intp)
-        return out[rev].astype(np.int64, copy=False)
+        return out[rev]
 
     # -----------------------------------------------------------------
 
@@ -2556,7 +2579,12 @@ def blk_func(values: ArrayLike) -> ArrayLike:
                 #  then there will be no -1s in indexer, so we can use
                 #  the original dtype (no need to ensure_dtype_can_hold_na)
                 if isinstance(values, np.ndarray):
-                    out = np.empty(values.shape, dtype=values.dtype)
+                    dtype = values.dtype
+                    if self.grouper.has_dropped_na:
+                        #  ...unless there are any dropped null groups,
+                        #  these give rise to nan in the result
+                        dtype = ensure_dtype_can_hold_na(values.dtype)
+                    out = np.empty(values.shape, dtype=dtype)
                 else:
                     out = type(values)._empty(values.shape, dtype=values.dtype)
 

@@ -38,6 +38,7 @@
 from pandas.util._decorators import cache_readonly
 
 from pandas.core.dtypes.cast import (
+    ensure_dtype_can_hold_na,
     maybe_cast_pointwise_result,
     maybe_downcast_to_dtype,
 )
@@ -112,7 +113,8 @@ class WrappedCythonOp:
     #  back to the original dtype.
     cast_blocklist = frozenset(["rank", "count", "size", "idxmin", "idxmax"])
 
-    def __init__(self, kind: str, how: str):
+    def __init__(self, grouper, kind: str, how: str):
+        self.grouper = grouper
         self.kind = kind
         self.how = how
 
@@ -199,7 +201,9 @@ def get_cython_func_and_vals(self, values: np.ndarray, is_numeric: bool):
         func = self._get_cython_function(kind, how, values.dtype, is_numeric)
 
         if values.dtype.kind in ["i", "u"]:
-            if how in ["add", "var", "prod", "mean", "ohlc"]:
+            if how in ["add", "var", "prod", "mean", "ohlc"] or (
+                kind == "transform" and self.grouper.has_dropped_na
+            ):
                 # result may still include NaN, so we have to cast
                 values = ensure_float64(values)
 
@@ -263,6 +267,9 @@ def _get_output_shape(self, ngroups: int, values: np.ndarray) -> Shape:
     def get_out_dtype(self, dtype: np.dtype) -> np.dtype:
         how = self.how
 
+        if self.kind == "transform" and self.grouper.has_dropped_na:
+            dtype = ensure_dtype_can_hold_na(dtype)
+
         if how == "rank":
             out_dtype = "float64"
         else:
@@ -463,6 +470,11 @@ def _cython_op_ndim_compat(
             # otherwise we have OHLC
             return res.T
 
+        if self.kind == "transform" and self.grouper.has_dropped_na:
+            mask = comp_ids == -1
+            # make mask 2d
+            mask = mask[None, :]
+
         return self._call_cython_op(
             values,
             min_count=min_count,
@@ -571,6 +583,10 @@ def _call_cython_op(
 
         result = result.T
 
+        if self.how == "rank" and self.grouper.has_dropped_na:
+            # TODO: Wouldn't need this if group_rank supported mask
+            result = np.where(comp_ids < 0, np.nan, result)
+
         if self.how not in self.cast_blocklist:
             # e.g. if we are int64 and need to restore to datetime64/timedelta64
             # "rank" is the only member of cast_blocklist we get here
@@ -783,6 +799,13 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:
         keys = [ping.group_index for ping in self.groupings]
         return get_indexer_dict(codes_list, keys)
 
+    @cache_readonly
+    def code_indices(self) -> dict[int, npt.NDArray[np.intp]]:
+        group_index = get_group_index(self.codes, self.shape, sort=True, xnull=True)
+        _, obs_group_ids, _ = self.group_info
+        result = {e: np.where(group_index == e)[0] for e in obs_group_ids}
+        return result
+
     @final
     @property
     def codes(self) -> list[np.ndarray]:
@@ -825,6 +848,14 @@ def is_monotonic(self) -> bool:
         # return if my group orderings are monotonic
         return Index(self.group_info[0]).is_monotonic_increasing
 
+    @final
+    @cache_readonly
+    def has_dropped_na(self) -> bool:
+        # TODO: Would like this to be named "contains_na" and be True whenever
+        #       there is a null in the grouper (regardless of dropna), but right
+        #       now will only be True when there is a null and dropna=True.
+        return bool((self.group_info[0] < 0).any())
+
     @cache_readonly
     def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
         comp_ids, obs_group_ids = self._get_compressed_codes()
@@ -928,7 +959,7 @@ def _cython_operation(
         """
         assert kind in ["transform", "aggregate"]
 
-        cy_op = WrappedCythonOp(kind=kind, how=how)
+        cy_op = WrappedCythonOp(grouper=self, kind=kind, how=how)
 
         ids, _, _ = self.group_info
         ngroups = self.ngroups

diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -829,7 +829,13 @@ def _make_na_block(
         block_values.fill(fill_value)
         return new_block_2d(block_values, placement=placement)
 
-    def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T:
+    def take(
+        self: T,
+        indexer,
+        axis: int = 1,
+        verify: bool = True,
+        convert_indices: bool = True,
+    ) -> T:
         """
         Take items along any axis.
 
@@ -851,7 +857,8 @@ def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T:
         )
 
         n = self.shape[axis]
-        indexer = maybe_convert_indices(indexer, n, verify=verify)
+        if convert_indices:
+            indexer = maybe_convert_indices(indexer, n, verify=verify)
 
         new_labels = self.axes[axis].take(indexer)
         return self.reindex_indexer(

diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py
@@ -19,6 +19,11 @@ def as_index(request):
     return request.param
 
 
+@pytest.fixture(params=[True, False])
+def dropna(request):
+    return request.param
+
+
 @pytest.fixture
 def mframe(multiindex_dataframe_random_data):
     return multiindex_dataframe_random_data