Skip to content

BUG: Fix some cases of groupby(...).transform with dropna=True #46209

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Mar 7, 2022
15 changes: 15 additions & 0 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -82,17 +82,32 @@ did not have the same index as the input.

.. code-block:: ipython

In [3]: df.groupby('a', dropna=True).transform(lambda x: x.sum())
Out[3]:
b
0 5
1 5

In [3]: df.groupby('a', dropna=True).transform(lambda x: x)
Out[3]:
b
0 2
1 3

In [3]: df.groupby('a', dropna=True).transform('sum')
Out[3]:
b
0 5
1 5
2 5

*New behavior*:

.. ipython:: python

df.groupby('a', dropna=True).transform(lambda x: x.sum())
df.groupby('a', dropna=True).transform(lambda x: x)
df.groupby('a', dropna=True).transform('sum')

.. _whatsnew_150.notable_bug_fixes.notable_bug_fix2:

Expand Down
20 changes: 18 additions & 2 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3693,10 +3693,26 @@ class max_speed

nv.validate_take((), kwargs)

return self._take(indices, axis)

def _take(
self: NDFrameT,
indices,
axis=0,
convert_indices: bool_t = True,
) -> NDFrameT:
"""
Internal version of the `take` allowing specification of additional args.

See the docstring of `take` for full explanation of the parameters.
"""
self._consolidate_inplace()

new_data = self._mgr.take(
indices, axis=self._get_block_manager_axis(axis), verify=True
indices,
axis=self._get_block_manager_axis(axis),
verify=True,
convert_indices=convert_indices,
)
return self._constructor(new_data).__finalize__(self, method="take")

Expand All @@ -3708,7 +3724,7 @@ def _take_with_is_copy(self: NDFrameT, indices, axis=0) -> NDFrameT:

See the docstring of `take` for full explanation of the parameters.
"""
result = self.take(indices=indices, axis=axis)
result = self._take(indices=indices, axis=axis)
# Maybe set copy if we didn't actually change the index.
if not result._get_axis(axis).equals(self._get_axis(axis)):
result._set_is_copy(self)
Expand Down
12 changes: 10 additions & 2 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1646,7 +1646,10 @@ def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT:
out = algorithms.take_nd(result._values, ids)
output = obj._constructor(out, index=obj.index, name=obj.name)
else:
output = result.take(ids, axis=self.axis)
# GH#46209
# Don't convert indices: negative indices need to give rise
# to null values in the result
output = result._take(ids, axis=self.axis, convert_indices=False)
output = output.set_axis(obj._get_axis(self.axis), axis=self.axis)
return output

Expand Down Expand Up @@ -1699,9 +1702,14 @@ def _cumcount_array(self, ascending: bool = True) -> np.ndarray:
else:
out = np.repeat(out[np.r_[run[1:], True]], rep) - out

if self.grouper.has_dropped_na:
out = np.where(ids == -1, np.nan, out.astype(np.float64, copy=False))
else:
out = out.astype(np.int64, copy=False)

rev = np.empty(count, dtype=np.intp)
rev[sorter] = np.arange(count, dtype=np.intp)
return out[rev].astype(np.int64, copy=False)
return out[rev]

# -----------------------------------------------------------------

Expand Down
11 changes: 9 additions & 2 deletions pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -640,7 +640,13 @@ def _reindex_indexer(

return type(self)(new_arrays, new_axes, verify_integrity=False)

def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T:
def take(
self: T,
indexer,
axis: int = 1,
verify: bool = True,
convert_indices: bool = True,
) -> T:
"""
Take items along any axis.
"""
Expand All @@ -656,7 +662,8 @@ def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T:
raise ValueError("indexer should be 1-dimensional")

n = self.shape_proper[axis]
indexer = maybe_convert_indices(indexer, n, verify=verify)
if convert_indices:
indexer = maybe_convert_indices(indexer, n, verify=verify)

new_labels = self._axes[axis].take(indexer)
return self._reindex_indexer(
Expand Down
13 changes: 11 additions & 2 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -829,7 +829,13 @@ def _make_na_block(
block_values.fill(fill_value)
return new_block_2d(block_values, placement=placement)

def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T:
def take(
self: T,
indexer,
axis: int = 1,
verify: bool = True,
convert_indices: bool = True,
) -> T:
"""
Take items along any axis.

Expand All @@ -838,6 +844,8 @@ def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T:
verify : bool, default True
Check that all entries are between 0 and len(self) - 1, inclusive.
Pass verify=False if this check has been done by the caller.
convert_indices : bool, default True
Whether to attempt to convert indices to positive values.

Returns
-------
Expand All @@ -851,7 +859,8 @@ def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T:
)

n = self.shape[axis]
indexer = maybe_convert_indices(indexer, n, verify=verify)
if convert_indices:
indexer = maybe_convert_indices(indexer, n, verify=verify)

new_labels = self.axes[axis].take(indexer)
return self.reindex_indexer(
Expand Down
193 changes: 193 additions & 0 deletions pandas/tests/groupby/transform/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -1308,3 +1308,196 @@ def test_null_group_lambda_self(sort, dropna):
gb = df.groupby("A", dropna=dropna, sort=sort)
result = gb.transform(lambda x: x)
tm.assert_frame_equal(result, expected)


def test_null_group_str_reducer(request, dropna, reduction_func):
# GH 17093
if reduction_func in ("corrwith", "ngroup"):
msg = "incorrectly raises"
request.node.add_marker(pytest.mark.xfail(reason=msg))
index = [1, 2, 3, 4] # test transform preserves non-standard index
df = DataFrame({"A": [1, 1, np.nan, np.nan], "B": [1, 2, 2, 3]}, index=index)
gb = df.groupby("A", dropna=dropna)

if reduction_func == "corrwith":
args = (df["B"],)
elif reduction_func == "nth":
args = (0,)
else:
args = ()

# Manually handle reducers that don't fit the generic pattern
# Set expected with dropna=False, then replace if necessary
if reduction_func == "first":
expected = DataFrame({"B": [1, 1, 2, 2]}, index=index)
elif reduction_func == "last":
expected = DataFrame({"B": [2, 2, 3, 3]}, index=index)
elif reduction_func == "nth":
expected = DataFrame({"B": [1, 1, 2, 2]}, index=index)
elif reduction_func == "size":
expected = Series([2, 2, 2, 2], index=index)
elif reduction_func == "corrwith":
expected = DataFrame({"B": [1.0, 1.0, 1.0, 1.0]}, index=index)
else:
expected_gb = df.groupby("A", dropna=False)
buffer = []
for idx, group in expected_gb:
res = getattr(group["B"], reduction_func)()
buffer.append(Series(res, index=group.index))
expected = concat(buffer).to_frame("B")
if dropna:
dtype = object if reduction_func in ("any", "all") else float
expected = expected.astype(dtype)
if expected.ndim == 2:
expected.iloc[[2, 3], 0] = np.nan
else:
expected.iloc[[2, 3]] = np.nan

result = gb.transform(reduction_func, *args)
tm.assert_equal(result, expected)


def test_null_group_str_transformer(
request, using_array_manager, dropna, transformation_func
):
# GH 17093
xfails_block = (
"cummax",
"cummin",
"cumsum",
"fillna",
"rank",
"backfill",
"ffill",
"bfill",
"pad",
)
xfails_array = ("cummax", "cummin", "cumsum", "fillna", "rank")
if transformation_func == "tshift":
msg = "tshift requires timeseries"
request.node.add_marker(pytest.mark.xfail(reason=msg))
elif dropna and (
(not using_array_manager and transformation_func in xfails_block)
or (using_array_manager and transformation_func in xfails_array)
):
msg = "produces incorrect results when nans are present"
request.node.add_marker(pytest.mark.xfail(reason=msg))
args = (0,) if transformation_func == "fillna" else ()
df = DataFrame({"A": [1, 1, np.nan], "B": [1, 2, 2]}, index=[1, 2, 3])
gb = df.groupby("A", dropna=dropna)

buffer = []
for k, (idx, group) in enumerate(gb):
if transformation_func == "cumcount":
# DataFrame has no cumcount method
res = DataFrame({"B": range(len(group))}, index=group.index)
elif transformation_func == "ngroup":
res = DataFrame(len(group) * [k], index=group.index, columns=["B"])
else:
res = getattr(group[["B"]], transformation_func)(*args)
buffer.append(res)
if dropna:
dtype = object if transformation_func in ("any", "all") else None
buffer.append(DataFrame([[np.nan]], index=[3], dtype=dtype, columns=["B"]))
expected = concat(buffer)

if transformation_func in ("cumcount", "ngroup"):
# ngroup/cumcount always returns a Series as it counts the groups, not values
expected = expected["B"].rename(None)

warn = FutureWarning if transformation_func in ("backfill", "pad") else None
msg = f"{transformation_func} is deprecated"
with tm.assert_produces_warning(warn, match=msg):
result = gb.transform(transformation_func, *args)

tm.assert_equal(result, expected)


def test_null_group_str_reducer_series(request, dropna, reduction_func):
# GH 17093
if reduction_func == "corrwith":
msg = "corrwith not implemented for SeriesGroupBy"
request.node.add_marker(pytest.mark.xfail(reason=msg))

if reduction_func == "ngroup":
msg = "ngroup fails"
request.node.add_marker(pytest.mark.xfail(reason=msg))

# GH 17093
index = [1, 2, 3, 4] # test transform preserves non-standard index
ser = Series([1, 2, 2, 3], index=index)
gb = ser.groupby([1, 1, np.nan, np.nan], dropna=dropna)

if reduction_func == "corrwith":
args = (ser,)
elif reduction_func == "nth":
args = (0,)
else:
args = ()

# Manually handle reducers that don't fit the generic pattern
# Set expected with dropna=False, then replace if necessary
if reduction_func == "first":
expected = Series([1, 1, 2, 2], index=index)
elif reduction_func == "last":
expected = Series([2, 2, 3, 3], index=index)
elif reduction_func == "nth":
expected = Series([1, 1, 2, 2], index=index)
elif reduction_func == "size":
expected = Series([2, 2, 2, 2], index=index)
elif reduction_func == "corrwith":
expected = Series([1, 1, 2, 2], index=index)
else:
expected_gb = ser.groupby([1, 1, np.nan, np.nan], dropna=False)
buffer = []
for idx, group in expected_gb:
res = getattr(group, reduction_func)()
buffer.append(Series(res, index=group.index))
expected = concat(buffer)
if dropna:
dtype = object if reduction_func in ("any", "all") else float
expected = expected.astype(dtype)
expected.iloc[[2, 3]] = np.nan

result = gb.transform(reduction_func, *args)
tm.assert_series_equal(result, expected)


def test_null_group_str_transformer_series(request, dropna, transformation_func):
# GH 17093
if transformation_func == "tshift":
msg = "tshift requires timeseries"
request.node.add_marker(pytest.mark.xfail(reason=msg))
elif dropna and transformation_func in (
"cummax",
"cummin",
"cumsum",
"fillna",
"rank",
):
msg = "produces incorrect results when nans are present"
request.node.add_marker(pytest.mark.xfail(reason=msg))
args = (0,) if transformation_func == "fillna" else ()
ser = Series([1, 2, 2], index=[1, 2, 3])
gb = ser.groupby([1, 1, np.nan], dropna=dropna)

buffer = []
for k, (idx, group) in enumerate(gb):
if transformation_func == "cumcount":
# Series has no cumcount method
res = Series(range(len(group)), index=group.index)
elif transformation_func == "ngroup":
res = Series(k, index=group.index)
else:
res = getattr(group, transformation_func)(*args)
buffer.append(res)
if dropna:
dtype = object if transformation_func in ("any", "all") else None
buffer.append(Series([np.nan], index=[3], dtype=dtype))
expected = concat(buffer)

warn = FutureWarning if transformation_func in ("backfill", "pad") else None
msg = f"{transformation_func} is deprecated"
with tm.assert_produces_warning(warn, match=msg):
result = gb.transform(transformation_func, *args)
tm.assert_equal(result, expected)