Skip to content

Commit 1efa4fb

Browse files
authored
BUG: Fix some cases of groupby(...).transform with dropna=True (#45953)
1 parent 429f294 commit 1efa4fb

File tree

6 files changed

+99
-55
lines changed

6 files changed

+99
-55
lines changed

doc/source/whatsnew/v1.5.0.rst

+33
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,39 @@ Styler
5858

5959
- Fixed bug in :class:`CSSToExcelConverter` leading to ``TypeError`` when border color provided without border style for ``xlsxwriter`` engine (:issue:`42276`)
6060

61+
.. _whatsnew_150.notable_bug_fixes.groupby_transform_dropna:
62+
63+
Using ``dropna=True`` with ``groupby`` transforms
64+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
65+
66+
A transform is an operation whose result has the same size as its input. When the
67+
result is a :class:`DataFrame` or :class:`Series`, it is also required that the
68+
index of the result matches that of the input. In pandas 1.4, using
69+
:meth:`.DataFrameGroupBy.transform` or :meth:`.SeriesGroupBy.transform` with null
70+
values in the groups and ``dropna=True`` gave incorrect results. Demonstrated by the
71+
examples below, the incorrect results either contained incorrect values, or the result
72+
did not have the same index as the input.
73+
74+
.. ipython:: python
75+
76+
df = pd.DataFrame({'a': [1, 1, np.nan], 'b': [2, 3, 4]})
77+
78+
*Old behavior*:
79+
80+
.. code-block:: ipython
81+
82+
In [3]: df.groupby('a', dropna=True).transform(lambda x: x)
83+
Out[3]:
84+
b
85+
0 2
86+
1 3
87+
88+
*New behavior*:
89+
90+
.. ipython:: python
91+
92+
df.groupby('a', dropna=True).transform(lambda x: x)
93+
6194
.. _whatsnew_150.notable_bug_fixes.notable_bug_fix2:
6295

6396
notable_bug_fix2

pandas/core/groupby/groupby.py

+8-13
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ class providing the base-class of operations.
108108
CategoricalIndex,
109109
Index,
110110
MultiIndex,
111+
RangeIndex,
111112
)
112113
from pandas.core.internals.blocks import ensure_block_shape
113114
import pandas.core.sample as sample
@@ -1093,21 +1094,15 @@ def _set_result_index_ordered(
10931094
return result
10941095

10951096
# row order is scrambled => sort the rows by position in original index
1096-
original_positions = Index(
1097-
np.concatenate(self._get_indices(self.grouper.result_index))
1098-
)
1097+
original_positions = Index(self.grouper.result_ilocs())
10991098
result.set_axis(original_positions, axis=self.axis, inplace=True)
11001099
result = result.sort_index(axis=self.axis)
1101-
1102-
dropped_rows = len(result.index) < len(self.obj.index)
1103-
1104-
if dropped_rows:
1105-
# get index by slicing original index according to original positions
1106-
# slice drops attrs => use set_axis when no rows were dropped
1107-
sorted_indexer = result.index
1108-
result.index = self._selected_obj.index[sorted_indexer]
1109-
else:
1110-
result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True)
1100+
obj_axis = self.obj._get_axis(self.axis)
1101+
if self.grouper.has_dropped_na:
1102+
# Add back in any missing rows due to dropna - index here is integral
1103+
# with values referring to the row of the input so can use RangeIndex
1104+
result = result.reindex(RangeIndex(len(obj_axis)), axis=self.axis)
1105+
result.set_axis(obj_axis, axis=self.axis, inplace=True)
11111106

11121107
return result
11131108

pandas/core/groupby/ops.py

+32
Original file line numberDiff line numberDiff line change
@@ -795,6 +795,30 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:
795795
keys = [ping.group_index for ping in self.groupings]
796796
return get_indexer_dict(codes_list, keys)
797797

798+
@final
799+
def result_ilocs(self) -> npt.NDArray[np.intp]:
800+
"""
801+
Get the original integer locations of result_index in the input.
802+
"""
803+
# Original indices are where group_index would go via sorting.
804+
# But when dropna is true, we need to remove null values while accounting for
805+
# any gaps that then occur because of them.
806+
group_index = get_group_index(self.codes, self.shape, sort=False, xnull=True)
807+
808+
if self.has_dropped_na:
809+
mask = np.where(group_index >= 0)
810+
# Count how many gaps are caused by previous null values for each position
811+
null_gaps = np.cumsum(group_index == -1)[mask]
812+
group_index = group_index[mask]
813+
814+
result = get_group_index_sorter(group_index, self.ngroups)
815+
816+
if self.has_dropped_na:
817+
# Shift by the number of prior null gaps
818+
result += np.take(null_gaps, result)
819+
820+
return result
821+
798822
@final
799823
@property
800824
def codes(self) -> list[npt.NDArray[np.signedinteger]]:
@@ -837,6 +861,14 @@ def is_monotonic(self) -> bool:
837861
# return if my group orderings are monotonic
838862
return Index(self.group_info[0]).is_monotonic_increasing
839863

864+
@final
865+
@cache_readonly
866+
def has_dropped_na(self) -> bool:
867+
"""
868+
Whether grouper has null value(s) that are dropped.
869+
"""
870+
return bool((self.group_info[0] < 0).any())
871+
840872
@cache_readonly
841873
def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
842874
comp_ids, obs_group_ids = self._get_compressed_codes()

pandas/tests/extension/test_string.py

-6
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,6 @@
1818
import numpy as np
1919
import pytest
2020

21-
from pandas.compat import pa_version_under2p0
22-
2321
import pandas as pd
2422
from pandas.core.arrays import ArrowStringArray
2523
from pandas.core.arrays.string_ import StringDtype
@@ -193,10 +191,6 @@ class TestPrinting(base.BasePrintingTests):
193191

194192
class TestGroupBy(base.BaseGroupbyTests):
195193
def test_groupby_extension_transform(self, data_for_grouping, request):
196-
if data_for_grouping.dtype.storage == "pyarrow" and pa_version_under2p0:
197-
# failure observed in 1.0.1, not in 2.0 or later
198-
mark = pytest.mark.xfail(reason="pyarrow raises in self._data[item]")
199-
request.node.add_marker(mark)
200194
super().test_groupby_extension_transform(data_for_grouping)
201195

202196

pandas/tests/groupby/test_groupby_dropna.py

+10-32
Original file line numberDiff line numberDiff line change
@@ -171,52 +171,30 @@ def test_grouper_dropna_propagation(dropna):
171171

172172

173173
@pytest.mark.parametrize(
174-
"dropna,input_index,expected_data,expected_index",
174+
"index",
175175
[
176-
(True, pd.RangeIndex(0, 4), {"B": [2, 2, 1]}, pd.RangeIndex(0, 3)),
177-
(True, list("abcd"), {"B": [2, 2, 1]}, list("abc")),
178-
(
179-
True,
180-
pd.MultiIndex.from_tuples(
181-
[(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"]
182-
),
183-
{"B": [2, 2, 1]},
184-
pd.MultiIndex.from_tuples(
185-
[(1, "R"), (1, "B"), (2, "R")], names=["num", "col"]
186-
),
187-
),
188-
(False, pd.RangeIndex(0, 4), {"B": [2, 2, 1, 1]}, pd.RangeIndex(0, 4)),
189-
(False, list("abcd"), {"B": [2, 2, 1, 1]}, list("abcd")),
190-
(
191-
False,
192-
pd.MultiIndex.from_tuples(
193-
[(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"]
194-
),
195-
{"B": [2, 2, 1, 1]},
196-
pd.MultiIndex.from_tuples(
197-
[(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"]
198-
),
199-
),
176+
pd.RangeIndex(0, 4),
177+
list("abcd"),
178+
pd.MultiIndex.from_product([(1, 2), ("R", "B")], names=["num", "col"]),
200179
],
201180
)
202-
def test_groupby_dataframe_slice_then_transform(
203-
dropna, input_index, expected_data, expected_index
204-
):
181+
def test_groupby_dataframe_slice_then_transform(dropna, index):
205182
# GH35014 & GH35612
183+
expected_data = {"B": [2, 2, 1, np.nan if dropna else 1]}
206184

207-
df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}, index=input_index)
185+
df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}, index=index)
208186
gb = df.groupby("A", dropna=dropna)
209187

210188
result = gb.transform(len)
211-
expected = pd.DataFrame(expected_data, index=expected_index)
189+
expected = pd.DataFrame(expected_data, index=index)
212190
tm.assert_frame_equal(result, expected)
213191

214192
result = gb[["B"]].transform(len)
215-
expected = pd.DataFrame(expected_data, index=expected_index)
193+
expected = pd.DataFrame(expected_data, index=index)
216194
tm.assert_frame_equal(result, expected)
217195

218196
result = gb["B"].transform(len)
219-
expected = pd.Series(expected_data["B"], index=expected_index, name="B")
197+
expected = pd.Series(expected_data["B"], index=index, name="B")
220198
tm.assert_series_equal(result, expected)
221199

222200

pandas/tests/groupby/transform/test_transform.py

+16-4
Original file line numberDiff line numberDiff line change
@@ -1290,9 +1290,21 @@ def test_transform_cumcount():
12901290
tm.assert_series_equal(result, expected)
12911291

12921292

1293-
def test_null_group_lambda_self():
1293+
def test_null_group_lambda_self(sort, dropna):
12941294
# GH 17093
1295-
df = DataFrame({"A": [1, np.nan], "B": [1, 1]})
1296-
result = df.groupby("A").transform(lambda x: x)
1297-
expected = DataFrame([1], columns=["B"])
1295+
np.random.seed(0)
1296+
keys = np.random.randint(0, 5, size=50).astype(float)
1297+
nulls = np.random.choice([0, 1], keys.shape).astype(bool)
1298+
keys[nulls] = np.nan
1299+
values = np.random.randint(0, 5, size=keys.shape)
1300+
df = DataFrame({"A": keys, "B": values})
1301+
1302+
expected_values = values
1303+
if dropna and nulls.any():
1304+
expected_values = expected_values.astype(float)
1305+
expected_values[nulls] = np.nan
1306+
expected = DataFrame(expected_values, columns=["B"])
1307+
1308+
gb = df.groupby("A", dropna=dropna, sort=sort)
1309+
result = gb.transform(lambda x: x)
12981310
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)