Skip to content

REGR: groupby.transform producing segfault #46585

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 31, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1106,7 +1106,7 @@ def _set_result_index_ordered(
# set the result index on the passed values object and
# return the new object, xref 8046

if self.grouper.is_monotonic:
if self.grouper.is_monotonic and not self.grouper.has_dropped_na:
# shortcut if we have an already ordered grouper
result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True)
return result
Expand Down
5 changes: 4 additions & 1 deletion pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -818,7 +818,10 @@ def result_ilocs(self) -> npt.NDArray[np.intp]:
# Original indices are where group_index would go via sorting.
# But when dropna is true, we need to remove null values while accounting for
# any gaps that then occur because of them.
group_index = get_group_index(self.codes, self.shape, sort=False, xnull=True)
group_index = get_group_index(
self.codes, self.shape, sort=self._sort, xnull=True
)
group_index, _ = compress_group_index(group_index, sort=self._sort)

if self.has_dropped_na:
mask = np.where(group_index >= 0)
Expand Down
33 changes: 22 additions & 11 deletions pandas/tests/groupby/transform/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -1303,23 +1303,34 @@ def test_transform_cumcount():
tm.assert_series_equal(result, expected)


def test_null_group_lambda_self(sort, dropna):
@pytest.mark.parametrize("keys", [["A1"], ["A1", "A2"]])
def test_null_group_lambda_self(request, sort, dropna, keys):
# GH 17093
np.random.seed(0)
keys = np.random.randint(0, 5, size=50).astype(float)
nulls = np.random.choice([0, 1], keys.shape).astype(bool)
keys[nulls] = np.nan
values = np.random.randint(0, 5, size=keys.shape)
df = DataFrame({"A": keys, "B": values})
if not sort and not dropna:
msg = "GH#46584: null values get sorted when sort=False"
request.node.add_marker(pytest.mark.xfail(reason=msg, strict=False))

size = 50
nulls1 = np.random.choice([False, True], size)
nulls2 = np.random.choice([False, True], size)
# Whether a group contains a null value or not
nulls_grouper = nulls1 if len(keys) == 1 else nulls1 | nulls2

a1 = np.random.randint(0, 5, size=size).astype(float)
a1[nulls1] = np.nan
a2 = np.random.randint(0, 5, size=size).astype(float)
a2[nulls2] = np.nan
values = np.random.randint(0, 5, size=a1.shape)
df = DataFrame({"A1": a1, "A2": a2, "B": values})

expected_values = values
if dropna and nulls.any():
if dropna and nulls_grouper.any():
expected_values = expected_values.astype(float)
expected_values[nulls] = np.nan
expected_values[nulls_grouper] = np.nan
expected = DataFrame(expected_values, columns=["B"])

gb = df.groupby("A", dropna=dropna, sort=sort)
result = gb.transform(lambda x: x)
gb = df.groupby(keys, dropna=dropna, sort=sort)
result = gb[["B"]].transform(lambda x: x)
tm.assert_frame_equal(result, expected)


Expand Down