From d82c7406c6ea4325711daa02126167033e62d6f3 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 31 Mar 2022 08:59:49 -0400 Subject: [PATCH] REGR: groupby.transform producing segfault --- pandas/core/groupby/groupby.py | 2 +- pandas/core/groupby/ops.py | 5 ++- .../tests/groupby/transform/test_transform.py | 33 ++++++++++++------- 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 3089a6b8c16ae..91c35d7555705 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1106,7 +1106,7 @@ def _set_result_index_ordered( # set the result index on the passed values object and # return the new object, xref 8046 - if self.grouper.is_monotonic: + if self.grouper.is_monotonic and not self.grouper.has_dropped_na: # shortcut if we have an already ordered grouper result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True) return result diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 5f15e11c4740c..b43319765c5b4 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -818,7 +818,10 @@ def result_ilocs(self) -> npt.NDArray[np.intp]: # Original indices are where group_index would go via sorting. # But when dropna is true, we need to remove null values while accounting for # any gaps that then occur because of them. - group_index = get_group_index(self.codes, self.shape, sort=False, xnull=True) + group_index = get_group_index( + self.codes, self.shape, sort=self._sort, xnull=True + ) + group_index, _ = compress_group_index(group_index, sort=self._sort) if self.has_dropped_na: mask = np.where(group_index >= 0) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index c210c79c29426..f178e05d40dd0 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1303,23 +1303,34 @@ def test_transform_cumcount(): tm.assert_series_equal(result, expected) -def test_null_group_lambda_self(sort, dropna): +@pytest.mark.parametrize("keys", [["A1"], ["A1", "A2"]]) +def test_null_group_lambda_self(request, sort, dropna, keys): # GH 17093 - np.random.seed(0) - keys = np.random.randint(0, 5, size=50).astype(float) - nulls = np.random.choice([0, 1], keys.shape).astype(bool) - keys[nulls] = np.nan - values = np.random.randint(0, 5, size=keys.shape) - df = DataFrame({"A": keys, "B": values}) + if not sort and not dropna: + msg = "GH#46584: null values get sorted when sort=False" + request.node.add_marker(pytest.mark.xfail(reason=msg, strict=False)) + + size = 50 + nulls1 = np.random.choice([False, True], size) + nulls2 = np.random.choice([False, True], size) + # Whether a group contains a null value or not + nulls_grouper = nulls1 if len(keys) == 1 else nulls1 | nulls2 + + a1 = np.random.randint(0, 5, size=size).astype(float) + a1[nulls1] = np.nan + a2 = np.random.randint(0, 5, size=size).astype(float) + a2[nulls2] = np.nan + values = np.random.randint(0, 5, size=a1.shape) + df = DataFrame({"A1": a1, "A2": a2, "B": values}) expected_values = values - if dropna and nulls.any(): + if dropna and nulls_grouper.any(): expected_values = expected_values.astype(float) - expected_values[nulls] = np.nan + expected_values[nulls_grouper] = np.nan expected = DataFrame(expected_values, columns=["B"]) - gb = df.groupby("A", dropna=dropna, sort=sort) - result = gb.transform(lambda x: x) + gb = df.groupby(keys, dropna=dropna, sort=sort) + result = gb[["B"]].transform(lambda x: x) tm.assert_frame_equal(result, expected)