From b58536c2a3081e15bde6658c03db8fe1a4de628d Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 10 Feb 2022 22:43:49 -0500 Subject: [PATCH 01/11] BUG: Fix groupby(...).transform with dropna=True --- pandas/core/groupby/groupby.py | 28 ++++++++----- pandas/core/groupby/ops.py | 12 ++++++ pandas/tests/groupby/conftest.py | 5 +++ pandas/tests/groupby/test_groupby_dropna.py | 42 +++++-------------- .../tests/groupby/transform/test_transform.py | 11 +++-- 5 files changed, 51 insertions(+), 47 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 4eb907e06adf1..9424da35d3d4e 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -108,6 +108,7 @@ class providing the base-class of operations. CategoricalIndex, Index, MultiIndex, + RangeIndex, ) from pandas.core.internals.blocks import ensure_block_shape import pandas.core.sample as sample @@ -614,6 +615,14 @@ def indices(self): """ return self.grouper.indices + @final + def _get_indices_by_codes(self, codes): + """ + Safe get multiple indices, translate keys for + datelike to underlying repr. + """ + return [self.grouper.code_indices.get(code, []) for code in codes] + @final def _get_indices(self, names): """ @@ -1093,21 +1102,18 @@ def _set_result_index_ordered( return result # row order is scrambled => sort the rows by position in original index + _, obs_group_ids, _ = self.grouper.group_info original_positions = Index( - np.concatenate(self._get_indices(self.grouper.result_index)) + np.concatenate(self._get_indices_by_codes(obs_group_ids)) ) result.set_axis(original_positions, axis=self.axis, inplace=True) result = result.sort_index(axis=self.axis) - - dropped_rows = len(result.index) < len(self.obj.index) - - if dropped_rows: - # get index by slicing original index according to original positions - # slice drops attrs => use set_axis when no rows were dropped - sorted_indexer = result.index - result.index = self._selected_obj.index[sorted_indexer] - else: - result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True) + obj_axis = self.obj._get_axis(self.axis) + if self.grouper.has_dropped_na: + # Fill in any missing values due to dropna - index here is integral + # with values referring to the row of the input so can use RangeIndex + result = result.reindex(RangeIndex(len(obj_axis)), axis=self.axis) + result.set_axis(obj_axis, axis=self.axis, inplace=True) return result diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 467da586f604d..f21a191da5de8 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -783,6 +783,13 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: keys = [ping.group_index for ping in self.groupings] return get_indexer_dict(codes_list, keys) + @cache_readonly + def code_indices(self) -> dict[int, npt.NDArray[np.intp]]: + group_index = get_group_index(self.codes, self.shape, sort=True, xnull=True) + _, obs_group_ids, _ = self.group_info + result = {e: np.where(group_index == e)[0] for e in obs_group_ids} + return result + @final @property def codes(self) -> list[np.ndarray]: @@ -825,6 +832,11 @@ def is_monotonic(self) -> bool: # return if my group orderings are monotonic return Index(self.group_info[0]).is_monotonic_increasing + @final + @cache_readonly + def has_dropped_na(self) -> bool: + return bool((self.group_info[0] < 0).any()) + @cache_readonly def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: comp_ids, obs_group_ids = self._get_compressed_codes() diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index 403e3edcc34d0..58d9e500554dd 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -19,6 +19,11 @@ def as_index(request): return request.param +@pytest.fixture(params=[True, False]) +def dropna(request): + return request.param + + @pytest.fixture def mframe(multiindex_dataframe_random_data): return multiindex_dataframe_random_data diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index ab568e24ff029..9fcc28d8f65a9 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -171,52 +171,30 @@ def test_grouper_dropna_propagation(dropna): @pytest.mark.parametrize( - "dropna,input_index,expected_data,expected_index", + "index", [ - (True, pd.RangeIndex(0, 4), {"B": [2, 2, 1]}, pd.RangeIndex(0, 3)), - (True, list("abcd"), {"B": [2, 2, 1]}, list("abc")), - ( - True, - pd.MultiIndex.from_tuples( - [(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"] - ), - {"B": [2, 2, 1]}, - pd.MultiIndex.from_tuples( - [(1, "R"), (1, "B"), (2, "R")], names=["num", "col"] - ), - ), - (False, pd.RangeIndex(0, 4), {"B": [2, 2, 1, 1]}, pd.RangeIndex(0, 4)), - (False, list("abcd"), {"B": [2, 2, 1, 1]}, list("abcd")), - ( - False, - pd.MultiIndex.from_tuples( - [(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"] - ), - {"B": [2, 2, 1, 1]}, - pd.MultiIndex.from_tuples( - [(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"] - ), - ), + pd.RangeIndex(0, 4), + list("abcd"), + pd.MultiIndex.from_product([(1, 2), ("R", "B")], names=["num", "col"]), ], ) -def test_groupby_dataframe_slice_then_transform( - dropna, input_index, expected_data, expected_index -): +def test_groupby_dataframe_slice_then_transform(dropna, index): # GH35014 & GH35612 + expected_data = {"B": [2, 2, 1, np.nan if dropna else 1]} - df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}, index=input_index) + df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}, index=index) gb = df.groupby("A", dropna=dropna) result = gb.transform(len) - expected = pd.DataFrame(expected_data, index=expected_index) + expected = pd.DataFrame(expected_data, index=index) tm.assert_frame_equal(result, expected) result = gb[["B"]].transform(len) - expected = pd.DataFrame(expected_data, index=expected_index) + expected = pd.DataFrame(expected_data, index=index) tm.assert_frame_equal(result, expected) result = gb["B"].transform(len) - expected = pd.Series(expected_data["B"], index=expected_index, name="B") + expected = pd.Series(expected_data["B"], index=index, name="B") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 6e9b8c35b3698..1ea0fc3f0e3db 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1281,9 +1281,12 @@ def test_transform_cumcount(): tm.assert_series_equal(result, expected) -def test_null_group_lambda_self(): +# Test both monotonic increasing and not +@pytest.mark.parametrize("keys", [[1, 2, np.nan], [2, 1, np.nan]]) +def test_null_group_lambda_self(dropna, keys): # GH 17093 - df = DataFrame({"A": [1, np.nan], "B": [1, 1]}) - result = df.groupby("A").transform(lambda x: x) - expected = DataFrame([1], columns=["B"]) + df = DataFrame({"A": keys, "B": [1, 2, 3]}) + result = df.groupby("A", dropna=dropna).transform(lambda x: x) + value = np.nan if dropna else 3 + expected = DataFrame([1, 2, value], columns=["B"]) tm.assert_frame_equal(result, expected) From 726a1b83d11c1933bd9866013fdf742e7207641c Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 11 Feb 2022 21:25:28 -0500 Subject: [PATCH 02/11] WIP --- pandas/_libs/lib.pyx | 15 ++++++++++++++ pandas/core/groupby/groupby.py | 36 ++++++++++++++++++++++++++++++---- pandas/core/groupby/ops.py | 2 +- 3 files changed, 48 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index c86eb80da93f7..95782b583a565 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -944,6 +944,21 @@ def generate_slices(const intp_t[:] labels, Py_ssize_t ngroups): return np.asarray(starts), np.asarray(ends) +def code_indices_fast(ndarray[intp_t, ndim=1] index, ndarray[intp_t, ndim=1] obs_group_ids) -> dict: + cdef: + Py_ssize_t i, j, k, l + ndarray[intp_t, ndim=1] result + + result = np.empty(len(index)) + + for i in range(k): + result[obs_group_ids[i]] = [] + + for j in range(l): + result[index[j]].append(j) + return result + + def indices_fast(ndarray[intp_t, ndim=1] index, const int64_t[:] labels, list keys, list sorted_labels) -> dict: """ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 9424da35d3d4e..6de52701cae6d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -621,7 +621,17 @@ def _get_indices_by_codes(self, codes): Safe get multiple indices, translate keys for datelike to underlying repr. """ - return [self.grouper.code_indices.get(code, []) for code in codes] + # code_indices = self.grouper.code_indices + # return [self.grouper.code_indices.get(code, []) for code in codes] + from pandas.core.sorting import get_group_index + + group_index = get_group_index( + self.grouper.codes, self.grouper.shape, sort=True, xnull=True + ) + _, obs_group_ids, _ = self.grouper.group_info + + sorter = get_group_index_sorter(group_index, len(group_index)) + return sorter @final def _get_indices(self, names): @@ -1101,11 +1111,29 @@ def _set_result_index_ordered( result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True) return result + # # row order is scrambled => sort the rows by position in original index + # original_positions = Index( + # np.concatenate(self._get_indices(self.grouper.result_index)) + # ) + # result.set_axis(original_positions, axis=self.axis, inplace=True) + # result = result.sort_index(axis=self.axis) + + # dropped_rows = len(result.index) < len(self.obj.index) + # + # if dropped_rows: + # # get index by slicing original index according to original positions + # # slice drops attrs => use set_axis when no rows were dropped + # sorted_indexer = result.index + # result.index = self._selected_obj.index[sorted_indexer] + # else: + # result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True) + # + # return result + # row order is scrambled => sort the rows by position in original index _, obs_group_ids, _ = self.grouper.group_info - original_positions = Index( - np.concatenate(self._get_indices_by_codes(obs_group_ids)) - ) + new = self._get_indices_by_codes(obs_group_ids) + original_positions = Index(new) result.set_axis(original_positions, axis=self.axis, inplace=True) result = result.sort_index(axis=self.axis) obj_axis = self.obj._get_axis(self.axis) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 7978970006437..92db80e3f3c23 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -787,7 +787,7 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: def code_indices(self) -> dict[int, npt.NDArray[np.intp]]: group_index = get_group_index(self.codes, self.shape, sort=True, xnull=True) _, obs_group_ids, _ = self.group_info - result = {e: np.where(group_index == e)[0] for e in obs_group_ids} + result = lib.code_indices_fast(group_index, obs_group_ids) return result @final From 2f32be165d025368a42b41f65a4e0a970c93f2ee Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 10 Feb 2022 22:43:49 -0500 Subject: [PATCH 03/11] BUG: Fix some cases of groupby(...).transform with dropna=True --- doc/source/whatsnew/v1.5.0.rst | 33 +++++++++++++++ pandas/core/groupby/groupby.py | 40 +++++++++++------- pandas/core/groupby/ops.py | 8 ++++ pandas/tests/groupby/conftest.py | 5 +++ pandas/tests/groupby/test_groupby_dropna.py | 42 +++++-------------- .../tests/groupby/transform/test_transform.py | 11 +++-- 6 files changed, 89 insertions(+), 50 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 4298809b0f2e4..d01a74e332c4f 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -57,6 +57,39 @@ Styler - Fixed bug in :class:`CSSToExcelConverter` leading to ``TypeError`` when border color provided without border style for ``xlsxwriter`` engine (:issue:`42276`) +.. _whatsnew_150.notable_bug_fixes.groupby_transform_dropna: + +Using ``dropna=True`` with ``groupby`` transformers +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A transformer is an operation whose result has the same size as its input. When the +result is a :class:`DataFrame` or :class:`Series`, it is also required that the +index of the result is equal to that of the input. In pandas 1.4, using +:meth:`.DataFrameGroupBy.transform` or :meth:`.SeriesGroupBy.transform` with null +values in the groups and ``dropna=True`` gave incorrect results. Demonstrated by the +examples below, the incorrect results either contained incorrect values, or the result +did not have the same index as the input. + +.. ipython:: python + + df = pd.DataFrame({'a': [1, 1, np.nan], 'b': [2, 3, 4]}) + +*Old behavior*: + +.. code-block:: ipython + + In [3]: df.groupby('a', dropna=True).transform(lambda x: x) + Out[3]: + b + 0 2 + 1 3 + +*New behavior*: + +.. ipython:: python + + df.groupby('a', dropna=True).transform(lambda x: x) + .. _whatsnew_150.notable_bug_fixes.notable_bug_fix2: notable_bug_fix2 diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 4eb907e06adf1..75c8c1cbb4cd4 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -108,11 +108,15 @@ class providing the base-class of operations. CategoricalIndex, Index, MultiIndex, + RangeIndex, ) from pandas.core.internals.blocks import ensure_block_shape import pandas.core.sample as sample from pandas.core.series import Series -from pandas.core.sorting import get_group_index_sorter +from pandas.core.sorting import ( + get_group_index, + get_group_index_sorter, +) from pandas.core.util.numba_ import ( NUMBA_FUNC_CACHE, maybe_use_numba, @@ -614,6 +618,20 @@ def indices(self): """ return self.grouper.indices + @final + def _get_indices_by_codes(self, codes): + """ + Safe get multiple indices by code. + """ + group_index = get_group_index( + codes, self.grouper.shape, sort=True, xnull=self.dropna + ) + if self.dropna: + # group_index can only be negative when passing xnull=True + group_index = group_index[np.where(group_index >= 0)] + sorter = get_group_index_sorter(group_index, len(group_index)) + return sorter + @final def _get_indices(self, names): """ @@ -1093,21 +1111,15 @@ def _set_result_index_ordered( return result # row order is scrambled => sort the rows by position in original index - original_positions = Index( - np.concatenate(self._get_indices(self.grouper.result_index)) - ) + original_positions = Index(self._get_indices_by_codes(self.grouper.codes)) result.set_axis(original_positions, axis=self.axis, inplace=True) result = result.sort_index(axis=self.axis) - - dropped_rows = len(result.index) < len(self.obj.index) - - if dropped_rows: - # get index by slicing original index according to original positions - # slice drops attrs => use set_axis when no rows were dropped - sorted_indexer = result.index - result.index = self._selected_obj.index[sorted_indexer] - else: - result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True) + obj_axis = self.obj._get_axis(self.axis) + if self.grouper.has_dropped_na: + # Fill in any missing values due to dropna - index here is integral + # with values referring to the row of the input so can use RangeIndex + result = result.reindex(RangeIndex(len(obj_axis)), axis=self.axis) + result.set_axis(obj_axis, axis=self.axis, inplace=True) return result diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index d4aa6ae9f4059..6bb6973a65071 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -825,6 +825,14 @@ def is_monotonic(self) -> bool: # return if my group orderings are monotonic return Index(self.group_info[0]).is_monotonic_increasing + @final + @cache_readonly + def has_dropped_na(self) -> bool: + """ + Whether grouper has null value(s) that are dropped. + """ + return bool((self.group_info[0] < 0).any()) + @cache_readonly def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: comp_ids, obs_group_ids = self._get_compressed_codes() diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index 403e3edcc34d0..58d9e500554dd 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -19,6 +19,11 @@ def as_index(request): return request.param +@pytest.fixture(params=[True, False]) +def dropna(request): + return request.param + + @pytest.fixture def mframe(multiindex_dataframe_random_data): return multiindex_dataframe_random_data diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index ab568e24ff029..9fcc28d8f65a9 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -171,52 +171,30 @@ def test_grouper_dropna_propagation(dropna): @pytest.mark.parametrize( - "dropna,input_index,expected_data,expected_index", + "index", [ - (True, pd.RangeIndex(0, 4), {"B": [2, 2, 1]}, pd.RangeIndex(0, 3)), - (True, list("abcd"), {"B": [2, 2, 1]}, list("abc")), - ( - True, - pd.MultiIndex.from_tuples( - [(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"] - ), - {"B": [2, 2, 1]}, - pd.MultiIndex.from_tuples( - [(1, "R"), (1, "B"), (2, "R")], names=["num", "col"] - ), - ), - (False, pd.RangeIndex(0, 4), {"B": [2, 2, 1, 1]}, pd.RangeIndex(0, 4)), - (False, list("abcd"), {"B": [2, 2, 1, 1]}, list("abcd")), - ( - False, - pd.MultiIndex.from_tuples( - [(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"] - ), - {"B": [2, 2, 1, 1]}, - pd.MultiIndex.from_tuples( - [(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"] - ), - ), + pd.RangeIndex(0, 4), + list("abcd"), + pd.MultiIndex.from_product([(1, 2), ("R", "B")], names=["num", "col"]), ], ) -def test_groupby_dataframe_slice_then_transform( - dropna, input_index, expected_data, expected_index -): +def test_groupby_dataframe_slice_then_transform(dropna, index): # GH35014 & GH35612 + expected_data = {"B": [2, 2, 1, np.nan if dropna else 1]} - df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}, index=input_index) + df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}, index=index) gb = df.groupby("A", dropna=dropna) result = gb.transform(len) - expected = pd.DataFrame(expected_data, index=expected_index) + expected = pd.DataFrame(expected_data, index=index) tm.assert_frame_equal(result, expected) result = gb[["B"]].transform(len) - expected = pd.DataFrame(expected_data, index=expected_index) + expected = pd.DataFrame(expected_data, index=index) tm.assert_frame_equal(result, expected) result = gb["B"].transform(len) - expected = pd.Series(expected_data["B"], index=expected_index, name="B") + expected = pd.Series(expected_data["B"], index=index, name="B") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 6e9b8c35b3698..1ea0fc3f0e3db 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1281,9 +1281,12 @@ def test_transform_cumcount(): tm.assert_series_equal(result, expected) -def test_null_group_lambda_self(): +# Test both monotonic increasing and not +@pytest.mark.parametrize("keys", [[1, 2, np.nan], [2, 1, np.nan]]) +def test_null_group_lambda_self(dropna, keys): # GH 17093 - df = DataFrame({"A": [1, np.nan], "B": [1, 1]}) - result = df.groupby("A").transform(lambda x: x) - expected = DataFrame([1], columns=["B"]) + df = DataFrame({"A": keys, "B": [1, 2, 3]}) + result = df.groupby("A", dropna=dropna).transform(lambda x: x) + value = np.nan if dropna else 3 + expected = DataFrame([1, 2, value], columns=["B"]) tm.assert_frame_equal(result, expected) From 5e92af4c8e39bd144919b3c6bbedf045637c3af0 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 11 Feb 2022 22:34:26 -0500 Subject: [PATCH 04/11] Better conditional --- pandas/core/groupby/groupby.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 75c8c1cbb4cd4..eca4f0ed4e6fa 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -626,8 +626,7 @@ def _get_indices_by_codes(self, codes): group_index = get_group_index( codes, self.grouper.shape, sort=True, xnull=self.dropna ) - if self.dropna: - # group_index can only be negative when passing xnull=True + if self.grouper.has_dropped_na: group_index = group_index[np.where(group_index >= 0)] sorter = get_group_index_sorter(group_index, len(group_index)) return sorter From 3919c515c2f20a922663c0587bb71b5150898d1b Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 11 Feb 2022 23:14:14 -0500 Subject: [PATCH 05/11] Remove xfail --- pandas/tests/extension/test_string.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 4256142556894..73682620b8353 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -18,8 +18,6 @@ import numpy as np import pytest -from pandas.compat import pa_version_under2p0 - import pandas as pd from pandas.core.arrays import ArrowStringArray from pandas.core.arrays.string_ import StringDtype @@ -193,10 +191,6 @@ class TestPrinting(base.BasePrintingTests): class TestGroupBy(base.BaseGroupbyTests): def test_groupby_extension_transform(self, data_for_grouping, request): - if data_for_grouping.dtype.storage == "pyarrow" and pa_version_under2p0: - # failure observed in 1.0.1, not in 2.0 or later - mark = pytest.mark.xfail(reason="pyarrow raises in self._data[item]") - request.node.add_marker(mark) super().test_groupby_extension_transform(data_for_grouping) From 31240e179f32caab5554fe7bc983b321013b7958 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 11 Feb 2022 23:54:11 -0500 Subject: [PATCH 06/11] Fixup & type-hints --- pandas/core/groupby/groupby.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index eca4f0ed4e6fa..67de98578496e 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -619,12 +619,12 @@ def indices(self): return self.grouper.indices @final - def _get_indices_by_codes(self, codes): + def _get_indices_by_codes(self, codes: list[np.ndarray]) -> npt.NDArray[np.intp]: """ Safe get multiple indices by code. """ group_index = get_group_index( - codes, self.grouper.shape, sort=True, xnull=self.dropna + codes, self.grouper.shape, sort=True, xnull=self.grouper.dropna ) if self.grouper.has_dropped_na: group_index = group_index[np.where(group_index >= 0)] From d3cbf503ea024b18b8b258ad14a54d498c4d7f60 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 13 Feb 2022 11:30:20 -0500 Subject: [PATCH 07/11] WIP --- .../tests/groupby/transform/test_transform.py | 22 ++++++++++++++----- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 1ea0fc3f0e3db..6349bad3ed135 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1281,12 +1281,22 @@ def test_transform_cumcount(): tm.assert_series_equal(result, expected) -# Test both monotonic increasing and not -@pytest.mark.parametrize("keys", [[1, 2, np.nan], [2, 1, np.nan]]) -def test_null_group_lambda_self(dropna, keys): +def test_null_group_lambda_self(dropna): # GH 17093 - df = DataFrame({"A": keys, "B": [1, 2, 3]}) + keys = np.random.randint(0, 5, size=5).astype(float) + nulls = np.random.choice([0, 1], keys.shape).astype(bool) + keys[nulls] = np.nan + values = np.random.randint(0, 5, size=keys.shape) + df = DataFrame({"A": keys, "B": values}) + + expected_values = values + if dropna: + expected_values = expected_values.astype(float) + expected_values[nulls] = np.nan + expected = DataFrame(expected_values, columns=["B"]) + result = df.groupby("A", dropna=dropna).transform(lambda x: x) - value = np.nan if dropna else 3 - expected = DataFrame([1, 2, value], columns=["B"]) + print(df) + print(result) + print(expected) tm.assert_frame_equal(result, expected) From c60da747a9a64f86519e1bb4cfce697357058373 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 13 Feb 2022 12:50:54 -0500 Subject: [PATCH 08/11] Fix implementation, better comments/tests. --- pandas/_libs/lib.pyx | 15 ------------ pandas/core/groupby/groupby.py | 20 ++-------------- pandas/core/groupby/ops.py | 24 +++++++++++++++++++ .../tests/groupby/transform/test_transform.py | 13 +++++----- 4 files changed, 32 insertions(+), 40 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 95782b583a565..c86eb80da93f7 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -944,21 +944,6 @@ def generate_slices(const intp_t[:] labels, Py_ssize_t ngroups): return np.asarray(starts), np.asarray(ends) -def code_indices_fast(ndarray[intp_t, ndim=1] index, ndarray[intp_t, ndim=1] obs_group_ids) -> dict: - cdef: - Py_ssize_t i, j, k, l - ndarray[intp_t, ndim=1] result - - result = np.empty(len(index)) - - for i in range(k): - result[obs_group_ids[i]] = [] - - for j in range(l): - result[index[j]].append(j) - return result - - def indices_fast(ndarray[intp_t, ndim=1] index, const int64_t[:] labels, list keys, list sorted_labels) -> dict: """ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 67de98578496e..335ef3d254173 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -113,10 +113,7 @@ class providing the base-class of operations. from pandas.core.internals.blocks import ensure_block_shape import pandas.core.sample as sample from pandas.core.series import Series -from pandas.core.sorting import ( - get_group_index, - get_group_index_sorter, -) +from pandas.core.sorting import get_group_index_sorter from pandas.core.util.numba_ import ( NUMBA_FUNC_CACHE, maybe_use_numba, @@ -618,19 +615,6 @@ def indices(self): """ return self.grouper.indices - @final - def _get_indices_by_codes(self, codes: list[np.ndarray]) -> npt.NDArray[np.intp]: - """ - Safe get multiple indices by code. - """ - group_index = get_group_index( - codes, self.grouper.shape, sort=True, xnull=self.grouper.dropna - ) - if self.grouper.has_dropped_na: - group_index = group_index[np.where(group_index >= 0)] - sorter = get_group_index_sorter(group_index, len(group_index)) - return sorter - @final def _get_indices(self, names): """ @@ -1110,7 +1094,7 @@ def _set_result_index_ordered( return result # row order is scrambled => sort the rows by position in original index - original_positions = Index(self._get_indices_by_codes(self.grouper.codes)) + original_positions = Index(self.grouper.result_ilocs()) result.set_axis(original_positions, axis=self.axis, inplace=True) result = result.sort_index(axis=self.axis) obj_axis = self.obj._get_axis(self.axis) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 6bb6973a65071..95f73cc9bab1c 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -783,6 +783,30 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: keys = [ping.group_index for ping in self.groupings] return get_indexer_dict(codes_list, keys) + @final + def result_ilocs(self) -> npt.NDArray[np.intp]: + """ + Get the original integer locations of result_index in the input. + """ + # Original indices are where group_index would go via sorting. + # But when dropna is true, we need to remove null values whilst accounting for + # any gaps that then occur because of them. + group_index = get_group_index(self.codes, self.shape, sort=False, xnull=True) + + if self.has_dropped_na: + mask = np.where(group_index >= 0) + # Count how many gaps are caused by previous null values for each position + null_gaps = np.cumsum(group_index == -1)[mask] + group_index = group_index[mask] + + result = get_group_index_sorter(group_index, self.ngroups) + + if self.has_dropped_na: + # Shift by the number of prior null gaps + result += np.take(null_gaps, result) + + return result + @final @property def codes(self) -> list[np.ndarray]: diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 6349bad3ed135..9e8a1ff233575 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1281,22 +1281,21 @@ def test_transform_cumcount(): tm.assert_series_equal(result, expected) -def test_null_group_lambda_self(dropna): +def test_null_group_lambda_self(sort, dropna): # GH 17093 - keys = np.random.randint(0, 5, size=5).astype(float) + np.random.seed(0) + keys = np.random.randint(0, 5, size=50).astype(float) nulls = np.random.choice([0, 1], keys.shape).astype(bool) keys[nulls] = np.nan values = np.random.randint(0, 5, size=keys.shape) df = DataFrame({"A": keys, "B": values}) expected_values = values - if dropna: + if dropna and nulls.any(): expected_values = expected_values.astype(float) expected_values[nulls] = np.nan expected = DataFrame(expected_values, columns=["B"]) - result = df.groupby("A", dropna=dropna).transform(lambda x: x) - print(df) - print(result) - print(expected) + gb = df.groupby("A", dropna=dropna, sort=sort) + result = gb.transform(lambda x: x) tm.assert_frame_equal(result, expected) From 80b662983213856e3e910d3c704f79f9e73d370f Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 13 Feb 2022 13:03:04 -0500 Subject: [PATCH 09/11] Fix comment --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 335ef3d254173..289b445af1c24 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1099,7 +1099,7 @@ def _set_result_index_ordered( result = result.sort_index(axis=self.axis) obj_axis = self.obj._get_axis(self.axis) if self.grouper.has_dropped_na: - # Fill in any missing values due to dropna - index here is integral + # Add back in any missing rows due to dropna - index here is integral # with values referring to the row of the input so can use RangeIndex result = result.reindex(RangeIndex(len(obj_axis)), axis=self.axis) result.set_axis(obj_axis, axis=self.axis, inplace=True) From da4201a2d14a41010f177121e93d5d7b74a72c3b Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 16 Feb 2022 21:28:25 -0500 Subject: [PATCH 10/11] Improve comment/whatsnew --- doc/source/whatsnew/v1.5.0.rst | 4 ++-- pandas/core/groupby/ops.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index ccedaba977080..8e1cc22197301 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -62,9 +62,9 @@ Styler Using ``dropna=True`` with ``groupby`` transformers ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -A transformer is an operation whose result has the same size as its input. When the +A transform is an operation whose result has the same size as its input. When the result is a :class:`DataFrame` or :class:`Series`, it is also required that the -index of the result is equal to that of the input. In pandas 1.4, using +index of the result matches that of the input. In pandas 1.4, using :meth:`.DataFrameGroupBy.transform` or :meth:`.SeriesGroupBy.transform` with null values in the groups and ``dropna=True`` gave incorrect results. Demonstrated by the examples below, the incorrect results either contained incorrect values, or the result diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index d233fdcbb263d..884bf3c1bcc48 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -789,7 +789,7 @@ def result_ilocs(self) -> npt.NDArray[np.intp]: Get the original integer locations of result_index in the input. """ # Original indices are where group_index would go via sorting. - # But when dropna is true, we need to remove null values whilst accounting for + # But when dropna is true, we need to remove null values while accounting for # any gaps that then occur because of them. group_index = get_group_index(self.codes, self.shape, sort=False, xnull=True) From e589e11b9f56970d3dd0bbd4efa06953fe7ef3a8 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 16 Feb 2022 23:35:44 -0500 Subject: [PATCH 11/11] missed the title --- doc/source/whatsnew/v1.5.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 8e1cc22197301..00ce71609a3d8 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -59,8 +59,8 @@ Styler .. _whatsnew_150.notable_bug_fixes.groupby_transform_dropna: -Using ``dropna=True`` with ``groupby`` transformers -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Using ``dropna=True`` with ``groupby`` transforms +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ A transform is an operation whose result has the same size as its input. When the result is a :class:`DataFrame` or :class:`Series`, it is also required that the