From eb793c07a0b4320d3b6d42fe78e495f023bd69a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Correia?= Date: Tue, 26 Mar 2024 14:45:09 +0000 Subject: [PATCH 1/3] Fix #57069: DataFrameGroupBy.transform with numba returning the wrong order with non monotonically increasing indexes Fixed a bug that was returning the wrong order unless the index was monotonically increasing while utilizing DataFrameGroupBy.transform with engine='numba' Fixed the test "pandas/tests/groupby/transform/test_numba.py::test_index_data_correctly_passed" to expect a result in the correct order Added a test "pandas/tests/groupby/transform/test_numba.py::test_index_order_consistency_preserved" to test DataFrameGroupBy.transform with engine='numba' with a decreasing index Updated whatsnew to reflect changes --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/groupby/groupby.py | 3 ++- pandas/tests/groupby/transform/test_numba.py | 16 +++++++++++++++- 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index a398b93b60018..2f23a240bdcd1 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -303,6 +303,7 @@ Bug fixes - Fixed bug in :class:`SparseDtype` for equal comparison with na fill value. (:issue:`54770`) - Fixed bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`) - Fixed bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) +- Fixed bug in :meth:`DataFrame.transform` that was returning the wrong order unless the index was monotonically increasing. (:issue:`57069`) - Fixed bug in :meth:`DataFrame.update` bool dtype being converted to object (:issue:`55509`) - Fixed bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`) - Fixed bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 0b61938d474b9..bd8e222831d0c 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1439,6 +1439,7 @@ def _transform_with_numba(self, func, *args, engine_kwargs=None, **kwargs): data and indices into a Numba jitted function. """ data = self._obj_with_exclusions + index_sorting = self._grouper.result_ilocs df = data if data.ndim == 2 else data.to_frame() starts, ends, sorted_index, sorted_data = self._numba_prep(df) @@ -1456,7 +1457,7 @@ def _transform_with_numba(self, func, *args, engine_kwargs=None, **kwargs): ) # result values needs to be resorted to their original positions since we # evaluated the data sorted by group - result = result.take(np.argsort(sorted_index), axis=0) + result = result.take(np.argsort(index_sorting), axis=0) index = data.index if data.ndim == 1: result_kwargs = {"name": data.name} diff --git a/pandas/tests/groupby/transform/test_numba.py b/pandas/tests/groupby/transform/test_numba.py index b75113d3f4e14..653a460913125 100644 --- a/pandas/tests/groupby/transform/test_numba.py +++ b/pandas/tests/groupby/transform/test_numba.py @@ -181,10 +181,24 @@ def f(values, index): df = DataFrame({"group": ["A", "A", "B"], "v": [4, 5, 6]}, index=[-1, -2, -3]) result = df.groupby("group").transform(f, engine="numba") - expected = DataFrame([-4.0, -3.0, -2.0], columns=["v"], index=[-1, -2, -3]) + expected = DataFrame([-2.0, -3.0, -4.0], columns=["v"], index=[-1, -2, -3]) tm.assert_frame_equal(result, expected) +def test_index_order_consistency_preserved(): + # GH 57069 + pytest.importorskip("numba") + + def f(values, index): + return values + + df = DataFrame({"vals": [0.0, 1.0, 2.0, 3.0], "group": [0, 1, 0, 1]}) + df.index = df.index.values[::-1] + result = df.groupby("group")["vals"].transform(f, engine="numba") + expected = Series([0.0, 1.0, 2.0, 3.0], index=[3, 2, 1, 0], name="vals") + tm.assert_series_equal(result, expected) + + def test_engine_kwargs_not_cached(): # If the user passes a different set of engine_kwargs don't return the same # jitted function From 5dcc366fe6123666168f35ab40096fca5101beb0 Mon Sep 17 00:00:00 2001 From: Andre Correia Date: Wed, 27 Mar 2024 18:41:10 +0000 Subject: [PATCH 2/3] Apply suggestions from code review Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/tests/groupby/transform/test_numba.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/tests/groupby/transform/test_numba.py b/pandas/tests/groupby/transform/test_numba.py index 653a460913125..24a709d1d08c7 100644 --- a/pandas/tests/groupby/transform/test_numba.py +++ b/pandas/tests/groupby/transform/test_numba.py @@ -192,10 +192,9 @@ def test_index_order_consistency_preserved(): def f(values, index): return values - df = DataFrame({"vals": [0.0, 1.0, 2.0, 3.0], "group": [0, 1, 0, 1]}) - df.index = df.index.values[::-1] + df = DataFrame({"vals": [0.0, 1.0, 2.0, 3.0], "group": [0, 1, 0, 1]}, index=range(3, -1, -1)) result = df.groupby("group")["vals"].transform(f, engine="numba") - expected = Series([0.0, 1.0, 2.0, 3.0], index=[3, 2, 1, 0], name="vals") + expected = Series([0.0, 1.0, 2.0, 3.0], index=range(3, -1, -1), name="vals") tm.assert_series_equal(result, expected) From 76b18d333f9ef831371dfb80e47c63f0143e358d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Correia?= Date: Thu, 28 Mar 2024 17:29:08 +0000 Subject: [PATCH 3/3] Fixed pre-commit requirements --- pandas/tests/groupby/transform/test_numba.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/transform/test_numba.py b/pandas/tests/groupby/transform/test_numba.py index 24a709d1d08c7..a17d25b2e7e2e 100644 --- a/pandas/tests/groupby/transform/test_numba.py +++ b/pandas/tests/groupby/transform/test_numba.py @@ -192,7 +192,9 @@ def test_index_order_consistency_preserved(): def f(values, index): return values - df = DataFrame({"vals": [0.0, 1.0, 2.0, 3.0], "group": [0, 1, 0, 1]}, index=range(3, -1, -1)) + df = DataFrame( + {"vals": [0.0, 1.0, 2.0, 3.0], "group": [0, 1, 0, 1]}, index=range(3, -1, -1) + ) result = df.groupby("group")["vals"].transform(f, engine="numba") expected = Series([0.0, 1.0, 2.0, 3.0], index=range(3, -1, -1), name="vals") tm.assert_series_equal(result, expected)