From 8b42961bf6d97fff3f0f0ab667e8b5c0724d4595 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 5 Jun 2022 12:00:46 -0400 Subject: [PATCH 1/4] BUG: groupby.transform not aligning with input index --- doc/source/user_guide/groupby.rst | 5 ++- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/core/groupby/generic.py | 2 + pandas/core/groupby/groupby.py | 9 +++-- .../tests/groupby/transform/test_transform.py | 37 +++++++++++++++++++ 5 files changed, 48 insertions(+), 6 deletions(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index f2d83885df2d0..146529b4b1bf9 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -761,12 +761,13 @@ different dtypes, then a common dtype will be determined in the same way as ``Da Transformation -------------- -The ``transform`` method returns an object that is indexed the same (same size) +The ``transform`` method returns an object that is indexed the same as the one being grouped. The transform function must: * Return a result that is either the same size as the group chunk or broadcastable to the size of the group chunk (e.g., a scalar, - ``grouped.transform(lambda x: x.iloc[-1])``). + ``grouped.transform(lambda x: x.iloc[-1])``). When the result is a Series + or DataFrame, alignment with the group chunk's index will be performed. * Operate column-by-column on the group chunk. The transform is applied to the first group chunk using chunk.apply. * Not perform in-place operations on the group chunk. Group chunks should diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 6bf6fd65f5633..581e351ddb2ff 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -865,6 +865,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.rolling` gives ValueError when center=True, axis=1 and win_type is specified (:issue:`46135`) - Bug in :meth:`.DataFrameGroupBy.describe` and :meth:`.SeriesGroupBy.describe` produces inconsistent results for empty datasets (:issue:`41575`) - Bug in :meth:`DataFrame.resample` reduction methods when used with ``on`` would attempt to aggregate the provided column (:issue:`47079`) +- Bug in :meth:`DataFrameGroupBy.transform` not aligning the result when the user returned a Series or DataFrame (:issue:`45648`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 5d215ec81a6cd..5832fcdaf9981 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1863,5 +1863,7 @@ def _wrap_transform_general_frame( ) assert isinstance(res_frame, DataFrame) return res_frame + elif isinstance(res, DataFrame): + return obj._constructor(res, index=group.index) else: return res diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index af2a5579bf1cd..80bea21a68515 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -373,14 +373,14 @@ class providing the base-class of operations. """ _transform_template = """ -Call function producing a like-indexed %(klass)s on each group and +Call function producing a same-indexed %(klass)s on each group and return a %(klass)s having the same indexes as the original object filled with the transformed values. Parameters ---------- f : function - Function to apply to each group. + Function to apply to each group. See the Notes section below for requirements. Can also accept a Numba JIT function with ``engine='numba'`` specified. @@ -435,11 +435,12 @@ class providing the base-class of operations. * f must return a value that either has the same shape as the input subframe or can be broadcast to the shape of the input subframe. For example, if `f` returns a scalar it will be broadcast to have the - same shape as the input subframe. + same shape as the input subframe. When the result is a Series or DataFrame, + alignment with the group chunk's index will be performed. * if this is a DataFrame, f must support application column-by-column in the subframe. If f also supports application to the entire subframe, then a fast path is used starting from the second chunk. -* f must not mutate groups. Mutation is not supported and may +* f must not mutate group chunks. Mutation is not supported and may produce unexpected results. See :ref:`gotchas.udf-mutation` for more details. When using ``engine='numba'``, there will be no "fall back" behavior internally. diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index b325edaf2b1ea..c676e3c16ab76 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1531,3 +1531,40 @@ def test_null_group_str_transformer_series(request, dropna, transformation_func) result = gb.transform(transformation_func, *args) tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "func, expected_values", + [ + (Series.sort_values, [5, 4, 3, 2, 1]), + (lambda x: x.head(1), [5.0, np.nan, 3.0, 2.0, np.nan]), + ], +) +@pytest.mark.parametrize("series", [True, False]) +@pytest.mark.parametrize( + "index", + [ + [1, 2, 3, 4, 5], + [5, 4, 3, 2, 1], + ], +) +@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]]) +@pytest.mark.parametrize("keys_in_index", [True, False]) +def test_transform_aligns(func, expected_values, series, index, keys, keys_in_index): + # GH#45648 - transform should align with the input's index + df = DataFrame({"a1": [1, 1, 3, 2, 2], "b": [5, 4, 3, 2, 1]}, index=index) + if "a2" in keys: + df["a2"] = df["a1"] + if keys_in_index: + df = df.set_index(keys, append=True) + + gb = df.groupby(keys) + if series: + gb = gb["b"] + + result = gb.transform(func) + + expected = DataFrame({"b": expected_values}, index=df.index) + if series: + expected = expected["b"] + tm.assert_equal(result, expected) From f779b0c9eff68c39d6efce8ce7aa7ec4fe59b907 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 5 Jun 2022 12:44:54 -0400 Subject: [PATCH 2/4] Change to deprecation --- doc/source/user_guide/groupby.rst | 10 ++++- doc/source/whatsnew/v1.5.0.rst | 2 +- pandas/core/groupby/generic.py | 20 +++++++++- pandas/core/groupby/groupby.py | 12 ++++-- .../tests/groupby/transform/test_transform.py | 40 ++++++++++--------- 5 files changed, 57 insertions(+), 27 deletions(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 146529b4b1bf9..d668c34922137 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -766,8 +766,7 @@ as the one being grouped. The transform function must: * Return a result that is either the same size as the group chunk or broadcastable to the size of the group chunk (e.g., a scalar, - ``grouped.transform(lambda x: x.iloc[-1])``). When the result is a Series - or DataFrame, alignment with the group chunk's index will be performed. + ``grouped.transform(lambda x: x.iloc[-1])``). * Operate column-by-column on the group chunk. The transform is applied to the first group chunk using chunk.apply. * Not perform in-place operations on the group chunk. Group chunks should @@ -777,6 +776,13 @@ as the one being grouped. The transform function must: * (Optionally) operates on the entire group chunk. If this is supported, a fast path is used starting from the *second* chunk. +.. deprecated:: 1.5.0 + When using ``.transform`` on a grouped DataFrame and the transformation function + returns a DataFrame, currently pandas does not align the result's index + with the input's index. This behavior is deprecated and alignment will + be performed in a future version of pandas. You can apply ``.to_numpy()`` to the + result of the transformation function to avoid alignment. + Similar to :ref:`groupby.aggregate.udfs`, the resulting dtype will reflect that of the transformation function. If the results from different groups have different dtypes, then a common dtype will be determined in the same way as ``DataFrame`` construction. diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 581e351ddb2ff..a7a16fb6935a9 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -675,6 +675,7 @@ Other Deprecations - Deprecated the ``closed`` argument in :class:`IntervalArray` in favor of ``inclusive`` argument; In a future version passing ``closed`` will raise (:issue:`40245`) - Deprecated the ``closed`` argument in :class:`intervaltree` in favor of ``inclusive`` argument; In a future version passing ``closed`` will raise (:issue:`40245`) - Deprecated the ``closed`` argument in :class:`ArrowInterval` in favor of ``inclusive`` argument; In a future version passing ``closed`` will raise (:issue:`40245`) +- Deprecated :meth:`DataFrameGroupBy.transform` not aligning the result when the UDF returned DataFrame (:issue:`45648`) .. --------------------------------------------------------------------------- .. _whatsnew_150.performance: @@ -865,7 +866,6 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.rolling` gives ValueError when center=True, axis=1 and win_type is specified (:issue:`46135`) - Bug in :meth:`.DataFrameGroupBy.describe` and :meth:`.SeriesGroupBy.describe` produces inconsistent results for empty datasets (:issue:`41575`) - Bug in :meth:`DataFrame.resample` reduction methods when used with ``on`` would attempt to aggregate the provided column (:issue:`47079`) -- Bug in :meth:`DataFrameGroupBy.transform` not aligning the result when the user returned a Series or DataFrame (:issue:`45648`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 5832fcdaf9981..1f8d7a2a17c52 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1196,14 +1196,32 @@ def _transform_general(self, func, *args, **kwargs): applied.append(res) # Compute and process with the remaining groups + emit_alignment_warning = False for name, group in gen: if group.size == 0: continue object.__setattr__(group, "name", name) res = path(group) + if ( + not emit_alignment_warning + and res.ndim == 2 + and not res.index.equals(group.index) + ): + emit_alignment_warning = True + res = _wrap_transform_general_frame(self.obj, group, res) applied.append(res) + if emit_alignment_warning: + warnings.warn( + "In a future version of pandas, returning a DataFrame in " + "groupby.transform will align with the input's index. Apply " + "`.to_numpy()` to the result in the transform function to keep " + "the current behavior and silence this warning.", + FutureWarning, + stacklevel=find_stack_level(), + ) + concat_index = obj.columns if self.axis == 0 else obj.index other_axis = 1 if self.axis == 0 else 0 # switches between 0 & 1 concatenated = concat(applied, axis=self.axis, verify_integrity=False) @@ -1863,7 +1881,5 @@ def _wrap_transform_general_frame( ) assert isinstance(res_frame, DataFrame) return res_frame - elif isinstance(res, DataFrame): - return obj._constructor(res, index=group.index) else: return res diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 80bea21a68515..2e9c031edcefd 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -435,12 +435,11 @@ class providing the base-class of operations. * f must return a value that either has the same shape as the input subframe or can be broadcast to the shape of the input subframe. For example, if `f` returns a scalar it will be broadcast to have the - same shape as the input subframe. When the result is a Series or DataFrame, - alignment with the group chunk's index will be performed. + same shape as the input subframe. * if this is a DataFrame, f must support application column-by-column in the subframe. If f also supports application to the entire subframe, then a fast path is used starting from the second chunk. -* f must not mutate group chunks. Mutation is not supported and may +* f must not mutate groups. Mutation is not supported and may produce unexpected results. See :ref:`gotchas.udf-mutation` for more details. When using ``engine='numba'``, there will be no "fall back" behavior internally. @@ -452,6 +451,13 @@ class providing the base-class of operations. The resulting dtype will reflect the return value of the passed ``func``, see the examples below. +.. deprecated:: 1.5.0 + When using ``.transform`` on a grouped DataFrame and the transformation function + returns a DataFrame, currently pandas does not align the result's index + with the input's index. This behavior is deprecated and alignment will + be performed in a future version of pandas. You can apply ``.to_numpy()`` to the + result of the transformation function to avoid alignment. + Examples -------- diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index c676e3c16ab76..5c64ba3d9e266 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1534,25 +1534,20 @@ def test_null_group_str_transformer_series(request, dropna, transformation_func) @pytest.mark.parametrize( - "func, expected_values", + "func, series, expected_values", [ - (Series.sort_values, [5, 4, 3, 2, 1]), - (lambda x: x.head(1), [5.0, np.nan, 3.0, 2.0, np.nan]), - ], -) -@pytest.mark.parametrize("series", [True, False]) -@pytest.mark.parametrize( - "index", - [ - [1, 2, 3, 4, 5], - [5, 4, 3, 2, 1], + (Series.sort_values, False, [4, 5, 3, 1, 2]), + (lambda x: x.head(1), False, ValueError), + # SeriesGroupBy already has correct behavior + (Series.sort_values, True, [5, 4, 3, 2, 1]), + (lambda x: x.head(1), True, [5.0, np.nan, 3.0, 2.0, np.nan]), ], ) @pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]]) @pytest.mark.parametrize("keys_in_index", [True, False]) -def test_transform_aligns(func, expected_values, series, index, keys, keys_in_index): +def test_transform_aligns_depr(func, series, expected_values, keys, keys_in_index): # GH#45648 - transform should align with the input's index - df = DataFrame({"a1": [1, 1, 3, 2, 2], "b": [5, 4, 3, 2, 1]}, index=index) + df = DataFrame({"a1": [1, 1, 3, 2, 2], "b": [5, 4, 3, 2, 1]}) if "a2" in keys: df["a2"] = df["a1"] if keys_in_index: @@ -1562,9 +1557,16 @@ def test_transform_aligns(func, expected_values, series, index, keys, keys_in_in if series: gb = gb["b"] - result = gb.transform(func) - - expected = DataFrame({"b": expected_values}, index=df.index) - if series: - expected = expected["b"] - tm.assert_equal(result, expected) + warn = None if series else FutureWarning + msg = "returning a DataFrame in groupby.transform will align" + if expected_values is ValueError: + with tm.assert_produces_warning(warn, match=msg): + with pytest.raises(ValueError, match="Length mismatch"): + gb.transform(func) + else: + with tm.assert_produces_warning(warn, match=msg): + result = gb.transform(func) + expected = DataFrame({"b": expected_values}, index=df.index) + if series: + expected = expected["b"] + tm.assert_equal(result, expected) From 9b69cb62082307873c152977ed149dc9f6f65d9e Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 6 Jun 2022 17:13:52 -0400 Subject: [PATCH 3/4] newlines and GH# --- doc/source/user_guide/groupby.rst | 1 + pandas/core/groupby/generic.py | 1 + pandas/core/groupby/groupby.py | 1 + 3 files changed, 3 insertions(+) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index d668c34922137..ba3fb17cc8764 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -777,6 +777,7 @@ as the one being grouped. The transform function must: fast path is used starting from the *second* chunk. .. deprecated:: 1.5.0 + When using ``.transform`` on a grouped DataFrame and the transformation function returns a DataFrame, currently pandas does not align the result's index with the input's index. This behavior is deprecated and alignment will diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 1f8d7a2a17c52..5a62ca9d0e1f3 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1213,6 +1213,7 @@ def _transform_general(self, func, *args, **kwargs): applied.append(res) if emit_alignment_warning: + # GH#45648 warnings.warn( "In a future version of pandas, returning a DataFrame in " "groupby.transform will align with the input's index. Apply " diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 2e9c031edcefd..391d77db8a5fb 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -452,6 +452,7 @@ class providing the base-class of operations. see the examples below. .. deprecated:: 1.5.0 + When using ``.transform`` on a grouped DataFrame and the transformation function returns a DataFrame, currently pandas does not align the result's index with the input's index. This behavior is deprecated and alignment will From d28f643a4d6198e3a75efb4f1c1a03ed9e7f6b58 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 11 Jun 2022 09:08:30 -0400 Subject: [PATCH 4/4] merge cleanup --- doc/source/whatsnew/v1.5.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index fdd6013308979..55bfb044fb31d 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -690,7 +690,6 @@ Other Deprecations - Deprecated the ``closed`` argument in :class:`intervaltree` in favor of ``inclusive`` argument; In a future version passing ``closed`` will raise (:issue:`40245`) - Deprecated the ``closed`` argument in :class:`ArrowInterval` in favor of ``inclusive`` argument; In a future version passing ``closed`` will raise (:issue:`40245`) - Deprecated allowing ``unit="M"`` or ``unit="Y"`` in :class:`Timestamp` constructor with a non-round float value (:issue:`47267`) -- Deprecated :meth:`DataFrameGroupBy.transform` not aligning the result when the UDF returned DataFrame (:issue:`45648`) - Deprecated the ``display.column_space`` global configuration option (:issue:`7576`) - Deprecated :meth:`DataFrameGroupBy.transform` not aligning the result when the UDF returned DataFrame (:issue:`45648`) -