From 65abc5e2b374bd9268dc1767313ce8bf6ef50b02 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 23 Feb 2021 14:27:50 -0500 Subject: [PATCH 1/2] BUG: DataFrame/Series.transform with list and non-list dict values --- doc/source/whatsnew/v1.3.0.rst | 1 + pandas/core/apply.py | 47 +++++++++++---------- pandas/tests/apply/test_frame_transform.py | 11 +++++ pandas/tests/apply/test_series_transform.py | 13 ++++++ 4 files changed, 50 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 32a2514b3b6a3..54f5ea720f673 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -327,6 +327,7 @@ Numeric - Bug in :func:`select_dtypes` different behavior between Windows and Linux with ``include="int"`` (:issue:`36569`) - Bug in :meth:`DataFrame.apply` and :meth:`DataFrame.agg` when passed argument ``func="size"`` would operate on the entire ``DataFrame`` instead of rows or columns (:issue:`39934`) - Bug in :meth:`DataFrame.transform` would raise ``SpecificationError`` when passed a dictionary and columns were missing; will now raise a ``KeyError`` instead (:issue:`40004`) +- Bug in :meth:`DataFrame.transform` and :meth:`Series.transform` would have incorrect column labels when passed a dictionary with a mix of list and non-list values (:issue:`40018`) - Conversion diff --git a/pandas/core/apply.py b/pandas/core/apply.py index db4203e5158ef..3ea07e4425a51 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -280,7 +280,7 @@ def transform_dict_like(self, func): if len(func) == 0: raise ValueError("No transform functions were provided") - self.validate_dictlike_arg("transform", obj, func) + func = self.validate_dictlike_arg("transform", obj, func) results: Dict[Hashable, FrameOrSeriesUnion] = {} for name, how in func.items(): @@ -421,32 +421,17 @@ def agg_dict_like(self, _axis: int) -> FrameOrSeriesUnion: ------- Result of aggregation. """ + from pandas.core.reshape.concat import concat + obj = self.obj arg = cast(AggFuncTypeDict, self.f) - is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) - if _axis != 0: # pragma: no cover raise ValueError("Can only pass dict with axis=0") selected_obj = obj._selected_obj - self.validate_dictlike_arg("agg", selected_obj, arg) - - # if we have a dict of any non-scalars - # eg. {'A' : ['mean']}, normalize all to - # be list-likes - # Cannot use arg.values() because arg may be a Series - if any(is_aggregator(x) for _, x in arg.items()): - new_arg: AggFuncTypeDict = {} - for k, v in arg.items(): - if not isinstance(v, (tuple, list, dict)): - new_arg[k] = [v] - else: - new_arg[k] = v - arg = new_arg - - from pandas.core.reshape.concat import concat + arg = self.validate_dictlike_arg("agg", selected_obj, arg) if selected_obj.ndim == 1: # key only used for output @@ -542,12 +527,13 @@ def maybe_apply_multiple(self) -> Optional[FrameOrSeriesUnion]: def validate_dictlike_arg( self, how: str, obj: FrameOrSeriesUnion, func: AggFuncTypeDict - ) -> None: + ) -> AggFuncTypeDict: """ - Raise if dict-like argument is invalid. + Handler for dict-like argument. Ensures that necessary columns exist if obj is a DataFrame, and - that a nested renamer is not passed. + that a nested renamer is not passed. Also normalizes to all lists + when values consists of a mix of list and non-lists. """ assert how in ("apply", "agg", "transform") @@ -567,6 +553,23 @@ def validate_dictlike_arg( cols_sorted = list(safe_sort(list(cols))) raise KeyError(f"Column(s) {cols_sorted} do not exist") + is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) + + # if we have a dict of any non-scalars + # eg. {'A' : ['mean']}, normalize all to + # be list-likes + # Cannot use func.values() because arg may be a Series + if any(is_aggregator(x) for _, x in func.items()): + new_func: AggFuncTypeDict = {} + for k, v in func.items(): + if not is_aggregator(v): + # mypy can't realize v is not a list here + new_func[k] = [v] # type:ignore[list-item] + else: + new_func[k] = v + func = new_func + return func + class FrameApply(Apply): obj: DataFrame diff --git a/pandas/tests/apply/test_frame_transform.py b/pandas/tests/apply/test_frame_transform.py index 1888ddd8ec4aa..47bc69656a597 100644 --- a/pandas/tests/apply/test_frame_transform.py +++ b/pandas/tests/apply/test_frame_transform.py @@ -103,6 +103,17 @@ def test_transform_dictlike(axis, float_frame, box): tm.assert_frame_equal(result, expected) +def test_transform_dictlike_mixed(): + # GH 40018 - mix of lists and non-lists in values of a dictionary + df = DataFrame({"a": [1, 2], "b": [1, 4], "c": [1, 4]}) + result = df.transform({"b": ["sqrt", "abs"], "c": "sqrt"}) + expected = DataFrame( + [[1.0, 1, 1.0], [2.0, 4, 2.0]], + columns=MultiIndex([("b", "c"), ("sqrt", "abs")], [(0, 0, 1), (0, 1, 0)]), + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "ops", [ diff --git a/pandas/tests/apply/test_series_transform.py b/pandas/tests/apply/test_series_transform.py index e67ea4f14e4ac..24d619cb2bbb1 100644 --- a/pandas/tests/apply/test_series_transform.py +++ b/pandas/tests/apply/test_series_transform.py @@ -2,6 +2,8 @@ import pytest from pandas import ( + DataFrame, + MultiIndex, Series, concat, ) @@ -55,6 +57,17 @@ def test_transform_dictlike(string_series, box): tm.assert_frame_equal(result, expected) +def test_transform_dictlike_mixed(): + # GH 40018 - mix of lists and non-lists in values of a dictionary + df = Series([1, 4]) + result = df.transform({"b": ["sqrt", "abs"], "c": "sqrt"}) + expected = DataFrame( + [[1.0, 1, 1.0], [2.0, 4, 2.0]], + columns=MultiIndex([("b", "c"), ("sqrt", "abs")], [(0, 0, 1), (0, 1, 0)]), + ) + tm.assert_frame_equal(result, expected) + + def test_transform_wont_agg(string_series): # GH 35964 # we are trying to transform with an aggregator From 5f7dac2509339e2b9f14c709fef7f0c79865257b Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 27 Feb 2021 08:43:18 -0500 Subject: [PATCH 2/2] Moved whatsnew note, renamed to normalize --- doc/source/whatsnew/v1.2.3.rst | 2 ++ doc/source/whatsnew/v1.3.0.rst | 1 - pandas/core/apply.py | 6 +++--- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.2.3.rst b/doc/source/whatsnew/v1.2.3.rst index f72ee78bf243a..99e997189d7b8 100644 --- a/doc/source/whatsnew/v1.2.3.rst +++ b/doc/source/whatsnew/v1.2.3.rst @@ -24,6 +24,8 @@ Fixed regressions Passing ``ascending=None`` is still considered invalid, and the new error message suggests a proper usage (``ascending`` must be a boolean or a list-like boolean). +- Fixed regression in :meth:`DataFrame.transform` and :meth:`Series.transform` giving incorrect column labels when passed a dictionary with a mix of list and non-list values (:issue:`40018`) +- .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 54f5ea720f673..32a2514b3b6a3 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -327,7 +327,6 @@ Numeric - Bug in :func:`select_dtypes` different behavior between Windows and Linux with ``include="int"`` (:issue:`36569`) - Bug in :meth:`DataFrame.apply` and :meth:`DataFrame.agg` when passed argument ``func="size"`` would operate on the entire ``DataFrame`` instead of rows or columns (:issue:`39934`) - Bug in :meth:`DataFrame.transform` would raise ``SpecificationError`` when passed a dictionary and columns were missing; will now raise a ``KeyError`` instead (:issue:`40004`) -- Bug in :meth:`DataFrame.transform` and :meth:`Series.transform` would have incorrect column labels when passed a dictionary with a mix of list and non-list values (:issue:`40018`) - Conversion diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 3ea07e4425a51..970629f4abfe9 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -280,7 +280,7 @@ def transform_dict_like(self, func): if len(func) == 0: raise ValueError("No transform functions were provided") - func = self.validate_dictlike_arg("transform", obj, func) + func = self.normalize_dictlike_arg("transform", obj, func) results: Dict[Hashable, FrameOrSeriesUnion] = {} for name, how in func.items(): @@ -431,7 +431,7 @@ def agg_dict_like(self, _axis: int) -> FrameOrSeriesUnion: selected_obj = obj._selected_obj - arg = self.validate_dictlike_arg("agg", selected_obj, arg) + arg = self.normalize_dictlike_arg("agg", selected_obj, arg) if selected_obj.ndim == 1: # key only used for output @@ -525,7 +525,7 @@ def maybe_apply_multiple(self) -> Optional[FrameOrSeriesUnion]: return None return self.obj.aggregate(self.f, self.axis, *self.args, **self.kwargs) - def validate_dictlike_arg( + def normalize_dictlike_arg( self, how: str, obj: FrameOrSeriesUnion, func: AggFuncTypeDict ) -> AggFuncTypeDict: """