From 61d4602fbb43007026a9191ef9a44907068cd545 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 23 Feb 2021 14:27:50 -0500 Subject: [PATCH 1/6] REF: Consolidate validation of dictionary argument in agg/transform --- pandas/core/apply.py | 76 +++++++++++++++++--------------------------- 1 file changed, 30 insertions(+), 46 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 46b1e5b20ce3a..9e9d80ef52f27 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -273,17 +273,7 @@ def transform_dict_like(self, func): if len(func) == 0: raise ValueError("No transform functions were provided") - if obj.ndim != 1: - # Check for missing columns on a frame - cols = set(func.keys()) - set(obj.columns) - if len(cols) > 0: - cols_sorted = list(safe_sort(list(cols))) - raise SpecificationError(f"Column(s) {cols_sorted} do not exist") - - # Can't use func.values(); wouldn't work for a Series - if any(is_dict_like(v) for _, v in func.items()): - # GH 15931 - deprecation of renaming keys - raise SpecificationError("nested renamer is not supported") + self.validate_dictlike_arg("transform", obj, func) results: Dict[Hashable, FrameOrSeriesUnion] = {} for name, how in func.items(): @@ -434,6 +424,8 @@ def agg_dict_like(self, _axis: int) -> FrameOrSeriesUnion: selected_obj = obj._selected_obj + self.validate_dictlike_arg("agg", selected_obj, arg) + # if we have a dict of any non-scalars # eg. {'A' : ['mean']}, normalize all to # be list-likes @@ -445,43 +437,8 @@ def agg_dict_like(self, _axis: int) -> FrameOrSeriesUnion: new_arg[k] = [v] else: new_arg[k] = v - - # the keys must be in the columns - # for ndim=2, or renamers for ndim=1 - - # ok for now, but deprecated - # {'A': { 'ra': 'mean' }} - # {'A': { 'ra': ['mean'] }} - # {'ra': ['mean']} - - # not ok - # {'ra' : { 'A' : 'mean' }} - if isinstance(v, dict): - raise SpecificationError("nested renamer is not supported") - elif isinstance(selected_obj, ABCSeries): - raise SpecificationError("nested renamer is not supported") - elif ( - isinstance(selected_obj, ABCDataFrame) - and k not in selected_obj.columns - ): - raise KeyError(f"Column '{k}' does not exist!") - arg = new_arg - else: - # deprecation of renaming keys - # GH 15931 - keys = list(arg.keys()) - if isinstance(selected_obj, ABCDataFrame) and len( - selected_obj.columns.intersection(keys) - ) != len(keys): - cols = list( - safe_sort( - list(set(keys) - set(selected_obj.columns.intersection(keys))), - ) - ) - raise SpecificationError(f"Column(s) {cols} do not exist") - from pandas.core.reshape.concat import concat if selected_obj.ndim == 1: @@ -567,6 +524,33 @@ def maybe_apply_multiple(self) -> Optional[FrameOrSeriesUnion]: return None return self.obj.aggregate(self.f, self.axis, *self.args, **self.kwargs) + def validate_dictlike_arg( + self, how: str, obj: FrameOrSeriesUnion, func: AggFuncTypeDict + ) -> None: + """ + Raise if dict-like argument is invalid. + + Ensures that necessary columns exist if obj is a DataFrame, and + that a nested renamer is not passed. + """ + assert how in ("apply", "agg", "transform") + + if obj.ndim != 1: + # Check for missing columns on a frame + cols = set(func.keys()) - set(obj.columns) + if len(cols) > 0: + cols_sorted = list(safe_sort(list(cols))) + raise SpecificationError(f"Column(s) {cols_sorted} do not exist") + + # Can't use func.values(); wouldn't work for a Series + if ( + how == "agg" + and isinstance(obj, ABCSeries) + and any(is_list_like(v) for _, v in func.items()) + ) or (any(is_dict_like(v) for _, v in func.items())): + # GH 15931 - deprecation of renaming keys + raise SpecificationError("nested renamer is not supported") + class FrameApply(Apply): obj: DataFrame From 592368b6b228abd2f6bd8d0120edf38609addd3f Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 23 Feb 2021 16:52:03 -0500 Subject: [PATCH 2/6] Add tests --- pandas/tests/apply/test_invalid_arg.py | 31 ++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 pandas/tests/apply/test_invalid_arg.py diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py new file mode 100644 index 0000000000000..1755b964fdfeb --- /dev/null +++ b/pandas/tests/apply/test_invalid_arg.py @@ -0,0 +1,31 @@ +# Tests specifically aimed at detecting bad arguments. +import re + +import pytest + +from pandas import ( + DataFrame, + Series, +) +from pandas.core.base import SpecificationError + + +@pytest.mark.parametrize("box", [DataFrame, Series]) +@pytest.mark.parametrize("method", ["apply", "agg", "transform"]) +@pytest.mark.parametrize("func", [{"A": {"B": "sum"}}, {"A": {"B": ["sum"]}}]) +def test_nested_renamer(box, method, func): + # GH 35964 + obj = box({"A": [1]}) + match = "nested renamer is not supported" + with pytest.raises(SpecificationError, match=match): + getattr(obj, method)(func) + + +@pytest.mark.parametrize("method", ["apply", "agg", "transform"]) +@pytest.mark.parametrize("func", [{"B": "sum"}, {"B": ["sum"]}]) +def test_missing_column(method, func): + # GH ??? + obj = DataFrame({"A": [1]}) + match = re.escape("Column(s) ['B'] do not exist") + with pytest.raises(SpecificationError, match=match): + getattr(obj, method)(func) From 0d9055d9055bf2605ba464c0ec68a957244cb009 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 23 Feb 2021 16:59:22 -0500 Subject: [PATCH 3/6] Add PR # to test --- pandas/tests/apply/test_invalid_arg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py index 1755b964fdfeb..b86e834f4c08b 100644 --- a/pandas/tests/apply/test_invalid_arg.py +++ b/pandas/tests/apply/test_invalid_arg.py @@ -24,7 +24,7 @@ def test_nested_renamer(box, method, func): @pytest.mark.parametrize("method", ["apply", "agg", "transform"]) @pytest.mark.parametrize("func", [{"B": "sum"}, {"B": ["sum"]}]) def test_missing_column(method, func): - # GH ??? + # GH 40004 obj = DataFrame({"A": [1]}) match = re.escape("Column(s) ['B'] do not exist") with pytest.raises(SpecificationError, match=match): From 758af83ecf3ad5d0214ccc86e33b54f0ca91e6f2 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 23 Feb 2021 17:23:26 -0500 Subject: [PATCH 4/6] Switch priority of checks, adjust test error messages outside of apply --- pandas/core/apply.py | 14 +++++++------- pandas/tests/groupby/aggregate/test_aggregate.py | 6 ++++-- pandas/tests/resample/test_resample_api.py | 9 +++++---- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 9e9d80ef52f27..2276cb78aafc8 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -535,13 +535,6 @@ def validate_dictlike_arg( """ assert how in ("apply", "agg", "transform") - if obj.ndim != 1: - # Check for missing columns on a frame - cols = set(func.keys()) - set(obj.columns) - if len(cols) > 0: - cols_sorted = list(safe_sort(list(cols))) - raise SpecificationError(f"Column(s) {cols_sorted} do not exist") - # Can't use func.values(); wouldn't work for a Series if ( how == "agg" @@ -551,6 +544,13 @@ def validate_dictlike_arg( # GH 15931 - deprecation of renaming keys raise SpecificationError("nested renamer is not supported") + if obj.ndim != 1: + # Check for missing columns on a frame + cols = set(func.keys()) - set(obj.columns) + if len(cols) > 0: + cols_sorted = list(safe_sort(list(cols))) + raise SpecificationError(f"Column(s) {cols_sorted} do not exist") + class FrameApply(Apply): obj: DataFrame diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 92b7aefa6dd8c..e7e47c5b5f911 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -4,6 +4,7 @@ import datetime import functools from functools import partial +import re import numpy as np import pytest @@ -683,7 +684,8 @@ def test_agg_relabel_other_raises(self): def test_missing_raises(self): df = DataFrame({"A": [0, 1], "B": [1, 2]}) - with pytest.raises(KeyError, match="Column 'C' does not exist"): + match = re.escape("Column(s) ['C'] do not exist") + with pytest.raises(SpecificationError, match=match): df.groupby("A").agg(c=("C", "sum")) def test_agg_namedtuple(self): @@ -760,7 +762,7 @@ def test_agg_relabel_multiindex_raises_not_exist(): ) df.columns = pd.MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")]) - with pytest.raises(KeyError, match="does not exist"): + with pytest.raises(SpecificationError, match="do not exist"): df.groupby(("x", "group")).agg(a=(("Y", "a"), "max")) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 48c068be843a9..e976edec8f6d7 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -9,6 +9,7 @@ Series, ) import pandas._testing as tm +from pandas.core.base import SpecificationError from pandas.core.indexes.datetimes import date_range dti = date_range(start=datetime(2005, 1, 1), end=datetime(2005, 1, 10), freq="Min") @@ -475,9 +476,9 @@ def test_agg_misc(): # errors # invalid names in the agg specification - msg = "\"Column 'B' does not exist!\"" + msg = r"Column\(s\) \['B'\] do not exist" for t in cases: - with pytest.raises(KeyError, match=msg): + with pytest.raises(SpecificationError, match=msg): t[["A"]].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) @@ -526,8 +527,8 @@ def test_try_aggregate_non_existing_column(): df = DataFrame(data).set_index("dt") # Error as we don't have 'z' column - msg = "\"Column 'z' does not exist!\"" - with pytest.raises(KeyError, match=msg): + msg = r"Column\(s\) \['z'\] do not exist" + with pytest.raises(SpecificationError, match=msg): df.resample("30T").agg({"x": ["mean"], "y": ["median"], "z": ["sum"]}) From 713534c3e63cf8809b4db688180d0943faef0fb4 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 24 Feb 2021 07:59:38 -0500 Subject: [PATCH 5/6] Added assert --- pandas/core/apply.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 2276cb78aafc8..56974dd2953cd 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -270,6 +270,9 @@ def transform_dict_like(self, func): args = self.args kwargs = self.kwargs + # transform is currently only for Series/DataFrame + assert isinstance(obj, ABCNDFrame) + if len(func) == 0: raise ValueError("No transform functions were provided") From f31011193077d5d26062e6d865246200170e8dfc Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 24 Feb 2021 14:12:21 -0500 Subject: [PATCH 6/6] SpecificationError -> KeyError, whatsnew --- doc/source/whatsnew/v1.3.0.rst | 2 ++ pandas/core/apply.py | 2 +- pandas/tests/apply/test_frame_transform.py | 4 ++-- pandas/tests/apply/test_invalid_arg.py | 2 +- pandas/tests/groupby/aggregate/test_aggregate.py | 4 ++-- pandas/tests/groupby/aggregate/test_other.py | 4 ++-- pandas/tests/resample/test_resample_api.py | 11 +++++------ 7 files changed, 15 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index c65502898195a..4326808b67bb4 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -323,6 +323,7 @@ Numeric - Bug in :meth:`DataFrame.rank` with ``np.inf`` and mixture of ``np.nan`` and ``np.inf`` (:issue:`32593`) - Bug in :meth:`DataFrame.rank` with ``axis=0`` and columns holding incomparable types raising ``IndexError`` (:issue:`38932`) - Bug in :func:`select_dtypes` different behavior between Windows and Linux with ``include="int"`` (:issue:`36569`) +- Bug in :meth:`DataFrame.transform` would raise ``SpecificationError`` when passed a dictionary and columns were missing; will now raise a ``KeyError`` instead (:issue:`40004`) - Conversion @@ -439,6 +440,7 @@ Groupby/resample/rolling - Bug in :meth:`core.window.expanding.ExpandingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.cov` where 1 would be returned instead of ``np.nan`` when providing ``other`` that was longer than each group (:issue:`39591`) - Bug in :meth:`.GroupBy.mean`, :meth:`.GroupBy.median` and :meth:`DataFrame.pivot_table` not propagating metadata (:issue:`28283`) - Bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` not calculating window bounds correctly when window is an offset and dates are in descending order (:issue:`40002`) +- Bug in :meth:`DataFrameGroupBy.aggregate` and :meth:`.Resampler.aggregate` would sometimes raise ``SpecificationError`` when passed a dictionary and columns were missing; will now always raise a ``KeyError`` instead (:issue:`40004`) - Reshaping diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 56974dd2953cd..e21f6dd815765 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -552,7 +552,7 @@ def validate_dictlike_arg( cols = set(func.keys()) - set(obj.columns) if len(cols) > 0: cols_sorted = list(safe_sort(list(cols))) - raise SpecificationError(f"Column(s) {cols_sorted} do not exist") + raise KeyError(f"Column(s) {cols_sorted} do not exist") class FrameApply(Apply): diff --git a/pandas/tests/apply/test_frame_transform.py b/pandas/tests/apply/test_frame_transform.py index 7718ec5215499..1888ddd8ec4aa 100644 --- a/pandas/tests/apply/test_frame_transform.py +++ b/pandas/tests/apply/test_frame_transform.py @@ -260,7 +260,7 @@ def test_transform_missing_columns(axis): # GH#35964 df = DataFrame({"A": [1, 2], "B": [3, 4]}) match = re.escape("Column(s) ['C'] do not exist") - with pytest.raises(SpecificationError, match=match): + with pytest.raises(KeyError, match=match): df.transform({"C": "cumsum"}) @@ -276,7 +276,7 @@ def test_transform_mixed_column_name_dtypes(): # GH39025 df = DataFrame({"a": ["1"]}) msg = r"Column\(s\) \[1, 'b'\] do not exist" - with pytest.raises(SpecificationError, match=msg): + with pytest.raises(KeyError, match=msg): df.transform({"a": int, 1: str, "b": int}) diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py index b86e834f4c08b..c67259d3c8194 100644 --- a/pandas/tests/apply/test_invalid_arg.py +++ b/pandas/tests/apply/test_invalid_arg.py @@ -27,5 +27,5 @@ def test_missing_column(method, func): # GH 40004 obj = DataFrame({"A": [1]}) match = re.escape("Column(s) ['B'] do not exist") - with pytest.raises(SpecificationError, match=match): + with pytest.raises(KeyError, match=match): getattr(obj, method)(func) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index e7e47c5b5f911..093da45faba11 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -685,7 +685,7 @@ def test_agg_relabel_other_raises(self): def test_missing_raises(self): df = DataFrame({"A": [0, 1], "B": [1, 2]}) match = re.escape("Column(s) ['C'] do not exist") - with pytest.raises(SpecificationError, match=match): + with pytest.raises(KeyError, match=match): df.groupby("A").agg(c=("C", "sum")) def test_agg_namedtuple(self): @@ -762,7 +762,7 @@ def test_agg_relabel_multiindex_raises_not_exist(): ) df.columns = pd.MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")]) - with pytest.raises(SpecificationError, match="do not exist"): + with pytest.raises(KeyError, match="do not exist"): df.groupby(("x", "group")).agg(a=(("Y", "a"), "max")) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index f945f898603ac..c566c45b582d7 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -210,7 +210,7 @@ def test_aggregate_api_consistency(): expected.columns = MultiIndex.from_product([["C", "D"], ["mean", "sum"]]) msg = r"Column\(s\) \['r', 'r2'\] do not exist" - with pytest.raises(SpecificationError, match=msg): + with pytest.raises(KeyError, match=msg): grouped[["D", "C"]].agg({"r": np.sum, "r2": np.mean}) @@ -225,7 +225,7 @@ def test_agg_dict_renaming_deprecation(): ) msg = r"Column\(s\) \['ma'\] do not exist" - with pytest.raises(SpecificationError, match=msg): + with pytest.raises(KeyError, match=msg): df.groupby("A")[["B", "C"]].agg({"ma": "max"}) msg = r"nested renamer is not supported" diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index e976edec8f6d7..219e407c3e999 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -9,7 +9,6 @@ Series, ) import pandas._testing as tm -from pandas.core.base import SpecificationError from pandas.core.indexes.datetimes import date_range dti = date_range(start=datetime(2005, 1, 1), end=datetime(2005, 1, 10), freq="Min") @@ -297,7 +296,7 @@ def test_agg_consistency(): r = df.resample("3T") msg = r"Column\(s\) \['r1', 'r2'\] do not exist" - with pytest.raises(pd.core.base.SpecificationError, match=msg): + with pytest.raises(KeyError, match=msg): r.agg({"r1": "mean", "r2": "sum"}) @@ -312,7 +311,7 @@ def test_agg_consistency_int_str_column_mix(): r = df.resample("3T") msg = r"Column\(s\) \[2, 'b'\] do not exist" - with pytest.raises(pd.core.base.SpecificationError, match=msg): + with pytest.raises(KeyError, match=msg): r.agg({2: "mean", "b": "sum"}) @@ -445,7 +444,7 @@ def test_agg_misc(): msg = r"Column\(s\) \['result1', 'result2'\] do not exist" for t in cases: - with pytest.raises(pd.core.base.SpecificationError, match=msg): + with pytest.raises(KeyError, match=msg): t[["A", "B"]].agg({"result1": np.sum, "result2": np.mean}) # agg with different hows @@ -478,7 +477,7 @@ def test_agg_misc(): # invalid names in the agg specification msg = r"Column\(s\) \['B'\] do not exist" for t in cases: - with pytest.raises(SpecificationError, match=msg): + with pytest.raises(KeyError, match=msg): t[["A"]].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) @@ -528,7 +527,7 @@ def test_try_aggregate_non_existing_column(): # Error as we don't have 'z' column msg = r"Column\(s\) \['z'\] do not exist" - with pytest.raises(SpecificationError, match=msg): + with pytest.raises(KeyError, match=msg): df.resample("30T").agg({"x": ["mean"], "y": ["median"], "z": ["sum"]})