diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 09f235bde5f79..535bc5f3bd7bf 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -325,6 +325,7 @@ Numeric - Bug in :meth:`DataFrame.rank` with ``axis=0`` and columns holding incomparable types raising ``IndexError`` (:issue:`38932`) - Bug in :func:`select_dtypes` different behavior between Windows and Linux with ``include="int"`` (:issue:`36569`) - Bug in :meth:`DataFrame.apply` and :meth:`DataFrame.agg` when passed argument ``func="size"`` would operate on the entire ``DataFrame`` instead of rows or columns (:issue:`39934`) +- Bug in :meth:`DataFrame.transform` would raise ``SpecificationError`` when passed a dictionary and columns were missing; will now raise a ``KeyError`` instead (:issue:`40004`) - Conversion @@ -443,6 +444,7 @@ Groupby/resample/rolling - Bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` not calculating window bounds correctly when window is an offset and dates are in descending order (:issue:`40002`) - Bug in :class:`SeriesGroupBy` and :class:`DataFrameGroupBy` on an empty ``Series`` or ``DataFrame`` would lose index, columns, and/or data types when directly using the methods ``idxmax``, ``idxmin``, ``mad``, ``min``, ``max``, ``sum``, ``prod``, and ``skew`` or using them through ``apply``, ``aggregate``, or ``resample`` (:issue:`26411`) - Bug in :meth:`DataFrameGroupBy.sample` where error was raised when ``weights`` was specified and the index was an :class:`Int64Index` (:issue:`39927`) +- Bug in :meth:`DataFrameGroupBy.aggregate` and :meth:`.Resampler.aggregate` would sometimes raise ``SpecificationError`` when passed a dictionary and columns were missing; will now always raise a ``KeyError`` instead (:issue:`40004`) - Reshaping diff --git a/pandas/core/apply.py b/pandas/core/apply.py index c7fa298b06a2f..db4203e5158ef 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -274,20 +274,13 @@ def transform_dict_like(self, func): args = self.args kwargs = self.kwargs + # transform is currently only for Series/DataFrame + assert isinstance(obj, ABCNDFrame) + if len(func) == 0: raise ValueError("No transform functions were provided") - if obj.ndim != 1: - # Check for missing columns on a frame - cols = set(func.keys()) - set(obj.columns) - if len(cols) > 0: - cols_sorted = list(safe_sort(list(cols))) - raise SpecificationError(f"Column(s) {cols_sorted} do not exist") - - # Can't use func.values(); wouldn't work for a Series - if any(is_dict_like(v) for _, v in func.items()): - # GH 15931 - deprecation of renaming keys - raise SpecificationError("nested renamer is not supported") + self.validate_dictlike_arg("transform", obj, func) results: Dict[Hashable, FrameOrSeriesUnion] = {} for name, how in func.items(): @@ -438,6 +431,8 @@ def agg_dict_like(self, _axis: int) -> FrameOrSeriesUnion: selected_obj = obj._selected_obj + self.validate_dictlike_arg("agg", selected_obj, arg) + # if we have a dict of any non-scalars # eg. {'A' : ['mean']}, normalize all to # be list-likes @@ -449,43 +444,8 @@ def agg_dict_like(self, _axis: int) -> FrameOrSeriesUnion: new_arg[k] = [v] else: new_arg[k] = v - - # the keys must be in the columns - # for ndim=2, or renamers for ndim=1 - - # ok for now, but deprecated - # {'A': { 'ra': 'mean' }} - # {'A': { 'ra': ['mean'] }} - # {'ra': ['mean']} - - # not ok - # {'ra' : { 'A' : 'mean' }} - if isinstance(v, dict): - raise SpecificationError("nested renamer is not supported") - elif isinstance(selected_obj, ABCSeries): - raise SpecificationError("nested renamer is not supported") - elif ( - isinstance(selected_obj, ABCDataFrame) - and k not in selected_obj.columns - ): - raise KeyError(f"Column '{k}' does not exist!") - arg = new_arg - else: - # deprecation of renaming keys - # GH 15931 - keys = list(arg.keys()) - if isinstance(selected_obj, ABCDataFrame) and len( - selected_obj.columns.intersection(keys) - ) != len(keys): - cols = list( - safe_sort( - list(set(keys) - set(selected_obj.columns.intersection(keys))), - ) - ) - raise SpecificationError(f"Column(s) {cols} do not exist") - from pandas.core.reshape.concat import concat if selected_obj.ndim == 1: @@ -580,6 +540,33 @@ def maybe_apply_multiple(self) -> Optional[FrameOrSeriesUnion]: return None return self.obj.aggregate(self.f, self.axis, *self.args, **self.kwargs) + def validate_dictlike_arg( + self, how: str, obj: FrameOrSeriesUnion, func: AggFuncTypeDict + ) -> None: + """ + Raise if dict-like argument is invalid. + + Ensures that necessary columns exist if obj is a DataFrame, and + that a nested renamer is not passed. + """ + assert how in ("apply", "agg", "transform") + + # Can't use func.values(); wouldn't work for a Series + if ( + how == "agg" + and isinstance(obj, ABCSeries) + and any(is_list_like(v) for _, v in func.items()) + ) or (any(is_dict_like(v) for _, v in func.items())): + # GH 15931 - deprecation of renaming keys + raise SpecificationError("nested renamer is not supported") + + if obj.ndim != 1: + # Check for missing columns on a frame + cols = set(func.keys()) - set(obj.columns) + if len(cols) > 0: + cols_sorted = list(safe_sort(list(cols))) + raise KeyError(f"Column(s) {cols_sorted} do not exist") + class FrameApply(Apply): obj: DataFrame diff --git a/pandas/tests/apply/test_frame_transform.py b/pandas/tests/apply/test_frame_transform.py index 7718ec5215499..1888ddd8ec4aa 100644 --- a/pandas/tests/apply/test_frame_transform.py +++ b/pandas/tests/apply/test_frame_transform.py @@ -260,7 +260,7 @@ def test_transform_missing_columns(axis): # GH#35964 df = DataFrame({"A": [1, 2], "B": [3, 4]}) match = re.escape("Column(s) ['C'] do not exist") - with pytest.raises(SpecificationError, match=match): + with pytest.raises(KeyError, match=match): df.transform({"C": "cumsum"}) @@ -276,7 +276,7 @@ def test_transform_mixed_column_name_dtypes(): # GH39025 df = DataFrame({"a": ["1"]}) msg = r"Column\(s\) \[1, 'b'\] do not exist" - with pytest.raises(SpecificationError, match=msg): + with pytest.raises(KeyError, match=msg): df.transform({"a": int, 1: str, "b": int}) diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py new file mode 100644 index 0000000000000..c67259d3c8194 --- /dev/null +++ b/pandas/tests/apply/test_invalid_arg.py @@ -0,0 +1,31 @@ +# Tests specifically aimed at detecting bad arguments. +import re + +import pytest + +from pandas import ( + DataFrame, + Series, +) +from pandas.core.base import SpecificationError + + +@pytest.mark.parametrize("box", [DataFrame, Series]) +@pytest.mark.parametrize("method", ["apply", "agg", "transform"]) +@pytest.mark.parametrize("func", [{"A": {"B": "sum"}}, {"A": {"B": ["sum"]}}]) +def test_nested_renamer(box, method, func): + # GH 35964 + obj = box({"A": [1]}) + match = "nested renamer is not supported" + with pytest.raises(SpecificationError, match=match): + getattr(obj, method)(func) + + +@pytest.mark.parametrize("method", ["apply", "agg", "transform"]) +@pytest.mark.parametrize("func", [{"B": "sum"}, {"B": ["sum"]}]) +def test_missing_column(method, func): + # GH 40004 + obj = DataFrame({"A": [1]}) + match = re.escape("Column(s) ['B'] do not exist") + with pytest.raises(KeyError, match=match): + getattr(obj, method)(func) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index f2f9cfee178d9..276c0adfdb485 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -4,6 +4,7 @@ import datetime import functools from functools import partial +import re import numpy as np import pytest @@ -685,7 +686,8 @@ def test_agg_relabel_other_raises(self): def test_missing_raises(self): df = DataFrame({"A": [0, 1], "B": [1, 2]}) - with pytest.raises(KeyError, match="Column 'C' does not exist"): + match = re.escape("Column(s) ['C'] do not exist") + with pytest.raises(KeyError, match=match): df.groupby("A").agg(c=("C", "sum")) def test_agg_namedtuple(self): @@ -762,7 +764,7 @@ def test_agg_relabel_multiindex_raises_not_exist(): ) df.columns = pd.MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")]) - with pytest.raises(KeyError, match="does not exist"): + with pytest.raises(KeyError, match="do not exist"): df.groupby(("x", "group")).agg(a=(("Y", "a"), "max")) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index f945f898603ac..c566c45b582d7 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -210,7 +210,7 @@ def test_aggregate_api_consistency(): expected.columns = MultiIndex.from_product([["C", "D"], ["mean", "sum"]]) msg = r"Column\(s\) \['r', 'r2'\] do not exist" - with pytest.raises(SpecificationError, match=msg): + with pytest.raises(KeyError, match=msg): grouped[["D", "C"]].agg({"r": np.sum, "r2": np.mean}) @@ -225,7 +225,7 @@ def test_agg_dict_renaming_deprecation(): ) msg = r"Column\(s\) \['ma'\] do not exist" - with pytest.raises(SpecificationError, match=msg): + with pytest.raises(KeyError, match=msg): df.groupby("A")[["B", "C"]].agg({"ma": "max"}) msg = r"nested renamer is not supported" diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 48c068be843a9..219e407c3e999 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -296,7 +296,7 @@ def test_agg_consistency(): r = df.resample("3T") msg = r"Column\(s\) \['r1', 'r2'\] do not exist" - with pytest.raises(pd.core.base.SpecificationError, match=msg): + with pytest.raises(KeyError, match=msg): r.agg({"r1": "mean", "r2": "sum"}) @@ -311,7 +311,7 @@ def test_agg_consistency_int_str_column_mix(): r = df.resample("3T") msg = r"Column\(s\) \[2, 'b'\] do not exist" - with pytest.raises(pd.core.base.SpecificationError, match=msg): + with pytest.raises(KeyError, match=msg): r.agg({2: "mean", "b": "sum"}) @@ -444,7 +444,7 @@ def test_agg_misc(): msg = r"Column\(s\) \['result1', 'result2'\] do not exist" for t in cases: - with pytest.raises(pd.core.base.SpecificationError, match=msg): + with pytest.raises(KeyError, match=msg): t[["A", "B"]].agg({"result1": np.sum, "result2": np.mean}) # agg with different hows @@ -475,7 +475,7 @@ def test_agg_misc(): # errors # invalid names in the agg specification - msg = "\"Column 'B' does not exist!\"" + msg = r"Column\(s\) \['B'\] do not exist" for t in cases: with pytest.raises(KeyError, match=msg): t[["A"]].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) @@ -526,7 +526,7 @@ def test_try_aggregate_non_existing_column(): df = DataFrame(data).set_index("dt") # Error as we don't have 'z' column - msg = "\"Column 'z' does not exist!\"" + msg = r"Column\(s\) \['z'\] do not exist" with pytest.raises(KeyError, match=msg): df.resample("30T").agg({"x": ["mean"], "y": ["median"], "z": ["sum"]})