REF: Consolidate validation of dictionary argument in agg/transform (#40004)

rhshadrach · web-flow · commit 8bdd08136a9e · 2021-02-24T19:42:26.000-05:00
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -325,6 +325,7 @@ Numeric
 - Bug in :meth:`DataFrame.rank` with ``axis=0`` and columns holding incomparable types raising ``IndexError`` (:issue:`38932`)
 - Bug in :func:`select_dtypes` different behavior between Windows and Linux with ``include="int"`` (:issue:`36569`)
 - Bug in :meth:`DataFrame.apply` and :meth:`DataFrame.agg` when passed argument ``func="size"`` would operate on the entire ``DataFrame`` instead of rows or columns (:issue:`39934`)
+- Bug in :meth:`DataFrame.transform` would raise ``SpecificationError`` when passed a dictionary and columns were missing; will now raise a ``KeyError`` instead (:issue:`40004`)
 -
 
 Conversion
@@ -443,6 +444,7 @@ Groupby/resample/rolling
 - Bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` not calculating window bounds correctly when window is an offset and dates are in descending order (:issue:`40002`)
 - Bug in :class:`SeriesGroupBy` and :class:`DataFrameGroupBy` on an empty ``Series`` or ``DataFrame`` would lose index, columns, and/or data types when directly using the methods ``idxmax``, ``idxmin``, ``mad``, ``min``, ``max``, ``sum``, ``prod``, and ``skew`` or using them through ``apply``, ``aggregate``, or ``resample`` (:issue:`26411`)
 - Bug in :meth:`DataFrameGroupBy.sample` where error was raised when ``weights`` was specified and the index was an :class:`Int64Index` (:issue:`39927`)
+- Bug in :meth:`DataFrameGroupBy.aggregate` and :meth:`.Resampler.aggregate` would sometimes raise ``SpecificationError`` when passed a dictionary and columns were missing; will now always raise a ``KeyError`` instead (:issue:`40004`)
 -
 
 Reshaping
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
@@ -274,20 +274,13 @@ def transform_dict_like(self, func):
         args = self.args
         kwargs = self.kwargs
 
+        # transform is currently only for Series/DataFrame
+        assert isinstance(obj, ABCNDFrame)
+
         if len(func) == 0:
             raise ValueError("No transform functions were provided")
 
-        if obj.ndim != 1:
-            # Check for missing columns on a frame
-            cols = set(func.keys()) - set(obj.columns)
-            if len(cols) > 0:
-                cols_sorted = list(safe_sort(list(cols)))
-                raise SpecificationError(f"Column(s) {cols_sorted} do not exist")
-
-        # Can't use func.values(); wouldn't work for a Series
-        if any(is_dict_like(v) for _, v in func.items()):
-            # GH 15931 - deprecation of renaming keys
-            raise SpecificationError("nested renamer is not supported")
+        self.validate_dictlike_arg("transform", obj, func)
 
         results: Dict[Hashable, FrameOrSeriesUnion] = {}
         for name, how in func.items():
@@ -438,6 +431,8 @@ def agg_dict_like(self, _axis: int) -> FrameOrSeriesUnion:
 
         selected_obj = obj._selected_obj
 
+        self.validate_dictlike_arg("agg", selected_obj, arg)
+
         # if we have a dict of any non-scalars
         # eg. {'A' : ['mean']}, normalize all to
         # be list-likes
@@ -449,43 +444,8 @@ def agg_dict_like(self, _axis: int) -> FrameOrSeriesUnion:
                     new_arg[k] = [v]
                 else:
                     new_arg[k] = v
-
-                # the keys must be in the columns
-                # for ndim=2, or renamers for ndim=1
-
-                # ok for now, but deprecated
-                # {'A': { 'ra': 'mean' }}
-                # {'A': { 'ra': ['mean'] }}
-                # {'ra': ['mean']}
-
-                # not ok
-                # {'ra' : { 'A' : 'mean' }}
-                if isinstance(v, dict):
-                    raise SpecificationError("nested renamer is not supported")
-                elif isinstance(selected_obj, ABCSeries):
-                    raise SpecificationError("nested renamer is not supported")
-                elif (
-                    isinstance(selected_obj, ABCDataFrame)
-                    and k not in selected_obj.columns
-                ):
-                    raise KeyError(f"Column '{k}' does not exist!")
-
             arg = new_arg
 
-        else:
-            # deprecation of renaming keys
-            # GH 15931
-            keys = list(arg.keys())
-            if isinstance(selected_obj, ABCDataFrame) and len(
-                selected_obj.columns.intersection(keys)
-            ) != len(keys):
-                cols = list(
-                    safe_sort(
-                        list(set(keys) - set(selected_obj.columns.intersection(keys))),
-                    )
-                )
-                raise SpecificationError(f"Column(s) {cols} do not exist")
-
         from pandas.core.reshape.concat import concat
 
         if selected_obj.ndim == 1:
@@ -580,6 +540,33 @@ def maybe_apply_multiple(self) -> Optional[FrameOrSeriesUnion]:
             return None
         return self.obj.aggregate(self.f, self.axis, *self.args, **self.kwargs)
 
+    def validate_dictlike_arg(
+        self, how: str, obj: FrameOrSeriesUnion, func: AggFuncTypeDict
+    ) -> None:
+        """
+        Raise if dict-like argument is invalid.
+
+        Ensures that necessary columns exist if obj is a DataFrame, and
+        that a nested renamer is not passed.
+        """
+        assert how in ("apply", "agg", "transform")
+
+        # Can't use func.values(); wouldn't work for a Series
+        if (
+            how == "agg"
+            and isinstance(obj, ABCSeries)
+            and any(is_list_like(v) for _, v in func.items())
+        ) or (any(is_dict_like(v) for _, v in func.items())):
+            # GH 15931 - deprecation of renaming keys
+            raise SpecificationError("nested renamer is not supported")
+
+        if obj.ndim != 1:
+            # Check for missing columns on a frame
+            cols = set(func.keys()) - set(obj.columns)
+            if len(cols) > 0:
+                cols_sorted = list(safe_sort(list(cols)))
+                raise KeyError(f"Column(s) {cols_sorted} do not exist")
+
 
 class FrameApply(Apply):
     obj: DataFrame
diff --git a/pandas/tests/apply/test_frame_transform.py b/pandas/tests/apply/test_frame_transform.py
@@ -260,7 +260,7 @@ def test_transform_missing_columns(axis):
     # GH#35964
     df = DataFrame({"A": [1, 2], "B": [3, 4]})
     match = re.escape("Column(s) ['C'] do not exist")
-    with pytest.raises(SpecificationError, match=match):
+    with pytest.raises(KeyError, match=match):
         df.transform({"C": "cumsum"})
 
 
@@ -276,7 +276,7 @@ def test_transform_mixed_column_name_dtypes():
     # GH39025
     df = DataFrame({"a": ["1"]})
     msg = r"Column\(s\) \[1, 'b'\] do not exist"
-    with pytest.raises(SpecificationError, match=msg):
+    with pytest.raises(KeyError, match=msg):
         df.transform({"a": int, 1: str, "b": int})
 
 
diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py
@@ -0,0 +1,31 @@
+# Tests specifically aimed at detecting bad arguments.
+import re
+
+import pytest
+
+from pandas import (
+    DataFrame,
+    Series,
+)
+from pandas.core.base import SpecificationError
+
+
+@pytest.mark.parametrize("box", [DataFrame, Series])
+@pytest.mark.parametrize("method", ["apply", "agg", "transform"])
+@pytest.mark.parametrize("func", [{"A": {"B": "sum"}}, {"A": {"B": ["sum"]}}])
+def test_nested_renamer(box, method, func):
+    # GH 35964
+    obj = box({"A": [1]})
+    match = "nested renamer is not supported"
+    with pytest.raises(SpecificationError, match=match):
+        getattr(obj, method)(func)
+
+
+@pytest.mark.parametrize("method", ["apply", "agg", "transform"])
+@pytest.mark.parametrize("func", [{"B": "sum"}, {"B": ["sum"]}])
+def test_missing_column(method, func):
+    # GH 40004
+    obj = DataFrame({"A": [1]})
+    match = re.escape("Column(s) ['B'] do not exist")
+    with pytest.raises(KeyError, match=match):
+        getattr(obj, method)(func)
diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py
@@ -4,6 +4,7 @@
 import datetime
 import functools
 from functools import partial
+import re
 
 import numpy as np
 import pytest
@@ -685,7 +686,8 @@ def test_agg_relabel_other_raises(self):
 
     def test_missing_raises(self):
         df = DataFrame({"A": [0, 1], "B": [1, 2]})
-        with pytest.raises(KeyError, match="Column 'C' does not exist"):
+        match = re.escape("Column(s) ['C'] do not exist")
+        with pytest.raises(KeyError, match=match):
             df.groupby("A").agg(c=("C", "sum"))
 
     def test_agg_namedtuple(self):
@@ -762,7 +764,7 @@ def test_agg_relabel_multiindex_raises_not_exist():
     )
     df.columns = pd.MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")])
 
-    with pytest.raises(KeyError, match="does not exist"):
+    with pytest.raises(KeyError, match="do not exist"):
         df.groupby(("x", "group")).agg(a=(("Y", "a"), "max"))
 
 
diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py
@@ -210,7 +210,7 @@ def test_aggregate_api_consistency():
     expected.columns = MultiIndex.from_product([["C", "D"], ["mean", "sum"]])
 
     msg = r"Column\(s\) \['r', 'r2'\] do not exist"
-    with pytest.raises(SpecificationError, match=msg):
+    with pytest.raises(KeyError, match=msg):
         grouped[["D", "C"]].agg({"r": np.sum, "r2": np.mean})
 
 
@@ -225,7 +225,7 @@ def test_agg_dict_renaming_deprecation():
         )
 
     msg = r"Column\(s\) \['ma'\] do not exist"
-    with pytest.raises(SpecificationError, match=msg):
+    with pytest.raises(KeyError, match=msg):
         df.groupby("A")[["B", "C"]].agg({"ma": "max"})
 
     msg = r"nested renamer is not supported"
diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py
@@ -296,7 +296,7 @@ def test_agg_consistency():
     r = df.resample("3T")
 
     msg = r"Column\(s\) \['r1', 'r2'\] do not exist"
-    with pytest.raises(pd.core.base.SpecificationError, match=msg):
+    with pytest.raises(KeyError, match=msg):
         r.agg({"r1": "mean", "r2": "sum"})
 
 
@@ -311,7 +311,7 @@ def test_agg_consistency_int_str_column_mix():
     r = df.resample("3T")
 
     msg = r"Column\(s\) \[2, 'b'\] do not exist"
-    with pytest.raises(pd.core.base.SpecificationError, match=msg):
+    with pytest.raises(KeyError, match=msg):
         r.agg({2: "mean", "b": "sum"})
 
 
@@ -444,7 +444,7 @@ def test_agg_misc():
 
     msg = r"Column\(s\) \['result1', 'result2'\] do not exist"
     for t in cases:
-        with pytest.raises(pd.core.base.SpecificationError, match=msg):
+        with pytest.raises(KeyError, match=msg):
             t[["A", "B"]].agg({"result1": np.sum, "result2": np.mean})
 
     # agg with different hows
@@ -475,7 +475,7 @@ def test_agg_misc():
 
     # errors
     # invalid names in the agg specification
-    msg = "\"Column 'B' does not exist!\""
+    msg = r"Column\(s\) \['B'\] do not exist"
     for t in cases:
         with pytest.raises(KeyError, match=msg):
             t[["A"]].agg({"A": ["sum", "std"], "B": ["mean", "std"]})
@@ -526,7 +526,7 @@ def test_try_aggregate_non_existing_column():
     df = DataFrame(data).set_index("dt")
 
     # Error as we don't have 'z' column
-    msg = "\"Column 'z' does not exist!\""
+    msg = r"Column\(s\) \['z'\] do not exist"
     with pytest.raises(KeyError, match=msg):
         df.resample("30T").agg({"x": ["mean"], "y": ["median"], "z": ["sum"]})
 

Original file line number	Diff line number	Diff line change
`@@ -325,6 +325,7 @@ Numeric`
`325`	`325`	- Bug in :meth:`DataFrame.rank` with ``axis=0`` and columns holding incomparable types raising ``IndexError`` (:issue:`38932`)
`326`	`326`	- Bug in :func:`select_dtypes` different behavior between Windows and Linux with ``include="int"`` (:issue:`36569`)
`327`	`327`	- Bug in :meth:`DataFrame.apply` and :meth:`DataFrame.agg` when passed argument ``func="size"`` would operate on the entire ``DataFrame`` instead of rows or columns (:issue:`39934`)
	`328`	+- Bug in :meth:`DataFrame.transform` would raise ``SpecificationError`` when passed a dictionary and columns were missing; will now raise a ``KeyError`` instead (:issue:`40004`)
`328`	`329`	`-`
`329`	`330`
`330`	`331`	`Conversion`
`@@ -443,6 +444,7 @@ Groupby/resample/rolling`
`443`	`444`	- Bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` not calculating window bounds correctly when window is an offset and dates are in descending order (:issue:`40002`)
`444`	`445`	- Bug in :class:`SeriesGroupBy` and :class:`DataFrameGroupBy` on an empty ``Series`` or ``DataFrame`` would lose index, columns, and/or data types when directly using the methods ``idxmax``, ``idxmin``, ``mad``, ``min``, ``max``, ``sum``, ``prod``, and ``skew`` or using them through ``apply``, ``aggregate``, or ``resample`` (:issue:`26411`)
`445`	`446`	- Bug in :meth:`DataFrameGroupBy.sample` where error was raised when ``weights`` was specified and the index was an :class:`Int64Index` (:issue:`39927`)
	`447`	+- Bug in :meth:`DataFrameGroupBy.aggregate` and :meth:`.Resampler.aggregate` would sometimes raise ``SpecificationError`` when passed a dictionary and columns were missing; will now always raise a ``KeyError`` instead (:issue:`40004`)
`446`	`448`	`-`
`447`	`449`
`448`	`450`	`Reshaping`