Skip to content

REF: Consolidate validation of dictionary argument in agg/transform #40004

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Feb 25, 2021
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,7 @@ Numeric
- Bug in :meth:`DataFrame.rank` with ``axis=0`` and columns holding incomparable types raising ``IndexError`` (:issue:`38932`)
- Bug in :func:`select_dtypes` different behavior between Windows and Linux with ``include="int"`` (:issue:`36569`)
- Bug in :meth:`DataFrame.apply` and :meth:`DataFrame.agg` when passed argument ``func="size"`` would operate on the entire ``DataFrame`` instead of rows or columns (:issue:`39934`)
- Bug in :meth:`DataFrame.transform` would raise ``SpecificationError`` when passed a dictionary and columns were missing; will now raise a ``KeyError`` instead (:issue:`40004`)
-

Conversion
Expand Down Expand Up @@ -443,6 +444,7 @@ Groupby/resample/rolling
- Bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` not calculating window bounds correctly when window is an offset and dates are in descending order (:issue:`40002`)
- Bug in :class:`SeriesGroupBy` and :class:`DataFrameGroupBy` on an empty ``Series`` or ``DataFrame`` would lose index, columns, and/or data types when directly using the methods ``idxmax``, ``idxmin``, ``mad``, ``min``, ``max``, ``sum``, ``prod``, and ``skew`` or using them through ``apply``, ``aggregate``, or ``resample`` (:issue:`26411`)
- Bug in :meth:`DataFrameGroupBy.sample` where error was raised when ``weights`` was specified and the index was an :class:`Int64Index` (:issue:`39927`)
- Bug in :meth:`DataFrameGroupBy.aggregate` and :meth:`.Resampler.aggregate` would sometimes raise ``SpecificationError`` when passed a dictionary and columns were missing; will now always raise a ``KeyError`` instead (:issue:`40004`)
-

Reshaping
Expand Down
79 changes: 33 additions & 46 deletions pandas/core/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,20 +274,13 @@ def transform_dict_like(self, func):
args = self.args
kwargs = self.kwargs

# transform is currently only for Series/DataFrame
assert isinstance(obj, ABCNDFrame)

if len(func) == 0:
raise ValueError("No transform functions were provided")

if obj.ndim != 1:
# Check for missing columns on a frame
cols = set(func.keys()) - set(obj.columns)
if len(cols) > 0:
cols_sorted = list(safe_sort(list(cols)))
raise SpecificationError(f"Column(s) {cols_sorted} do not exist")

# Can't use func.values(); wouldn't work for a Series
if any(is_dict_like(v) for _, v in func.items()):
# GH 15931 - deprecation of renaming keys
raise SpecificationError("nested renamer is not supported")
self.validate_dictlike_arg("transform", obj, func)

results: Dict[Hashable, FrameOrSeriesUnion] = {}
for name, how in func.items():
Expand Down Expand Up @@ -438,6 +431,8 @@ def agg_dict_like(self, _axis: int) -> FrameOrSeriesUnion:

selected_obj = obj._selected_obj

self.validate_dictlike_arg("agg", selected_obj, arg)

# if we have a dict of any non-scalars
# eg. {'A' : ['mean']}, normalize all to
# be list-likes
Expand All @@ -449,43 +444,8 @@ def agg_dict_like(self, _axis: int) -> FrameOrSeriesUnion:
new_arg[k] = [v]
else:
new_arg[k] = v

# the keys must be in the columns
# for ndim=2, or renamers for ndim=1

# ok for now, but deprecated
# {'A': { 'ra': 'mean' }}
# {'A': { 'ra': ['mean'] }}
# {'ra': ['mean']}

# not ok
# {'ra' : { 'A' : 'mean' }}
if isinstance(v, dict):
raise SpecificationError("nested renamer is not supported")
elif isinstance(selected_obj, ABCSeries):
raise SpecificationError("nested renamer is not supported")
elif (
isinstance(selected_obj, ABCDataFrame)
and k not in selected_obj.columns
):
raise KeyError(f"Column '{k}' does not exist!")

arg = new_arg

else:
# deprecation of renaming keys
# GH 15931
keys = list(arg.keys())
if isinstance(selected_obj, ABCDataFrame) and len(
selected_obj.columns.intersection(keys)
) != len(keys):
cols = list(
safe_sort(
list(set(keys) - set(selected_obj.columns.intersection(keys))),
)
)
raise SpecificationError(f"Column(s) {cols} do not exist")

from pandas.core.reshape.concat import concat

if selected_obj.ndim == 1:
Expand Down Expand Up @@ -580,6 +540,33 @@ def maybe_apply_multiple(self) -> Optional[FrameOrSeriesUnion]:
return None
return self.obj.aggregate(self.f, self.axis, *self.args, **self.kwargs)

def validate_dictlike_arg(
self, how: str, obj: FrameOrSeriesUnion, func: AggFuncTypeDict
) -> None:
"""
Raise if dict-like argument is invalid.

Ensures that necessary columns exist if obj is a DataFrame, and
that a nested renamer is not passed.
"""
assert how in ("apply", "agg", "transform")

# Can't use func.values(); wouldn't work for a Series
if (
how == "agg"
and isinstance(obj, ABCSeries)
and any(is_list_like(v) for _, v in func.items())
) or (any(is_dict_like(v) for _, v in func.items())):
# GH 15931 - deprecation of renaming keys
raise SpecificationError("nested renamer is not supported")

if obj.ndim != 1:
# Check for missing columns on a frame
cols = set(func.keys()) - set(obj.columns)
if len(cols) > 0:
cols_sorted = list(safe_sort(list(cols)))
raise KeyError(f"Column(s) {cols_sorted} do not exist")


class FrameApply(Apply):
obj: DataFrame
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/apply/test_frame_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ def test_transform_missing_columns(axis):
# GH#35964
df = DataFrame({"A": [1, 2], "B": [3, 4]})
match = re.escape("Column(s) ['C'] do not exist")
with pytest.raises(SpecificationError, match=match):
with pytest.raises(KeyError, match=match):
df.transform({"C": "cumsum"})


Expand All @@ -276,7 +276,7 @@ def test_transform_mixed_column_name_dtypes():
# GH39025
df = DataFrame({"a": ["1"]})
msg = r"Column\(s\) \[1, 'b'\] do not exist"
with pytest.raises(SpecificationError, match=msg):
with pytest.raises(KeyError, match=msg):
df.transform({"a": int, 1: str, "b": int})


Expand Down
31 changes: 31 additions & 0 deletions pandas/tests/apply/test_invalid_arg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Tests specifically aimed at detecting bad arguments.
import re

import pytest

from pandas import (
DataFrame,
Series,
)
from pandas.core.base import SpecificationError


@pytest.mark.parametrize("box", [DataFrame, Series])
@pytest.mark.parametrize("method", ["apply", "agg", "transform"])
@pytest.mark.parametrize("func", [{"A": {"B": "sum"}}, {"A": {"B": ["sum"]}}])
def test_nested_renamer(box, method, func):
# GH 35964
obj = box({"A": [1]})
match = "nested renamer is not supported"
with pytest.raises(SpecificationError, match=match):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this error is good

getattr(obj, method)(func)


@pytest.mark.parametrize("method", ["apply", "agg", "transform"])
@pytest.mark.parametrize("func", [{"B": "sum"}, {"B": ["sum"]}])
def test_missing_column(method, func):
# GH 40004
obj = DataFrame({"A": [1]})
match = re.escape("Column(s) ['B'] do not exist")
with pytest.raises(KeyError, match=match):
getattr(obj, method)(func)
6 changes: 4 additions & 2 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import datetime
import functools
from functools import partial
import re

import numpy as np
import pytest
Expand Down Expand Up @@ -685,7 +686,8 @@ def test_agg_relabel_other_raises(self):

def test_missing_raises(self):
df = DataFrame({"A": [0, 1], "B": [1, 2]})
with pytest.raises(KeyError, match="Column 'C' does not exist"):
match = re.escape("Column(s) ['C'] do not exist")
with pytest.raises(KeyError, match=match):
df.groupby("A").agg(c=("C", "sum"))

def test_agg_namedtuple(self):
Expand Down Expand Up @@ -762,7 +764,7 @@ def test_agg_relabel_multiindex_raises_not_exist():
)
df.columns = pd.MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")])

with pytest.raises(KeyError, match="does not exist"):
with pytest.raises(KeyError, match="do not exist"):
df.groupby(("x", "group")).agg(a=(("Y", "a"), "max"))


Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/groupby/aggregate/test_other.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ def test_aggregate_api_consistency():
expected.columns = MultiIndex.from_product([["C", "D"], ["mean", "sum"]])

msg = r"Column\(s\) \['r', 'r2'\] do not exist"
with pytest.raises(SpecificationError, match=msg):
with pytest.raises(KeyError, match=msg):
grouped[["D", "C"]].agg({"r": np.sum, "r2": np.mean})


Expand All @@ -225,7 +225,7 @@ def test_agg_dict_renaming_deprecation():
)

msg = r"Column\(s\) \['ma'\] do not exist"
with pytest.raises(SpecificationError, match=msg):
with pytest.raises(KeyError, match=msg):
df.groupby("A")[["B", "C"]].agg({"ma": "max"})

msg = r"nested renamer is not supported"
Expand Down
10 changes: 5 additions & 5 deletions pandas/tests/resample/test_resample_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ def test_agg_consistency():
r = df.resample("3T")

msg = r"Column\(s\) \['r1', 'r2'\] do not exist"
with pytest.raises(pd.core.base.SpecificationError, match=msg):
with pytest.raises(KeyError, match=msg):
r.agg({"r1": "mean", "r2": "sum"})


Expand All @@ -311,7 +311,7 @@ def test_agg_consistency_int_str_column_mix():
r = df.resample("3T")

msg = r"Column\(s\) \[2, 'b'\] do not exist"
with pytest.raises(pd.core.base.SpecificationError, match=msg):
with pytest.raises(KeyError, match=msg):
r.agg({2: "mean", "b": "sum"})


Expand Down Expand Up @@ -444,7 +444,7 @@ def test_agg_misc():

msg = r"Column\(s\) \['result1', 'result2'\] do not exist"
for t in cases:
with pytest.raises(pd.core.base.SpecificationError, match=msg):
with pytest.raises(KeyError, match=msg):
t[["A", "B"]].agg({"result1": np.sum, "result2": np.mean})

# agg with different hows
Expand Down Expand Up @@ -475,7 +475,7 @@ def test_agg_misc():

# errors
# invalid names in the agg specification
msg = "\"Column 'B' does not exist!\""
msg = r"Column\(s\) \['B'\] do not exist"
for t in cases:
with pytest.raises(KeyError, match=msg):
t[["A"]].agg({"A": ["sum", "std"], "B": ["mean", "std"]})
Expand Down Expand Up @@ -526,7 +526,7 @@ def test_try_aggregate_non_existing_column():
df = DataFrame(data).set_index("dt")

# Error as we don't have 'z' column
msg = "\"Column 'z' does not exist!\""
msg = r"Column\(s\) \['z'\] do not exist"
with pytest.raises(KeyError, match=msg):
df.resample("30T").agg({"x": ["mean"], "y": ["median"], "z": ["sum"]})

Expand Down