Skip to content

REF: Consolidate validation of dictionary argument in agg/transform #40004

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Feb 25, 2021
79 changes: 33 additions & 46 deletions pandas/core/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,20 +270,13 @@ def transform_dict_like(self, func):
args = self.args
kwargs = self.kwargs

# transform is currently only for Series/DataFrame
assert isinstance(obj, ABCNDFrame)

if len(func) == 0:
raise ValueError("No transform functions were provided")

if obj.ndim != 1:
# Check for missing columns on a frame
cols = set(func.keys()) - set(obj.columns)
if len(cols) > 0:
cols_sorted = list(safe_sort(list(cols)))
raise SpecificationError(f"Column(s) {cols_sorted} do not exist")

# Can't use func.values(); wouldn't work for a Series
if any(is_dict_like(v) for _, v in func.items()):
# GH 15931 - deprecation of renaming keys
raise SpecificationError("nested renamer is not supported")
self.validate_dictlike_arg("transform", obj, func)

results: Dict[Hashable, FrameOrSeriesUnion] = {}
for name, how in func.items():
Expand Down Expand Up @@ -434,6 +427,8 @@ def agg_dict_like(self, _axis: int) -> FrameOrSeriesUnion:

selected_obj = obj._selected_obj

self.validate_dictlike_arg("agg", selected_obj, arg)

# if we have a dict of any non-scalars
# eg. {'A' : ['mean']}, normalize all to
# be list-likes
Expand All @@ -445,43 +440,8 @@ def agg_dict_like(self, _axis: int) -> FrameOrSeriesUnion:
new_arg[k] = [v]
else:
new_arg[k] = v

# the keys must be in the columns
# for ndim=2, or renamers for ndim=1

# ok for now, but deprecated
# {'A': { 'ra': 'mean' }}
# {'A': { 'ra': ['mean'] }}
# {'ra': ['mean']}

# not ok
# {'ra' : { 'A' : 'mean' }}
if isinstance(v, dict):
raise SpecificationError("nested renamer is not supported")
elif isinstance(selected_obj, ABCSeries):
raise SpecificationError("nested renamer is not supported")
elif (
isinstance(selected_obj, ABCDataFrame)
and k not in selected_obj.columns
):
raise KeyError(f"Column '{k}' does not exist!")

arg = new_arg

else:
# deprecation of renaming keys
# GH 15931
keys = list(arg.keys())
if isinstance(selected_obj, ABCDataFrame) and len(
selected_obj.columns.intersection(keys)
) != len(keys):
cols = list(
safe_sort(
list(set(keys) - set(selected_obj.columns.intersection(keys))),
)
)
raise SpecificationError(f"Column(s) {cols} do not exist")

from pandas.core.reshape.concat import concat

if selected_obj.ndim == 1:
Expand Down Expand Up @@ -567,6 +527,33 @@ def maybe_apply_multiple(self) -> Optional[FrameOrSeriesUnion]:
return None
return self.obj.aggregate(self.f, self.axis, *self.args, **self.kwargs)

def validate_dictlike_arg(
self, how: str, obj: FrameOrSeriesUnion, func: AggFuncTypeDict
) -> None:
"""
Raise if dict-like argument is invalid.

Ensures that necessary columns exist if obj is a DataFrame, and
that a nested renamer is not passed.
"""
assert how in ("apply", "agg", "transform")

# Can't use func.values(); wouldn't work for a Series
if (
how == "agg"
and isinstance(obj, ABCSeries)
and any(is_list_like(v) for _, v in func.items())
) or (any(is_dict_like(v) for _, v in func.items())):
# GH 15931 - deprecation of renaming keys
raise SpecificationError("nested renamer is not supported")

if obj.ndim != 1:
# Check for missing columns on a frame
cols = set(func.keys()) - set(obj.columns)
if len(cols) > 0:
cols_sorted = list(safe_sort(list(cols)))
raise SpecificationError(f"Column(s) {cols_sorted} do not exist")


class FrameApply(Apply):
obj: DataFrame
Expand Down
31 changes: 31 additions & 0 deletions pandas/tests/apply/test_invalid_arg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Tests specifically aimed at detecting bad arguments.
import re

import pytest

from pandas import (
DataFrame,
Series,
)
from pandas.core.base import SpecificationError


@pytest.mark.parametrize("box", [DataFrame, Series])
@pytest.mark.parametrize("method", ["apply", "agg", "transform"])
@pytest.mark.parametrize("func", [{"A": {"B": "sum"}}, {"A": {"B": ["sum"]}}])
def test_nested_renamer(box, method, func):
# GH 35964
obj = box({"A": [1]})
match = "nested renamer is not supported"
with pytest.raises(SpecificationError, match=match):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this error is good

getattr(obj, method)(func)


@pytest.mark.parametrize("method", ["apply", "agg", "transform"])
@pytest.mark.parametrize("func", [{"B": "sum"}, {"B": ["sum"]}])
def test_missing_column(method, func):
# GH 40004
obj = DataFrame({"A": [1]})
match = re.escape("Column(s) ['B'] do not exist")
with pytest.raises(SpecificationError, match=match):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would be ok here with a KeyError (which i think is the same as now)

getattr(obj, method)(func)
6 changes: 4 additions & 2 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import datetime
import functools
from functools import partial
import re

import numpy as np
import pytest
Expand Down Expand Up @@ -683,7 +684,8 @@ def test_agg_relabel_other_raises(self):

def test_missing_raises(self):
df = DataFrame({"A": [0, 1], "B": [1, 2]})
with pytest.raises(KeyError, match="Column 'C' does not exist"):
match = re.escape("Column(s) ['C'] do not exist")
with pytest.raises(SpecificationError, match=match):
df.groupby("A").agg(c=("C", "sum"))

def test_agg_namedtuple(self):
Expand Down Expand Up @@ -760,7 +762,7 @@ def test_agg_relabel_multiindex_raises_not_exist():
)
df.columns = pd.MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")])

with pytest.raises(KeyError, match="does not exist"):
with pytest.raises(SpecificationError, match="do not exist"):
df.groupby(("x", "group")).agg(a=(("Y", "a"), "max"))


Expand Down
9 changes: 5 additions & 4 deletions pandas/tests/resample/test_resample_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
Series,
)
import pandas._testing as tm
from pandas.core.base import SpecificationError
from pandas.core.indexes.datetimes import date_range

dti = date_range(start=datetime(2005, 1, 1), end=datetime(2005, 1, 10), freq="Min")
Expand Down Expand Up @@ -475,9 +476,9 @@ def test_agg_misc():

# errors
# invalid names in the agg specification
msg = "\"Column 'B' does not exist!\""
msg = r"Column\(s\) \['B'\] do not exist"
for t in cases:
with pytest.raises(KeyError, match=msg):
with pytest.raises(SpecificationError, match=msg):
t[["A"]].agg({"A": ["sum", "std"], "B": ["mean", "std"]})


Expand Down Expand Up @@ -526,8 +527,8 @@ def test_try_aggregate_non_existing_column():
df = DataFrame(data).set_index("dt")

# Error as we don't have 'z' column
msg = "\"Column 'z' does not exist!\""
with pytest.raises(KeyError, match=msg):
msg = r"Column\(s\) \['z'\] do not exist"
with pytest.raises(SpecificationError, match=msg):
df.resample("30T").agg({"x": ["mean"], "y": ["median"], "z": ["sum"]})


Expand Down