Skip to content

Commit 8bdd081

Browse files
authored
REF: Consolidate validation of dictionary argument in agg/transform (#40004)
1 parent b1363b4 commit 8bdd081

File tree

7 files changed

+79
-57
lines changed

7 files changed

+79
-57
lines changed

doc/source/whatsnew/v1.3.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,7 @@ Numeric
325325
- Bug in :meth:`DataFrame.rank` with ``axis=0`` and columns holding incomparable types raising ``IndexError`` (:issue:`38932`)
326326
- Bug in :func:`select_dtypes` different behavior between Windows and Linux with ``include="int"`` (:issue:`36569`)
327327
- Bug in :meth:`DataFrame.apply` and :meth:`DataFrame.agg` when passed argument ``func="size"`` would operate on the entire ``DataFrame`` instead of rows or columns (:issue:`39934`)
328+
- Bug in :meth:`DataFrame.transform` would raise ``SpecificationError`` when passed a dictionary and columns were missing; will now raise a ``KeyError`` instead (:issue:`40004`)
328329
-
329330

330331
Conversion
@@ -443,6 +444,7 @@ Groupby/resample/rolling
443444
- Bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` not calculating window bounds correctly when window is an offset and dates are in descending order (:issue:`40002`)
444445
- Bug in :class:`SeriesGroupBy` and :class:`DataFrameGroupBy` on an empty ``Series`` or ``DataFrame`` would lose index, columns, and/or data types when directly using the methods ``idxmax``, ``idxmin``, ``mad``, ``min``, ``max``, ``sum``, ``prod``, and ``skew`` or using them through ``apply``, ``aggregate``, or ``resample`` (:issue:`26411`)
445446
- Bug in :meth:`DataFrameGroupBy.sample` where error was raised when ``weights`` was specified and the index was an :class:`Int64Index` (:issue:`39927`)
447+
- Bug in :meth:`DataFrameGroupBy.aggregate` and :meth:`.Resampler.aggregate` would sometimes raise ``SpecificationError`` when passed a dictionary and columns were missing; will now always raise a ``KeyError`` instead (:issue:`40004`)
446448
-
447449

448450
Reshaping

pandas/core/apply.py

+33-46
Original file line numberDiff line numberDiff line change
@@ -274,20 +274,13 @@ def transform_dict_like(self, func):
274274
args = self.args
275275
kwargs = self.kwargs
276276

277+
# transform is currently only for Series/DataFrame
278+
assert isinstance(obj, ABCNDFrame)
279+
277280
if len(func) == 0:
278281
raise ValueError("No transform functions were provided")
279282

280-
if obj.ndim != 1:
281-
# Check for missing columns on a frame
282-
cols = set(func.keys()) - set(obj.columns)
283-
if len(cols) > 0:
284-
cols_sorted = list(safe_sort(list(cols)))
285-
raise SpecificationError(f"Column(s) {cols_sorted} do not exist")
286-
287-
# Can't use func.values(); wouldn't work for a Series
288-
if any(is_dict_like(v) for _, v in func.items()):
289-
# GH 15931 - deprecation of renaming keys
290-
raise SpecificationError("nested renamer is not supported")
283+
self.validate_dictlike_arg("transform", obj, func)
291284

292285
results: Dict[Hashable, FrameOrSeriesUnion] = {}
293286
for name, how in func.items():
@@ -438,6 +431,8 @@ def agg_dict_like(self, _axis: int) -> FrameOrSeriesUnion:
438431

439432
selected_obj = obj._selected_obj
440433

434+
self.validate_dictlike_arg("agg", selected_obj, arg)
435+
441436
# if we have a dict of any non-scalars
442437
# eg. {'A' : ['mean']}, normalize all to
443438
# be list-likes
@@ -449,43 +444,8 @@ def agg_dict_like(self, _axis: int) -> FrameOrSeriesUnion:
449444
new_arg[k] = [v]
450445
else:
451446
new_arg[k] = v
452-
453-
# the keys must be in the columns
454-
# for ndim=2, or renamers for ndim=1
455-
456-
# ok for now, but deprecated
457-
# {'A': { 'ra': 'mean' }}
458-
# {'A': { 'ra': ['mean'] }}
459-
# {'ra': ['mean']}
460-
461-
# not ok
462-
# {'ra' : { 'A' : 'mean' }}
463-
if isinstance(v, dict):
464-
raise SpecificationError("nested renamer is not supported")
465-
elif isinstance(selected_obj, ABCSeries):
466-
raise SpecificationError("nested renamer is not supported")
467-
elif (
468-
isinstance(selected_obj, ABCDataFrame)
469-
and k not in selected_obj.columns
470-
):
471-
raise KeyError(f"Column '{k}' does not exist!")
472-
473447
arg = new_arg
474448

475-
else:
476-
# deprecation of renaming keys
477-
# GH 15931
478-
keys = list(arg.keys())
479-
if isinstance(selected_obj, ABCDataFrame) and len(
480-
selected_obj.columns.intersection(keys)
481-
) != len(keys):
482-
cols = list(
483-
safe_sort(
484-
list(set(keys) - set(selected_obj.columns.intersection(keys))),
485-
)
486-
)
487-
raise SpecificationError(f"Column(s) {cols} do not exist")
488-
489449
from pandas.core.reshape.concat import concat
490450

491451
if selected_obj.ndim == 1:
@@ -580,6 +540,33 @@ def maybe_apply_multiple(self) -> Optional[FrameOrSeriesUnion]:
580540
return None
581541
return self.obj.aggregate(self.f, self.axis, *self.args, **self.kwargs)
582542

543+
def validate_dictlike_arg(
544+
self, how: str, obj: FrameOrSeriesUnion, func: AggFuncTypeDict
545+
) -> None:
546+
"""
547+
Raise if dict-like argument is invalid.
548+
549+
Ensures that necessary columns exist if obj is a DataFrame, and
550+
that a nested renamer is not passed.
551+
"""
552+
assert how in ("apply", "agg", "transform")
553+
554+
# Can't use func.values(); wouldn't work for a Series
555+
if (
556+
how == "agg"
557+
and isinstance(obj, ABCSeries)
558+
and any(is_list_like(v) for _, v in func.items())
559+
) or (any(is_dict_like(v) for _, v in func.items())):
560+
# GH 15931 - deprecation of renaming keys
561+
raise SpecificationError("nested renamer is not supported")
562+
563+
if obj.ndim != 1:
564+
# Check for missing columns on a frame
565+
cols = set(func.keys()) - set(obj.columns)
566+
if len(cols) > 0:
567+
cols_sorted = list(safe_sort(list(cols)))
568+
raise KeyError(f"Column(s) {cols_sorted} do not exist")
569+
583570

584571
class FrameApply(Apply):
585572
obj: DataFrame

pandas/tests/apply/test_frame_transform.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,7 @@ def test_transform_missing_columns(axis):
260260
# GH#35964
261261
df = DataFrame({"A": [1, 2], "B": [3, 4]})
262262
match = re.escape("Column(s) ['C'] do not exist")
263-
with pytest.raises(SpecificationError, match=match):
263+
with pytest.raises(KeyError, match=match):
264264
df.transform({"C": "cumsum"})
265265

266266

@@ -276,7 +276,7 @@ def test_transform_mixed_column_name_dtypes():
276276
# GH39025
277277
df = DataFrame({"a": ["1"]})
278278
msg = r"Column\(s\) \[1, 'b'\] do not exist"
279-
with pytest.raises(SpecificationError, match=msg):
279+
with pytest.raises(KeyError, match=msg):
280280
df.transform({"a": int, 1: str, "b": int})
281281

282282

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# Tests specifically aimed at detecting bad arguments.
2+
import re
3+
4+
import pytest
5+
6+
from pandas import (
7+
DataFrame,
8+
Series,
9+
)
10+
from pandas.core.base import SpecificationError
11+
12+
13+
@pytest.mark.parametrize("box", [DataFrame, Series])
14+
@pytest.mark.parametrize("method", ["apply", "agg", "transform"])
15+
@pytest.mark.parametrize("func", [{"A": {"B": "sum"}}, {"A": {"B": ["sum"]}}])
16+
def test_nested_renamer(box, method, func):
17+
# GH 35964
18+
obj = box({"A": [1]})
19+
match = "nested renamer is not supported"
20+
with pytest.raises(SpecificationError, match=match):
21+
getattr(obj, method)(func)
22+
23+
24+
@pytest.mark.parametrize("method", ["apply", "agg", "transform"])
25+
@pytest.mark.parametrize("func", [{"B": "sum"}, {"B": ["sum"]}])
26+
def test_missing_column(method, func):
27+
# GH 40004
28+
obj = DataFrame({"A": [1]})
29+
match = re.escape("Column(s) ['B'] do not exist")
30+
with pytest.raises(KeyError, match=match):
31+
getattr(obj, method)(func)

pandas/tests/groupby/aggregate/test_aggregate.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import datetime
55
import functools
66
from functools import partial
7+
import re
78

89
import numpy as np
910
import pytest
@@ -685,7 +686,8 @@ def test_agg_relabel_other_raises(self):
685686

686687
def test_missing_raises(self):
687688
df = DataFrame({"A": [0, 1], "B": [1, 2]})
688-
with pytest.raises(KeyError, match="Column 'C' does not exist"):
689+
match = re.escape("Column(s) ['C'] do not exist")
690+
with pytest.raises(KeyError, match=match):
689691
df.groupby("A").agg(c=("C", "sum"))
690692

691693
def test_agg_namedtuple(self):
@@ -762,7 +764,7 @@ def test_agg_relabel_multiindex_raises_not_exist():
762764
)
763765
df.columns = pd.MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")])
764766

765-
with pytest.raises(KeyError, match="does not exist"):
767+
with pytest.raises(KeyError, match="do not exist"):
766768
df.groupby(("x", "group")).agg(a=(("Y", "a"), "max"))
767769

768770

pandas/tests/groupby/aggregate/test_other.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,7 @@ def test_aggregate_api_consistency():
210210
expected.columns = MultiIndex.from_product([["C", "D"], ["mean", "sum"]])
211211

212212
msg = r"Column\(s\) \['r', 'r2'\] do not exist"
213-
with pytest.raises(SpecificationError, match=msg):
213+
with pytest.raises(KeyError, match=msg):
214214
grouped[["D", "C"]].agg({"r": np.sum, "r2": np.mean})
215215

216216

@@ -225,7 +225,7 @@ def test_agg_dict_renaming_deprecation():
225225
)
226226

227227
msg = r"Column\(s\) \['ma'\] do not exist"
228-
with pytest.raises(SpecificationError, match=msg):
228+
with pytest.raises(KeyError, match=msg):
229229
df.groupby("A")[["B", "C"]].agg({"ma": "max"})
230230

231231
msg = r"nested renamer is not supported"

pandas/tests/resample/test_resample_api.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -296,7 +296,7 @@ def test_agg_consistency():
296296
r = df.resample("3T")
297297

298298
msg = r"Column\(s\) \['r1', 'r2'\] do not exist"
299-
with pytest.raises(pd.core.base.SpecificationError, match=msg):
299+
with pytest.raises(KeyError, match=msg):
300300
r.agg({"r1": "mean", "r2": "sum"})
301301

302302

@@ -311,7 +311,7 @@ def test_agg_consistency_int_str_column_mix():
311311
r = df.resample("3T")
312312

313313
msg = r"Column\(s\) \[2, 'b'\] do not exist"
314-
with pytest.raises(pd.core.base.SpecificationError, match=msg):
314+
with pytest.raises(KeyError, match=msg):
315315
r.agg({2: "mean", "b": "sum"})
316316

317317

@@ -444,7 +444,7 @@ def test_agg_misc():
444444

445445
msg = r"Column\(s\) \['result1', 'result2'\] do not exist"
446446
for t in cases:
447-
with pytest.raises(pd.core.base.SpecificationError, match=msg):
447+
with pytest.raises(KeyError, match=msg):
448448
t[["A", "B"]].agg({"result1": np.sum, "result2": np.mean})
449449

450450
# agg with different hows
@@ -475,7 +475,7 @@ def test_agg_misc():
475475

476476
# errors
477477
# invalid names in the agg specification
478-
msg = "\"Column 'B' does not exist!\""
478+
msg = r"Column\(s\) \['B'\] do not exist"
479479
for t in cases:
480480
with pytest.raises(KeyError, match=msg):
481481
t[["A"]].agg({"A": ["sum", "std"], "B": ["mean", "std"]})
@@ -526,7 +526,7 @@ def test_try_aggregate_non_existing_column():
526526
df = DataFrame(data).set_index("dt")
527527

528528
# Error as we don't have 'z' column
529-
msg = "\"Column 'z' does not exist!\""
529+
msg = r"Column\(s\) \['z'\] do not exist"
530530
with pytest.raises(KeyError, match=msg):
531531
df.resample("30T").agg({"x": ["mean"], "y": ["median"], "z": ["sum"]})
532532

0 commit comments

Comments
 (0)