Skip to content

Commit a1cb2b1

Browse files
committed
TST: update _groupby_op to raise
1 parent 75b22fa commit a1cb2b1

File tree

5 files changed

+84
-41
lines changed

5 files changed

+84
-41
lines changed

pandas/core/arrays/arrow/array.py

+13
Original file line numberDiff line numberDiff line change
@@ -2244,6 +2244,19 @@ def _groupby_op(
22442244
**kwargs,
22452245
):
22462246
if isinstance(self.dtype, StringDtype):
2247+
if how in [
2248+
"sum",
2249+
"prod",
2250+
"mean",
2251+
"median",
2252+
"cumsum",
2253+
"cumprod",
2254+
"std",
2255+
"sem",
2256+
"var",
2257+
"skew",
2258+
]:
2259+
raise TypeError(f"{self.dtype} dtype does not support {how} operations")
22472260
return super()._groupby_op(
22482261
how=how,
22492262
has_dropped_na=has_dropped_na,

pandas/core/arrays/base.py

+15
Original file line numberDiff line numberDiff line change
@@ -2410,6 +2410,21 @@ def _groupby_op(
24102410
# GH#43682
24112411
if isinstance(self.dtype, StringDtype):
24122412
# StringArray
2413+
if op.how in [
2414+
"sum",
2415+
"prod",
2416+
"mean",
2417+
"median",
2418+
"cumsum",
2419+
"cumprod",
2420+
"std",
2421+
"sem",
2422+
"var",
2423+
"skew",
2424+
]:
2425+
raise TypeError(
2426+
f"{self.dtype} dtype does not support {op.how} operations"
2427+
)
24132428
if op.how not in ["any", "all"]:
24142429
# Fail early to avoid conversion to object
24152430
op._get_cython_function(op.kind, op.how, np.dtype(object), False)

pandas/core/groupby/groupby.py

+4
Original file line numberDiff line numberDiff line change
@@ -4290,6 +4290,10 @@ def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]:
42904290
raise TypeError(
42914291
"'quantile' cannot be performed against 'object' dtypes!"
42924292
)
4293+
elif isinstance(vals.dtype, StringDtype):
4294+
raise TypeError(
4295+
f"{vals.dtype} dtype does not support quantile operations"
4296+
)
42934297

42944298
inference: DtypeObj | None = None
42954299
if isinstance(vals, BaseMaskedArray) and is_numeric_dtype(vals.dtype):

pandas/tests/groupby/test_groupby.py

+11-3
Original file line numberDiff line numberDiff line change
@@ -702,15 +702,20 @@ def test_keep_nuisance_agg(df, agg_function):
702702
["sum", "mean", "prod", "std", "var", "sem", "median"],
703703
)
704704
@pytest.mark.parametrize("numeric_only", [True, False])
705-
def test_omit_nuisance_agg(df, agg_function, numeric_only):
705+
def test_omit_nuisance_agg(df, agg_function, numeric_only, using_infer_string):
706706
# GH 38774, GH 38815
707707
grouped = df.groupby("A")
708708

709709
no_drop_nuisance = ("var", "std", "sem", "mean", "prod", "median")
710+
if using_infer_string:
711+
no_drop_nuisance += ("sum",)
710712
if agg_function in no_drop_nuisance and not numeric_only:
711713
# Added numeric_only as part of GH#46560; these do not drop nuisance
712714
# columns when numeric_only is False
713-
if agg_function in ("std", "sem"):
715+
if using_infer_string:
716+
msg = f"str dtype does not support {agg_function} operations"
717+
klass = TypeError
718+
elif agg_function in ("std", "sem"):
714719
klass = ValueError
715720
msg = "could not convert string to float: 'one'"
716721
else:
@@ -1772,6 +1777,7 @@ def get_categorical_invalid_expected():
17721777
is_per = isinstance(df.dtypes.iloc[0], pd.PeriodDtype)
17731778
is_dt64 = df.dtypes.iloc[0].kind == "M"
17741779
is_cat = isinstance(values, Categorical)
1780+
is_str = isinstance(df.dtypes.iloc[0], pd.StringDtype)
17751781

17761782
if (
17771783
isinstance(values, Categorical)
@@ -1796,13 +1802,15 @@ def get_categorical_invalid_expected():
17961802

17971803
if op in ["prod", "sum", "skew"]:
17981804
# ops that require more than just ordered-ness
1799-
if is_dt64 or is_cat or is_per:
1805+
if is_dt64 or is_cat or is_per or is_str:
18001806
# GH#41291
18011807
# datetime64 -> prod and sum are invalid
18021808
if is_dt64:
18031809
msg = "datetime64 type does not support"
18041810
elif is_per:
18051811
msg = "Period type does not support"
1812+
elif is_str:
1813+
msg = "str dtype does not support"
18061814
else:
18071815
msg = "category type does not support"
18081816
if op == "skew":

pandas/tests/groupby/test_raises.py

+41-38
Original file line numberDiff line numberDiff line change
@@ -181,42 +181,46 @@ def test_groupby_raises_string(
181181
}[groupby_func]
182182

183183
if using_infer_string:
184-
if klass is not None:
185-
if re.escape("agg function failed") in msg:
186-
msg = msg.replace("object", "string")
187-
elif groupby_func in [
188-
"cumsum",
189-
"cumprod",
190-
"cummin",
191-
"cummax",
192-
"std",
193-
"sem",
194-
"skew",
195-
]:
196-
msg = msg.replace("object", "string")
197-
elif groupby_func == "quantile":
198-
msg = "No matching signature found"
199-
elif groupby_func == "corrwith":
200-
msg = (
201-
"'ArrowStringArrayNumpySemantics' with dtype string does "
202-
"not support operation 'mean'"
203-
)
204-
else:
205-
import pyarrow as pa
206-
207-
klass = pa.lib.ArrowNotImplementedError
208-
if groupby_func == "pct_change":
209-
msg = "Function 'divide' has no kernel matching input types"
210-
elif groupby_func == "diff":
211-
msg = (
212-
"Function 'subtract_checked' has no kernel matching "
213-
"input types"
214-
)
215-
else:
216-
msg = (
217-
f"Function '{groupby_func}' has no kernel matching "
218-
"input types"
219-
)
184+
if groupby_func in [
185+
"sum",
186+
"prod",
187+
"mean",
188+
"median",
189+
"cumsum",
190+
"cumprod",
191+
"std",
192+
"sem",
193+
"var",
194+
"skew",
195+
"quantile",
196+
]:
197+
msg = f"str dtype does not support {groupby_func} operations"
198+
if groupby_func == "sum":
199+
# The object-dtype allows this, StringArray variants do not.
200+
klass = TypeError
201+
elif groupby_func in ["sem", "std", "skew"]:
202+
# The object-dtype raises ValueError when trying to convert to numeric.
203+
klass = TypeError
204+
elif groupby_func == "pct_change" and df["d"].dtype.storage == "pyarrow":
205+
# This doesn't go through EA._groupby_op so the message isn't controlled
206+
# there.
207+
import pyarrow as pa
208+
209+
klass = pa.lib.ArrowNotImplementedError
210+
msg = "Function 'divide' has no kernel matching input types"
211+
elif groupby_func == "diff" and df["d"].dtype.storage == "pyarrow":
212+
# This doesn't go through EA._groupby_op so the message isn't controlled
213+
# there.
214+
import pyarrow as pa
215+
216+
klass = pa.lib.ArrowNotImplementedError
217+
msg = "Function 'subtract_checked' has no kernel matching input types"
218+
elif groupby_func in ["cummin", "cummax"]:
219+
msg = msg.replace("object", "str")
220+
elif groupby_func == "corrwith":
221+
msg = (
222+
"'.*NumpySemantics' with dtype str does " "not support operation 'mean'"
223+
)
220224

221225
if groupby_func == "fillna":
222226
kind = "Series" if groupby_series else "DataFrame"
@@ -269,10 +273,9 @@ def test_groupby_raises_string_np(
269273
}[groupby_func_np]
270274

271275
if using_infer_string:
272-
# TODO: should ArrowStringArrayNumpySemantics support sum?
273276
klass = TypeError
274277
msg = (
275-
"'ArrowStringArrayNumpySemantics' with dtype string does not "
278+
"'.*StringArrayNumpySemantics' with dtype str does not "
276279
f"support operation '{groupby_func_np.__name__}'"
277280
)
278281

0 commit comments

Comments
 (0)