Skip to content

Commit 4224a52

Browse files
committed
TST: update _groupby_op to raise
1 parent 31229c6 commit 4224a52

File tree

5 files changed

+84
-41
lines changed

5 files changed

+84
-41
lines changed

pandas/core/arrays/arrow/array.py

+13
Original file line numberDiff line numberDiff line change
@@ -2238,6 +2238,19 @@ def _groupby_op(
22382238
**kwargs,
22392239
):
22402240
if isinstance(self.dtype, StringDtype):
2241+
if how in [
2242+
"sum",
2243+
"prod",
2244+
"mean",
2245+
"median",
2246+
"cumsum",
2247+
"cumprod",
2248+
"std",
2249+
"sem",
2250+
"var",
2251+
"skew",
2252+
]:
2253+
raise TypeError(f"{self.dtype} dtype does not support {how} operations")
22412254
return super()._groupby_op(
22422255
how=how,
22432256
has_dropped_na=has_dropped_na,

pandas/core/arrays/base.py

+15
Original file line numberDiff line numberDiff line change
@@ -2389,6 +2389,21 @@ def _groupby_op(
23892389
# GH#43682
23902390
if isinstance(self.dtype, StringDtype):
23912391
# StringArray
2392+
if op.how in [
2393+
"sum",
2394+
"prod",
2395+
"mean",
2396+
"median",
2397+
"cumsum",
2398+
"cumprod",
2399+
"std",
2400+
"sem",
2401+
"var",
2402+
"skew",
2403+
]:
2404+
raise TypeError(
2405+
f"{self.dtype} dtype does not support {op.how} operations"
2406+
)
23922407
if op.how not in ["any", "all"]:
23932408
# Fail early to avoid conversion to object
23942409
op._get_cython_function(op.kind, op.how, np.dtype(object), False)

pandas/core/groupby/groupby.py

+4
Original file line numberDiff line numberDiff line change
@@ -4290,6 +4290,10 @@ def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]:
42904290
raise TypeError(
42914291
"'quantile' cannot be performed against 'object' dtypes!"
42924292
)
4293+
elif isinstance(vals.dtype, StringDtype):
4294+
raise TypeError(
4295+
f"{vals.dtype} dtype does not support quantile operations"
4296+
)
42934297

42944298
inference: DtypeObj | None = None
42954299
if isinstance(vals, BaseMaskedArray) and is_numeric_dtype(vals.dtype):

pandas/tests/groupby/test_groupby.py

+11-3
Original file line numberDiff line numberDiff line change
@@ -702,15 +702,20 @@ def test_keep_nuisance_agg(df, agg_function):
702702
["sum", "mean", "prod", "std", "var", "sem", "median"],
703703
)
704704
@pytest.mark.parametrize("numeric_only", [True, False])
705-
def test_omit_nuisance_agg(df, agg_function, numeric_only):
705+
def test_omit_nuisance_agg(df, agg_function, numeric_only, using_infer_string):
706706
# GH 38774, GH 38815
707707
grouped = df.groupby("A")
708708

709709
no_drop_nuisance = ("var", "std", "sem", "mean", "prod", "median")
710+
if using_infer_string:
711+
no_drop_nuisance += ("sum",)
710712
if agg_function in no_drop_nuisance and not numeric_only:
711713
# Added numeric_only as part of GH#46560; these do not drop nuisance
712714
# columns when numeric_only is False
713-
if agg_function in ("std", "sem"):
715+
if using_infer_string:
716+
msg = f"str dtype does not support {agg_function} operations"
717+
klass = TypeError
718+
elif agg_function in ("std", "sem"):
714719
klass = ValueError
715720
msg = "could not convert string to float: 'one'"
716721
else:
@@ -1772,6 +1777,7 @@ def get_categorical_invalid_expected():
17721777
is_per = isinstance(df.dtypes.iloc[0], pd.PeriodDtype)
17731778
is_dt64 = df.dtypes.iloc[0].kind == "M"
17741779
is_cat = isinstance(values, Categorical)
1780+
is_str = isinstance(df.dtypes.iloc[0], pd.StringDtype)
17751781

17761782
if (
17771783
isinstance(values, Categorical)
@@ -1796,13 +1802,15 @@ def get_categorical_invalid_expected():
17961802

17971803
if op in ["prod", "sum", "skew"]:
17981804
# ops that require more than just ordered-ness
1799-
if is_dt64 or is_cat or is_per:
1805+
if is_dt64 or is_cat or is_per or is_str:
18001806
# GH#41291
18011807
# datetime64 -> prod and sum are invalid
18021808
if is_dt64:
18031809
msg = "datetime64 type does not support"
18041810
elif is_per:
18051811
msg = "Period type does not support"
1812+
elif is_str:
1813+
msg = "str dtype does not support"
18061814
else:
18071815
msg = "category type does not support"
18081816
if op == "skew":

pandas/tests/groupby/test_raises.py

+41-38
Original file line numberDiff line numberDiff line change
@@ -181,42 +181,46 @@ def test_groupby_raises_string(
181181
}[groupby_func]
182182

183183
if using_infer_string:
184-
if klass is not None:
185-
if re.escape("agg function failed") in msg:
186-
msg = msg.replace("object", "string")
187-
elif groupby_func in [
188-
"cumsum",
189-
"cumprod",
190-
"cummin",
191-
"cummax",
192-
"std",
193-
"sem",
194-
"skew",
195-
]:
196-
msg = msg.replace("object", "string")
197-
elif groupby_func == "quantile":
198-
msg = "No matching signature found"
199-
elif groupby_func == "corrwith":
200-
msg = (
201-
"'ArrowStringArrayNumpySemantics' with dtype string does "
202-
"not support operation 'mean'"
203-
)
204-
else:
205-
import pyarrow as pa
206-
207-
klass = pa.lib.ArrowNotImplementedError
208-
if groupby_func == "pct_change":
209-
msg = "Function 'divide' has no kernel matching input types"
210-
elif groupby_func == "diff":
211-
msg = (
212-
"Function 'subtract_checked' has no kernel matching "
213-
"input types"
214-
)
215-
else:
216-
msg = (
217-
f"Function '{groupby_func}' has no kernel matching "
218-
"input types"
219-
)
184+
if groupby_func in [
185+
"sum",
186+
"prod",
187+
"mean",
188+
"median",
189+
"cumsum",
190+
"cumprod",
191+
"std",
192+
"sem",
193+
"var",
194+
"skew",
195+
"quantile",
196+
]:
197+
msg = f"str dtype does not support {groupby_func} operations"
198+
if groupby_func == "sum":
199+
# The object-dtype allows this, StringArray variants do not.
200+
klass = TypeError
201+
elif groupby_func in ["sem", "std", "skew"]:
202+
# The object-dtype raises ValueError when trying to convert to numeric.
203+
klass = TypeError
204+
elif groupby_func == "pct_change" and df["d"].dtype.storage == "pyarrow":
205+
# This doesn't go through EA._groupby_op so the message isn't controlled
206+
# there.
207+
import pyarrow as pa
208+
209+
klass = pa.lib.ArrowNotImplementedError
210+
msg = "Function 'divide' has no kernel matching input types"
211+
elif groupby_func == "diff" and df["d"].dtype.storage == "pyarrow":
212+
# This doesn't go through EA._groupby_op so the message isn't controlled
213+
# there.
214+
import pyarrow as pa
215+
216+
klass = pa.lib.ArrowNotImplementedError
217+
msg = "Function 'subtract_checked' has no kernel matching input types"
218+
elif groupby_func in ["cummin", "cummax"]:
219+
msg = msg.replace("object", "str")
220+
elif groupby_func == "corrwith":
221+
msg = (
222+
"'.*NumpySemantics' with dtype str does " "not support operation 'mean'"
223+
)
220224

221225
if groupby_func == "fillna":
222226
kind = "Series" if groupby_series else "DataFrame"
@@ -269,10 +273,9 @@ def test_groupby_raises_string_np(
269273
}[groupby_func_np]
270274

271275
if using_infer_string:
272-
# TODO: should ArrowStringArrayNumpySemantics support sum?
273276
klass = TypeError
274277
msg = (
275-
"'ArrowStringArrayNumpySemantics' with dtype string does not "
278+
"'.*StringArrayNumpySemantics' with dtype str does not "
276279
f"support operation '{groupby_func_np.__name__}'"
277280
)
278281

0 commit comments

Comments
 (0)