TST: update _groupby_op to raise

jbrockmendel · jbrockmendel · commit 4224a52f34c7 · 2024-08-13T14:20:55.000-07:00
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -2238,6 +2238,19 @@ def _groupby_op(
         **kwargs,
     ):
         if isinstance(self.dtype, StringDtype):
+            if how in [
+                "sum",
+                "prod",
+                "mean",
+                "median",
+                "cumsum",
+                "cumprod",
+                "std",
+                "sem",
+                "var",
+                "skew",
+            ]:
+                raise TypeError(f"{self.dtype} dtype does not support {how} operations")
             return super()._groupby_op(
                 how=how,
                 has_dropped_na=has_dropped_na,
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -2389,6 +2389,21 @@ def _groupby_op(
         # GH#43682
         if isinstance(self.dtype, StringDtype):
             # StringArray
+            if op.how in [
+                "sum",
+                "prod",
+                "mean",
+                "median",
+                "cumsum",
+                "cumprod",
+                "std",
+                "sem",
+                "var",
+                "skew",
+            ]:
+                raise TypeError(
+                    f"{self.dtype} dtype does not support {op.how} operations"
+                )
             if op.how not in ["any", "all"]:
                 # Fail early to avoid conversion to object
                 op._get_cython_function(op.kind, op.how, np.dtype(object), False)
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -4290,6 +4290,10 @@ def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]:
                 raise TypeError(
                     "'quantile' cannot be performed against 'object' dtypes!"
                 )
+            elif isinstance(vals.dtype, StringDtype):
+                raise TypeError(
+                    f"{vals.dtype} dtype does not support quantile operations"
+                )
 
             inference: DtypeObj | None = None
             if isinstance(vals, BaseMaskedArray) and is_numeric_dtype(vals.dtype):
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -702,15 +702,20 @@ def test_keep_nuisance_agg(df, agg_function):
     ["sum", "mean", "prod", "std", "var", "sem", "median"],
 )
 @pytest.mark.parametrize("numeric_only", [True, False])
-def test_omit_nuisance_agg(df, agg_function, numeric_only):
+def test_omit_nuisance_agg(df, agg_function, numeric_only, using_infer_string):
     # GH 38774, GH 38815
     grouped = df.groupby("A")
 
     no_drop_nuisance = ("var", "std", "sem", "mean", "prod", "median")
+    if using_infer_string:
+        no_drop_nuisance += ("sum",)
     if agg_function in no_drop_nuisance and not numeric_only:
         # Added numeric_only as part of GH#46560; these do not drop nuisance
         # columns when numeric_only is False
-        if agg_function in ("std", "sem"):
+        if using_infer_string:
+            msg = f"str dtype does not support {agg_function} operations"
+            klass = TypeError
+        elif agg_function in ("std", "sem"):
             klass = ValueError
             msg = "could not convert string to float: 'one'"
         else:
@@ -1772,6 +1777,7 @@ def get_categorical_invalid_expected():
     is_per = isinstance(df.dtypes.iloc[0], pd.PeriodDtype)
     is_dt64 = df.dtypes.iloc[0].kind == "M"
     is_cat = isinstance(values, Categorical)
+    is_str = isinstance(df.dtypes.iloc[0], pd.StringDtype)
 
     if (
         isinstance(values, Categorical)
@@ -1796,13 +1802,15 @@ def get_categorical_invalid_expected():
 
     if op in ["prod", "sum", "skew"]:
         # ops that require more than just ordered-ness
-        if is_dt64 or is_cat or is_per:
+        if is_dt64 or is_cat or is_per or is_str:
             # GH#41291
             # datetime64 -> prod and sum are invalid
             if is_dt64:
                 msg = "datetime64 type does not support"
             elif is_per:
                 msg = "Period type does not support"
+            elif is_str:
+                msg = "str dtype does not support"
             else:
                 msg = "category type does not support"
             if op == "skew":
diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py
@@ -181,42 +181,46 @@ def test_groupby_raises_string(
     }[groupby_func]
 
     if using_infer_string:
-        if klass is not None:
-            if re.escape("agg function failed") in msg:
-                msg = msg.replace("object", "string")
-            elif groupby_func in [
-                "cumsum",
-                "cumprod",
-                "cummin",
-                "cummax",
-                "std",
-                "sem",
-                "skew",
-            ]:
-                msg = msg.replace("object", "string")
-            elif groupby_func == "quantile":
-                msg = "No matching signature found"
-            elif groupby_func == "corrwith":
-                msg = (
-                    "'ArrowStringArrayNumpySemantics' with dtype string does "
-                    "not support operation 'mean'"
-                )
-            else:
-                import pyarrow as pa
-
-                klass = pa.lib.ArrowNotImplementedError
-                if groupby_func == "pct_change":
-                    msg = "Function 'divide' has no kernel matching input types"
-                elif groupby_func == "diff":
-                    msg = (
-                        "Function 'subtract_checked' has no kernel matching "
-                        "input types"
-                    )
-                else:
-                    msg = (
-                        f"Function '{groupby_func}' has no kernel matching "
-                        "input types"
-                    )
+        if groupby_func in [
+            "sum",
+            "prod",
+            "mean",
+            "median",
+            "cumsum",
+            "cumprod",
+            "std",
+            "sem",
+            "var",
+            "skew",
+            "quantile",
+        ]:
+            msg = f"str dtype does not support {groupby_func} operations"
+            if groupby_func == "sum":
+                # The object-dtype allows this, StringArray variants do not.
+                klass = TypeError
+            elif groupby_func in ["sem", "std", "skew"]:
+                # The object-dtype raises ValueError when trying to convert to numeric.
+                klass = TypeError
+        elif groupby_func == "pct_change" and df["d"].dtype.storage == "pyarrow":
+            # This doesn't go through EA._groupby_op so the message isn't controlled
+            #  there.
+            import pyarrow as pa
+
+            klass = pa.lib.ArrowNotImplementedError
+            msg = "Function 'divide' has no kernel matching input types"
+        elif groupby_func == "diff" and df["d"].dtype.storage == "pyarrow":
+            # This doesn't go through EA._groupby_op so the message isn't controlled
+            #  there.
+            import pyarrow as pa
+
+            klass = pa.lib.ArrowNotImplementedError
+            msg = "Function 'subtract_checked' has no kernel matching input types"
+        elif groupby_func in ["cummin", "cummax"]:
+            msg = msg.replace("object", "str")
+        elif groupby_func == "corrwith":
+            msg = (
+                "'.*NumpySemantics' with dtype str does " "not support operation 'mean'"
+            )
 
     if groupby_func == "fillna":
         kind = "Series" if groupby_series else "DataFrame"
@@ -269,10 +273,9 @@ def test_groupby_raises_string_np(
     }[groupby_func_np]
 
     if using_infer_string:
-        # TODO: should ArrowStringArrayNumpySemantics support sum?
         klass = TypeError
         msg = (
-            "'ArrowStringArrayNumpySemantics' with dtype string does not "
+            "'.*StringArrayNumpySemantics' with dtype str does not "
             f"support operation '{groupby_func_np.__name__}'"
         )