From 34b36fb8b8aeba8a100b5df2d7592fc14b827d65 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 6 Aug 2024 14:24:00 -0700 Subject: [PATCH 01/12] TST: fix groupby xfails with using_infer_string --- pandas/tests/groupby/test_raises.py | 60 ++++++++++++++++++++++++++--- 1 file changed, 54 insertions(+), 6 deletions(-) diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index f28967fa81ddb..7c469fd57f567 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( Categorical, DataFrame, @@ -106,10 +104,9 @@ def _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=""): gb.transform(groupby_func, *args) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("how", ["method", "agg", "transform"]) def test_groupby_raises_string( - how, by, groupby_series, groupby_func, df_with_string_col + how, by, groupby_series, groupby_func, df_with_string_col, using_infer_string ): df = df_with_string_col args = get_groupby_method_args(groupby_func, df) @@ -183,6 +180,44 @@ def test_groupby_raises_string( ), }[groupby_func] + if using_infer_string: + if klass is not None: + if re.escape("agg function failed") in msg: + msg = msg.replace("object", "string") + elif groupby_func in [ + "cumsum", + "cumprod", + "cummin", + "cummax", + "std", + "sem", + "skew", + ]: + msg = msg.replace("object", "string") + elif groupby_func == "quantile": + msg = "No matching signature found" + elif groupby_func == "corrwith": + msg = ( + "'ArrowStringArrayNumpySemantics' with dtype string does " + "not support operation 'mean'" + ) + else: + import pyarrow as pa + + klass = pa.lib.ArrowNotImplementedError + if groupby_func == "pct_change": + msg = "Function 'divide' has no kernel matching input types" + elif groupby_func == "diff": + msg = ( + "Function 'subtract_checked' has no kernel matching " + "input types" + ) + else: + msg = ( + f"Function '{groupby_func}' has no kernel matching " + "input types" + ) + if groupby_func == "fillna": kind = "Series" if groupby_series else "DataFrame" warn_msg = f"{kind}GroupBy.fillna is deprecated" @@ -208,11 +243,15 @@ def func(x): getattr(gb, how)(func) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("how", ["agg", "transform"]) @pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean]) def test_groupby_raises_string_np( - how, by, groupby_series, groupby_func_np, df_with_string_col + how, + by, + groupby_series, + groupby_func_np, + df_with_string_col, + using_infer_string, ): # GH#50749 df = df_with_string_col @@ -228,6 +267,15 @@ def test_groupby_raises_string_np( "Could not convert string .* to numeric", ), }[groupby_func_np] + + if using_infer_string: + # TODO: should ArrowStringArrayNumpySemantics support sum? + klass = TypeError + msg = ( + "'ArrowStringArrayNumpySemantics' with dtype string does not " + f"support operation '{groupby_func_np.__name__}'" + ) + _call_and_check(klass, msg, how, gb, groupby_func_np, ()) From 91278297313b496535ed9dee8871eab1318d9691 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 13 Aug 2024 14:20:55 -0700 Subject: [PATCH 02/12] TST: update _groupby_op to raise --- pandas/core/arrays/arrow/array.py | 13 +++++ pandas/core/arrays/base.py | 15 ++++++ pandas/core/groupby/groupby.py | 4 ++ pandas/tests/groupby/test_groupby.py | 14 +++-- pandas/tests/groupby/test_raises.py | 79 +++++++++++++++------------- 5 files changed, 84 insertions(+), 41 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index a374afcacc45a..27034ce63ff53 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2270,6 +2270,19 @@ def _groupby_op( **kwargs, ): if isinstance(self.dtype, StringDtype): + if how in [ + "sum", + "prod", + "mean", + "median", + "cumsum", + "cumprod", + "std", + "sem", + "var", + "skew", + ]: + raise TypeError(f"{self.dtype} dtype does not support {how} operations") return super()._groupby_op( how=how, has_dropped_na=has_dropped_na, diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 2124f86b03b9c..6259de397b170 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2535,6 +2535,21 @@ def _groupby_op( # GH#43682 if isinstance(self.dtype, StringDtype): # StringArray + if op.how in [ + "sum", + "prod", + "mean", + "median", + "cumsum", + "cumprod", + "std", + "sem", + "var", + "skew", + ]: + raise TypeError( + f"{self.dtype} dtype does not support {op.how} operations" + ) if op.how not in ["any", "all"]: # Fail early to avoid conversion to object op._get_cython_function(op.kind, op.how, np.dtype(object), False) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 8c9c92594ebe7..8f45e110594b1 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -4287,6 +4287,10 @@ def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]: raise TypeError( "'quantile' cannot be performed against 'object' dtypes!" ) + elif isinstance(vals.dtype, StringDtype): + raise TypeError( + f"{vals.dtype} dtype does not support quantile operations" + ) inference: DtypeObj | None = None if isinstance(vals, BaseMaskedArray) and is_numeric_dtype(vals.dtype): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 11b874d0b1608..c5158c56622a7 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -702,15 +702,20 @@ def test_keep_nuisance_agg(df, agg_function): ["sum", "mean", "prod", "std", "var", "sem", "median"], ) @pytest.mark.parametrize("numeric_only", [True, False]) -def test_omit_nuisance_agg(df, agg_function, numeric_only): +def test_omit_nuisance_agg(df, agg_function, numeric_only, using_infer_string): # GH 38774, GH 38815 grouped = df.groupby("A") no_drop_nuisance = ("var", "std", "sem", "mean", "prod", "median") + if using_infer_string: + no_drop_nuisance += ("sum",) if agg_function in no_drop_nuisance and not numeric_only: # Added numeric_only as part of GH#46560; these do not drop nuisance # columns when numeric_only is False - if agg_function in ("std", "sem"): + if using_infer_string: + msg = f"str dtype does not support {agg_function} operations" + klass = TypeError + elif agg_function in ("std", "sem"): klass = ValueError msg = "could not convert string to float: 'one'" else: @@ -1772,6 +1777,7 @@ def get_categorical_invalid_expected(): is_per = isinstance(df.dtypes.iloc[0], pd.PeriodDtype) is_dt64 = df.dtypes.iloc[0].kind == "M" is_cat = isinstance(values, Categorical) + is_str = isinstance(df.dtypes.iloc[0], pd.StringDtype) if ( isinstance(values, Categorical) @@ -1796,13 +1802,15 @@ def get_categorical_invalid_expected(): if op in ["prod", "sum", "skew"]: # ops that require more than just ordered-ness - if is_dt64 or is_cat or is_per: + if is_dt64 or is_cat or is_per or is_str: # GH#41291 # datetime64 -> prod and sum are invalid if is_dt64: msg = "datetime64 type does not support" elif is_per: msg = "Period type does not support" + elif is_str: + msg = "str dtype does not support" else: msg = "category type does not support" if op == "skew": diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 7c469fd57f567..6925e0158fbea 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -181,42 +181,46 @@ def test_groupby_raises_string( }[groupby_func] if using_infer_string: - if klass is not None: - if re.escape("agg function failed") in msg: - msg = msg.replace("object", "string") - elif groupby_func in [ - "cumsum", - "cumprod", - "cummin", - "cummax", - "std", - "sem", - "skew", - ]: - msg = msg.replace("object", "string") - elif groupby_func == "quantile": - msg = "No matching signature found" - elif groupby_func == "corrwith": - msg = ( - "'ArrowStringArrayNumpySemantics' with dtype string does " - "not support operation 'mean'" - ) - else: - import pyarrow as pa - - klass = pa.lib.ArrowNotImplementedError - if groupby_func == "pct_change": - msg = "Function 'divide' has no kernel matching input types" - elif groupby_func == "diff": - msg = ( - "Function 'subtract_checked' has no kernel matching " - "input types" - ) - else: - msg = ( - f"Function '{groupby_func}' has no kernel matching " - "input types" - ) + if groupby_func in [ + "sum", + "prod", + "mean", + "median", + "cumsum", + "cumprod", + "std", + "sem", + "var", + "skew", + "quantile", + ]: + msg = f"str dtype does not support {groupby_func} operations" + if groupby_func == "sum": + # The object-dtype allows this, StringArray variants do not. + klass = TypeError + elif groupby_func in ["sem", "std", "skew"]: + # The object-dtype raises ValueError when trying to convert to numeric. + klass = TypeError + elif groupby_func == "pct_change" and df["d"].dtype.storage == "pyarrow": + # This doesn't go through EA._groupby_op so the message isn't controlled + # there. + import pyarrow as pa + + klass = pa.lib.ArrowNotImplementedError + msg = "Function 'divide' has no kernel matching input types" + elif groupby_func == "diff" and df["d"].dtype.storage == "pyarrow": + # This doesn't go through EA._groupby_op so the message isn't controlled + # there. + import pyarrow as pa + + klass = pa.lib.ArrowNotImplementedError + msg = "Function 'subtract_checked' has no kernel matching input types" + elif groupby_func in ["cummin", "cummax"]: + msg = msg.replace("object", "str") + elif groupby_func == "corrwith": + msg = ( + "'.*NumpySemantics' with dtype str does " "not support operation 'mean'" + ) if groupby_func == "fillna": kind = "Series" if groupby_series else "DataFrame" @@ -269,10 +273,9 @@ def test_groupby_raises_string_np( }[groupby_func_np] if using_infer_string: - # TODO: should ArrowStringArrayNumpySemantics support sum? klass = TypeError msg = ( - "'ArrowStringArrayNumpySemantics' with dtype string does not " + "'.*StringArrayNumpySemantics' with dtype str does not " f"support operation '{groupby_func_np.__name__}'" ) From e7ae735e65b3ee55ee41cbc75ea3133a49364a8f Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 14 Aug 2024 14:12:58 -0700 Subject: [PATCH 03/12] update tests --- pandas/tests/frame/test_stack_unstack.py | 4 ++- pandas/tests/groupby/test_groupby.py | 18 +++++++++--- pandas/tests/groupby/test_groupby_subclass.py | 2 +- pandas/tests/groupby/test_numeric_only.py | 4 +-- pandas/tests/groupby/test_raises.py | 11 ++++--- pandas/tests/resample/test_resample_api.py | 29 +++++++++++++++++-- pandas/tests/reshape/merge/test_join.py | 4 ++- pandas/tests/reshape/test_pivot.py | 10 +++++-- 8 files changed, 63 insertions(+), 19 deletions(-) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index b4f02b6f81b6f..c9ddb7bf60085 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -2113,7 +2113,7 @@ def test_unstack_period_frame(self): @pytest.mark.filterwarnings( "ignore:The previous implementation of stack is deprecated" ) - def test_stack_multiple_bug(self, future_stack): + def test_stack_multiple_bug(self, future_stack, using_infer_string): # bug when some uniques are not present in the data GH#3170 id_col = ([1] * 3) + ([2] * 3) name = (["a"] * 3) + (["b"] * 3) @@ -2125,6 +2125,8 @@ def test_stack_multiple_bug(self, future_stack): multi.columns.name = "Params" unst = multi.unstack("ID") msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "str dtype does not support mean operations" with pytest.raises(TypeError, match=msg): unst.resample("W-THU").mean() down = unst.resample("W-THU").mean(numeric_only=True) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index c5158c56622a7..74582267a6475 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -428,7 +428,7 @@ def test_frame_multi_key_function_list(): tm.assert_frame_equal(agged, expected) -def test_frame_multi_key_function_list_partial_failure(): +def test_frame_multi_key_function_list_partial_failure(using_infer_string): data = DataFrame( { "A": [ @@ -479,6 +479,8 @@ def test_frame_multi_key_function_list_partial_failure(): grouped = data.groupby(["A", "B"]) funcs = ["mean", "std"] msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "str dtype does not support mean operations" with pytest.raises(TypeError, match=msg): grouped.agg(funcs) @@ -665,9 +667,11 @@ def test_groupby_multi_corner(df): tm.assert_frame_equal(agged, expected) -def test_raises_on_nuisance(df): +def test_raises_on_nuisance(df, using_infer_string): grouped = df.groupby("A") msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "str dtype does not support mean operations" with pytest.raises(TypeError, match=msg): grouped.agg("mean") with pytest.raises(TypeError, match=msg): @@ -743,9 +747,11 @@ def test_raise_on_nuisance_python_single(df): grouped.skew() -def test_raise_on_nuisance_python_multiple(three_group): +def test_raise_on_nuisance_python_multiple(three_group, using_infer_string): grouped = three_group.groupby(["A", "B"]) msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "str dtype does not support mean operations" with pytest.raises(TypeError, match=msg): grouped.agg("mean") with pytest.raises(TypeError, match=msg): @@ -783,12 +789,16 @@ def test_nonsense_func(): df.groupby(lambda x: x + "foo") -def test_wrap_aggregated_output_multindex(multiindex_dataframe_random_data): +def test_wrap_aggregated_output_multindex( + multiindex_dataframe_random_data, using_infer_string +): df = multiindex_dataframe_random_data.T df["baz", "two"] = "peekaboo" keys = [np.array([0, 0, 1]), np.array([0, 0, 1])] msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "str dtype does not support mean operations" with pytest.raises(TypeError, match=msg): df.groupby(keys).agg("mean") agged = df.drop(columns=("baz", "two")).groupby(keys).agg("mean") diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py index 0832b67b38098..a1f4627475bab 100644 --- a/pandas/tests/groupby/test_groupby_subclass.py +++ b/pandas/tests/groupby/test_groupby_subclass.py @@ -109,7 +109,7 @@ def test_groupby_resample_preserves_subclass(obj): df = obj( { - "Buyer": "Carl Carl Carl Carl Joe Carl".split(), + "Buyer": Series("Carl Carl Carl Carl Joe Carl".split(), dtype=object), "Quantity": [18, 3, 5, 1, 9, 3], "Date": [ datetime(2013, 9, 1, 13, 0), diff --git a/pandas/tests/groupby/test_numeric_only.py b/pandas/tests/groupby/test_numeric_only.py index 41e00f8121b14..8a09625dba79f 100644 --- a/pandas/tests/groupby/test_numeric_only.py +++ b/pandas/tests/groupby/test_numeric_only.py @@ -180,7 +180,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): "category type does not support sum operations", re.escape(f"agg function failed [how->{method},dtype->object]"), re.escape(f"agg function failed [how->{method},dtype->string]"), - re.escape(f"agg function failed [how->{method},dtype->str]"), + f"str dtype does not support {method} operations", ] ) with pytest.raises(exception, match=msg): @@ -198,7 +198,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): f"Cannot perform {method} with non-ordered Categorical", re.escape(f"agg function failed [how->{method},dtype->object]"), re.escape(f"agg function failed [how->{method},dtype->string]"), - re.escape(f"agg function failed [how->{method},dtype->str]"), + f"str dtype does not support {method} operations", ] ) with pytest.raises(exception, match=msg): diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 6925e0158fbea..acc9729d235b0 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -274,10 +274,13 @@ def test_groupby_raises_string_np( if using_infer_string: klass = TypeError - msg = ( - "'.*StringArrayNumpySemantics' with dtype str does not " - f"support operation '{groupby_func_np.__name__}'" - ) + if df["d"].dtype.storage == "python": + msg = "Cannot perform reduction 'mean' with string dtype" + else: + msg = ( + "'ArrowStringArrayNumpySemantics' with dtype str does not " + f"support operation '{groupby_func_np.__name__}'" + ) _call_and_check(klass, msg, how, gb, groupby_func_np, ()) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index a8fb1b392322d..18db05f554140 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -187,7 +187,7 @@ def test_api_compat_before_use(attr): getattr(rs, attr) -def tests_raises_on_nuisance(test_frame): +def tests_raises_on_nuisance(test_frame, using_infer_string): df = test_frame df["D"] = "foo" r = df.resample("h") @@ -197,6 +197,8 @@ def tests_raises_on_nuisance(test_frame): expected = r[["A", "B", "C"]].mean() msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "str dtype does not support mean operations" with pytest.raises(TypeError, match=msg): r.mean() result = r.mean(numeric_only=True) @@ -881,7 +883,9 @@ def test_end_and_end_day_origin( ("sem", lib.no_default, "could not convert string to float"), ], ) -def test_frame_downsample_method(method, numeric_only, expected_data): +def test_frame_downsample_method( + method, numeric_only, expected_data, using_infer_string +): # GH#46442 test if `numeric_only` behave as expected for DataFrameGroupBy index = date_range("2018-01-01", periods=2, freq="D") @@ -898,11 +902,21 @@ def test_frame_downsample_method(method, numeric_only, expected_data): if method in ("var", "mean", "median", "prod"): klass = TypeError msg = re.escape(f"agg function failed [how->{method},dtype->") + if using_infer_string: + msg = f"str dtype does not support {method} operations" + elif method in ["sum", "std", "sem"] and using_infer_string: + klass = TypeError + msg = f"str dtype does not support {method} operations" else: klass = ValueError msg = expected_data with pytest.raises(klass, match=msg): _ = func(**kwargs) + elif method == "sum" and using_infer_string and numeric_only is not True: + klass = TypeError + msg = "str dtype does not support sum operations" + with pytest.raises(klass, match=msg): + _ = func(**kwargs) else: result = func(**kwargs) expected = DataFrame(expected_data, index=expected_index) @@ -932,7 +946,9 @@ def test_frame_downsample_method(method, numeric_only, expected_data): ("last", lib.no_default, ["cat_2"]), ], ) -def test_series_downsample_method(method, numeric_only, expected_data): +def test_series_downsample_method( + method, numeric_only, expected_data, using_infer_string +): # GH#46442 test if `numeric_only` behave as expected for SeriesGroupBy index = date_range("2018-01-01", periods=2, freq="D") @@ -948,8 +964,15 @@ def test_series_downsample_method(method, numeric_only, expected_data): func(**kwargs) elif method == "prod": msg = re.escape("agg function failed [how->prod,dtype->") + if using_infer_string: + msg = "str dtype does not support prod operations" + with pytest.raises(TypeError, match=msg): + func(**kwargs) + elif method == "sum" and using_infer_string and numeric_only is not True: + msg = "str dtype does not support sum operations" with pytest.raises(TypeError, match=msg): func(**kwargs) + else: result = func(**kwargs) expected = Series(expected_data, index=expected_index) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 0f743332acbbe..8e3da5b3bee38 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -620,7 +620,7 @@ def test_join_non_unique_period_index(self): ) tm.assert_frame_equal(result, expected) - def test_mixed_type_join_with_suffix(self): + def test_mixed_type_join_with_suffix(self, using_infer_string): # GH #916 df = DataFrame( np.random.default_rng(2).standard_normal((20, 6)), @@ -631,6 +631,8 @@ def test_mixed_type_join_with_suffix(self): grouped = df.groupby("id") msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "str dtype does not support mean operations" with pytest.raises(TypeError, match=msg): grouped.mean() mn = grouped.mean(numeric_only=True) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 44b96afaa4ef5..64ba8da907557 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -135,7 +135,7 @@ def test_pivot_table_categorical_observed_equal(self, observed): tm.assert_frame_equal(result, expected) - def test_pivot_table_nocols(self): + def test_pivot_table_nocols(self, using_infer_string): df = DataFrame( {"rows": ["a", "b", "c"], "cols": ["x", "y", "z"], "values": [1, 2, 3]} ) @@ -935,12 +935,14 @@ def test_margins(self, data): for value_col in table.columns.levels[0]: self._check_output(table[value_col], value_col, data) - def test_no_col(self, data): + def test_no_col(self, data, using_infer_string): # no col # to help with a buglet data.columns = [k * 2 for k in data.columns] msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "str dtype does not support mean operations" with pytest.raises(TypeError, match=msg): data.pivot_table(index=["AA", "BB"], margins=True, aggfunc="mean") table = data.drop(columns="CC").pivot_table( @@ -990,7 +992,7 @@ def test_no_col(self, data): ], ) def test_margin_with_only_columns_defined( - self, columns, aggfunc, values, expected_columns + self, columns, aggfunc, values, expected_columns, using_infer_string ): # GH 31016 df = DataFrame( @@ -1014,6 +1016,8 @@ def test_margin_with_only_columns_defined( ) if aggfunc != "sum": msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "str dtype does not support mean operations" with pytest.raises(TypeError, match=msg): df.pivot_table(columns=columns, margins=True, aggfunc=aggfunc) if "B" not in columns: From 4ca5a2f12e38413ba5ad08200f3308b2b57b5c94 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 19 Aug 2024 10:45:42 -0700 Subject: [PATCH 04/12] Fix failing test_in_numeric_groupby --- pandas/tests/extension/base/groupby.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index bab8566a06dc2..c1480f54163e0 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -6,7 +6,6 @@ is_bool_dtype, is_numeric_dtype, is_object_dtype, - is_string_dtype, ) import pandas as pd @@ -151,7 +150,6 @@ def test_in_numeric_groupby(self, data_for_grouping): is_numeric_dtype(dtype) or is_bool_dtype(dtype) or dtype.name == "decimal" - or is_string_dtype(dtype) or is_object_dtype(dtype) or dtype.kind == "m" # in particular duration[*][pyarrow] ): From 2c28a2c868fd3fb7ae980c2588a5cf5975e8ff2e Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 22 Aug 2024 14:21:45 -0700 Subject: [PATCH 05/12] update exception messages --- asv_bench/benchmarks/groupby.py | 2 + pandas/core/arrays/arrow/array.py | 4 +- pandas/core/arrays/base.py | 2 +- pandas/core/groupby/groupby.py | 2 +- pandas/tests/frame/test_stack_unstack.py | 2 +- pandas/tests/generic/test_frame.py | 1 + .../tests/groupby/aggregate/test_aggregate.py | 2 +- pandas/tests/groupby/methods/test_quantile.py | 11 ++-- pandas/tests/groupby/test_groupby.py | 63 ++++++++++++++----- pandas/tests/groupby/test_numeric_only.py | 6 +- pandas/tests/groupby/test_raises.py | 12 ++-- pandas/tests/resample/test_resample_api.py | 12 ++-- pandas/tests/reshape/merge/test_join.py | 2 +- pandas/tests/reshape/test_pivot.py | 7 ++- 14 files changed, 84 insertions(+), 44 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index abffa1f702b9c..19e95fefd4986 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -760,6 +760,8 @@ def setup(self, dtype, method): ) def time_str_func(self, dtype, method): + if dtype == "string[python]" and method == "sum": + raise NotImplementedError self.df.groupby("a")[self.df.columns[1:]].agg(method) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 27034ce63ff53..ef8192e80f9fe 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2282,7 +2282,9 @@ def _groupby_op( "var", "skew", ]: - raise TypeError(f"{self.dtype} dtype does not support {how} operations") + raise TypeError( + f"dtype '{self.dtype}' does not support operation '{how}'" + ) return super()._groupby_op( how=how, has_dropped_na=has_dropped_na, diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 6259de397b170..f47342c0c031a 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2548,7 +2548,7 @@ def _groupby_op( "skew", ]: raise TypeError( - f"{self.dtype} dtype does not support {op.how} operations" + f"dtype '{self.dtype}' does not support operation '{how}'" ) if op.how not in ["any", "all"]: # Fail early to avoid conversion to object diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 8f45e110594b1..007802131bcbe 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -4289,7 +4289,7 @@ def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]: ) elif isinstance(vals.dtype, StringDtype): raise TypeError( - f"{vals.dtype} dtype does not support quantile operations" + f"dtype '{vals.dtype}' does not support operation 'quantile'" ) inference: DtypeObj | None = None diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index c9ddb7bf60085..57c803c23b001 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -2126,7 +2126,7 @@ def test_stack_multiple_bug(self, future_stack, using_infer_string): unst = multi.unstack("ID") msg = re.escape("agg function failed [how->mean,dtype->") if using_infer_string: - msg = "str dtype does not support mean operations" + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): unst.resample("W-THU").mean() down = unst.resample("W-THU").mean(numeric_only=True) diff --git a/pandas/tests/generic/test_frame.py b/pandas/tests/generic/test_frame.py index 1d0f491529b56..7b74856273ad3 100644 --- a/pandas/tests/generic/test_frame.py +++ b/pandas/tests/generic/test_frame.py @@ -61,6 +61,7 @@ def test_metadata_propagation_indiv_groupby(self): "D": np.random.default_rng(2).standard_normal(8), } ) + df = df.astype({"A": object, "B": object}) result = df.groupby("A").sum() tm.assert_metadata_equivalent(df, result) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 46c27849356b5..a14d4e8385576 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1020,7 +1020,7 @@ def test_groupby_as_index_agg(df): result2 = grouped.agg({"C": "mean", "D": "sum"}) expected2 = grouped.mean(numeric_only=True) - expected2["D"] = grouped.sum()["D"] + expected2["D"] = grouped.sum(numeric_only=True)["D"] tm.assert_frame_equal(result2, expected2) grouped = df.groupby("A", as_index=True) diff --git a/pandas/tests/groupby/methods/test_quantile.py b/pandas/tests/groupby/methods/test_quantile.py index 0e31c0698cb1e..e2a5ab04c8887 100644 --- a/pandas/tests/groupby/methods/test_quantile.py +++ b/pandas/tests/groupby/methods/test_quantile.py @@ -241,19 +241,20 @@ def test_groupby_quantile_nullable_array(values, q): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) +# @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]]) @pytest.mark.parametrize("numeric_only", [True, False]) -def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only): +def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only, using_infer_string): df = DataFrame({"a": [1], "b": [2.0], "c": ["x"]}) if numeric_only: result = df.groupby("a").quantile(q, numeric_only=numeric_only) expected = df.groupby("a")[["b"]].quantile(q) tm.assert_frame_equal(result, expected) else: - with pytest.raises( - TypeError, match="'quantile' cannot be performed against 'object' dtypes!" - ): + msg = "'quantile' cannot be performed against 'object' dtypes!" + if using_infer_string: + msg = "dtype 'str' does not support operation 'quantile'" + with pytest.raises(TypeError, match=msg): df.groupby("a").quantile(q, numeric_only=numeric_only) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 74582267a6475..de4a225b92298 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -480,7 +480,7 @@ def test_frame_multi_key_function_list_partial_failure(using_infer_string): funcs = ["mean", "std"] msg = re.escape("agg function failed [how->mean,dtype->") if using_infer_string: - msg = "str dtype does not support mean operations" + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): grouped.agg(funcs) @@ -578,6 +578,7 @@ def test_ops_not_as_index(reduction_func): def test_as_index_series_return_frame(df): + df = df.astype({"A": object, "B": object}) grouped = df.groupby("A", as_index=False) grouped2 = df.groupby(["A", "B"], as_index=False) @@ -671,7 +672,7 @@ def test_raises_on_nuisance(df, using_infer_string): grouped = df.groupby("A") msg = re.escape("agg function failed [how->mean,dtype->") if using_infer_string: - msg = "str dtype does not support mean operations" + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): grouped.agg("mean") with pytest.raises(TypeError, match=msg): @@ -717,7 +718,7 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only, using_infer_string): # Added numeric_only as part of GH#46560; these do not drop nuisance # columns when numeric_only is False if using_infer_string: - msg = f"str dtype does not support {agg_function} operations" + msg = f"dtype 'str' does not support operation '{agg_function}'" klass = TypeError elif agg_function in ("std", "sem"): klass = ValueError @@ -740,10 +741,16 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only, using_infer_string): tm.assert_frame_equal(result, expected) -def test_raise_on_nuisance_python_single(df): +def test_raise_on_nuisance_python_single(df, using_infer_string): # GH 38815 grouped = df.groupby("A") - with pytest.raises(ValueError, match="could not convert"): + + err = ValueError + msg = "could not convert" + if using_infer_string: + err = TypeError + msg = "dtype 'str' does not support operation 'skew'" + with pytest.raises(err, match=msg): grouped.skew() @@ -751,7 +758,7 @@ def test_raise_on_nuisance_python_multiple(three_group, using_infer_string): grouped = three_group.groupby(["A", "B"]) msg = re.escape("agg function failed [how->mean,dtype->") if using_infer_string: - msg = "str dtype does not support mean operations" + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): grouped.agg("mean") with pytest.raises(TypeError, match=msg): @@ -798,7 +805,7 @@ def test_wrap_aggregated_output_multindex( keys = [np.array([0, 0, 1]), np.array([0, 0, 1])] msg = re.escape("agg function failed [how->mean,dtype->") if using_infer_string: - msg = "str dtype does not support mean operations" + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): df.groupby(keys).agg("mean") agged = df.drop(columns=("baz", "two")).groupby(keys).agg("mean") @@ -976,10 +983,20 @@ def test_groupby_with_hier_columns(): tm.assert_index_equal(result.columns, df.columns[:-1]) -def test_grouping_ndarray(df): +def test_grouping_ndarray(df, using_infer_string): grouped = df.groupby(df["A"].values) + grouped2 = df.groupby(df["A"].rename(None)) + + if using_infer_string: + msg = "dtype 'str' does not support operation 'sum'" + with pytest.raises(TypeError, match=msg): + grouped.sum() + with pytest.raises(TypeError, match=msg): + grouped2.sum() + return + result = grouped.sum() - expected = df.groupby(df["A"].rename(None)).sum() + expected = grouped2.sum() tm.assert_frame_equal(result, expected) @@ -1478,13 +1495,23 @@ def f(group): assert names == expected_names -def test_no_dummy_key_names(df): +def test_no_dummy_key_names(df, using_infer_string): # see gh-1291 - result = df.groupby(df["A"].values).sum() + gb = df.groupby(df["A"].values) + gb2 = df.groupby([df["A"].values, df["B"].values]) + if using_infer_string: + msg = "dtype 'str' does not support operation 'sum'" + with pytest.raises(TypeError, match=msg): + gb.sum() + with pytest.raises(TypeError, match=msg): + gb2.sum() + return + + result = gb.sum() assert result.index.name is None - result = df.groupby([df["A"].values, df["B"].values]).sum() - assert result.index.names == (None, None) + result2 = gb2.sum() + assert result2.index.names == (None, None) def test_groupby_sort_multiindex_series(): @@ -1820,7 +1847,7 @@ def get_categorical_invalid_expected(): elif is_per: msg = "Period type does not support" elif is_str: - msg = "str dtype does not support" + msg = f"dtype 'str' does not support operation '{op}'" else: msg = "category type does not support" if op == "skew": @@ -2750,7 +2777,7 @@ def test_obj_with_exclusions_duplicate_columns(): def test_groupby_numeric_only_std_no_result(numeric_only): # GH 51080 dicts_non_numeric = [{"a": "foo", "b": "bar"}, {"a": "car", "b": "dar"}] - df = DataFrame(dicts_non_numeric) + df = DataFrame(dicts_non_numeric, dtype=object) dfgb = df.groupby("a", as_index=False, sort=False) if numeric_only: @@ -2809,10 +2836,14 @@ def test_grouping_with_categorical_interval_columns(): def test_groupby_sum_on_nan_should_return_nan(bug_var): # GH 24196 df = DataFrame({"A": [bug_var, bug_var, bug_var, np.nan]}) + if isinstance(bug_var, str): + df = df.astype(object) dfgb = df.groupby(lambda x: x) result = dfgb.sum(min_count=1) - expected_df = DataFrame([bug_var, bug_var, bug_var, None], columns=["A"]) + expected_df = DataFrame( + [bug_var, bug_var, bug_var, None], columns=["A"], dtype=df["A"].dtype + ) tm.assert_frame_equal(result, expected_df) diff --git a/pandas/tests/groupby/test_numeric_only.py b/pandas/tests/groupby/test_numeric_only.py index 8a09625dba79f..3e5c7daf933df 100644 --- a/pandas/tests/groupby/test_numeric_only.py +++ b/pandas/tests/groupby/test_numeric_only.py @@ -28,7 +28,7 @@ def df(self): "group": [1, 1, 2], "int": [1, 2, 3], "float": [4.0, 5.0, 6.0], - "string": list("abc"), + "string": Series(["a", "b", "c"], dtype=object), "category_string": Series(list("abc")).astype("category"), "category_int": [7, 8, 9], "datetime": date_range("20130101", periods=3), @@ -180,7 +180,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): "category type does not support sum operations", re.escape(f"agg function failed [how->{method},dtype->object]"), re.escape(f"agg function failed [how->{method},dtype->string]"), - f"str dtype does not support {method} operations", + f"dtype 'str' does not support operation '{method}'", ] ) with pytest.raises(exception, match=msg): @@ -198,7 +198,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): f"Cannot perform {method} with non-ordered Categorical", re.escape(f"agg function failed [how->{method},dtype->object]"), re.escape(f"agg function failed [how->{method},dtype->string]"), - f"str dtype does not support {method} operations", + f"dtype 'str' does not support operation '{method}'", ] ) with pytest.raises(exception, match=msg): diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index acc9729d235b0..e2ca6b08a8478 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -194,7 +194,7 @@ def test_groupby_raises_string( "skew", "quantile", ]: - msg = f"str dtype does not support {groupby_func} operations" + msg = f"dtype 'str' does not support operation '{groupby_func}'" if groupby_func == "sum": # The object-dtype allows this, StringArray variants do not. klass = TypeError @@ -213,14 +213,13 @@ def test_groupby_raises_string( # there. import pyarrow as pa + # TODO(infer_string): avoid bubbling up pyarrow exceptions klass = pa.lib.ArrowNotImplementedError msg = "Function 'subtract_checked' has no kernel matching input types" elif groupby_func in ["cummin", "cummax"]: msg = msg.replace("object", "str") elif groupby_func == "corrwith": - msg = ( - "'.*NumpySemantics' with dtype str does " "not support operation 'mean'" - ) + msg = "'.*NumpySemantics' with dtype str does not support operation 'mean'" if groupby_func == "fillna": kind = "Series" if groupby_series else "DataFrame" @@ -275,7 +274,10 @@ def test_groupby_raises_string_np( if using_infer_string: klass = TypeError if df["d"].dtype.storage == "python": - msg = "Cannot perform reduction 'mean' with string dtype" + msg = ( + f"Cannot perform reduction '{groupby_func_np.__name__}' " + "with string dtype" + ) else: msg = ( "'ArrowStringArrayNumpySemantics' with dtype str does not " diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 18db05f554140..940c9e6700ea2 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -198,7 +198,7 @@ def tests_raises_on_nuisance(test_frame, using_infer_string): expected = r[["A", "B", "C"]].mean() msg = re.escape("agg function failed [how->mean,dtype->") if using_infer_string: - msg = "str dtype does not support mean operations" + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): r.mean() result = r.mean(numeric_only=True) @@ -903,10 +903,10 @@ def test_frame_downsample_method( klass = TypeError msg = re.escape(f"agg function failed [how->{method},dtype->") if using_infer_string: - msg = f"str dtype does not support {method} operations" + msg = f"dtype 'str' does not support operation '{method}'" elif method in ["sum", "std", "sem"] and using_infer_string: klass = TypeError - msg = f"str dtype does not support {method} operations" + msg = f"dtype 'str' does not support operation '{method}'" else: klass = ValueError msg = expected_data @@ -914,7 +914,7 @@ def test_frame_downsample_method( _ = func(**kwargs) elif method == "sum" and using_infer_string and numeric_only is not True: klass = TypeError - msg = "str dtype does not support sum operations" + msg = f"dtype 'str' does not support operation '{method}'" with pytest.raises(klass, match=msg): _ = func(**kwargs) else: @@ -965,11 +965,11 @@ def test_series_downsample_method( elif method == "prod": msg = re.escape("agg function failed [how->prod,dtype->") if using_infer_string: - msg = "str dtype does not support prod operations" + msg = "dtype 'str' does not support operation 'prod'" with pytest.raises(TypeError, match=msg): func(**kwargs) elif method == "sum" and using_infer_string and numeric_only is not True: - msg = "str dtype does not support sum operations" + msg = "dtype 'str' does not support operation 'sum'" with pytest.raises(TypeError, match=msg): func(**kwargs) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 8e3da5b3bee38..65bfea0b9beea 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -632,7 +632,7 @@ def test_mixed_type_join_with_suffix(self, using_infer_string): grouped = df.groupby("id") msg = re.escape("agg function failed [how->mean,dtype->") if using_infer_string: - msg = "str dtype does not support mean operations" + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): grouped.mean() mn = grouped.mean(numeric_only=True) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 64ba8da907557..9f26a31f1bdc1 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -135,10 +135,11 @@ def test_pivot_table_categorical_observed_equal(self, observed): tm.assert_frame_equal(result, expected) - def test_pivot_table_nocols(self, using_infer_string): + def test_pivot_table_nocols(self): df = DataFrame( {"rows": ["a", "b", "c"], "cols": ["x", "y", "z"], "values": [1, 2, 3]} ) + df = df.astype({"rows": object, "cols": object}) rs = df.pivot_table(columns="cols", aggfunc="sum") xp = df.pivot_table(index="cols", aggfunc="sum").T tm.assert_frame_equal(rs, xp) @@ -942,7 +943,7 @@ def test_no_col(self, data, using_infer_string): data.columns = [k * 2 for k in data.columns] msg = re.escape("agg function failed [how->mean,dtype->") if using_infer_string: - msg = "str dtype does not support mean operations" + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): data.pivot_table(index=["AA", "BB"], margins=True, aggfunc="mean") table = data.drop(columns="CC").pivot_table( @@ -1017,7 +1018,7 @@ def test_margin_with_only_columns_defined( if aggfunc != "sum": msg = re.escape("agg function failed [how->mean,dtype->") if using_infer_string: - msg = "str dtype does not support mean operations" + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): df.pivot_table(columns=columns, margins=True, aggfunc=aggfunc) if "B" not in columns: From 708e5d3220b9f16947e6e430e75a3cf0058b769e Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 22 Aug 2024 15:48:52 -0700 Subject: [PATCH 06/12] update message --- pandas/tests/groupby/test_raises.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index e2ca6b08a8478..d4c135a062052 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -219,7 +219,13 @@ def test_groupby_raises_string( elif groupby_func in ["cummin", "cummax"]: msg = msg.replace("object", "str") elif groupby_func == "corrwith": - msg = "'.*NumpySemantics' with dtype str does not support operation 'mean'" + if df["d"].dtype.storage == "pyarrow": + msg = ( + "ArrowStringArrayNumpySemantics' with dtype str does not " + "support operation 'mean'" + ) + else: + msg = "Cannot perform reduction 'mean' with string dtype" if groupby_func == "fillna": kind = "Series" if groupby_series else "DataFrame" From 75eddea1673ddff9bbf848bbedae091186e567c5 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 27 Aug 2024 07:56:17 -0700 Subject: [PATCH 07/12] skip no-longer-supported --- asv_bench/benchmarks/groupby.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 19e95fefd4986..352615ca54cba 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -752,6 +752,9 @@ class String: ] def setup(self, dtype, method): + if dtype == "string[python]" and method == "sum": + raise NotImplementedError # skip benchmark + cols = list("abcdefghjkl") self.df = DataFrame( np.random.randint(0, 100, size=(10_000, len(cols))), @@ -760,8 +763,6 @@ def setup(self, dtype, method): ) def time_str_func(self, dtype, method): - if dtype == "string[python]" and method == "sum": - raise NotImplementedError self.df.groupby("a")[self.df.columns[1:]].agg(method) From 72c59cf7b64526ce69342f095dc7bfd7bdef249e Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 28 Aug 2024 07:53:10 -0700 Subject: [PATCH 08/12] update exception messages --- pandas/core/groupby/groupby.py | 6 +---- pandas/tests/groupby/methods/test_quantile.py | 5 +---- pandas/tests/groupby/test_groupby.py | 22 ++++--------------- pandas/tests/groupby/test_numeric_only.py | 6 +++-- pandas/tests/groupby/test_raises.py | 10 ++------- 5 files changed, 12 insertions(+), 37 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 007802131bcbe..efd4f1112aa09 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -4283,11 +4283,7 @@ def quantile( starts, ends = lib.generate_slices(splitter._slabels, splitter.ngroups) def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]: - if is_object_dtype(vals.dtype): - raise TypeError( - "'quantile' cannot be performed against 'object' dtypes!" - ) - elif isinstance(vals.dtype, StringDtype): + if isinstance(vals.dtype, StringDtype) or is_object_dtype(vals.dtype): raise TypeError( f"dtype '{vals.dtype}' does not support operation 'quantile'" ) diff --git a/pandas/tests/groupby/methods/test_quantile.py b/pandas/tests/groupby/methods/test_quantile.py index e2a5ab04c8887..3f22851426282 100644 --- a/pandas/tests/groupby/methods/test_quantile.py +++ b/pandas/tests/groupby/methods/test_quantile.py @@ -241,7 +241,6 @@ def test_groupby_quantile_nullable_array(values, q): tm.assert_series_equal(result, expected) -# @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]]) @pytest.mark.parametrize("numeric_only", [True, False]) def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only, using_infer_string): @@ -251,9 +250,7 @@ def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only, using_infer_s expected = df.groupby("a")[["b"]].quantile(q) tm.assert_frame_equal(result, expected) else: - msg = "'quantile' cannot be performed against 'object' dtypes!" - if using_infer_string: - msg = "dtype 'str' does not support operation 'quantile'" + msg = "dtype '.*' does not support operation 'quantile'" with pytest.raises(TypeError, match=msg): df.groupby("a").quantile(q, numeric_only=numeric_only) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index de4a225b92298..17eb30cb76f3b 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -983,18 +983,11 @@ def test_groupby_with_hier_columns(): tm.assert_index_equal(result.columns, df.columns[:-1]) -def test_grouping_ndarray(df, using_infer_string): +def test_grouping_ndarray(df): + df = df.astype({"A": object, "B": object}) grouped = df.groupby(df["A"].values) grouped2 = df.groupby(df["A"].rename(None)) - if using_infer_string: - msg = "dtype 'str' does not support operation 'sum'" - with pytest.raises(TypeError, match=msg): - grouped.sum() - with pytest.raises(TypeError, match=msg): - grouped2.sum() - return - result = grouped.sum() expected = grouped2.sum() tm.assert_frame_equal(result, expected) @@ -1495,18 +1488,11 @@ def f(group): assert names == expected_names -def test_no_dummy_key_names(df, using_infer_string): +def test_no_dummy_key_names(df): # see gh-1291 + df = df.astype({"A": object, "B": object}) gb = df.groupby(df["A"].values) gb2 = df.groupby([df["A"].values, df["B"].values]) - if using_infer_string: - msg = "dtype 'str' does not support operation 'sum'" - with pytest.raises(TypeError, match=msg): - gb.sum() - with pytest.raises(TypeError, match=msg): - gb2.sum() - return - result = gb.sum() assert result.index.name is None diff --git a/pandas/tests/groupby/test_numeric_only.py b/pandas/tests/groupby/test_numeric_only.py index 3e5c7daf933df..4d4800ca1b051 100644 --- a/pandas/tests/groupby/test_numeric_only.py +++ b/pandas/tests/groupby/test_numeric_only.py @@ -299,7 +299,9 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys): re.escape(f"agg function failed [how->{kernel},dtype->object]"), ] ) - if kernel == "idxmin": + if kernel == "quantile": + msg = "dtype 'object' does not support operation 'quantile'" + elif kernel == "idxmin": msg = "'<' not supported between instances of 'type' and 'type'" elif kernel == "idxmax": msg = "'>' not supported between instances of 'type' and 'type'" @@ -379,7 +381,7 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request): # that succeed should not be allowed to fail (without deprecation, at least) if groupby_func in fails_on_numeric_object and dtype is object: if groupby_func == "quantile": - msg = "cannot be performed against 'object' dtypes" + msg = "dtype 'object' does not support operation 'quantile'" else: msg = "is not supported for object dtype" with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index d4c135a062052..629e8466a99b3 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -204,18 +204,12 @@ def test_groupby_raises_string( elif groupby_func == "pct_change" and df["d"].dtype.storage == "pyarrow": # This doesn't go through EA._groupby_op so the message isn't controlled # there. - import pyarrow as pa - - klass = pa.lib.ArrowNotImplementedError - msg = "Function 'divide' has no kernel matching input types" + msg = "operation 'truediv' not supported for dtype 'str' with dtype 'str'" elif groupby_func == "diff" and df["d"].dtype.storage == "pyarrow": # This doesn't go through EA._groupby_op so the message isn't controlled # there. - import pyarrow as pa + msg = "operation 'sub' not supported for dtype 'str' with dtype 'str'" - # TODO(infer_string): avoid bubbling up pyarrow exceptions - klass = pa.lib.ArrowNotImplementedError - msg = "Function 'subtract_checked' has no kernel matching input types" elif groupby_func in ["cummin", "cummax"]: msg = msg.replace("object", "str") elif groupby_func == "corrwith": From 10be506223add53686b6de5c3e7d7a2ad12c76b8 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 28 Aug 2024 08:26:21 -0700 Subject: [PATCH 09/12] update exception message --- pandas/tests/groupby/test_raises.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 629e8466a99b3..c7ff64f5bcd5a 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -166,7 +166,7 @@ def test_groupby_raises_string( TypeError, re.escape("agg function failed [how->prod,dtype->object]"), ), - "quantile": (TypeError, "cannot be performed against 'object' dtypes!"), + "quantile": (TypeError, "dtype 'object' does not support operation 'quantile'"), "rank": (None, ""), "sem": (ValueError, "could not convert string to float"), "shift": (None, ""), From c8ebe0793543491bd55994d438f08917798e7f2b Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 28 Aug 2024 08:26:55 -0700 Subject: [PATCH 10/12] update exception message --- pandas/tests/groupby/methods/test_quantile.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/methods/test_quantile.py b/pandas/tests/groupby/methods/test_quantile.py index 3f22851426282..49c4b4ac3c474 100644 --- a/pandas/tests/groupby/methods/test_quantile.py +++ b/pandas/tests/groupby/methods/test_quantile.py @@ -162,7 +162,8 @@ def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, def test_quantile_raises(): df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"]) - with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"): + msg = "dtype 'object' does not support operation 'quantile'" + with pytest.raises(TypeError, match=msg): df.groupby("key").quantile() From 08713262105e36841e692b0d7cba244ac4dcf1bd Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 1 Nov 2024 11:19:18 -0700 Subject: [PATCH 11/12] Update now that .sum() is supported --- pandas/core/arrays/arrow/array.py | 1 - pandas/core/arrays/base.py | 1 - pandas/tests/extension/base/groupby.py | 2 ++ pandas/tests/groupby/aggregate/test_cython.py | 4 +-- pandas/tests/groupby/test_groupby.py | 4 +-- pandas/tests/groupby/test_raises.py | 26 +++++-------------- pandas/tests/resample/test_resample_api.py | 9 ------- 7 files changed, 12 insertions(+), 35 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index e08b78b98e642..7c42bb5a727ba 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2304,7 +2304,6 @@ def _groupby_op( ): if isinstance(self.dtype, StringDtype): if how in [ - "sum", "prod", "mean", "median", diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 6cf9cca341794..4835d808f2433 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2609,7 +2609,6 @@ def _groupby_op( if isinstance(self.dtype, StringDtype): # StringArray if op.how in [ - "sum", "prod", "mean", "median", diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index c1480f54163e0..bab8566a06dc2 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -6,6 +6,7 @@ is_bool_dtype, is_numeric_dtype, is_object_dtype, + is_string_dtype, ) import pandas as pd @@ -150,6 +151,7 @@ def test_in_numeric_groupby(self, data_for_grouping): is_numeric_dtype(dtype) or is_bool_dtype(dtype) or dtype.name == "decimal" + or is_string_dtype(dtype) or is_object_dtype(dtype) or dtype.kind == "m" # in particular duration[*][pyarrow] ): diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index d28eb227314c7..b937e7dcc8136 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -148,11 +148,11 @@ def test_cython_agg_return_dict(): def test_cython_fail_agg(): dr = bdate_range("1/1/2000", periods=50) - ts = Series(["A", "B", "C", "D", "E"] * 10, index=dr) + ts = Series(["A", "B", "C", "D", "E"] * 10, dtype=object, index=dr) grouped = ts.groupby(lambda x: x.month) summed = grouped.sum() - expected = grouped.agg(np.sum) + expected = grouped.agg(np.sum).astype(object) tm.assert_series_equal(summed, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index a434391983c01..ac7f305880878 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -709,8 +709,6 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only, using_infer_string): grouped = df.groupby("A") no_drop_nuisance = ("var", "std", "sem", "mean", "prod", "median") - if using_infer_string: - no_drop_nuisance += ("sum",) if agg_function in no_drop_nuisance and not numeric_only: # Added numeric_only as part of GH#46560; these do not drop nuisance # columns when numeric_only is False @@ -1814,7 +1812,7 @@ def get_categorical_invalid_expected(): if op in ["prod", "sum", "skew"]: # ops that require more than just ordered-ness - if is_dt64 or is_cat or is_per or is_str: + if is_dt64 or is_cat or is_per or (is_str and op != "sum"): # GH#41291 # datetime64 -> prod and sum are invalid if is_dt64: diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index f3768ee3433b5..e915011875c60 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -182,7 +182,6 @@ def test_groupby_raises_string( if using_infer_string: if groupby_func in [ - "sum", "prod", "mean", "median", @@ -213,13 +212,7 @@ def test_groupby_raises_string( elif groupby_func in ["cummin", "cummax"]: msg = msg.replace("object", "str") elif groupby_func == "corrwith": - if df["d"].dtype.storage == "pyarrow": - msg = ( - "ArrowStringArrayNumpySemantics' with dtype str does not " - "support operation 'mean'" - ) - else: - msg = "Cannot perform reduction 'mean' with string dtype" + msg = "Cannot perform reduction 'mean' with string dtype" if groupby_func == "fillna": kind = "Series" if groupby_series else "DataFrame" @@ -273,17 +266,12 @@ def test_groupby_raises_string_np( }[groupby_func_np] if using_infer_string: - klass = TypeError - if df["d"].dtype.storage == "python": - msg = ( - f"Cannot perform reduction '{groupby_func_np.__name__}' " - "with string dtype" - ) - else: - msg = ( - "'ArrowStringArrayNumpySemantics' with dtype str does not " - f"support operation '{groupby_func_np.__name__}'" - ) + if groupby_func_np is np.mean: + klass = TypeError + msg = ( + f"Cannot perform reduction '{groupby_func_np.__name__}' " + "with string dtype" + ) _call_and_check(klass, msg, how, gb, groupby_func_np, ()) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 940c9e6700ea2..b7b80b5e427ff 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -912,11 +912,6 @@ def test_frame_downsample_method( msg = expected_data with pytest.raises(klass, match=msg): _ = func(**kwargs) - elif method == "sum" and using_infer_string and numeric_only is not True: - klass = TypeError - msg = f"dtype 'str' does not support operation '{method}'" - with pytest.raises(klass, match=msg): - _ = func(**kwargs) else: result = func(**kwargs) expected = DataFrame(expected_data, index=expected_index) @@ -968,10 +963,6 @@ def test_series_downsample_method( msg = "dtype 'str' does not support operation 'prod'" with pytest.raises(TypeError, match=msg): func(**kwargs) - elif method == "sum" and using_infer_string and numeric_only is not True: - msg = "dtype 'str' does not support operation 'sum'" - with pytest.raises(TypeError, match=msg): - func(**kwargs) else: result = func(**kwargs) From baa1dd95ef140deb9ec87d7fdf523c46ac241ace Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 4 Nov 2024 09:28:29 +0100 Subject: [PATCH 12/12] more cleanups now sum is implemented --- asv_bench/benchmarks/groupby.py | 3 --- pandas/tests/generic/test_frame.py | 1 - pandas/tests/groupby/aggregate/test_aggregate.py | 2 +- pandas/tests/groupby/methods/test_quantile.py | 2 +- pandas/tests/groupby/test_groupby.py | 9 ++------- pandas/tests/groupby/test_numeric_only.py | 10 ++++++++-- pandas/tests/groupby/test_raises.py | 5 +---- pandas/tests/reshape/test_pivot.py | 1 - 8 files changed, 13 insertions(+), 20 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 352615ca54cba..abffa1f702b9c 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -752,9 +752,6 @@ class String: ] def setup(self, dtype, method): - if dtype == "string[python]" and method == "sum": - raise NotImplementedError # skip benchmark - cols = list("abcdefghjkl") self.df = DataFrame( np.random.randint(0, 100, size=(10_000, len(cols))), diff --git a/pandas/tests/generic/test_frame.py b/pandas/tests/generic/test_frame.py index 7b74856273ad3..1d0f491529b56 100644 --- a/pandas/tests/generic/test_frame.py +++ b/pandas/tests/generic/test_frame.py @@ -61,7 +61,6 @@ def test_metadata_propagation_indiv_groupby(self): "D": np.random.default_rng(2).standard_normal(8), } ) - df = df.astype({"A": object, "B": object}) result = df.groupby("A").sum() tm.assert_metadata_equivalent(df, result) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index a14d4e8385576..46c27849356b5 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1020,7 +1020,7 @@ def test_groupby_as_index_agg(df): result2 = grouped.agg({"C": "mean", "D": "sum"}) expected2 = grouped.mean(numeric_only=True) - expected2["D"] = grouped.sum(numeric_only=True)["D"] + expected2["D"] = grouped.sum()["D"] tm.assert_frame_equal(result2, expected2) grouped = df.groupby("A", as_index=True) diff --git a/pandas/tests/groupby/methods/test_quantile.py b/pandas/tests/groupby/methods/test_quantile.py index 49c4b4ac3c474..4a8ad65200caa 100644 --- a/pandas/tests/groupby/methods/test_quantile.py +++ b/pandas/tests/groupby/methods/test_quantile.py @@ -244,7 +244,7 @@ def test_groupby_quantile_nullable_array(values, q): @pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]]) @pytest.mark.parametrize("numeric_only", [True, False]) -def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only, using_infer_string): +def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only): df = DataFrame({"a": [1], "b": [2.0], "c": ["x"]}) if numeric_only: result = df.groupby("a").quantile(q, numeric_only=numeric_only) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index ac7f305880878..3305b48a4dcdc 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -575,7 +575,6 @@ def test_ops_not_as_index(reduction_func): def test_as_index_series_return_frame(df): - df = df.astype({"A": object, "B": object}) grouped = df.groupby("A", as_index=False) grouped2 = df.groupby(["A", "B"], as_index=False) @@ -979,7 +978,6 @@ def test_groupby_with_hier_columns(): def test_grouping_ndarray(df): - df = df.astype({"A": object, "B": object}) grouped = df.groupby(df["A"].values) grouped2 = df.groupby(df["A"].rename(None)) @@ -1477,13 +1475,10 @@ def f(group): def test_no_dummy_key_names(df): # see gh-1291 - df = df.astype({"A": object, "B": object}) - gb = df.groupby(df["A"].values) - gb2 = df.groupby([df["A"].values, df["B"].values]) - result = gb.sum() + result = df.groupby(df["A"].values).sum() assert result.index.name is None - result2 = gb2.sum() + result2 = df.groupby([df["A"].values, df["B"].values]).sum() assert result2.index.names == (None, None) diff --git a/pandas/tests/groupby/test_numeric_only.py b/pandas/tests/groupby/test_numeric_only.py index 4d4800ca1b051..cb4569812f600 100644 --- a/pandas/tests/groupby/test_numeric_only.py +++ b/pandas/tests/groupby/test_numeric_only.py @@ -28,7 +28,8 @@ def df(self): "group": [1, 1, 2], "int": [1, 2, 3], "float": [4.0, 5.0, 6.0], - "string": Series(["a", "b", "c"], dtype=object), + "string": Series(["a", "b", "c"], dtype="str"), + "object": Series(["a", "b", "c"], dtype=object), "category_string": Series(list("abc")).astype("category"), "category_int": [7, 8, 9], "datetime": date_range("20130101", periods=3), @@ -40,6 +41,7 @@ def df(self): "int", "float", "string", + "object", "category_string", "category_int", "datetime", @@ -112,6 +114,7 @@ def test_first_last(self, df, method): "int", "float", "string", + "object", "category_string", "category_int", "datetime", @@ -159,7 +162,9 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): # object dtypes for transformations are not implemented in Cython and # have no Python fallback - exception = NotImplementedError if method.startswith("cum") else TypeError + exception = ( + (NotImplementedError, TypeError) if method.startswith("cum") else TypeError + ) if method in ("min", "max", "cummin", "cummax", "cumsum", "cumprod"): # The methods default to numeric_only=False and raise TypeError @@ -170,6 +175,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): re.escape(f"agg function failed [how->{method},dtype->object]"), # cumsum/cummin/cummax/cumprod "function is not implemented for this dtype", + f"dtype 'str' does not support operation '{method}'", ] ) with pytest.raises(exception, match=msg): diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index e915011875c60..1e0a15d0ba796 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -194,10 +194,7 @@ def test_groupby_raises_string( "quantile", ]: msg = f"dtype 'str' does not support operation '{groupby_func}'" - if groupby_func == "sum": - # The object-dtype allows this, StringArray variants do not. - klass = TypeError - elif groupby_func in ["sem", "std", "skew"]: + if groupby_func in ["sem", "std", "skew"]: # The object-dtype raises ValueError when trying to convert to numeric. klass = TypeError elif groupby_func == "pct_change" and df["d"].dtype.storage == "pyarrow": diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 72dc1be251064..d8a9acdc561fd 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -139,7 +139,6 @@ def test_pivot_table_nocols(self): df = DataFrame( {"rows": ["a", "b", "c"], "cols": ["x", "y", "z"], "values": [1, 2, 3]} ) - df = df.astype({"rows": object, "cols": object}) rs = df.pivot_table(columns="cols", aggfunc="sum") xp = df.pivot_table(index="cols", aggfunc="sum").T tm.assert_frame_equal(rs, xp)