Skip to content

Commit a3cf70f

Browse files
jbrockmendeljorisvandenbossche
authored andcommitted
TST (string dtype): fix groupby xfails with using_infer_string + update error message (pandas-dev#59430)
Co-authored-by: Joris Van den Bossche <[email protected]> (cherry picked from commit e5dd89d)
1 parent cacd4bb commit a3cf70f

File tree

13 files changed

+166
-43
lines changed

13 files changed

+166
-43
lines changed

pandas/core/arrays/arrow/array.py

+14
Original file line numberDiff line numberDiff line change
@@ -2285,6 +2285,20 @@ def _groupby_op(
22852285
**kwargs,
22862286
):
22872287
if isinstance(self.dtype, StringDtype):
2288+
if how in [
2289+
"prod",
2290+
"mean",
2291+
"median",
2292+
"cumsum",
2293+
"cumprod",
2294+
"std",
2295+
"sem",
2296+
"var",
2297+
"skew",
2298+
]:
2299+
raise TypeError(
2300+
f"dtype '{self.dtype}' does not support operation '{how}'"
2301+
)
22882302
return super()._groupby_op(
22892303
how=how,
22902304
has_dropped_na=has_dropped_na,

pandas/core/arrays/base.py

+14
Original file line numberDiff line numberDiff line change
@@ -2369,6 +2369,20 @@ def _groupby_op(
23692369
# GH#43682
23702370
if isinstance(self.dtype, StringDtype):
23712371
# StringArray
2372+
if op.how in [
2373+
"prod",
2374+
"mean",
2375+
"median",
2376+
"cumsum",
2377+
"cumprod",
2378+
"std",
2379+
"sem",
2380+
"var",
2381+
"skew",
2382+
]:
2383+
raise TypeError(
2384+
f"dtype '{self.dtype}' does not support operation '{how}'"
2385+
)
23722386
if op.how not in ["any", "all"]:
23732387
# Fail early to avoid conversion to object
23742388
op._get_cython_function(op.kind, op.how, np.dtype(object), False)

pandas/core/groupby/groupby.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -4394,9 +4394,9 @@ def quantile(
43944394
starts, ends = lib.generate_slices(splitter._slabels, splitter.ngroups)
43954395

43964396
def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]:
4397-
if is_object_dtype(vals.dtype):
4397+
if isinstance(vals.dtype, StringDtype) or is_object_dtype(vals.dtype):
43984398
raise TypeError(
4399-
"'quantile' cannot be performed against 'object' dtypes!"
4399+
f"dtype '{vals.dtype}' does not support operation 'quantile'"
44004400
)
44014401

44024402
inference: DtypeObj | None = None

pandas/tests/frame/test_stack_unstack.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -2083,7 +2083,7 @@ def test_unstack_period_frame(self):
20832083
@pytest.mark.filterwarnings(
20842084
"ignore:The previous implementation of stack is deprecated"
20852085
)
2086-
def test_stack_multiple_bug(self, future_stack):
2086+
def test_stack_multiple_bug(self, future_stack, using_infer_string):
20872087
# bug when some uniques are not present in the data GH#3170
20882088
id_col = ([1] * 3) + ([2] * 3)
20892089
name = (["a"] * 3) + (["b"] * 3)
@@ -2095,6 +2095,8 @@ def test_stack_multiple_bug(self, future_stack):
20952095
multi.columns.name = "Params"
20962096
unst = multi.unstack("ID")
20972097
msg = re.escape("agg function failed [how->mean,dtype->")
2098+
if using_infer_string:
2099+
msg = "dtype 'str' does not support operation 'mean'"
20982100
with pytest.raises(TypeError, match=msg):
20992101
unst.resample("W-THU").mean()
21002102
down = unst.resample("W-THU").mean(numeric_only=True)

pandas/tests/groupby/aggregate/test_cython.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -166,14 +166,14 @@ def test_cython_agg_return_dict():
166166

167167
def test_cython_fail_agg():
168168
dr = bdate_range("1/1/2000", periods=50)
169-
ts = Series(["A", "B", "C", "D", "E"] * 10, index=dr)
169+
ts = Series(["A", "B", "C", "D", "E"] * 10, dtype=object, index=dr)
170170

171171
grouped = ts.groupby(lambda x: x.month)
172172
summed = grouped.sum()
173173
msg = "using SeriesGroupBy.sum"
174174
with tm.assert_produces_warning(FutureWarning, match=msg):
175175
# GH#53425
176-
expected = grouped.agg(np.sum)
176+
expected = grouped.agg(np.sum).astype(object)
177177
tm.assert_series_equal(summed, expected)
178178

179179

pandas/tests/groupby/methods/test_quantile.py

+4-5
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,8 @@ def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby,
174174
def test_quantile_raises():
175175
df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"])
176176

177-
with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"):
177+
msg = "dtype 'object' does not support operation 'quantile'"
178+
with pytest.raises(TypeError, match=msg):
178179
df.groupby("key").quantile()
179180

180181

@@ -253,7 +254,6 @@ def test_groupby_quantile_nullable_array(values, q):
253254
tm.assert_series_equal(result, expected)
254255

255256

256-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
257257
@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
258258
@pytest.mark.parametrize("numeric_only", [True, False])
259259
def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only):
@@ -263,9 +263,8 @@ def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only):
263263
expected = df.groupby("a")[["b"]].quantile(q)
264264
tm.assert_frame_equal(result, expected)
265265
else:
266-
with pytest.raises(
267-
TypeError, match="'quantile' cannot be performed against 'object' dtypes!"
268-
):
266+
msg = "dtype '.*' does not support operation 'quantile'"
267+
with pytest.raises(TypeError, match=msg):
269268
df.groupby("a").quantile(q, numeric_only=numeric_only)
270269

271270

pandas/tests/groupby/test_groupby.py

+42-14
Original file line numberDiff line numberDiff line change
@@ -640,7 +640,7 @@ def test_frame_multi_key_function_list():
640640
tm.assert_frame_equal(agged, expected)
641641

642642

643-
def test_frame_multi_key_function_list_partial_failure():
643+
def test_frame_multi_key_function_list_partial_failure(using_infer_string):
644644
data = DataFrame(
645645
{
646646
"A": [
@@ -691,6 +691,8 @@ def test_frame_multi_key_function_list_partial_failure():
691691
grouped = data.groupby(["A", "B"])
692692
funcs = ["mean", "std"]
693693
msg = re.escape("agg function failed [how->mean,dtype->")
694+
if using_infer_string:
695+
msg = "dtype 'str' does not support operation 'mean'"
694696
with pytest.raises(TypeError, match=msg):
695697
grouped.agg(funcs)
696698

@@ -981,9 +983,11 @@ def test_groupby_multi_corner(df):
981983
tm.assert_frame_equal(agged, expected)
982984

983985

984-
def test_raises_on_nuisance(df):
986+
def test_raises_on_nuisance(df, using_infer_string):
985987
grouped = df.groupby("A")
986988
msg = re.escape("agg function failed [how->mean,dtype->")
989+
if using_infer_string:
990+
msg = "dtype 'str' does not support operation 'mean'"
987991
with pytest.raises(TypeError, match=msg):
988992
grouped.agg("mean")
989993
with pytest.raises(TypeError, match=msg):
@@ -1026,15 +1030,18 @@ def test_keep_nuisance_agg(df, agg_function):
10261030
["sum", "mean", "prod", "std", "var", "sem", "median"],
10271031
)
10281032
@pytest.mark.parametrize("numeric_only", [True, False])
1029-
def test_omit_nuisance_agg(df, agg_function, numeric_only):
1033+
def test_omit_nuisance_agg(df, agg_function, numeric_only, using_infer_string):
10301034
# GH 38774, GH 38815
10311035
grouped = df.groupby("A")
10321036

10331037
no_drop_nuisance = ("var", "std", "sem", "mean", "prod", "median")
10341038
if agg_function in no_drop_nuisance and not numeric_only:
10351039
# Added numeric_only as part of GH#46560; these do not drop nuisance
10361040
# columns when numeric_only is False
1037-
if agg_function in ("std", "sem"):
1041+
if using_infer_string:
1042+
msg = f"dtype 'str' does not support operation '{agg_function}'"
1043+
klass = TypeError
1044+
elif agg_function in ("std", "sem"):
10381045
klass = ValueError
10391046
msg = "could not convert string to float: 'one'"
10401047
else:
@@ -1055,16 +1062,24 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only):
10551062
tm.assert_frame_equal(result, expected)
10561063

10571064

1058-
def test_raise_on_nuisance_python_single(df):
1065+
def test_raise_on_nuisance_python_single(df, using_infer_string):
10591066
# GH 38815
10601067
grouped = df.groupby("A")
1061-
with pytest.raises(ValueError, match="could not convert"):
1068+
1069+
err = ValueError
1070+
msg = "could not convert"
1071+
if using_infer_string:
1072+
err = TypeError
1073+
msg = "dtype 'str' does not support operation 'skew'"
1074+
with pytest.raises(err, match=msg):
10621075
grouped.skew()
10631076

10641077

1065-
def test_raise_on_nuisance_python_multiple(three_group):
1078+
def test_raise_on_nuisance_python_multiple(three_group, using_infer_string):
10661079
grouped = three_group.groupby(["A", "B"])
10671080
msg = re.escape("agg function failed [how->mean,dtype->")
1081+
if using_infer_string:
1082+
msg = "dtype 'str' does not support operation 'mean'"
10681083
with pytest.raises(TypeError, match=msg):
10691084
grouped.agg("mean")
10701085
with pytest.raises(TypeError, match=msg):
@@ -1102,12 +1117,16 @@ def test_nonsense_func():
11021117
df.groupby(lambda x: x + "foo")
11031118

11041119

1105-
def test_wrap_aggregated_output_multindex(multiindex_dataframe_random_data):
1120+
def test_wrap_aggregated_output_multindex(
1121+
multiindex_dataframe_random_data, using_infer_string
1122+
):
11061123
df = multiindex_dataframe_random_data.T
11071124
df["baz", "two"] = "peekaboo"
11081125

11091126
keys = [np.array([0, 0, 1]), np.array([0, 0, 1])]
11101127
msg = re.escape("agg function failed [how->mean,dtype->")
1128+
if using_infer_string:
1129+
msg = "dtype 'str' does not support operation 'mean'"
11111130
with pytest.raises(TypeError, match=msg):
11121131
df.groupby(keys).agg("mean")
11131132
agged = df.drop(columns=("baz", "two")).groupby(keys).agg("mean")
@@ -1299,8 +1318,10 @@ def test_groupby_with_hier_columns():
12991318

13001319
def test_grouping_ndarray(df):
13011320
grouped = df.groupby(df["A"].values)
1321+
grouped2 = df.groupby(df["A"].rename(None))
1322+
13021323
result = grouped.sum()
1303-
expected = df.groupby(df["A"].rename(None)).sum()
1324+
expected = grouped2.sum()
13041325
tm.assert_frame_equal(result, expected)
13051326

13061327

@@ -1793,8 +1814,8 @@ def test_no_dummy_key_names(df):
17931814
result = df.groupby(df["A"].values).sum()
17941815
assert result.index.name is None
17951816

1796-
result = df.groupby([df["A"].values, df["B"].values]).sum()
1797-
assert result.index.names == (None, None)
1817+
result2 = df.groupby([df["A"].values, df["B"].values]).sum()
1818+
assert result2.index.names == (None, None)
17981819

17991820

18001821
def test_groupby_sort_multiindex_series():
@@ -2099,6 +2120,7 @@ def get_categorical_invalid_expected():
20992120
is_per = isinstance(df.dtypes.iloc[0], pd.PeriodDtype)
21002121
is_dt64 = df.dtypes.iloc[0].kind == "M"
21012122
is_cat = isinstance(values, Categorical)
2123+
is_str = isinstance(df.dtypes.iloc[0], pd.StringDtype)
21022124

21032125
if (
21042126
isinstance(values, Categorical)
@@ -2123,13 +2145,15 @@ def get_categorical_invalid_expected():
21232145

21242146
if op in ["prod", "sum", "skew"]:
21252147
# ops that require more than just ordered-ness
2126-
if is_dt64 or is_cat or is_per:
2148+
if is_dt64 or is_cat or is_per or (is_str and op != "sum"):
21272149
# GH#41291
21282150
# datetime64 -> prod and sum are invalid
21292151
if is_dt64:
21302152
msg = "datetime64 type does not support"
21312153
elif is_per:
21322154
msg = "Period type does not support"
2155+
elif is_str:
2156+
msg = f"dtype 'str' does not support operation '{op}'"
21332157
else:
21342158
msg = "category type does not support"
21352159
if op == "skew":
@@ -3083,7 +3107,7 @@ def test_obj_with_exclusions_duplicate_columns():
30833107
def test_groupby_numeric_only_std_no_result(numeric_only):
30843108
# GH 51080
30853109
dicts_non_numeric = [{"a": "foo", "b": "bar"}, {"a": "car", "b": "dar"}]
3086-
df = DataFrame(dicts_non_numeric)
3110+
df = DataFrame(dicts_non_numeric, dtype=object)
30873111
dfgb = df.groupby("a", as_index=False, sort=False)
30883112

30893113
if numeric_only:
@@ -3142,10 +3166,14 @@ def test_grouping_with_categorical_interval_columns():
31423166
def test_groupby_sum_on_nan_should_return_nan(bug_var):
31433167
# GH 24196
31443168
df = DataFrame({"A": [bug_var, bug_var, bug_var, np.nan]})
3169+
if isinstance(bug_var, str):
3170+
df = df.astype(object)
31453171
dfgb = df.groupby(lambda x: x)
31463172
result = dfgb.sum(min_count=1)
31473173

3148-
expected_df = DataFrame([bug_var, bug_var, bug_var, None], columns=["A"])
3174+
expected_df = DataFrame(
3175+
[bug_var, bug_var, bug_var, None], columns=["A"], dtype=df["A"].dtype
3176+
)
31493177
tm.assert_frame_equal(result, expected_df)
31503178

31513179

pandas/tests/groupby/test_groupby_subclass.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ def test_groupby_resample_preserves_subclass(obj):
109109

110110
df = obj(
111111
{
112-
"Buyer": "Carl Carl Carl Carl Joe Carl".split(),
112+
"Buyer": Series("Carl Carl Carl Carl Joe Carl".split(), dtype=object),
113113
"Quantity": [18, 3, 5, 1, 9, 3],
114114
"Date": [
115115
datetime(2013, 9, 1, 13, 0),

pandas/tests/groupby/test_numeric_only.py

+14-6
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,8 @@ def df(self):
2929
"group": [1, 1, 2],
3030
"int": [1, 2, 3],
3131
"float": [4.0, 5.0, 6.0],
32-
"string": list("abc"),
32+
"string": Series(["a", "b", "c"], dtype="str"),
33+
"object": Series(["a", "b", "c"], dtype=object),
3334
"category_string": Series(list("abc")).astype("category"),
3435
"category_int": [7, 8, 9],
3536
"datetime": date_range("20130101", periods=3),
@@ -41,6 +42,7 @@ def df(self):
4142
"int",
4243
"float",
4344
"string",
45+
"object",
4446
"category_string",
4547
"category_int",
4648
"datetime",
@@ -113,6 +115,7 @@ def test_first_last(self, df, method):
113115
"int",
114116
"float",
115117
"string",
118+
"object",
116119
"category_string",
117120
"category_int",
118121
"datetime",
@@ -160,7 +163,9 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
160163

161164
# object dtypes for transformations are not implemented in Cython and
162165
# have no Python fallback
163-
exception = NotImplementedError if method.startswith("cum") else TypeError
166+
exception = (
167+
(NotImplementedError, TypeError) if method.startswith("cum") else TypeError
168+
)
164169

165170
if method in ("min", "max", "cummin", "cummax", "cumsum", "cumprod"):
166171
# The methods default to numeric_only=False and raise TypeError
@@ -171,6 +176,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
171176
re.escape(f"agg function failed [how->{method},dtype->object]"),
172177
# cumsum/cummin/cummax/cumprod
173178
"function is not implemented for this dtype",
179+
f"dtype 'str' does not support operation '{method}'",
174180
]
175181
)
176182
with pytest.raises(exception, match=msg):
@@ -181,7 +187,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
181187
"category type does not support sum operations",
182188
re.escape(f"agg function failed [how->{method},dtype->object]"),
183189
re.escape(f"agg function failed [how->{method},dtype->string]"),
184-
re.escape(f"agg function failed [how->{method},dtype->str]"),
190+
f"dtype 'str' does not support operation '{method}'",
185191
]
186192
)
187193
with pytest.raises(exception, match=msg):
@@ -199,7 +205,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
199205
f"Cannot perform {method} with non-ordered Categorical",
200206
re.escape(f"agg function failed [how->{method},dtype->object]"),
201207
re.escape(f"agg function failed [how->{method},dtype->string]"),
202-
re.escape(f"agg function failed [how->{method},dtype->str]"),
208+
f"dtype 'str' does not support operation '{method}'",
203209
]
204210
)
205211
with pytest.raises(exception, match=msg):
@@ -384,7 +390,9 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys):
384390
re.escape(f"agg function failed [how->{kernel},dtype->object]"),
385391
]
386392
)
387-
if kernel == "idxmin":
393+
if kernel == "quantile":
394+
msg = "dtype 'object' does not support operation 'quantile'"
395+
elif kernel == "idxmin":
388396
msg = "'<' not supported between instances of 'type' and 'type'"
389397
elif kernel == "idxmax":
390398
msg = "'>' not supported between instances of 'type' and 'type'"
@@ -458,7 +466,7 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request):
458466
# that succeed should not be allowed to fail (without deprecation, at least)
459467
if groupby_func in fails_on_numeric_object and dtype is object:
460468
if groupby_func == "quantile":
461-
msg = "cannot be performed against 'object' dtypes"
469+
msg = "dtype 'object' does not support operation 'quantile'"
462470
else:
463471
msg = "is not supported for object dtype"
464472
warn = FutureWarning if groupby_func == "fillna" else None

0 commit comments

Comments
 (0)