Skip to content

Commit 979fb2e

Browse files
jbrockmendeljorisvandenbossche
authored andcommitted
TST (string dtype): fix groupby xfails with using_infer_string + update error message (pandas-dev#59430)
Co-authored-by: Joris Van den Bossche <[email protected]>
1 parent 71ef0aa commit 979fb2e

File tree

13 files changed

+170
-43
lines changed

13 files changed

+170
-43
lines changed

pandas/core/arrays/arrow/array.py

+14
Original file line numberDiff line numberDiff line change
@@ -2312,6 +2312,20 @@ def _groupby_op(
23122312
**kwargs,
23132313
):
23142314
if isinstance(self.dtype, StringDtype):
2315+
if how in [
2316+
"prod",
2317+
"mean",
2318+
"median",
2319+
"cumsum",
2320+
"cumprod",
2321+
"std",
2322+
"sem",
2323+
"var",
2324+
"skew",
2325+
]:
2326+
raise TypeError(
2327+
f"dtype '{self.dtype}' does not support operation '{how}'"
2328+
)
23152329
return super()._groupby_op(
23162330
how=how,
23172331
has_dropped_na=has_dropped_na,

pandas/core/arrays/base.py

+14
Original file line numberDiff line numberDiff line change
@@ -2608,6 +2608,20 @@ def _groupby_op(
26082608
# GH#43682
26092609
if isinstance(self.dtype, StringDtype):
26102610
# StringArray
2611+
if op.how in [
2612+
"prod",
2613+
"mean",
2614+
"median",
2615+
"cumsum",
2616+
"cumprod",
2617+
"std",
2618+
"sem",
2619+
"var",
2620+
"skew",
2621+
]:
2622+
raise TypeError(
2623+
f"dtype '{self.dtype}' does not support operation '{how}'"
2624+
)
26112625
if op.how not in ["any", "all"]:
26122626
# Fail early to avoid conversion to object
26132627
op._get_cython_function(op.kind, op.how, np.dtype(object), False)

pandas/core/groupby/groupby.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -4162,9 +4162,9 @@ def quantile(
41624162
starts, ends = lib.generate_slices(splitter._slabels, splitter.ngroups)
41634163

41644164
def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]:
4165-
if is_object_dtype(vals.dtype):
4165+
if isinstance(vals.dtype, StringDtype) or is_object_dtype(vals.dtype):
41664166
raise TypeError(
4167-
"'quantile' cannot be performed against 'object' dtypes!"
4167+
f"dtype '{vals.dtype}' does not support operation 'quantile'"
41684168
)
41694169

41704170
inference: DtypeObj | None = None

pandas/tests/frame/test_stack_unstack.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -2113,7 +2113,7 @@ def test_unstack_period_frame(self):
21132113
@pytest.mark.filterwarnings(
21142114
"ignore:The previous implementation of stack is deprecated"
21152115
)
2116-
def test_stack_multiple_bug(self, future_stack):
2116+
def test_stack_multiple_bug(self, future_stack, using_infer_string):
21172117
# bug when some uniques are not present in the data GH#3170
21182118
id_col = ([1] * 3) + ([2] * 3)
21192119
name = (["a"] * 3) + (["b"] * 3)
@@ -2125,6 +2125,8 @@ def test_stack_multiple_bug(self, future_stack):
21252125
multi.columns.name = "Params"
21262126
unst = multi.unstack("ID")
21272127
msg = re.escape("agg function failed [how->mean,dtype->")
2128+
if using_infer_string:
2129+
msg = "dtype 'str' does not support operation 'mean'"
21282130
with pytest.raises(TypeError, match=msg):
21292131
unst.resample("W-THU").mean()
21302132
down = unst.resample("W-THU").mean(numeric_only=True)

pandas/tests/groupby/aggregate/test_cython.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -148,11 +148,11 @@ def test_cython_agg_return_dict():
148148

149149
def test_cython_fail_agg():
150150
dr = bdate_range("1/1/2000", periods=50)
151-
ts = Series(["A", "B", "C", "D", "E"] * 10, index=dr)
151+
ts = Series(["A", "B", "C", "D", "E"] * 10, dtype=object, index=dr)
152152

153153
grouped = ts.groupby(lambda x: x.month)
154154
summed = grouped.sum()
155-
expected = grouped.agg(np.sum)
155+
expected = grouped.agg(np.sum).astype(object)
156156
tm.assert_series_equal(summed, expected)
157157

158158

pandas/tests/groupby/methods/test_quantile.py

+4-5
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,8 @@ def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby,
162162
def test_quantile_raises():
163163
df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"])
164164

165-
with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"):
165+
msg = "dtype 'object' does not support operation 'quantile'"
166+
with pytest.raises(TypeError, match=msg):
166167
df.groupby("key").quantile()
167168

168169

@@ -241,7 +242,6 @@ def test_groupby_quantile_nullable_array(values, q):
241242
tm.assert_series_equal(result, expected)
242243

243244

244-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
245245
@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
246246
@pytest.mark.parametrize("numeric_only", [True, False])
247247
def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only):
@@ -251,9 +251,8 @@ def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only):
251251
expected = df.groupby("a")[["b"]].quantile(q)
252252
tm.assert_frame_equal(result, expected)
253253
else:
254-
with pytest.raises(
255-
TypeError, match="'quantile' cannot be performed against 'object' dtypes!"
256-
):
254+
msg = "dtype '.*' does not support operation 'quantile'"
255+
with pytest.raises(TypeError, match=msg):
257256
df.groupby("a").quantile(q, numeric_only=numeric_only)
258257

259258

pandas/tests/groupby/test_groupby.py

+42-14
Original file line numberDiff line numberDiff line change
@@ -425,7 +425,7 @@ def test_frame_multi_key_function_list():
425425
tm.assert_frame_equal(agged, expected)
426426

427427

428-
def test_frame_multi_key_function_list_partial_failure():
428+
def test_frame_multi_key_function_list_partial_failure(using_infer_string):
429429
data = DataFrame(
430430
{
431431
"A": [
@@ -476,6 +476,8 @@ def test_frame_multi_key_function_list_partial_failure():
476476
grouped = data.groupby(["A", "B"])
477477
funcs = ["mean", "std"]
478478
msg = re.escape("agg function failed [how->mean,dtype->")
479+
if using_infer_string:
480+
msg = "dtype 'str' does not support operation 'mean'"
479481
with pytest.raises(TypeError, match=msg):
480482
grouped.agg(funcs)
481483

@@ -662,9 +664,11 @@ def test_groupby_multi_corner(df):
662664
tm.assert_frame_equal(agged, expected)
663665

664666

665-
def test_raises_on_nuisance(df):
667+
def test_raises_on_nuisance(df, using_infer_string):
666668
grouped = df.groupby("A")
667669
msg = re.escape("agg function failed [how->mean,dtype->")
670+
if using_infer_string:
671+
msg = "dtype 'str' does not support operation 'mean'"
668672
with pytest.raises(TypeError, match=msg):
669673
grouped.agg("mean")
670674
with pytest.raises(TypeError, match=msg):
@@ -699,15 +703,18 @@ def test_keep_nuisance_agg(df, agg_function):
699703
["sum", "mean", "prod", "std", "var", "sem", "median"],
700704
)
701705
@pytest.mark.parametrize("numeric_only", [True, False])
702-
def test_omit_nuisance_agg(df, agg_function, numeric_only):
706+
def test_omit_nuisance_agg(df, agg_function, numeric_only, using_infer_string):
703707
# GH 38774, GH 38815
704708
grouped = df.groupby("A")
705709

706710
no_drop_nuisance = ("var", "std", "sem", "mean", "prod", "median")
707711
if agg_function in no_drop_nuisance and not numeric_only:
708712
# Added numeric_only as part of GH#46560; these do not drop nuisance
709713
# columns when numeric_only is False
710-
if agg_function in ("std", "sem"):
714+
if using_infer_string:
715+
msg = f"dtype 'str' does not support operation '{agg_function}'"
716+
klass = TypeError
717+
elif agg_function in ("std", "sem"):
711718
klass = ValueError
712719
msg = "could not convert string to float: 'one'"
713720
else:
@@ -728,16 +735,24 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only):
728735
tm.assert_frame_equal(result, expected)
729736

730737

731-
def test_raise_on_nuisance_python_single(df):
738+
def test_raise_on_nuisance_python_single(df, using_infer_string):
732739
# GH 38815
733740
grouped = df.groupby("A")
734-
with pytest.raises(ValueError, match="could not convert"):
741+
742+
err = ValueError
743+
msg = "could not convert"
744+
if using_infer_string:
745+
err = TypeError
746+
msg = "dtype 'str' does not support operation 'skew'"
747+
with pytest.raises(err, match=msg):
735748
grouped.skew()
736749

737750

738-
def test_raise_on_nuisance_python_multiple(three_group):
751+
def test_raise_on_nuisance_python_multiple(three_group, using_infer_string):
739752
grouped = three_group.groupby(["A", "B"])
740753
msg = re.escape("agg function failed [how->mean,dtype->")
754+
if using_infer_string:
755+
msg = "dtype 'str' does not support operation 'mean'"
741756
with pytest.raises(TypeError, match=msg):
742757
grouped.agg("mean")
743758
with pytest.raises(TypeError, match=msg):
@@ -775,12 +790,16 @@ def test_nonsense_func():
775790
df.groupby(lambda x: x + "foo")
776791

777792

778-
def test_wrap_aggregated_output_multindex(multiindex_dataframe_random_data):
793+
def test_wrap_aggregated_output_multindex(
794+
multiindex_dataframe_random_data, using_infer_string
795+
):
779796
df = multiindex_dataframe_random_data.T
780797
df["baz", "two"] = "peekaboo"
781798

782799
keys = [np.array([0, 0, 1]), np.array([0, 0, 1])]
783800
msg = re.escape("agg function failed [how->mean,dtype->")
801+
if using_infer_string:
802+
msg = "dtype 'str' does not support operation 'mean'"
784803
with pytest.raises(TypeError, match=msg):
785804
df.groupby(keys).agg("mean")
786805
agged = df.drop(columns=("baz", "two")).groupby(keys).agg("mean")
@@ -960,8 +979,10 @@ def test_groupby_with_hier_columns():
960979

961980
def test_grouping_ndarray(df):
962981
grouped = df.groupby(df["A"].values)
982+
grouped2 = df.groupby(df["A"].rename(None))
983+
963984
result = grouped.sum()
964-
expected = df.groupby(df["A"].rename(None)).sum()
985+
expected = grouped2.sum()
965986
tm.assert_frame_equal(result, expected)
966987

967988

@@ -1457,8 +1478,8 @@ def test_no_dummy_key_names(df):
14571478
result = df.groupby(df["A"].values).sum()
14581479
assert result.index.name is None
14591480

1460-
result = df.groupby([df["A"].values, df["B"].values]).sum()
1461-
assert result.index.names == (None, None)
1481+
result2 = df.groupby([df["A"].values, df["B"].values]).sum()
1482+
assert result2.index.names == (None, None)
14621483

14631484

14641485
def test_groupby_sort_multiindex_series():
@@ -1761,6 +1782,7 @@ def get_categorical_invalid_expected():
17611782
is_per = isinstance(df.dtypes.iloc[0], pd.PeriodDtype)
17621783
is_dt64 = df.dtypes.iloc[0].kind == "M"
17631784
is_cat = isinstance(values, Categorical)
1785+
is_str = isinstance(df.dtypes.iloc[0], pd.StringDtype)
17641786

17651787
if (
17661788
isinstance(values, Categorical)
@@ -1785,13 +1807,15 @@ def get_categorical_invalid_expected():
17851807

17861808
if op in ["prod", "sum", "skew"]:
17871809
# ops that require more than just ordered-ness
1788-
if is_dt64 or is_cat or is_per:
1810+
if is_dt64 or is_cat or is_per or (is_str and op != "sum"):
17891811
# GH#41291
17901812
# datetime64 -> prod and sum are invalid
17911813
if is_dt64:
17921814
msg = "datetime64 type does not support"
17931815
elif is_per:
17941816
msg = "Period type does not support"
1817+
elif is_str:
1818+
msg = f"dtype 'str' does not support operation '{op}'"
17951819
else:
17961820
msg = "category type does not support"
17971821
if op == "skew":
@@ -2714,7 +2738,7 @@ def test_obj_with_exclusions_duplicate_columns():
27142738
def test_groupby_numeric_only_std_no_result(numeric_only):
27152739
# GH 51080
27162740
dicts_non_numeric = [{"a": "foo", "b": "bar"}, {"a": "car", "b": "dar"}]
2717-
df = DataFrame(dicts_non_numeric)
2741+
df = DataFrame(dicts_non_numeric, dtype=object)
27182742
dfgb = df.groupby("a", as_index=False, sort=False)
27192743

27202744
if numeric_only:
@@ -2773,10 +2797,14 @@ def test_grouping_with_categorical_interval_columns():
27732797
def test_groupby_sum_on_nan_should_return_nan(bug_var):
27742798
# GH 24196
27752799
df = DataFrame({"A": [bug_var, bug_var, bug_var, np.nan]})
2800+
if isinstance(bug_var, str):
2801+
df = df.astype(object)
27762802
dfgb = df.groupby(lambda x: x)
27772803
result = dfgb.sum(min_count=1)
27782804

2779-
expected_df = DataFrame([bug_var, bug_var, bug_var, None], columns=["A"])
2805+
expected_df = DataFrame(
2806+
[bug_var, bug_var, bug_var, None], columns=["A"], dtype=df["A"].dtype
2807+
)
27802808
tm.assert_frame_equal(result, expected_df)
27812809

27822810

pandas/tests/groupby/test_groupby_subclass.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ def test_groupby_resample_preserves_subclass(obj):
109109

110110
df = obj(
111111
{
112-
"Buyer": "Carl Carl Carl Carl Joe Carl".split(),
112+
"Buyer": Series("Carl Carl Carl Carl Joe Carl".split(), dtype=object),
113113
"Quantity": [18, 3, 5, 1, 9, 3],
114114
"Date": [
115115
datetime(2013, 9, 1, 13, 0),

pandas/tests/groupby/test_numeric_only.py

+14-6
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ def df(self):
2828
"group": [1, 1, 2],
2929
"int": [1, 2, 3],
3030
"float": [4.0, 5.0, 6.0],
31-
"string": list("abc"),
31+
"string": Series(["a", "b", "c"], dtype="str"),
32+
"object": Series(["a", "b", "c"], dtype=object),
3233
"category_string": Series(list("abc")).astype("category"),
3334
"category_int": [7, 8, 9],
3435
"datetime": date_range("20130101", periods=3),
@@ -40,6 +41,7 @@ def df(self):
4041
"int",
4142
"float",
4243
"string",
44+
"object",
4345
"category_string",
4446
"category_int",
4547
"datetime",
@@ -112,6 +114,7 @@ def test_first_last(self, df, method):
112114
"int",
113115
"float",
114116
"string",
117+
"object",
115118
"category_string",
116119
"category_int",
117120
"datetime",
@@ -159,7 +162,9 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
159162

160163
# object dtypes for transformations are not implemented in Cython and
161164
# have no Python fallback
162-
exception = NotImplementedError if method.startswith("cum") else TypeError
165+
exception = (
166+
(NotImplementedError, TypeError) if method.startswith("cum") else TypeError
167+
)
163168

164169
if method in ("min", "max", "cummin", "cummax", "cumsum", "cumprod"):
165170
# The methods default to numeric_only=False and raise TypeError
@@ -170,6 +175,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
170175
re.escape(f"agg function failed [how->{method},dtype->object]"),
171176
# cumsum/cummin/cummax/cumprod
172177
"function is not implemented for this dtype",
178+
f"dtype 'str' does not support operation '{method}'",
173179
]
174180
)
175181
with pytest.raises(exception, match=msg):
@@ -180,7 +186,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
180186
"category type does not support sum operations",
181187
re.escape(f"agg function failed [how->{method},dtype->object]"),
182188
re.escape(f"agg function failed [how->{method},dtype->string]"),
183-
re.escape(f"agg function failed [how->{method},dtype->str]"),
189+
f"dtype 'str' does not support operation '{method}'",
184190
]
185191
)
186192
with pytest.raises(exception, match=msg):
@@ -198,7 +204,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
198204
f"Cannot perform {method} with non-ordered Categorical",
199205
re.escape(f"agg function failed [how->{method},dtype->object]"),
200206
re.escape(f"agg function failed [how->{method},dtype->string]"),
201-
re.escape(f"agg function failed [how->{method},dtype->str]"),
207+
f"dtype 'str' does not support operation '{method}'",
202208
]
203209
)
204210
with pytest.raises(exception, match=msg):
@@ -299,7 +305,9 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys):
299305
re.escape(f"agg function failed [how->{kernel},dtype->object]"),
300306
]
301307
)
302-
if kernel == "idxmin":
308+
if kernel == "quantile":
309+
msg = "dtype 'object' does not support operation 'quantile'"
310+
elif kernel == "idxmin":
303311
msg = "'<' not supported between instances of 'type' and 'type'"
304312
elif kernel == "idxmax":
305313
msg = "'>' not supported between instances of 'type' and 'type'"
@@ -379,7 +387,7 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request):
379387
# that succeed should not be allowed to fail (without deprecation, at least)
380388
if groupby_func in fails_on_numeric_object and dtype is object:
381389
if groupby_func == "quantile":
382-
msg = "cannot be performed against 'object' dtypes"
390+
msg = "dtype 'object' does not support operation 'quantile'"
383391
else:
384392
msg = "is not supported for object dtype"
385393
with pytest.raises(TypeError, match=msg):

0 commit comments

Comments
 (0)