Skip to content

Commit 280a91e

Browse files
TST (string dtype): resolve all easy xfails in pandas/tests/groupby (pandas-dev#60314)
(cherry picked from commit c4a2026)
1 parent 54b47df commit 280a91e

13 files changed

+30
-53
lines changed

pandas/tests/groupby/aggregate/test_aggregate.py

+2-6
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,6 @@
99
import numpy as np
1010
import pytest
1111

12-
from pandas._config import using_string_dtype
13-
1412
from pandas.errors import SpecificationError
1513

1614
from pandas.core.dtypes.common import is_integer_dtype
@@ -335,12 +333,11 @@ def aggfun_1(ser):
335333
assert len(result) == 0
336334

337335

338-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
339336
def test_wrap_agg_out(three_group):
340337
grouped = three_group.groupby(["A", "B"])
341338

342339
def func(ser):
343-
if ser.dtype == object:
340+
if ser.dtype in (object, "string"):
344341
raise TypeError("Test error message")
345342
return ser.sum()
346343

@@ -1101,7 +1098,6 @@ def test_lambda_named_agg(func):
11011098
tm.assert_frame_equal(result, expected)
11021099

11031100

1104-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
11051101
def test_aggregate_mixed_types():
11061102
# GH 16916
11071103
df = DataFrame(
@@ -1113,7 +1109,7 @@ def test_aggregate_mixed_types():
11131109
expected = DataFrame(
11141110
expected_data,
11151111
index=Index([2, "group 1"], dtype="object", name="grouping"),
1116-
columns=Index(["X", "Y", "Z"], dtype="object"),
1112+
columns=Index(["X", "Y", "Z"]),
11171113
)
11181114
tm.assert_frame_equal(result, expected)
11191115

pandas/tests/groupby/aggregate/test_cython.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,6 @@
55
import numpy as np
66
import pytest
77

8-
from pandas._config import using_string_dtype
9-
108
from pandas.core.dtypes.common import (
119
is_float_dtype,
1210
is_integer_dtype,
@@ -95,7 +93,6 @@ def test_cython_agg_boolean():
9593
tm.assert_series_equal(result, expected)
9694

9795

98-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
9996
def test_cython_agg_nothing_to_agg():
10097
frame = DataFrame(
10198
{"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25}
@@ -111,7 +108,9 @@ def test_cython_agg_nothing_to_agg():
111108

112109
result = frame[["b"]].groupby(frame["a"]).mean(numeric_only=True)
113110
expected = DataFrame(
114-
[], index=frame["a"].sort_values().drop_duplicates(), columns=[]
111+
[],
112+
index=frame["a"].sort_values().drop_duplicates(),
113+
columns=Index([], dtype="str"),
115114
)
116115
tm.assert_frame_equal(result, expected)
117116

pandas/tests/groupby/aggregate/test_other.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@
88
import numpy as np
99
import pytest
1010

11-
from pandas._config import using_string_dtype
12-
1311
from pandas.errors import SpecificationError
1412

1513
import pandas as pd
@@ -308,7 +306,6 @@ def test_series_agg_multikey():
308306
tm.assert_series_equal(result, expected)
309307

310308

311-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
312309
def test_series_agg_multi_pure_python():
313310
data = DataFrame(
314311
{
@@ -358,7 +355,8 @@ def test_series_agg_multi_pure_python():
358355
)
359356

360357
def bad(x):
361-
assert len(x.values.base) > 0
358+
if isinstance(x.values, np.ndarray):
359+
assert len(x.values.base) > 0
362360
return "foo"
363361

364362
result = data.groupby(["A", "B"]).agg(bad)

pandas/tests/groupby/methods/test_quantile.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
import numpy as np
22
import pytest
33

4-
from pandas._config import using_string_dtype
5-
64
import pandas as pd
75
from pandas import (
86
DataFrame,
@@ -170,11 +168,10 @@ def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby,
170168
tm.assert_frame_equal(result, expected)
171169

172170

173-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
174171
def test_quantile_raises():
175172
df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"])
176173

177-
msg = "dtype 'object' does not support operation 'quantile'"
174+
msg = "dtype '(object|str)' does not support operation 'quantile'"
178175
with pytest.raises(TypeError, match=msg):
179176
df.groupby("key").quantile()
180177

pandas/tests/groupby/methods/test_size.py

+2
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,8 @@ def test_size_series_masked_type_returns_Int64(dtype):
108108
tm.assert_series_equal(result, expected)
109109

110110

111+
# TODO(infer_string) in case the column is object dtype, it should preserve that dtype
112+
# for the result's index
111113
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
112114
def test_size_strings(any_string_dtype):
113115
# GH#55627

pandas/tests/groupby/test_categorical.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33
import numpy as np
44
import pytest
55

6-
from pandas._config import using_string_dtype
7-
86
import pandas as pd
97
from pandas import (
108
Categorical,
@@ -340,15 +338,18 @@ def test_apply(ordered):
340338
tm.assert_series_equal(result, expected)
341339

342340

343-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
344-
def test_observed(observed):
341+
def test_observed(request, using_infer_string, observed):
345342
# multiple groupers, don't re-expand the output space
346343
# of the grouper
347344
# gh-14942 (implement)
348345
# gh-10132 (back-compat)
349346
# gh-8138 (back-compat)
350347
# gh-8869
351348

349+
if using_infer_string and not observed:
350+
# TODO(infer_string) this fails with filling the string column with 0
351+
request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
352+
352353
cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True)
353354
cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True)
354355
df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})

pandas/tests/groupby/test_groupby.py

+3-6
Original file line numberDiff line numberDiff line change
@@ -1617,7 +1617,6 @@ def test_groupby_two_group_keys_all_nan():
16171617
assert result == {}
16181618

16191619

1620-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
16211620
def test_groupby_2d_malformed():
16221621
d = DataFrame(index=range(2))
16231622
d["group"] = ["g1", "g2"]
@@ -1626,7 +1625,7 @@ def test_groupby_2d_malformed():
16261625
d["label"] = ["l1", "l2"]
16271626
tmp = d.groupby(["group"]).mean(numeric_only=True)
16281627
res_values = np.array([[0.0, 1.0], [0.0, 1.0]])
1629-
tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"]))
1628+
tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"], dtype=object))
16301629
tm.assert_numpy_array_equal(tmp.values, res_values)
16311630

16321631

@@ -2711,7 +2710,6 @@ def test_groupby_all_nan_groups_drop():
27112710
tm.assert_series_equal(result, expected)
27122711

27132712

2714-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
27152713
@pytest.mark.parametrize("numeric_only", [True, False])
27162714
def test_groupby_empty_multi_column(as_index, numeric_only):
27172715
# GH 15106 & GH 41998
@@ -2720,15 +2718,14 @@ def test_groupby_empty_multi_column(as_index, numeric_only):
27202718
result = gb.sum(numeric_only=numeric_only)
27212719
if as_index:
27222720
index = MultiIndex([[], []], [[], []], names=["A", "B"])
2723-
columns = ["C"] if not numeric_only else []
2721+
columns = ["C"] if not numeric_only else Index([], dtype="str")
27242722
else:
27252723
index = RangeIndex(0)
27262724
columns = ["A", "B", "C"] if not numeric_only else ["A", "B"]
27272725
expected = DataFrame([], columns=columns, index=index)
27282726
tm.assert_frame_equal(result, expected)
27292727

27302728

2731-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
27322729
def test_groupby_aggregation_non_numeric_dtype():
27332730
# GH #43108
27342731
df = DataFrame(
@@ -2739,7 +2736,7 @@ def test_groupby_aggregation_non_numeric_dtype():
27392736
{
27402737
"v": [[1, 1], [10, 20]],
27412738
},
2742-
index=Index(["M", "W"], dtype="object", name="MW"),
2739+
index=Index(["M", "W"], name="MW"),
27432740
)
27442741

27452742
gb = df.groupby(by=["MW"])

pandas/tests/groupby/test_groupby_dropna.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
import numpy as np
22
import pytest
33

4-
from pandas._config import using_string_dtype
5-
64
from pandas.compat.pyarrow import pa_version_under10p1
75

86
from pandas.core.dtypes.missing import na_value_for_dtype
@@ -99,7 +97,6 @@ def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups(
9997
tm.assert_frame_equal(grouped, expected)
10098

10199

102-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
103100
@pytest.mark.parametrize(
104101
"dropna, idx, outputs",
105102
[
@@ -126,7 +123,7 @@ def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs):
126123
df = pd.DataFrame(df_list, columns=["a", "b", "c", "d"])
127124
grouped = df.groupby("a", dropna=dropna).sum()
128125

129-
expected = pd.DataFrame(outputs, index=pd.Index(idx, dtype="object", name="a"))
126+
expected = pd.DataFrame(outputs, index=pd.Index(idx, name="a"))
130127

131128
tm.assert_frame_equal(grouped, expected)
132129

pandas/tests/groupby/test_grouping.py

+4-6
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,6 @@
99
import numpy as np
1010
import pytest
1111

12-
from pandas._config import using_string_dtype
13-
1412
import pandas as pd
1513
from pandas import (
1614
CategoricalIndex,
@@ -844,7 +842,6 @@ def test_groupby_empty(self):
844842
expected = ["name"]
845843
assert result == expected
846844

847-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
848845
def test_groupby_level_index_value_all_na(self):
849846
# issue 20519
850847
df = DataFrame(
@@ -854,7 +851,7 @@ def test_groupby_level_index_value_all_na(self):
854851
expected = DataFrame(
855852
data=[],
856853
index=MultiIndex(
857-
levels=[Index(["x"], dtype="object"), Index([], dtype="float64")],
854+
levels=[Index(["x"], dtype="str"), Index([], dtype="float64")],
858855
codes=[[], []],
859856
names=["A", "B"],
860857
),
@@ -989,12 +986,13 @@ def test_groupby_with_empty(self):
989986
grouped = series.groupby(grouper)
990987
assert next(iter(grouped), None) is None
991988

992-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
993989
def test_groupby_with_single_column(self):
994990
df = DataFrame({"a": list("abssbab")})
995991
tm.assert_frame_equal(df.groupby("a").get_group("a"), df.iloc[[0, 5]])
996992
# GH 13530
997-
exp = DataFrame(index=Index(["a", "b", "s"], name="a"), columns=[])
993+
exp = DataFrame(
994+
index=Index(["a", "b", "s"], name="a"), columns=Index([], dtype="str")
995+
)
998996
tm.assert_frame_equal(df.groupby("a").count(), exp)
999997
tm.assert_frame_equal(df.groupby("a").sum(), exp)
1000998

pandas/tests/groupby/test_pipe.py

+1-5
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,4 @@
11
import numpy as np
2-
import pytest
3-
4-
from pandas._config import using_string_dtype
52

63
import pandas as pd
74
from pandas import (
@@ -11,7 +8,6 @@
118
import pandas._testing as tm
129

1310

14-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
1511
def test_pipe():
1612
# Test the pipe method of DataFrameGroupBy.
1713
# Issue #17871
@@ -39,7 +35,7 @@ def square(srs):
3935
# NDFrame.pipe methods
4036
result = df.groupby("A").pipe(f).pipe(square)
4137

42-
index = Index(["bar", "foo"], dtype="object", name="A")
38+
index = Index(["bar", "foo"], name="A")
4339
expected = pd.Series([3.749306591013693, 6.717707873081384], name="B", index=index)
4440

4541
tm.assert_series_equal(expected, result)

pandas/tests/groupby/test_reductions.py

+2-5
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,6 @@
55
import numpy as np
66
import pytest
77

8-
from pandas._config import using_string_dtype
9-
108
from pandas._libs.tslibs import iNaT
119

1210
from pandas.core.dtypes.common import pandas_dtype
@@ -457,8 +455,7 @@ def test_max_min_non_numeric():
457455
assert "ss" in result
458456

459457

460-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
461-
def test_max_min_object_multiple_columns(using_array_manager):
458+
def test_max_min_object_multiple_columns(using_array_manager, using_infer_string):
462459
# GH#41111 case where the aggregation is valid for some columns but not
463460
# others; we split object blocks column-wise, consistent with
464461
# DataFrame._reduce
@@ -472,7 +469,7 @@ def test_max_min_object_multiple_columns(using_array_manager):
472469
)
473470
df._consolidate_inplace() # should already be consolidate, but double-check
474471
if not using_array_manager:
475-
assert len(df._mgr.blocks) == 2
472+
assert len(df._mgr.blocks) == 3 if using_infer_string else 2
476473

477474
gb = df.groupby("A")
478475

pandas/tests/groupby/test_timegrouper.py

+2
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,8 @@ def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper):
7575

7676

7777
class TestGroupBy:
78+
# TODO(infer_string) resample sum introduces 0's
79+
# https://github.com/pandas-dev/pandas/issues/60229
7880
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
7981
def test_groupby_with_timegrouper(self):
8082
# GH 4161

pandas/tests/groupby/transform/test_transform.py

+2-5
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@
22
import numpy as np
33
import pytest
44

5-
from pandas._config import using_string_dtype
6-
75
from pandas._libs import lib
86

97
from pandas.core.dtypes.common import ensure_platform_int
@@ -1229,20 +1227,19 @@ def test_groupby_transform_with_datetimes(func, values):
12291227
tm.assert_series_equal(result, expected)
12301228

12311229

1232-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
12331230
def test_groupby_transform_dtype():
12341231
# GH 22243
12351232
df = DataFrame({"a": [1], "val": [1.35]})
12361233

12371234
result = df["val"].transform(lambda x: x.map(lambda y: f"+{y}"))
1238-
expected1 = Series(["+1.35"], name="val", dtype="object")
1235+
expected1 = Series(["+1.35"], name="val")
12391236
tm.assert_series_equal(result, expected1)
12401237

12411238
result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+{y}"))
12421239
tm.assert_series_equal(result, expected1)
12431240

12441241
result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+({y})"))
1245-
expected2 = Series(["+(1.35)"], name="val", dtype="object")
1242+
expected2 = Series(["+(1.35)"], name="val")
12461243
tm.assert_series_equal(result, expected2)
12471244

12481245
df["val"] = df["val"].astype(object)

0 commit comments

Comments
 (0)