Skip to content

Commit 695b170

Browse files
authored
TST: Clean groupby tests (#58797)
1 parent 1220198 commit 695b170

File tree

2 files changed

+86
-50
lines changed

2 files changed

+86
-50
lines changed

pandas/tests/groupby/test_apply.py

+34-21
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,8 @@ def test_groupby_as_index_apply():
322322
tm.assert_index_equal(res_as_apply, exp_as_apply)
323323
tm.assert_index_equal(res_not_as_apply, exp_not_as_apply)
324324

325+
326+
def test_groupby_as_index_apply_str():
325327
ind = Index(list("abcde"))
326328
df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind)
327329
msg = "DataFrameGroupBy.apply operated on the grouping columns"
@@ -379,8 +381,8 @@ def f(piece):
379381
{"value": piece, "demeaned": piece - piece.mean(), "logged": logged}
380382
)
381383

382-
dr = bdate_range("1/1/2000", periods=100)
383-
ts = Series(np.random.default_rng(2).standard_normal(100), index=dr)
384+
dr = bdate_range("1/1/2000", periods=10)
385+
ts = Series(np.random.default_rng(2).standard_normal(10), index=dr)
384386

385387
grouped = ts.groupby(lambda x: x.month, group_keys=False)
386388
result = grouped.apply(f)
@@ -639,13 +641,13 @@ def reindex_helper(x):
639641
def test_apply_corner_cases():
640642
# #535, can't use sliding iterator
641643

642-
N = 1000
644+
N = 10
643645
labels = np.random.default_rng(2).integers(0, 100, size=N)
644646
df = DataFrame(
645647
{
646648
"key": labels,
647649
"value1": np.random.default_rng(2).standard_normal(N),
648-
"value2": ["foo", "bar", "baz", "qux"] * (N // 4),
650+
"value2": ["foo", "bar", "baz", "qux", "a"] * (N // 5),
649651
}
650652
)
651653

@@ -680,6 +682,8 @@ def test_apply_numeric_coercion_when_datetime():
680682
result = df.groupby(["Number"]).apply(lambda x: x.iloc[0])
681683
tm.assert_series_equal(result["Str"], expected["Str"])
682684

685+
686+
def test_apply_numeric_coercion_when_datetime_getitem():
683687
# GH 15421
684688
df = DataFrame(
685689
{"A": [10, 20, 30], "B": ["foo", "3", "4"], "T": [pd.Timestamp("12:31:22")] * 3}
@@ -695,6 +699,8 @@ def get_B(g):
695699
expected.index = df.A
696700
tm.assert_series_equal(result, expected)
697701

702+
703+
def test_apply_numeric_coercion_when_datetime_with_nat():
698704
# GH 14423
699705
def predictions(tool):
700706
out = Series(index=["p1", "p2", "useTime"], dtype=object)
@@ -843,10 +849,24 @@ def test_func(x):
843849
tm.assert_frame_equal(result, expected)
844850

845851

846-
def test_groupby_apply_none_first():
852+
@pytest.mark.parametrize(
853+
"in_data, out_idx, out_data",
854+
[
855+
[
856+
{"groups": [1, 1, 1, 2], "vars": [0, 1, 2, 3]},
857+
[[1, 1], [0, 2]],
858+
{"groups": [1, 1], "vars": [0, 2]},
859+
],
860+
[
861+
{"groups": [1, 2, 2, 2], "vars": [0, 1, 2, 3]},
862+
[[2, 2], [1, 3]],
863+
{"groups": [2, 2], "vars": [1, 3]},
864+
],
865+
],
866+
)
867+
def test_groupby_apply_none_first(in_data, out_idx, out_data):
847868
# GH 12824. Tests if apply returns None first.
848-
test_df1 = DataFrame({"groups": [1, 1, 1, 2], "vars": [0, 1, 2, 3]})
849-
test_df2 = DataFrame({"groups": [1, 2, 2, 2], "vars": [0, 1, 2, 3]})
869+
test_df1 = DataFrame(in_data)
850870

851871
def test_func(x):
852872
if x.shape[0] < 2:
@@ -856,14 +876,9 @@ def test_func(x):
856876
msg = "DataFrameGroupBy.apply operated on the grouping columns"
857877
with tm.assert_produces_warning(DeprecationWarning, match=msg):
858878
result1 = test_df1.groupby("groups").apply(test_func)
859-
with tm.assert_produces_warning(DeprecationWarning, match=msg):
860-
result2 = test_df2.groupby("groups").apply(test_func)
861-
index1 = MultiIndex.from_arrays([[1, 1], [0, 2]], names=["groups", None])
862-
index2 = MultiIndex.from_arrays([[2, 2], [1, 3]], names=["groups", None])
863-
expected1 = DataFrame({"groups": [1, 1], "vars": [0, 2]}, index=index1)
864-
expected2 = DataFrame({"groups": [2, 2], "vars": [1, 3]}, index=index2)
879+
index1 = MultiIndex.from_arrays(out_idx, names=["groups", None])
880+
expected1 = DataFrame(out_data, index=index1)
865881
tm.assert_frame_equal(result1, expected1)
866-
tm.assert_frame_equal(result2, expected2)
867882

868883

869884
def test_groupby_apply_return_empty_chunk():
@@ -883,18 +898,16 @@ def test_groupby_apply_return_empty_chunk():
883898
tm.assert_series_equal(result, expected)
884899

885900

886-
def test_apply_with_mixed_types():
901+
@pytest.mark.parametrize("meth", ["apply", "transform"])
902+
def test_apply_with_mixed_types(meth):
887903
# gh-20949
888904
df = DataFrame({"A": "a a b".split(), "B": [1, 2, 3], "C": [4, 6, 5]})
889905
g = df.groupby("A", group_keys=False)
890906

891-
result = g.transform(lambda x: x / x.sum())
907+
result = getattr(g, meth)(lambda x: x / x.sum())
892908
expected = DataFrame({"B": [1 / 3.0, 2 / 3.0, 1], "C": [0.4, 0.6, 1.0]})
893909
tm.assert_frame_equal(result, expected)
894910

895-
result = g.apply(lambda x: x / x.sum())
896-
tm.assert_frame_equal(result, expected)
897-
898911

899912
def test_func_returns_object():
900913
# GH 28652
@@ -1106,7 +1119,7 @@ def test_apply_function_with_indexing_return_column():
11061119

11071120
@pytest.mark.parametrize(
11081121
"udf",
1109-
[(lambda x: x.copy()), (lambda x: x.copy().rename(lambda y: y + 1))],
1122+
[lambda x: x.copy(), lambda x: x.copy().rename(lambda y: y + 1)],
11101123
)
11111124
@pytest.mark.parametrize("group_keys", [True, False])
11121125
def test_apply_result_type(group_keys, udf):
@@ -1214,7 +1227,7 @@ def test_apply_with_date_in_multiindex_does_not_convert_to_timestamp():
12141227
expected = df.iloc[[0, 2, 3]]
12151228
expected = expected.reset_index()
12161229
expected.index = MultiIndex.from_frame(expected[["A", "B", "idx"]])
1217-
expected = expected.drop(columns="idx")
1230+
expected = expected.drop(columns=["idx"])
12181231

12191232
tm.assert_frame_equal(result, expected)
12201233
for val in result.index.levels[1]:

pandas/tests/groupby/test_categorical.py

+52-29
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ def get_stats(group):
8282
assert result.index.names[0] == "C"
8383

8484

85-
def test_basic(using_infer_string): # TODO: split this test
85+
def test_basic():
8686
cats = Categorical(
8787
["a", "a", "a", "b", "b", "b", "c", "c", "c"],
8888
categories=["a", "b", "c", "d"],
@@ -95,17 +95,20 @@ def test_basic(using_infer_string): # TODO: split this test
9595
result = data.groupby("b", observed=False).mean()
9696
tm.assert_frame_equal(result, expected)
9797

98+
99+
def test_basic_single_grouper():
98100
cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True)
99101
cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True)
100102
df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
101103

102-
# single grouper
103104
gb = df.groupby("A", observed=False)
104105
exp_idx = CategoricalIndex(["a", "b", "z"], name="A", ordered=True)
105106
expected = DataFrame({"values": Series([3, 7, 0], index=exp_idx)})
106107
result = gb.sum(numeric_only=True)
107108
tm.assert_frame_equal(result, expected)
108109

110+
111+
def test_basic_string(using_infer_string):
109112
# GH 8623
110113
x = DataFrame(
111114
[[1, "John P. Doe"], [2, "Jane Dove"], [1, "John P. Doe"]],
@@ -133,8 +136,9 @@ def f(x):
133136
expected["person_name"] = expected["person_name"].astype(dtype)
134137
tm.assert_frame_equal(result, expected)
135138

139+
140+
def test_basic_monotonic():
136141
# GH 9921
137-
# Monotonic
138142
df = DataFrame({"a": [5, 15, 25]})
139143
c = pd.cut(df.a, bins=[0, 10, 20, 30, 40])
140144

@@ -165,7 +169,8 @@ def f(x):
165169
tm.assert_series_equal(df.a.groupby(c, observed=False).filter(np.all), df["a"])
166170
tm.assert_frame_equal(df.groupby(c, observed=False).filter(np.all), df)
167171

168-
# Non-monotonic
172+
173+
def test_basic_non_monotonic():
169174
df = DataFrame({"a": [5, 15, 25, -5]})
170175
c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40])
171176

@@ -183,6 +188,8 @@ def f(x):
183188
df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df[["a"]]
184189
)
185190

191+
192+
def test_basic_cut_grouping():
186193
# GH 9603
187194
df = DataFrame({"a": [1, 0, 0, 0]})
188195
c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list("abcd")))
@@ -193,13 +200,14 @@ def f(x):
193200
expected.index.name = "a"
194201
tm.assert_series_equal(result, expected)
195202

196-
# more basic
203+
204+
def test_more_basic():
197205
levels = ["foo", "bar", "baz", "qux"]
198-
codes = np.random.default_rng(2).integers(0, 4, size=100)
206+
codes = np.random.default_rng(2).integers(0, 4, size=10)
199207

200208
cats = Categorical.from_codes(codes, levels, ordered=True)
201209

202-
data = DataFrame(np.random.default_rng(2).standard_normal((100, 4)))
210+
data = DataFrame(np.random.default_rng(2).standard_normal((10, 4)))
203211

204212
result = data.groupby(cats, observed=False).mean()
205213

@@ -225,9 +233,9 @@ def f(x):
225233
# GH 10460
226234
expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True)
227235
exp = CategoricalIndex(expc)
228-
tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp)
236+
tm.assert_index_equal(desc_result.stack().index.get_level_values(0), exp)
229237
exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4)
230-
tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp)
238+
tm.assert_index_equal(desc_result.stack().index.get_level_values(1), exp)
231239

232240

233241
def test_level_get_group(observed):
@@ -352,6 +360,8 @@ def test_observed(observed):
352360

353361
tm.assert_frame_equal(result, expected)
354362

363+
364+
def test_observed_single_column(observed):
355365
# https://github.com/pandas-dev/pandas/issues/8138
356366
d = {
357367
"cat": Categorical(
@@ -362,7 +372,6 @@ def test_observed(observed):
362372
}
363373
df = DataFrame(d)
364374

365-
# Grouping on a single column
366375
groups_single_key = df.groupby("cat", observed=observed)
367376
result = groups_single_key.mean()
368377

@@ -378,7 +387,17 @@ def test_observed(observed):
378387

379388
tm.assert_frame_equal(result, expected)
380389

381-
# Grouping on two columns
390+
391+
def test_observed_two_columns(observed):
392+
# https://github.com/pandas-dev/pandas/issues/8138
393+
d = {
394+
"cat": Categorical(
395+
["a", "b", "a", "b"], categories=["a", "b", "c"], ordered=True
396+
),
397+
"ints": [1, 1, 2, 2],
398+
"val": [10, 20, 30, 40],
399+
}
400+
df = DataFrame(d)
382401
groups_double_key = df.groupby(["cat", "ints"], observed=observed)
383402
result = groups_double_key.agg("mean")
384403
expected = DataFrame(
@@ -404,6 +423,8 @@ def test_observed(observed):
404423
expected = df[(df.cat == c) & (df.ints == i)]
405424
tm.assert_frame_equal(result, expected)
406425

426+
427+
def test_observed_with_as_index(observed):
407428
# gh-8869
408429
# with as_index
409430
d = {
@@ -591,7 +612,6 @@ def test_dataframe_categorical_with_nan(observed):
591612

592613

593614
@pytest.mark.parametrize("ordered", [True, False])
594-
@pytest.mark.parametrize("observed", [True, False])
595615
def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort):
596616
# GH 25871: Fix groupby sorting on ordered Categoricals
597617
# GH 25167: Groupby with observed=True doesn't sort
@@ -627,11 +647,11 @@ def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort):
627647
def test_datetime():
628648
# GH9049: ensure backward compatibility
629649
levels = pd.date_range("2014-01-01", periods=4)
630-
codes = np.random.default_rng(2).integers(0, 4, size=100)
650+
codes = np.random.default_rng(2).integers(0, 4, size=10)
631651

632652
cats = Categorical.from_codes(codes, levels, ordered=True)
633653

634-
data = DataFrame(np.random.default_rng(2).standard_normal((100, 4)))
654+
data = DataFrame(np.random.default_rng(2).standard_normal((10, 4)))
635655
result = data.groupby(cats, observed=False).mean()
636656

637657
expected = data.groupby(np.asarray(cats), observed=False).mean()
@@ -832,7 +852,10 @@ def test_preserve_categories():
832852
df.groupby("A", sort=False, observed=False).first().index, nosort_index
833853
)
834854

835-
# ordered=False
855+
856+
def test_preserve_categories_ordered_false():
857+
# GH-13179
858+
categories = list("abc")
836859
df = DataFrame({"A": Categorical(list("ba"), categories=categories, ordered=False)})
837860
sort_index = CategoricalIndex(categories, categories, ordered=False, name="A")
838861
# GH#48749 - don't change order of categories
@@ -846,7 +869,8 @@ def test_preserve_categories():
846869
)
847870

848871

849-
def test_preserve_categorical_dtype():
872+
@pytest.mark.parametrize("col", ["C1", "C2"])
873+
def test_preserve_categorical_dtype(col):
850874
# GH13743, GH13854
851875
df = DataFrame(
852876
{
@@ -865,18 +889,15 @@ def test_preserve_categorical_dtype():
865889
"C2": Categorical(list("bac"), categories=list("bac"), ordered=True),
866890
}
867891
)
868-
for col in ["C1", "C2"]:
869-
result1 = df.groupby(by=col, as_index=False, observed=False).mean(
870-
numeric_only=True
871-
)
872-
result2 = (
873-
df.groupby(by=col, as_index=True, observed=False)
874-
.mean(numeric_only=True)
875-
.reset_index()
876-
)
877-
expected = exp_full.reindex(columns=result1.columns)
878-
tm.assert_frame_equal(result1, expected)
879-
tm.assert_frame_equal(result2, expected)
892+
result1 = df.groupby(by=col, as_index=False, observed=False).mean(numeric_only=True)
893+
result2 = (
894+
df.groupby(by=col, as_index=True, observed=False)
895+
.mean(numeric_only=True)
896+
.reset_index()
897+
)
898+
expected = exp_full.reindex(columns=result1.columns)
899+
tm.assert_frame_equal(result1, expected)
900+
tm.assert_frame_equal(result2, expected)
880901

881902

882903
@pytest.mark.parametrize(
@@ -931,6 +952,8 @@ def test_categorical_no_compress():
931952
)
932953
tm.assert_series_equal(result, exp)
933954

955+
956+
def test_categorical_no_compress_string():
934957
cats = Categorical(
935958
["a", "a", "a", "b", "b", "b", "c", "c", "c"],
936959
categories=["a", "b", "c", "d"],
@@ -965,7 +988,7 @@ def test_sort():
965988
# has a sorted x axis
966989
# self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar')
967990

968-
df = DataFrame({"value": np.random.default_rng(2).integers(0, 10000, 100)})
991+
df = DataFrame({"value": np.random.default_rng(2).integers(0, 10000, 10)})
969992
labels = [f"{i} - {i+499}" for i in range(0, 10000, 500)]
970993
cat_labels = Categorical(labels, labels)
971994

0 commit comments

Comments
 (0)