Skip to content

Commit 03d86d6

Browse files
authored
CLN: groupby test (pandas-dev#58777)
* Clean test_cumulative * Clean test_counting * Clean test_filters * Undo change
1 parent 6a7b3da commit 03d86d6

File tree

3 files changed

+76
-63
lines changed

3 files changed

+76
-63
lines changed

pandas/tests/groupby/test_counting.py

+10-8
Original file line numberDiff line numberDiff line change
@@ -321,31 +321,33 @@ def test_count_object():
321321
expected = Series([3, 3], index=Index([2, 3], name="c"), name="a")
322322
tm.assert_series_equal(result, expected)
323323

324+
325+
def test_count_object_nan():
324326
df = DataFrame({"a": ["a", np.nan, np.nan] + ["b"] * 3, "c": [2] * 3 + [3] * 3})
325327
result = df.groupby("c").a.count()
326328
expected = Series([1, 3], index=Index([2, 3], name="c"), name="a")
327329
tm.assert_series_equal(result, expected)
328330

329331

330-
def test_count_cross_type():
332+
@pytest.mark.parametrize("typ", ["object", "float32"])
333+
def test_count_cross_type(typ):
331334
# GH8169
332335
# Set float64 dtype to avoid upcast when setting nan below
333336
vals = np.hstack(
334337
(
335-
np.random.default_rng(2).integers(0, 5, (100, 2)),
336-
np.random.default_rng(2).integers(0, 2, (100, 2)),
338+
np.random.default_rng(2).integers(0, 5, (10, 2)),
339+
np.random.default_rng(2).integers(0, 2, (10, 2)),
337340
)
338341
).astype("float64")
339342

340343
df = DataFrame(vals, columns=["a", "b", "c", "d"])
341344
df[df == 2] = np.nan
342345
expected = df.groupby(["c", "d"]).count()
343346

344-
for t in ["float32", "object"]:
345-
df["a"] = df["a"].astype(t)
346-
df["b"] = df["b"].astype(t)
347-
result = df.groupby(["c", "d"]).count()
348-
tm.assert_frame_equal(result, expected)
347+
df["a"] = df["a"].astype(typ)
348+
df["b"] = df["b"].astype(typ)
349+
result = df.groupby(["c", "d"]).count()
350+
tm.assert_frame_equal(result, expected)
349351

350352

351353
def test_lower_int_prec_count():

pandas/tests/groupby/test_cumulative.py

+38-12
Original file line numberDiff line numberDiff line change
@@ -94,21 +94,28 @@ def test_groupby_cumprod_nan_influences_other_columns():
9494

9595
def test_cummin(dtypes_for_minmax):
9696
dtype = dtypes_for_minmax[0]
97-
min_val = dtypes_for_minmax[1]
9897

9998
# GH 15048
10099
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
101100
expected_mins = [3, 3, 3, 2, 2, 2, 2, 1]
102101

103102
df = base_df.astype(dtype)
104-
105103
expected = DataFrame({"B": expected_mins}).astype(dtype)
106104
result = df.groupby("A").cummin()
107105
tm.assert_frame_equal(result, expected)
108106
result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame()
109107
tm.assert_frame_equal(result, expected)
110108

111-
# Test w/ min value for dtype
109+
110+
def test_cummin_min_value_for_dtype(dtypes_for_minmax):
111+
dtype = dtypes_for_minmax[0]
112+
min_val = dtypes_for_minmax[1]
113+
114+
# GH 15048
115+
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
116+
expected_mins = [3, 3, 3, 2, 2, 2, 2, 1]
117+
expected = DataFrame({"B": expected_mins}).astype(dtype)
118+
df = base_df.astype(dtype)
112119
df.loc[[2, 6], "B"] = min_val
113120
df.loc[[1, 5], "B"] = min_val + 1
114121
expected.loc[[2, 3, 6, 7], "B"] = min_val
@@ -120,8 +127,10 @@ def test_cummin(dtypes_for_minmax):
120127
)
121128
tm.assert_frame_equal(result, expected, check_exact=True)
122129

123-
# Test nan in some values
130+
131+
def test_cummin_nan_in_some_values(dtypes_for_minmax):
124132
# Explicit cast to float to avoid implicit cast when setting nan
133+
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
125134
base_df = base_df.astype({"B": "float"})
126135
base_df.loc[[0, 2, 4, 6], "B"] = np.nan
127136
expected = DataFrame({"B": [np.nan, 4, np.nan, 2, np.nan, 3, np.nan, 1]})
@@ -132,13 +141,17 @@ def test_cummin(dtypes_for_minmax):
132141
)
133142
tm.assert_frame_equal(result, expected)
134143

144+
145+
def test_cummin_datetime():
135146
# GH 15561
136147
df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])})
137148
expected = Series(pd.to_datetime("2001"), index=[0], name="b")
138149

139150
result = df.groupby("a")["b"].cummin()
140151
tm.assert_series_equal(expected, result)
141152

153+
154+
def test_cummin_getattr_series():
142155
# GH 15635
143156
df = DataFrame({"a": [1, 2, 1], "b": [1, 2, 2]})
144157
result = df.groupby("a").b.cummin()
@@ -163,7 +176,6 @@ def test_cummin_max_all_nan_column(method, dtype):
163176

164177
def test_cummax(dtypes_for_minmax):
165178
dtype = dtypes_for_minmax[0]
166-
max_val = dtypes_for_minmax[2]
167179

168180
# GH 15048
169181
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
@@ -177,8 +189,18 @@ def test_cummax(dtypes_for_minmax):
177189
result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame()
178190
tm.assert_frame_equal(result, expected)
179191

180-
# Test w/ max value for dtype
192+
193+
def test_cummax_min_value_for_dtype(dtypes_for_minmax):
194+
dtype = dtypes_for_minmax[0]
195+
max_val = dtypes_for_minmax[2]
196+
197+
# GH 15048
198+
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
199+
expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3]
200+
201+
df = base_df.astype(dtype)
181202
df.loc[[2, 6], "B"] = max_val
203+
expected = DataFrame({"B": expected_maxs}).astype(dtype)
182204
expected.loc[[2, 3, 6, 7], "B"] = max_val
183205
result = df.groupby("A").cummax()
184206
tm.assert_frame_equal(result, expected)
@@ -187,8 +209,11 @@ def test_cummax(dtypes_for_minmax):
187209
)
188210
tm.assert_frame_equal(result, expected)
189211

212+
213+
def test_cummax_nan_in_some_values(dtypes_for_minmax):
190214
# Test nan in some values
191215
# Explicit cast to float to avoid implicit cast when setting nan
216+
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
192217
base_df = base_df.astype({"B": "float"})
193218
base_df.loc[[0, 2, 4, 6], "B"] = np.nan
194219
expected = DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]})
@@ -199,13 +224,17 @@ def test_cummax(dtypes_for_minmax):
199224
)
200225
tm.assert_frame_equal(result, expected)
201226

227+
228+
def test_cummax_datetime():
202229
# GH 15561
203230
df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])})
204231
expected = Series(pd.to_datetime("2001"), index=[0], name="b")
205232

206233
result = df.groupby("a")["b"].cummax()
207234
tm.assert_series_equal(expected, result)
208235

236+
237+
def test_cummax_getattr_series():
209238
# GH 15635
210239
df = DataFrame({"a": [1, 2, 1], "b": [2, 1, 1]})
211240
result = df.groupby("a").b.cummax()
@@ -292,15 +321,12 @@ def test_nullable_int_not_cast_as_float(method, dtype, val):
292321
tm.assert_frame_equal(result, expected)
293322

294323

295-
def test_cython_api2():
324+
def test_cython_api2(as_index):
296325
# this takes the fast apply path
297326

298327
# cumsum (GH5614)
328+
# GH 5755 - cumsum is a transformer and should ignore as_index
299329
df = DataFrame([[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]], columns=["A", "B", "C"])
300330
expected = DataFrame([[2, np.nan], [np.nan, 9], [4, 9]], columns=["B", "C"])
301-
result = df.groupby("A").cumsum()
302-
tm.assert_frame_equal(result, expected)
303-
304-
# GH 5755 - cumsum is a transformer and should ignore as_index
305-
result = df.groupby("A", as_index=False).cumsum()
331+
result = df.groupby("A", as_index=as_index).cumsum()
306332
tm.assert_frame_equal(result, expected)

pandas/tests/groupby/test_filters.py

+28-43
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,9 @@ def test_filter_out_no_groups():
8585
grouped = s.groupby(grouper)
8686
filtered = grouped.filter(lambda x: x.mean() > 0)
8787
tm.assert_series_equal(filtered, s)
88+
89+
90+
def test_filter_out_no_groups_dataframe():
8891
df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()})
8992
grouper = df["A"].apply(lambda x: x % 2)
9093
grouped = df.groupby(grouper)
@@ -100,6 +103,9 @@ def test_filter_out_all_groups_in_df():
100103
expected = DataFrame({"a": [np.nan] * 3, "b": [np.nan] * 3})
101104
tm.assert_frame_equal(expected, res)
102105

106+
107+
def test_filter_out_all_groups_in_df_dropna_true():
108+
# GH12768
103109
df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]})
104110
res = df.groupby("a")
105111
res = res.filter(lambda x: x["b"].sum() > 5, dropna=True)
@@ -179,7 +185,7 @@ def test_filter_pdna_is_false():
179185

180186
def test_filter_against_workaround_ints():
181187
# Series of ints
182-
s = Series(np.random.default_rng(2).integers(0, 100, 100))
188+
s = Series(np.random.default_rng(2).integers(0, 100, 10))
183189
grouper = s.apply(lambda x: np.round(x, -1))
184190
grouped = s.groupby(grouper)
185191
f = lambda x: x.mean() > 10
@@ -191,7 +197,7 @@ def test_filter_against_workaround_ints():
191197

192198
def test_filter_against_workaround_floats():
193199
# Series of floats
194-
s = 100 * Series(np.random.default_rng(2).random(100))
200+
s = 100 * Series(np.random.default_rng(2).random(10))
195201
grouper = s.apply(lambda x: np.round(x, -1))
196202
grouped = s.groupby(grouper)
197203
f = lambda x: x.mean() > 10
@@ -203,40 +209,40 @@ def test_filter_against_workaround_floats():
203209
def test_filter_against_workaround_dataframe():
204210
# Set up DataFrame of ints, floats, strings.
205211
letters = np.array(list(ascii_lowercase))
206-
N = 100
212+
N = 10
207213
random_letters = letters.take(
208214
np.random.default_rng(2).integers(0, 26, N, dtype=int)
209215
)
210216
df = DataFrame(
211217
{
212-
"ints": Series(np.random.default_rng(2).integers(0, 100, N)),
218+
"ints": Series(np.random.default_rng(2).integers(0, 10, N)),
213219
"floats": N / 10 * Series(np.random.default_rng(2).random(N)),
214220
"letters": Series(random_letters),
215221
}
216222
)
217223

218224
# Group by ints; filter on floats.
219225
grouped = df.groupby("ints")
220-
old_way = df[grouped.floats.transform(lambda x: x.mean() > N / 20).astype("bool")]
221-
new_way = grouped.filter(lambda x: x["floats"].mean() > N / 20)
226+
old_way = df[grouped.floats.transform(lambda x: x.mean() > N / 2).astype("bool")]
227+
new_way = grouped.filter(lambda x: x["floats"].mean() > N / 2)
222228
tm.assert_frame_equal(new_way, old_way)
223229

224230
# Group by floats (rounded); filter on strings.
225231
grouper = df.floats.apply(lambda x: np.round(x, -1))
226232
grouped = df.groupby(grouper)
227-
old_way = df[grouped.letters.transform(lambda x: len(x) < N / 10).astype("bool")]
228-
new_way = grouped.filter(lambda x: len(x.letters) < N / 10)
233+
old_way = df[grouped.letters.transform(lambda x: len(x) < N / 2).astype("bool")]
234+
new_way = grouped.filter(lambda x: len(x.letters) < N / 2)
229235
tm.assert_frame_equal(new_way, old_way)
230236

231237
# Group by strings; filter on ints.
232238
grouped = df.groupby("letters")
233-
old_way = df[grouped.ints.transform(lambda x: x.mean() > N / 20).astype("bool")]
234-
new_way = grouped.filter(lambda x: x["ints"].mean() > N / 20)
239+
old_way = df[grouped.ints.transform(lambda x: x.mean() > N / 2).astype("bool")]
240+
new_way = grouped.filter(lambda x: x["ints"].mean() > N / 2)
235241
tm.assert_frame_equal(new_way, old_way)
236242

237243

238244
def test_filter_using_len():
239-
# BUG GH4447
245+
# GH 4447
240246
df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
241247
grouped = df.groupby("B")
242248
actual = grouped.filter(lambda x: len(x) > 2)
@@ -250,8 +256,10 @@ def test_filter_using_len():
250256
expected = df.loc[[]]
251257
tm.assert_frame_equal(actual, expected)
252258

253-
# Series have always worked properly, but we'll test anyway.
254-
s = df["B"]
259+
260+
def test_filter_using_len_series():
261+
# GH 4447
262+
s = Series(list("aabbbbcc"), name="B")
255263
grouped = s.groupby(s)
256264
actual = grouped.filter(lambda x: len(x) > 2)
257265
expected = Series(4 * ["b"], index=np.arange(2, 6, dtype=np.int64), name="B")
@@ -262,10 +270,14 @@ def test_filter_using_len():
262270
tm.assert_series_equal(actual, expected)
263271

264272

265-
def test_filter_maintains_ordering():
266-
# Simple case: index is sequential. #4621
273+
@pytest.mark.parametrize(
274+
"index", [range(8), range(7, -1, -1), [0, 2, 1, 3, 4, 6, 5, 7]]
275+
)
276+
def test_filter_maintains_ordering(index):
277+
# GH 4621
267278
df = DataFrame(
268-
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}
279+
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
280+
index=index,
269281
)
270282
s = df["pid"]
271283
grouped = df.groupby("tag")
@@ -278,33 +290,6 @@ def test_filter_maintains_ordering():
278290
expected = s.iloc[[1, 2, 4, 7]]
279291
tm.assert_series_equal(actual, expected)
280292

281-
# Now index is sequentially decreasing.
282-
df.index = np.arange(len(df) - 1, -1, -1)
283-
s = df["pid"]
284-
grouped = df.groupby("tag")
285-
actual = grouped.filter(lambda x: len(x) > 1)
286-
expected = df.iloc[[1, 2, 4, 7]]
287-
tm.assert_frame_equal(actual, expected)
288-
289-
grouped = s.groupby(df["tag"])
290-
actual = grouped.filter(lambda x: len(x) > 1)
291-
expected = s.iloc[[1, 2, 4, 7]]
292-
tm.assert_series_equal(actual, expected)
293-
294-
# Index is shuffled.
295-
SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3]
296-
df.index = df.index[SHUFFLED]
297-
s = df["pid"]
298-
grouped = df.groupby("tag")
299-
actual = grouped.filter(lambda x: len(x) > 1)
300-
expected = df.iloc[[1, 2, 4, 7]]
301-
tm.assert_frame_equal(actual, expected)
302-
303-
grouped = s.groupby(df["tag"])
304-
actual = grouped.filter(lambda x: len(x) > 1)
305-
expected = s.iloc[[1, 2, 4, 7]]
306-
tm.assert_series_equal(actual, expected)
307-
308293

309294
def test_filter_multiple_timestamp():
310295
# GH 10114

0 commit comments

Comments
 (0)