Skip to content

TST: Clean groupby tests #58797

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 34 additions & 21 deletions pandas/tests/groupby/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,8 @@ def test_groupby_as_index_apply():
tm.assert_index_equal(res_as_apply, exp_as_apply)
tm.assert_index_equal(res_not_as_apply, exp_not_as_apply)


def test_groupby_as_index_apply_str():
ind = Index(list("abcde"))
df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind)
msg = "DataFrameGroupBy.apply operated on the grouping columns"
Expand Down Expand Up @@ -379,8 +381,8 @@ def f(piece):
{"value": piece, "demeaned": piece - piece.mean(), "logged": logged}
)

dr = bdate_range("1/1/2000", periods=100)
ts = Series(np.random.default_rng(2).standard_normal(100), index=dr)
dr = bdate_range("1/1/2000", periods=10)
ts = Series(np.random.default_rng(2).standard_normal(10), index=dr)

grouped = ts.groupby(lambda x: x.month, group_keys=False)
result = grouped.apply(f)
Expand Down Expand Up @@ -639,13 +641,13 @@ def reindex_helper(x):
def test_apply_corner_cases():
# #535, can't use sliding iterator

N = 1000
N = 10
labels = np.random.default_rng(2).integers(0, 100, size=N)
df = DataFrame(
{
"key": labels,
"value1": np.random.default_rng(2).standard_normal(N),
"value2": ["foo", "bar", "baz", "qux"] * (N // 4),
"value2": ["foo", "bar", "baz", "qux", "a"] * (N // 5),
}
)

Expand Down Expand Up @@ -680,6 +682,8 @@ def test_apply_numeric_coercion_when_datetime():
result = df.groupby(["Number"]).apply(lambda x: x.iloc[0])
tm.assert_series_equal(result["Str"], expected["Str"])


def test_apply_numeric_coercion_when_datetime_getitem():
# GH 15421
df = DataFrame(
{"A": [10, 20, 30], "B": ["foo", "3", "4"], "T": [pd.Timestamp("12:31:22")] * 3}
Expand All @@ -695,6 +699,8 @@ def get_B(g):
expected.index = df.A
tm.assert_series_equal(result, expected)


def test_apply_numeric_coercion_when_datetime_with_nat():
# GH 14423
def predictions(tool):
out = Series(index=["p1", "p2", "useTime"], dtype=object)
Expand Down Expand Up @@ -843,10 +849,24 @@ def test_func(x):
tm.assert_frame_equal(result, expected)


def test_groupby_apply_none_first():
@pytest.mark.parametrize(
"in_data, out_idx, out_data",
[
[
{"groups": [1, 1, 1, 2], "vars": [0, 1, 2, 3]},
[[1, 1], [0, 2]],
{"groups": [1, 1], "vars": [0, 2]},
],
[
{"groups": [1, 2, 2, 2], "vars": [0, 1, 2, 3]},
[[2, 2], [1, 3]],
{"groups": [2, 2], "vars": [1, 3]},
],
],
)
def test_groupby_apply_none_first(in_data, out_idx, out_data):
# GH 12824. Tests if apply returns None first.
test_df1 = DataFrame({"groups": [1, 1, 1, 2], "vars": [0, 1, 2, 3]})
test_df2 = DataFrame({"groups": [1, 2, 2, 2], "vars": [0, 1, 2, 3]})
test_df1 = DataFrame(in_data)

def test_func(x):
if x.shape[0] < 2:
Expand All @@ -856,14 +876,9 @@ def test_func(x):
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
result1 = test_df1.groupby("groups").apply(test_func)
with tm.assert_produces_warning(DeprecationWarning, match=msg):
result2 = test_df2.groupby("groups").apply(test_func)
index1 = MultiIndex.from_arrays([[1, 1], [0, 2]], names=["groups", None])
index2 = MultiIndex.from_arrays([[2, 2], [1, 3]], names=["groups", None])
expected1 = DataFrame({"groups": [1, 1], "vars": [0, 2]}, index=index1)
expected2 = DataFrame({"groups": [2, 2], "vars": [1, 3]}, index=index2)
index1 = MultiIndex.from_arrays(out_idx, names=["groups", None])
expected1 = DataFrame(out_data, index=index1)
tm.assert_frame_equal(result1, expected1)
tm.assert_frame_equal(result2, expected2)


def test_groupby_apply_return_empty_chunk():
Expand All @@ -883,18 +898,16 @@ def test_groupby_apply_return_empty_chunk():
tm.assert_series_equal(result, expected)


def test_apply_with_mixed_types():
@pytest.mark.parametrize("meth", ["apply", "transform"])
def test_apply_with_mixed_types(meth):
# gh-20949
df = DataFrame({"A": "a a b".split(), "B": [1, 2, 3], "C": [4, 6, 5]})
g = df.groupby("A", group_keys=False)

result = g.transform(lambda x: x / x.sum())
result = getattr(g, meth)(lambda x: x / x.sum())
expected = DataFrame({"B": [1 / 3.0, 2 / 3.0, 1], "C": [0.4, 0.6, 1.0]})
tm.assert_frame_equal(result, expected)

result = g.apply(lambda x: x / x.sum())
tm.assert_frame_equal(result, expected)


def test_func_returns_object():
# GH 28652
Expand Down Expand Up @@ -1106,7 +1119,7 @@ def test_apply_function_with_indexing_return_column():

@pytest.mark.parametrize(
"udf",
[(lambda x: x.copy()), (lambda x: x.copy().rename(lambda y: y + 1))],
[lambda x: x.copy(), lambda x: x.copy().rename(lambda y: y + 1)],
)
@pytest.mark.parametrize("group_keys", [True, False])
def test_apply_result_type(group_keys, udf):
Expand Down Expand Up @@ -1214,7 +1227,7 @@ def test_apply_with_date_in_multiindex_does_not_convert_to_timestamp():
expected = df.iloc[[0, 2, 3]]
expected = expected.reset_index()
expected.index = MultiIndex.from_frame(expected[["A", "B", "idx"]])
expected = expected.drop(columns="idx")
expected = expected.drop(columns=["idx"])

tm.assert_frame_equal(result, expected)
for val in result.index.levels[1]:
Expand Down
81 changes: 52 additions & 29 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def get_stats(group):
assert result.index.names[0] == "C"


def test_basic(using_infer_string): # TODO: split this test
def test_basic():
cats = Categorical(
["a", "a", "a", "b", "b", "b", "c", "c", "c"],
categories=["a", "b", "c", "d"],
Expand All @@ -95,17 +95,20 @@ def test_basic(using_infer_string): # TODO: split this test
result = data.groupby("b", observed=False).mean()
tm.assert_frame_equal(result, expected)


def test_basic_single_grouper():
cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True)
cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True)
df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})

# single grouper
gb = df.groupby("A", observed=False)
exp_idx = CategoricalIndex(["a", "b", "z"], name="A", ordered=True)
expected = DataFrame({"values": Series([3, 7, 0], index=exp_idx)})
result = gb.sum(numeric_only=True)
tm.assert_frame_equal(result, expected)


def test_basic_string(using_infer_string):
# GH 8623
x = DataFrame(
[[1, "John P. Doe"], [2, "Jane Dove"], [1, "John P. Doe"]],
Expand Down Expand Up @@ -133,8 +136,9 @@ def f(x):
expected["person_name"] = expected["person_name"].astype(dtype)
tm.assert_frame_equal(result, expected)


def test_basic_monotonic():
# GH 9921
# Monotonic
df = DataFrame({"a": [5, 15, 25]})
c = pd.cut(df.a, bins=[0, 10, 20, 30, 40])

Expand Down Expand Up @@ -165,7 +169,8 @@ def f(x):
tm.assert_series_equal(df.a.groupby(c, observed=False).filter(np.all), df["a"])
tm.assert_frame_equal(df.groupby(c, observed=False).filter(np.all), df)

# Non-monotonic

def test_basic_non_monotonic():
df = DataFrame({"a": [5, 15, 25, -5]})
c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40])

Expand All @@ -183,6 +188,8 @@ def f(x):
df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df[["a"]]
)


def test_basic_cut_grouping():
# GH 9603
df = DataFrame({"a": [1, 0, 0, 0]})
c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list("abcd")))
Expand All @@ -193,13 +200,14 @@ def f(x):
expected.index.name = "a"
tm.assert_series_equal(result, expected)

# more basic

def test_more_basic():
levels = ["foo", "bar", "baz", "qux"]
codes = np.random.default_rng(2).integers(0, 4, size=100)
codes = np.random.default_rng(2).integers(0, 4, size=10)

cats = Categorical.from_codes(codes, levels, ordered=True)

data = DataFrame(np.random.default_rng(2).standard_normal((100, 4)))
data = DataFrame(np.random.default_rng(2).standard_normal((10, 4)))

result = data.groupby(cats, observed=False).mean()

Expand All @@ -225,9 +233,9 @@ def f(x):
# GH 10460
expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True)
exp = CategoricalIndex(expc)
tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp)
tm.assert_index_equal(desc_result.stack().index.get_level_values(0), exp)
exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4)
tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp)
tm.assert_index_equal(desc_result.stack().index.get_level_values(1), exp)


def test_level_get_group(observed):
Expand Down Expand Up @@ -352,6 +360,8 @@ def test_observed(observed):

tm.assert_frame_equal(result, expected)


def test_observed_single_column(observed):
# https://github.com/pandas-dev/pandas/issues/8138
d = {
"cat": Categorical(
Expand All @@ -362,7 +372,6 @@ def test_observed(observed):
}
df = DataFrame(d)

# Grouping on a single column
groups_single_key = df.groupby("cat", observed=observed)
result = groups_single_key.mean()

Expand All @@ -378,7 +387,17 @@ def test_observed(observed):

tm.assert_frame_equal(result, expected)

# Grouping on two columns

def test_observed_two_columns(observed):
# https://github.com/pandas-dev/pandas/issues/8138
d = {
"cat": Categorical(
["a", "b", "a", "b"], categories=["a", "b", "c"], ordered=True
),
"ints": [1, 1, 2, 2],
"val": [10, 20, 30, 40],
}
df = DataFrame(d)
groups_double_key = df.groupby(["cat", "ints"], observed=observed)
result = groups_double_key.agg("mean")
expected = DataFrame(
Expand All @@ -404,6 +423,8 @@ def test_observed(observed):
expected = df[(df.cat == c) & (df.ints == i)]
tm.assert_frame_equal(result, expected)


def test_observed_with_as_index(observed):
# gh-8869
# with as_index
d = {
Expand Down Expand Up @@ -591,7 +612,6 @@ def test_dataframe_categorical_with_nan(observed):


@pytest.mark.parametrize("ordered", [True, False])
@pytest.mark.parametrize("observed", [True, False])
def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort):
# GH 25871: Fix groupby sorting on ordered Categoricals
# GH 25167: Groupby with observed=True doesn't sort
Expand Down Expand Up @@ -627,11 +647,11 @@ def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort):
def test_datetime():
# GH9049: ensure backward compatibility
levels = pd.date_range("2014-01-01", periods=4)
codes = np.random.default_rng(2).integers(0, 4, size=100)
codes = np.random.default_rng(2).integers(0, 4, size=10)

cats = Categorical.from_codes(codes, levels, ordered=True)

data = DataFrame(np.random.default_rng(2).standard_normal((100, 4)))
data = DataFrame(np.random.default_rng(2).standard_normal((10, 4)))
result = data.groupby(cats, observed=False).mean()

expected = data.groupby(np.asarray(cats), observed=False).mean()
Expand Down Expand Up @@ -832,7 +852,10 @@ def test_preserve_categories():
df.groupby("A", sort=False, observed=False).first().index, nosort_index
)

# ordered=False

def test_preserve_categories_ordered_false():
# GH-13179
categories = list("abc")
df = DataFrame({"A": Categorical(list("ba"), categories=categories, ordered=False)})
sort_index = CategoricalIndex(categories, categories, ordered=False, name="A")
# GH#48749 - don't change order of categories
Expand All @@ -846,7 +869,8 @@ def test_preserve_categories():
)


def test_preserve_categorical_dtype():
@pytest.mark.parametrize("col", ["C1", "C2"])
def test_preserve_categorical_dtype(col):
# GH13743, GH13854
df = DataFrame(
{
Expand All @@ -865,18 +889,15 @@ def test_preserve_categorical_dtype():
"C2": Categorical(list("bac"), categories=list("bac"), ordered=True),
}
)
for col in ["C1", "C2"]:
result1 = df.groupby(by=col, as_index=False, observed=False).mean(
numeric_only=True
)
result2 = (
df.groupby(by=col, as_index=True, observed=False)
.mean(numeric_only=True)
.reset_index()
)
expected = exp_full.reindex(columns=result1.columns)
tm.assert_frame_equal(result1, expected)
tm.assert_frame_equal(result2, expected)
result1 = df.groupby(by=col, as_index=False, observed=False).mean(numeric_only=True)
result2 = (
df.groupby(by=col, as_index=True, observed=False)
.mean(numeric_only=True)
.reset_index()
)
expected = exp_full.reindex(columns=result1.columns)
tm.assert_frame_equal(result1, expected)
tm.assert_frame_equal(result2, expected)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -931,6 +952,8 @@ def test_categorical_no_compress():
)
tm.assert_series_equal(result, exp)


def test_categorical_no_compress_string():
cats = Categorical(
["a", "a", "a", "b", "b", "b", "c", "c", "c"],
categories=["a", "b", "c", "d"],
Expand Down Expand Up @@ -965,7 +988,7 @@ def test_sort():
# has a sorted x axis
# self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar')

df = DataFrame({"value": np.random.default_rng(2).integers(0, 10000, 100)})
df = DataFrame({"value": np.random.default_rng(2).integers(0, 10000, 10)})
labels = [f"{i} - {i+499}" for i in range(0, 10000, 500)]
cat_labels = Categorical(labels, labels)

Expand Down