Skip to content

CLN: groupby test #58777

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
May 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 10 additions & 8 deletions pandas/tests/groupby/test_counting.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,31 +321,33 @@ def test_count_object():
expected = Series([3, 3], index=Index([2, 3], name="c"), name="a")
tm.assert_series_equal(result, expected)


def test_count_object_nan():
df = DataFrame({"a": ["a", np.nan, np.nan] + ["b"] * 3, "c": [2] * 3 + [3] * 3})
result = df.groupby("c").a.count()
expected = Series([1, 3], index=Index([2, 3], name="c"), name="a")
tm.assert_series_equal(result, expected)


def test_count_cross_type():
@pytest.mark.parametrize("typ", ["object", "float32"])
def test_count_cross_type(typ):
# GH8169
# Set float64 dtype to avoid upcast when setting nan below
vals = np.hstack(
(
np.random.default_rng(2).integers(0, 5, (100, 2)),
np.random.default_rng(2).integers(0, 2, (100, 2)),
np.random.default_rng(2).integers(0, 5, (10, 2)),
np.random.default_rng(2).integers(0, 2, (10, 2)),
)
).astype("float64")

df = DataFrame(vals, columns=["a", "b", "c", "d"])
df[df == 2] = np.nan
expected = df.groupby(["c", "d"]).count()

for t in ["float32", "object"]:
df["a"] = df["a"].astype(t)
df["b"] = df["b"].astype(t)
result = df.groupby(["c", "d"]).count()
tm.assert_frame_equal(result, expected)
df["a"] = df["a"].astype(typ)
df["b"] = df["b"].astype(typ)
result = df.groupby(["c", "d"]).count()
tm.assert_frame_equal(result, expected)


def test_lower_int_prec_count():
Expand Down
50 changes: 38 additions & 12 deletions pandas/tests/groupby/test_cumulative.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,21 +94,28 @@ def test_groupby_cumprod_nan_influences_other_columns():

def test_cummin(dtypes_for_minmax):
dtype = dtypes_for_minmax[0]
min_val = dtypes_for_minmax[1]

# GH 15048
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
expected_mins = [3, 3, 3, 2, 2, 2, 2, 1]

df = base_df.astype(dtype)

expected = DataFrame({"B": expected_mins}).astype(dtype)
result = df.groupby("A").cummin()
tm.assert_frame_equal(result, expected)
result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(result, expected)

# Test w/ min value for dtype

def test_cummin_min_value_for_dtype(dtypes_for_minmax):
dtype = dtypes_for_minmax[0]
min_val = dtypes_for_minmax[1]

# GH 15048
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
expected_mins = [3, 3, 3, 2, 2, 2, 2, 1]
expected = DataFrame({"B": expected_mins}).astype(dtype)
df = base_df.astype(dtype)
df.loc[[2, 6], "B"] = min_val
df.loc[[1, 5], "B"] = min_val + 1
expected.loc[[2, 3, 6, 7], "B"] = min_val
Expand All @@ -120,8 +127,10 @@ def test_cummin(dtypes_for_minmax):
)
tm.assert_frame_equal(result, expected, check_exact=True)

# Test nan in some values

def test_cummin_nan_in_some_values(dtypes_for_minmax):
# Explicit cast to float to avoid implicit cast when setting nan
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
base_df = base_df.astype({"B": "float"})
base_df.loc[[0, 2, 4, 6], "B"] = np.nan
expected = DataFrame({"B": [np.nan, 4, np.nan, 2, np.nan, 3, np.nan, 1]})
Expand All @@ -132,13 +141,17 @@ def test_cummin(dtypes_for_minmax):
)
tm.assert_frame_equal(result, expected)


def test_cummin_datetime():
# GH 15561
df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])})
expected = Series(pd.to_datetime("2001"), index=[0], name="b")

result = df.groupby("a")["b"].cummin()
tm.assert_series_equal(expected, result)


def test_cummin_getattr_series():
# GH 15635
df = DataFrame({"a": [1, 2, 1], "b": [1, 2, 2]})
result = df.groupby("a").b.cummin()
Expand All @@ -163,7 +176,6 @@ def test_cummin_max_all_nan_column(method, dtype):

def test_cummax(dtypes_for_minmax):
dtype = dtypes_for_minmax[0]
max_val = dtypes_for_minmax[2]

# GH 15048
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
Expand All @@ -177,8 +189,18 @@ def test_cummax(dtypes_for_minmax):
result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(result, expected)

# Test w/ max value for dtype

def test_cummax_min_value_for_dtype(dtypes_for_minmax):
dtype = dtypes_for_minmax[0]
max_val = dtypes_for_minmax[2]

# GH 15048
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3]

df = base_df.astype(dtype)
df.loc[[2, 6], "B"] = max_val
expected = DataFrame({"B": expected_maxs}).astype(dtype)
expected.loc[[2, 3, 6, 7], "B"] = max_val
result = df.groupby("A").cummax()
tm.assert_frame_equal(result, expected)
Expand All @@ -187,8 +209,11 @@ def test_cummax(dtypes_for_minmax):
)
tm.assert_frame_equal(result, expected)


def test_cummax_nan_in_some_values(dtypes_for_minmax):
# Test nan in some values
# Explicit cast to float to avoid implicit cast when setting nan
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
base_df = base_df.astype({"B": "float"})
base_df.loc[[0, 2, 4, 6], "B"] = np.nan
expected = DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]})
Expand All @@ -199,13 +224,17 @@ def test_cummax(dtypes_for_minmax):
)
tm.assert_frame_equal(result, expected)


def test_cummax_datetime():
# GH 15561
df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])})
expected = Series(pd.to_datetime("2001"), index=[0], name="b")

result = df.groupby("a")["b"].cummax()
tm.assert_series_equal(expected, result)


def test_cummax_getattr_series():
# GH 15635
df = DataFrame({"a": [1, 2, 1], "b": [2, 1, 1]})
result = df.groupby("a").b.cummax()
Expand Down Expand Up @@ -292,15 +321,12 @@ def test_nullable_int_not_cast_as_float(method, dtype, val):
tm.assert_frame_equal(result, expected)


def test_cython_api2():
def test_cython_api2(as_index):
# this takes the fast apply path

# cumsum (GH5614)
# GH 5755 - cumsum is a transformer and should ignore as_index
df = DataFrame([[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]], columns=["A", "B", "C"])
expected = DataFrame([[2, np.nan], [np.nan, 9], [4, 9]], columns=["B", "C"])
result = df.groupby("A").cumsum()
tm.assert_frame_equal(result, expected)

# GH 5755 - cumsum is a transformer and should ignore as_index
result = df.groupby("A", as_index=False).cumsum()
result = df.groupby("A", as_index=as_index).cumsum()
tm.assert_frame_equal(result, expected)
71 changes: 28 additions & 43 deletions pandas/tests/groupby/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ def test_filter_out_no_groups():
grouped = s.groupby(grouper)
filtered = grouped.filter(lambda x: x.mean() > 0)
tm.assert_series_equal(filtered, s)


def test_filter_out_no_groups_dataframe():
df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()})
grouper = df["A"].apply(lambda x: x % 2)
grouped = df.groupby(grouper)
Expand All @@ -100,6 +103,9 @@ def test_filter_out_all_groups_in_df():
expected = DataFrame({"a": [np.nan] * 3, "b": [np.nan] * 3})
tm.assert_frame_equal(expected, res)


def test_filter_out_all_groups_in_df_dropna_true():
# GH12768
df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]})
res = df.groupby("a")
res = res.filter(lambda x: x["b"].sum() > 5, dropna=True)
Expand Down Expand Up @@ -179,7 +185,7 @@ def test_filter_pdna_is_false():

def test_filter_against_workaround_ints():
# Series of ints
s = Series(np.random.default_rng(2).integers(0, 100, 100))
s = Series(np.random.default_rng(2).integers(0, 100, 10))
grouper = s.apply(lambda x: np.round(x, -1))
grouped = s.groupby(grouper)
f = lambda x: x.mean() > 10
Expand All @@ -191,7 +197,7 @@ def test_filter_against_workaround_ints():

def test_filter_against_workaround_floats():
# Series of floats
s = 100 * Series(np.random.default_rng(2).random(100))
s = 100 * Series(np.random.default_rng(2).random(10))
grouper = s.apply(lambda x: np.round(x, -1))
grouped = s.groupby(grouper)
f = lambda x: x.mean() > 10
Expand All @@ -203,40 +209,40 @@ def test_filter_against_workaround_floats():
def test_filter_against_workaround_dataframe():
# Set up DataFrame of ints, floats, strings.
letters = np.array(list(ascii_lowercase))
N = 100
N = 10
random_letters = letters.take(
np.random.default_rng(2).integers(0, 26, N, dtype=int)
)
df = DataFrame(
{
"ints": Series(np.random.default_rng(2).integers(0, 100, N)),
"ints": Series(np.random.default_rng(2).integers(0, 10, N)),
"floats": N / 10 * Series(np.random.default_rng(2).random(N)),
"letters": Series(random_letters),
}
)

# Group by ints; filter on floats.
grouped = df.groupby("ints")
old_way = df[grouped.floats.transform(lambda x: x.mean() > N / 20).astype("bool")]
new_way = grouped.filter(lambda x: x["floats"].mean() > N / 20)
old_way = df[grouped.floats.transform(lambda x: x.mean() > N / 2).astype("bool")]
new_way = grouped.filter(lambda x: x["floats"].mean() > N / 2)
tm.assert_frame_equal(new_way, old_way)

# Group by floats (rounded); filter on strings.
grouper = df.floats.apply(lambda x: np.round(x, -1))
grouped = df.groupby(grouper)
old_way = df[grouped.letters.transform(lambda x: len(x) < N / 10).astype("bool")]
new_way = grouped.filter(lambda x: len(x.letters) < N / 10)
old_way = df[grouped.letters.transform(lambda x: len(x) < N / 2).astype("bool")]
new_way = grouped.filter(lambda x: len(x.letters) < N / 2)
tm.assert_frame_equal(new_way, old_way)

# Group by strings; filter on ints.
grouped = df.groupby("letters")
old_way = df[grouped.ints.transform(lambda x: x.mean() > N / 20).astype("bool")]
new_way = grouped.filter(lambda x: x["ints"].mean() > N / 20)
old_way = df[grouped.ints.transform(lambda x: x.mean() > N / 2).astype("bool")]
new_way = grouped.filter(lambda x: x["ints"].mean() > N / 2)
tm.assert_frame_equal(new_way, old_way)


def test_filter_using_len():
# BUG GH4447
# GH 4447
df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
grouped = df.groupby("B")
actual = grouped.filter(lambda x: len(x) > 2)
Expand All @@ -250,8 +256,10 @@ def test_filter_using_len():
expected = df.loc[[]]
tm.assert_frame_equal(actual, expected)

# Series have always worked properly, but we'll test anyway.
s = df["B"]

def test_filter_using_len_series():
# GH 4447
s = Series(list("aabbbbcc"), name="B")
grouped = s.groupby(s)
actual = grouped.filter(lambda x: len(x) > 2)
expected = Series(4 * ["b"], index=np.arange(2, 6, dtype=np.int64), name="B")
Expand All @@ -262,10 +270,14 @@ def test_filter_using_len():
tm.assert_series_equal(actual, expected)


def test_filter_maintains_ordering():
# Simple case: index is sequential. #4621
@pytest.mark.parametrize(
"index", [range(8), range(7, -1, -1), [0, 2, 1, 3, 4, 6, 5, 7]]
)
def test_filter_maintains_ordering(index):
# GH 4621
df = DataFrame(
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
index=index,
)
s = df["pid"]
grouped = df.groupby("tag")
Expand All @@ -278,33 +290,6 @@ def test_filter_maintains_ordering():
expected = s.iloc[[1, 2, 4, 7]]
tm.assert_series_equal(actual, expected)

# Now index is sequentially decreasing.
df.index = np.arange(len(df) - 1, -1, -1)
s = df["pid"]
grouped = df.groupby("tag")
actual = grouped.filter(lambda x: len(x) > 1)
expected = df.iloc[[1, 2, 4, 7]]
tm.assert_frame_equal(actual, expected)

grouped = s.groupby(df["tag"])
actual = grouped.filter(lambda x: len(x) > 1)
expected = s.iloc[[1, 2, 4, 7]]
tm.assert_series_equal(actual, expected)

# Index is shuffled.
SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3]
df.index = df.index[SHUFFLED]
s = df["pid"]
grouped = df.groupby("tag")
actual = grouped.filter(lambda x: len(x) > 1)
expected = df.iloc[[1, 2, 4, 7]]
tm.assert_frame_equal(actual, expected)

grouped = s.groupby(df["tag"])
actual = grouped.filter(lambda x: len(x) > 1)
expected = s.iloc[[1, 2, 4, 7]]
tm.assert_series_equal(actual, expected)


def test_filter_multiple_timestamp():
# GH 10114
Expand Down