diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py index 2622895f9f8d2..47ad18c9ad2c8 100644 --- a/pandas/tests/groupby/test_counting.py +++ b/pandas/tests/groupby/test_counting.py @@ -321,19 +321,22 @@ def test_count_object(): expected = Series([3, 3], index=Index([2, 3], name="c"), name="a") tm.assert_series_equal(result, expected) + +def test_count_object_nan(): df = DataFrame({"a": ["a", np.nan, np.nan] + ["b"] * 3, "c": [2] * 3 + [3] * 3}) result = df.groupby("c").a.count() expected = Series([1, 3], index=Index([2, 3], name="c"), name="a") tm.assert_series_equal(result, expected) -def test_count_cross_type(): +@pytest.mark.parametrize("typ", ["object", "float32"]) +def test_count_cross_type(typ): # GH8169 # Set float64 dtype to avoid upcast when setting nan below vals = np.hstack( ( - np.random.default_rng(2).integers(0, 5, (100, 2)), - np.random.default_rng(2).integers(0, 2, (100, 2)), + np.random.default_rng(2).integers(0, 5, (10, 2)), + np.random.default_rng(2).integers(0, 2, (10, 2)), ) ).astype("float64") @@ -341,11 +344,10 @@ def test_count_cross_type(): df[df == 2] = np.nan expected = df.groupby(["c", "d"]).count() - for t in ["float32", "object"]: - df["a"] = df["a"].astype(t) - df["b"] = df["b"].astype(t) - result = df.groupby(["c", "d"]).count() - tm.assert_frame_equal(result, expected) + df["a"] = df["a"].astype(typ) + df["b"] = df["b"].astype(typ) + result = df.groupby(["c", "d"]).count() + tm.assert_frame_equal(result, expected) def test_lower_int_prec_count(): diff --git a/pandas/tests/groupby/test_cumulative.py b/pandas/tests/groupby/test_cumulative.py index 28dcb38d173f2..b0a0414c1feb2 100644 --- a/pandas/tests/groupby/test_cumulative.py +++ b/pandas/tests/groupby/test_cumulative.py @@ -94,21 +94,28 @@ def test_groupby_cumprod_nan_influences_other_columns(): def test_cummin(dtypes_for_minmax): dtype = dtypes_for_minmax[0] - min_val = dtypes_for_minmax[1] # GH 15048 base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) expected_mins = [3, 3, 3, 2, 2, 2, 2, 1] df = base_df.astype(dtype) - expected = DataFrame({"B": expected_mins}).astype(dtype) result = df.groupby("A").cummin() tm.assert_frame_equal(result, expected) result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame() tm.assert_frame_equal(result, expected) - # Test w/ min value for dtype + +def test_cummin_min_value_for_dtype(dtypes_for_minmax): + dtype = dtypes_for_minmax[0] + min_val = dtypes_for_minmax[1] + + # GH 15048 + base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) + expected_mins = [3, 3, 3, 2, 2, 2, 2, 1] + expected = DataFrame({"B": expected_mins}).astype(dtype) + df = base_df.astype(dtype) df.loc[[2, 6], "B"] = min_val df.loc[[1, 5], "B"] = min_val + 1 expected.loc[[2, 3, 6, 7], "B"] = min_val @@ -120,8 +127,10 @@ def test_cummin(dtypes_for_minmax): ) tm.assert_frame_equal(result, expected, check_exact=True) - # Test nan in some values + +def test_cummin_nan_in_some_values(dtypes_for_minmax): # Explicit cast to float to avoid implicit cast when setting nan + base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) base_df = base_df.astype({"B": "float"}) base_df.loc[[0, 2, 4, 6], "B"] = np.nan expected = DataFrame({"B": [np.nan, 4, np.nan, 2, np.nan, 3, np.nan, 1]}) @@ -132,6 +141,8 @@ def test_cummin(dtypes_for_minmax): ) tm.assert_frame_equal(result, expected) + +def test_cummin_datetime(): # GH 15561 df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])}) expected = Series(pd.to_datetime("2001"), index=[0], name="b") @@ -139,6 +150,8 @@ def test_cummin(dtypes_for_minmax): result = df.groupby("a")["b"].cummin() tm.assert_series_equal(expected, result) + +def test_cummin_getattr_series(): # GH 15635 df = DataFrame({"a": [1, 2, 1], "b": [1, 2, 2]}) result = df.groupby("a").b.cummin() @@ -163,7 +176,6 @@ def test_cummin_max_all_nan_column(method, dtype): def test_cummax(dtypes_for_minmax): dtype = dtypes_for_minmax[0] - max_val = dtypes_for_minmax[2] # GH 15048 base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) @@ -177,8 +189,18 @@ def test_cummax(dtypes_for_minmax): result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame() tm.assert_frame_equal(result, expected) - # Test w/ max value for dtype + +def test_cummax_min_value_for_dtype(dtypes_for_minmax): + dtype = dtypes_for_minmax[0] + max_val = dtypes_for_minmax[2] + + # GH 15048 + base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) + expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3] + + df = base_df.astype(dtype) df.loc[[2, 6], "B"] = max_val + expected = DataFrame({"B": expected_maxs}).astype(dtype) expected.loc[[2, 3, 6, 7], "B"] = max_val result = df.groupby("A").cummax() tm.assert_frame_equal(result, expected) @@ -187,8 +209,11 @@ def test_cummax(dtypes_for_minmax): ) tm.assert_frame_equal(result, expected) + +def test_cummax_nan_in_some_values(dtypes_for_minmax): # Test nan in some values # Explicit cast to float to avoid implicit cast when setting nan + base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) base_df = base_df.astype({"B": "float"}) base_df.loc[[0, 2, 4, 6], "B"] = np.nan expected = DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]}) @@ -199,6 +224,8 @@ def test_cummax(dtypes_for_minmax): ) tm.assert_frame_equal(result, expected) + +def test_cummax_datetime(): # GH 15561 df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])}) expected = Series(pd.to_datetime("2001"), index=[0], name="b") @@ -206,6 +233,8 @@ def test_cummax(dtypes_for_minmax): result = df.groupby("a")["b"].cummax() tm.assert_series_equal(expected, result) + +def test_cummax_getattr_series(): # GH 15635 df = DataFrame({"a": [1, 2, 1], "b": [2, 1, 1]}) result = df.groupby("a").b.cummax() @@ -292,15 +321,12 @@ def test_nullable_int_not_cast_as_float(method, dtype, val): tm.assert_frame_equal(result, expected) -def test_cython_api2(): +def test_cython_api2(as_index): # this takes the fast apply path # cumsum (GH5614) + # GH 5755 - cumsum is a transformer and should ignore as_index df = DataFrame([[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]], columns=["A", "B", "C"]) expected = DataFrame([[2, np.nan], [np.nan, 9], [4, 9]], columns=["B", "C"]) - result = df.groupby("A").cumsum() - tm.assert_frame_equal(result, expected) - - # GH 5755 - cumsum is a transformer and should ignore as_index - result = df.groupby("A", as_index=False).cumsum() + result = df.groupby("A", as_index=as_index).cumsum() tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_filters.py b/pandas/tests/groupby/test_filters.py index a34170e9b55db..04883b3ef6b78 100644 --- a/pandas/tests/groupby/test_filters.py +++ b/pandas/tests/groupby/test_filters.py @@ -85,6 +85,9 @@ def test_filter_out_no_groups(): grouped = s.groupby(grouper) filtered = grouped.filter(lambda x: x.mean() > 0) tm.assert_series_equal(filtered, s) + + +def test_filter_out_no_groups_dataframe(): df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()}) grouper = df["A"].apply(lambda x: x % 2) grouped = df.groupby(grouper) @@ -100,6 +103,9 @@ def test_filter_out_all_groups_in_df(): expected = DataFrame({"a": [np.nan] * 3, "b": [np.nan] * 3}) tm.assert_frame_equal(expected, res) + +def test_filter_out_all_groups_in_df_dropna_true(): + # GH12768 df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]}) res = df.groupby("a") res = res.filter(lambda x: x["b"].sum() > 5, dropna=True) @@ -179,7 +185,7 @@ def test_filter_pdna_is_false(): def test_filter_against_workaround_ints(): # Series of ints - s = Series(np.random.default_rng(2).integers(0, 100, 100)) + s = Series(np.random.default_rng(2).integers(0, 100, 10)) grouper = s.apply(lambda x: np.round(x, -1)) grouped = s.groupby(grouper) f = lambda x: x.mean() > 10 @@ -191,7 +197,7 @@ def test_filter_against_workaround_ints(): def test_filter_against_workaround_floats(): # Series of floats - s = 100 * Series(np.random.default_rng(2).random(100)) + s = 100 * Series(np.random.default_rng(2).random(10)) grouper = s.apply(lambda x: np.round(x, -1)) grouped = s.groupby(grouper) f = lambda x: x.mean() > 10 @@ -203,13 +209,13 @@ def test_filter_against_workaround_floats(): def test_filter_against_workaround_dataframe(): # Set up DataFrame of ints, floats, strings. letters = np.array(list(ascii_lowercase)) - N = 100 + N = 10 random_letters = letters.take( np.random.default_rng(2).integers(0, 26, N, dtype=int) ) df = DataFrame( { - "ints": Series(np.random.default_rng(2).integers(0, 100, N)), + "ints": Series(np.random.default_rng(2).integers(0, 10, N)), "floats": N / 10 * Series(np.random.default_rng(2).random(N)), "letters": Series(random_letters), } @@ -217,26 +223,26 @@ def test_filter_against_workaround_dataframe(): # Group by ints; filter on floats. grouped = df.groupby("ints") - old_way = df[grouped.floats.transform(lambda x: x.mean() > N / 20).astype("bool")] - new_way = grouped.filter(lambda x: x["floats"].mean() > N / 20) + old_way = df[grouped.floats.transform(lambda x: x.mean() > N / 2).astype("bool")] + new_way = grouped.filter(lambda x: x["floats"].mean() > N / 2) tm.assert_frame_equal(new_way, old_way) # Group by floats (rounded); filter on strings. grouper = df.floats.apply(lambda x: np.round(x, -1)) grouped = df.groupby(grouper) - old_way = df[grouped.letters.transform(lambda x: len(x) < N / 10).astype("bool")] - new_way = grouped.filter(lambda x: len(x.letters) < N / 10) + old_way = df[grouped.letters.transform(lambda x: len(x) < N / 2).astype("bool")] + new_way = grouped.filter(lambda x: len(x.letters) < N / 2) tm.assert_frame_equal(new_way, old_way) # Group by strings; filter on ints. grouped = df.groupby("letters") - old_way = df[grouped.ints.transform(lambda x: x.mean() > N / 20).astype("bool")] - new_way = grouped.filter(lambda x: x["ints"].mean() > N / 20) + old_way = df[grouped.ints.transform(lambda x: x.mean() > N / 2).astype("bool")] + new_way = grouped.filter(lambda x: x["ints"].mean() > N / 2) tm.assert_frame_equal(new_way, old_way) def test_filter_using_len(): - # BUG GH4447 + # GH 4447 df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)}) grouped = df.groupby("B") actual = grouped.filter(lambda x: len(x) > 2) @@ -250,8 +256,10 @@ def test_filter_using_len(): expected = df.loc[[]] tm.assert_frame_equal(actual, expected) - # Series have always worked properly, but we'll test anyway. - s = df["B"] + +def test_filter_using_len_series(): + # GH 4447 + s = Series(list("aabbbbcc"), name="B") grouped = s.groupby(s) actual = grouped.filter(lambda x: len(x) > 2) expected = Series(4 * ["b"], index=np.arange(2, 6, dtype=np.int64), name="B") @@ -262,10 +270,14 @@ def test_filter_using_len(): tm.assert_series_equal(actual, expected) -def test_filter_maintains_ordering(): - # Simple case: index is sequential. #4621 +@pytest.mark.parametrize( + "index", [range(8), range(7, -1, -1), [0, 2, 1, 3, 4, 6, 5, 7]] +) +def test_filter_maintains_ordering(index): + # GH 4621 df = DataFrame( - {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]} + {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}, + index=index, ) s = df["pid"] grouped = df.groupby("tag") @@ -278,33 +290,6 @@ def test_filter_maintains_ordering(): expected = s.iloc[[1, 2, 4, 7]] tm.assert_series_equal(actual, expected) - # Now index is sequentially decreasing. - df.index = np.arange(len(df) - 1, -1, -1) - s = df["pid"] - grouped = df.groupby("tag") - actual = grouped.filter(lambda x: len(x) > 1) - expected = df.iloc[[1, 2, 4, 7]] - tm.assert_frame_equal(actual, expected) - - grouped = s.groupby(df["tag"]) - actual = grouped.filter(lambda x: len(x) > 1) - expected = s.iloc[[1, 2, 4, 7]] - tm.assert_series_equal(actual, expected) - - # Index is shuffled. - SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3] - df.index = df.index[SHUFFLED] - s = df["pid"] - grouped = df.groupby("tag") - actual = grouped.filter(lambda x: len(x) > 1) - expected = df.iloc[[1, 2, 4, 7]] - tm.assert_frame_equal(actual, expected) - - grouped = s.groupby(df["tag"]) - actual = grouped.filter(lambda x: len(x) > 1) - expected = s.iloc[[1, 2, 4, 7]] - tm.assert_series_equal(actual, expected) - def test_filter_multiple_timestamp(): # GH 10114