|
| 1 | +import numpy as np |
| 2 | +import pytest |
| 3 | + |
| 4 | +from pandas._libs.tslibs import iNaT |
| 5 | + |
| 6 | +import pandas as pd |
| 7 | +from pandas import ( |
| 8 | + DataFrame, |
| 9 | + Index, |
| 10 | + Series, |
| 11 | +) |
| 12 | +import pandas._testing as tm |
| 13 | + |
| 14 | + |
| 15 | +def test_max_min_non_numeric(): |
| 16 | + # #2700 |
| 17 | + aa = DataFrame({"nn": [11, 11, 22, 22], "ii": [1, 2, 3, 4], "ss": 4 * ["mama"]}) |
| 18 | + |
| 19 | + result = aa.groupby("nn").max() |
| 20 | + assert "ss" in result |
| 21 | + |
| 22 | + result = aa.groupby("nn").max(numeric_only=False) |
| 23 | + assert "ss" in result |
| 24 | + |
| 25 | + result = aa.groupby("nn").min() |
| 26 | + assert "ss" in result |
| 27 | + |
| 28 | + result = aa.groupby("nn").min(numeric_only=False) |
| 29 | + assert "ss" in result |
| 30 | + |
| 31 | + |
| 32 | +def test_max_min_object_multiple_columns(using_array_manager): |
| 33 | + # GH#41111 case where the aggregation is valid for some columns but not |
| 34 | + # others; we split object blocks column-wise, consistent with |
| 35 | + # DataFrame._reduce |
| 36 | + |
| 37 | + df = DataFrame( |
| 38 | + { |
| 39 | + "A": [1, 1, 2, 2, 3], |
| 40 | + "B": [1, "foo", 2, "bar", False], |
| 41 | + "C": ["a", "b", "c", "d", "e"], |
| 42 | + } |
| 43 | + ) |
| 44 | + df._consolidate_inplace() # should already be consolidate, but double-check |
| 45 | + if not using_array_manager: |
| 46 | + assert len(df._mgr.blocks) == 2 |
| 47 | + |
| 48 | + gb = df.groupby("A") |
| 49 | + |
| 50 | + with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"): |
| 51 | + result = gb.max(numeric_only=False) |
| 52 | + # "max" is valid for column "C" but not for "B" |
| 53 | + ei = Index([1, 2, 3], name="A") |
| 54 | + expected = DataFrame({"C": ["b", "d", "e"]}, index=ei) |
| 55 | + tm.assert_frame_equal(result, expected) |
| 56 | + |
| 57 | + with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"): |
| 58 | + result = gb.min(numeric_only=False) |
| 59 | + # "min" is valid for column "C" but not for "B" |
| 60 | + ei = Index([1, 2, 3], name="A") |
| 61 | + expected = DataFrame({"C": ["a", "c", "e"]}, index=ei) |
| 62 | + tm.assert_frame_equal(result, expected) |
| 63 | + |
| 64 | + |
| 65 | +def test_min_date_with_nans(): |
| 66 | + # GH26321 |
| 67 | + dates = pd.to_datetime( |
| 68 | + Series(["2019-05-09", "2019-05-09", "2019-05-09"]), format="%Y-%m-%d" |
| 69 | + ).dt.date |
| 70 | + df = DataFrame({"a": [np.nan, "1", np.nan], "b": [0, 1, 1], "c": dates}) |
| 71 | + |
| 72 | + result = df.groupby("b", as_index=False)["c"].min()["c"] |
| 73 | + expected = pd.to_datetime( |
| 74 | + Series(["2019-05-09", "2019-05-09"], name="c"), format="%Y-%m-%d" |
| 75 | + ).dt.date |
| 76 | + tm.assert_series_equal(result, expected) |
| 77 | + |
| 78 | + result = df.groupby("b")["c"].min() |
| 79 | + expected.index.name = "b" |
| 80 | + tm.assert_series_equal(result, expected) |
| 81 | + |
| 82 | + |
| 83 | +def test_max_inat(): |
| 84 | + # GH#40767 dont interpret iNaT as NaN |
| 85 | + ser = Series([1, iNaT]) |
| 86 | + gb = ser.groupby([1, 1]) |
| 87 | + |
| 88 | + result = gb.max(min_count=2) |
| 89 | + expected = Series({1: 1}, dtype=np.int64) |
| 90 | + tm.assert_series_equal(result, expected, check_exact=True) |
| 91 | + |
| 92 | + result = gb.min(min_count=2) |
| 93 | + expected = Series({1: iNaT}, dtype=np.int64) |
| 94 | + tm.assert_series_equal(result, expected, check_exact=True) |
| 95 | + |
| 96 | + # not enough entries -> gets masked to NaN |
| 97 | + result = gb.min(min_count=3) |
| 98 | + expected = Series({1: np.nan}) |
| 99 | + tm.assert_series_equal(result, expected, check_exact=True) |
| 100 | + |
| 101 | + |
| 102 | +def test_max_inat_not_all_na(): |
| 103 | + # GH#40767 dont interpret iNaT as NaN |
| 104 | + |
| 105 | + # make sure we dont round iNaT+1 to iNaT |
| 106 | + ser = Series([1, iNaT, 2, iNaT + 1]) |
| 107 | + gb = ser.groupby([1, 2, 3, 3]) |
| 108 | + result = gb.min(min_count=2) |
| 109 | + |
| 110 | + # Note: in converting to float64, the iNaT + 1 maps to iNaT, i.e. is lossy |
| 111 | + expected = Series({1: np.nan, 2: np.nan, 3: iNaT + 1}) |
| 112 | + tm.assert_series_equal(result, expected, check_exact=True) |
| 113 | + |
| 114 | + |
| 115 | +@pytest.mark.parametrize("func", ["min", "max"]) |
| 116 | +def test_groupby_aggregate_period_column(func): |
| 117 | + # GH 31471 |
| 118 | + groups = [1, 2] |
| 119 | + periods = pd.period_range("2020", periods=2, freq="Y") |
| 120 | + df = DataFrame({"a": groups, "b": periods}) |
| 121 | + |
| 122 | + result = getattr(df.groupby("a")["b"], func)() |
| 123 | + idx = pd.Int64Index([1, 2], name="a") |
| 124 | + expected = Series(periods, index=idx, name="b") |
| 125 | + |
| 126 | + tm.assert_series_equal(result, expected) |
| 127 | + |
| 128 | + |
| 129 | +@pytest.mark.parametrize("func", ["min", "max"]) |
| 130 | +def test_groupby_aggregate_period_frame(func): |
| 131 | + # GH 31471 |
| 132 | + groups = [1, 2] |
| 133 | + periods = pd.period_range("2020", periods=2, freq="Y") |
| 134 | + df = DataFrame({"a": groups, "b": periods}) |
| 135 | + |
| 136 | + result = getattr(df.groupby("a"), func)() |
| 137 | + idx = pd.Int64Index([1, 2], name="a") |
| 138 | + expected = DataFrame({"b": periods}, index=idx) |
| 139 | + |
| 140 | + tm.assert_frame_equal(result, expected) |
| 141 | + |
| 142 | + |
| 143 | +def test_aggregate_numeric_object_dtype(): |
| 144 | + # https://github.com/pandas-dev/pandas/issues/39329 |
| 145 | + # simplified case: multiple object columns where one is all-NaN |
| 146 | + # -> gets split as the all-NaN is inferred as float |
| 147 | + df = DataFrame( |
| 148 | + {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": [np.nan] * 4}, |
| 149 | + ).astype(object) |
| 150 | + result = df.groupby("key").min() |
| 151 | + expected = DataFrame( |
| 152 | + {"key": ["A", "B"], "col1": ["a", "c"], "col2": [np.nan, np.nan]} |
| 153 | + ).set_index("key") |
| 154 | + tm.assert_frame_equal(result, expected) |
| 155 | + |
| 156 | + # same but with numbers |
| 157 | + df = DataFrame( |
| 158 | + {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": range(4)}, |
| 159 | + ).astype(object) |
| 160 | + result = df.groupby("key").min() |
| 161 | + expected = DataFrame( |
| 162 | + {"key": ["A", "B"], "col1": ["a", "c"], "col2": [0, 2]} |
| 163 | + ).set_index("key") |
| 164 | + tm.assert_frame_equal(result, expected) |
| 165 | + |
| 166 | + |
| 167 | +@pytest.mark.parametrize("func", ["min", "max"]) |
| 168 | +def test_aggregate_categorical_lost_index(func: str): |
| 169 | + # GH: 28641 groupby drops index, when grouping over categorical column with min/max |
| 170 | + ds = Series(["b"], dtype="category").cat.as_ordered() |
| 171 | + df = DataFrame({"A": [1997], "B": ds}) |
| 172 | + result = df.groupby("A").agg({"B": func}) |
| 173 | + expected = DataFrame({"B": ["b"]}, index=Index([1997], name="A")) |
| 174 | + |
| 175 | + # ordered categorical dtype should be preserved |
| 176 | + expected["B"] = expected["B"].astype(ds.dtype) |
| 177 | + |
| 178 | + tm.assert_frame_equal(result, expected) |
0 commit comments