From 9d5de5bdfff5edb5f3122278ecca18160ad7de7a Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 8 May 2020 11:41:05 -0500 Subject: [PATCH 1/3] TST/CLN: Move groupby tests --- pandas/tests/groupby/test_counting.py | 135 +++++- pandas/tests/groupby/test_function.py | 659 -------------------------- pandas/tests/groupby/test_nunique.py | 164 +++++++ pandas/tests/groupby/test_quantile.py | 336 +++++++++++++ pandas/tests/groupby/test_size.py | 38 ++ 5 files changed, 672 insertions(+), 660 deletions(-) create mode 100644 pandas/tests/groupby/test_nunique.py create mode 100644 pandas/tests/groupby/test_quantile.py create mode 100644 pandas/tests/groupby/test_size.py diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py index 56a18757da6e7..997d9b006c802 100644 --- a/pandas/tests/groupby/test_counting.py +++ b/pandas/tests/groupby/test_counting.py @@ -1,9 +1,20 @@ from itertools import product +from string import ascii_lowercase import numpy as np import pytest -from pandas import DataFrame, Index, MultiIndex, Period, Series, Timedelta, Timestamp +import pandas as pd +from pandas import ( + DataFrame, + Index, + MultiIndex, + Period, + Series, + Timedelta, + Timestamp, + date_range, +) import pandas._testing as tm @@ -229,3 +240,125 @@ def test_count_groupby_column_with_nan_in_groupby_column(self): index=Index([0.0, 3.0, 4.0, 5.0], name="B"), data={"A": [1, 1, 1, 1]} ) tm.assert_frame_equal(expected, res) + + +def test_groupby_timedelta_cython_count(): + df = DataFrame( + {"g": list("ab" * 2), "delt": np.arange(4).astype("timedelta64[ns]")} + ) + expected = Series([2, 2], index=pd.Index(["a", "b"], name="g"), name="delt") + result = df.groupby("g").delt.count() + tm.assert_series_equal(expected, result) + + +def test_count(): + n = 1 << 15 + dr = date_range("2015-08-30", periods=n // 10, freq="T") + + df = DataFrame( + { + "1st": np.random.choice(list(ascii_lowercase), n), + "2nd": np.random.randint(0, 5, n), + "3rd": np.random.randn(n).round(3), + "4th": np.random.randint(-10, 10, n), + "5th": np.random.choice(dr, n), + "6th": np.random.randn(n).round(3), + "7th": np.random.randn(n).round(3), + "8th": np.random.choice(dr, n) - np.random.choice(dr, 1), + "9th": np.random.choice(list(ascii_lowercase), n), + } + ) + + for col in df.columns.drop(["1st", "2nd", "4th"]): + df.loc[np.random.choice(n, n // 10), col] = np.nan + + df["9th"] = df["9th"].astype("category") + + for key in ["1st", "2nd", ["1st", "2nd"]]: + left = df.groupby(key).count() + right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) + tm.assert_frame_equal(left, right) + + +def test_count_non_nulls(): + # GH#5610 + # count counts non-nulls + df = pd.DataFrame( + [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, np.nan]], + columns=["A", "B", "C"], + ) + + count_as = df.groupby("A").count() + count_not_as = df.groupby("A", as_index=False).count() + + expected = DataFrame([[1, 2], [0, 0]], columns=["B", "C"], index=[1, 3]) + expected.index.name = "A" + tm.assert_frame_equal(count_not_as, expected.reset_index()) + tm.assert_frame_equal(count_as, expected) + + count_B = df.groupby("A")["B"].count() + tm.assert_series_equal(count_B, expected["B"]) + + +def test_count_object(): + df = pd.DataFrame({"a": ["a"] * 3 + ["b"] * 3, "c": [2] * 3 + [3] * 3}) + result = df.groupby("c").a.count() + expected = pd.Series([3, 3], index=pd.Index([2, 3], name="c"), name="a") + tm.assert_series_equal(result, expected) + + df = pd.DataFrame({"a": ["a", np.nan, np.nan] + ["b"] * 3, "c": [2] * 3 + [3] * 3}) + result = df.groupby("c").a.count() + expected = pd.Series([1, 3], index=pd.Index([2, 3], name="c"), name="a") + tm.assert_series_equal(result, expected) + + +def test_count_cross_type(): + # GH8169 + vals = np.hstack( + (np.random.randint(0, 5, (100, 2)), np.random.randint(0, 2, (100, 2))) + ) + + df = pd.DataFrame(vals, columns=["a", "b", "c", "d"]) + df[df == 2] = np.nan + expected = df.groupby(["c", "d"]).count() + + for t in ["float32", "object"]: + df["a"] = df["a"].astype(t) + df["b"] = df["b"].astype(t) + result = df.groupby(["c", "d"]).count() + tm.assert_frame_equal(result, expected) + + +def test_lower_int_prec_count(): + df = DataFrame( + { + "a": np.array([0, 1, 2, 100], np.int8), + "b": np.array([1, 2, 3, 6], np.uint32), + "c": np.array([4, 5, 6, 8], np.int16), + "grp": list("ab" * 2), + } + ) + result = df.groupby("grp").count() + expected = DataFrame( + {"a": [2, 2], "b": [2, 2], "c": [2, 2]}, index=pd.Index(list("ab"), name="grp") + ) + tm.assert_frame_equal(result, expected) + + +def test_count_uses_size_on_exception(): + class RaisingObjectException(Exception): + pass + + class RaisingObject: + def __init__(self, msg="I will raise inside Cython"): + super().__init__() + self.msg = msg + + def __eq__(self, other): + # gets called in Cython to check that raising calls the method + raise RaisingObjectException(self.msg) + + df = DataFrame({"a": [RaisingObject() for _ in range(4)], "grp": list("ab" * 2)}) + result = df.groupby("grp").count() + expected = DataFrame({"a": [2, 2]}, index=pd.Index(list("ab"), name="grp")) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 93dd1bf23c308..11673532681a4 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1,7 +1,5 @@ import builtins -import datetime as dt from io import StringIO -from string import ascii_lowercase import numpy as np import pytest @@ -13,7 +11,6 @@ DataFrame, Index, MultiIndex, - NaT, Series, Timestamp, date_range, @@ -983,659 +980,3 @@ def test_frame_describe_unstacked_format(): columns=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], ) tm.assert_frame_equal(result, expected) - - -# nunique -# -------------------------------- - - -@pytest.mark.parametrize("n", 10 ** np.arange(2, 6)) -@pytest.mark.parametrize("m", [10, 100, 1000]) -@pytest.mark.parametrize("sort", [False, True]) -@pytest.mark.parametrize("dropna", [False, True]) -def test_series_groupby_nunique(n, m, sort, dropna): - def check_nunique(df, keys, as_index=True): - original_df = df.copy() - gr = df.groupby(keys, as_index=as_index, sort=sort) - left = gr["julie"].nunique(dropna=dropna) - - gr = df.groupby(keys, as_index=as_index, sort=sort) - right = gr["julie"].apply(Series.nunique, dropna=dropna) - if not as_index: - right = right.reset_index(drop=True) - - tm.assert_series_equal(left, right, check_names=False) - tm.assert_frame_equal(df, original_df) - - days = date_range("2015-08-23", periods=10) - - frame = DataFrame( - { - "jim": np.random.choice(list(ascii_lowercase), n), - "joe": np.random.choice(days, n), - "julie": np.random.randint(0, m, n), - } - ) - - check_nunique(frame, ["jim"]) - check_nunique(frame, ["jim", "joe"]) - - frame.loc[1::17, "jim"] = None - frame.loc[3::37, "joe"] = None - frame.loc[7::19, "julie"] = None - frame.loc[8::19, "julie"] = None - frame.loc[9::19, "julie"] = None - - check_nunique(frame, ["jim"]) - check_nunique(frame, ["jim", "joe"]) - check_nunique(frame, ["jim"], as_index=False) - check_nunique(frame, ["jim", "joe"], as_index=False) - - -def test_nunique(): - df = DataFrame({"A": list("abbacc"), "B": list("abxacc"), "C": list("abbacx")}) - - expected = DataFrame({"A": [1] * 3, "B": [1, 2, 1], "C": [1, 1, 2]}) - result = df.groupby("A", as_index=False).nunique() - tm.assert_frame_equal(result, expected) - - # as_index - expected.index = list("abc") - expected.index.name = "A" - result = df.groupby("A").nunique() - tm.assert_frame_equal(result, expected) - - # with na - result = df.replace({"x": None}).groupby("A").nunique(dropna=False) - tm.assert_frame_equal(result, expected) - - # dropna - expected = DataFrame({"A": [1] * 3, "B": [1] * 3, "C": [1] * 3}, index=list("abc")) - expected.index.name = "A" - result = df.replace({"x": None}).groupby("A").nunique() - tm.assert_frame_equal(result, expected) - - -def test_nunique_with_object(): - # GH 11077 - data = pd.DataFrame( - [ - [100, 1, "Alice"], - [200, 2, "Bob"], - [300, 3, "Charlie"], - [-400, 4, "Dan"], - [500, 5, "Edith"], - ], - columns=["amount", "id", "name"], - ) - - result = data.groupby(["id", "amount"])["name"].nunique() - index = MultiIndex.from_arrays([data.id, data.amount]) - expected = pd.Series([1] * 5, name="name", index=index) - tm.assert_series_equal(result, expected) - - -def test_nunique_with_empty_series(): - # GH 12553 - data = pd.Series(name="name", dtype=object) - result = data.groupby(level=0).nunique() - expected = pd.Series(name="name", dtype="int64") - tm.assert_series_equal(result, expected) - - -def test_nunique_with_timegrouper(): - # GH 13453 - test = pd.DataFrame( - { - "time": [ - Timestamp("2016-06-28 09:35:35"), - Timestamp("2016-06-28 16:09:30"), - Timestamp("2016-06-28 16:46:28"), - ], - "data": ["1", "2", "3"], - } - ).set_index("time") - result = test.groupby(pd.Grouper(freq="h"))["data"].nunique() - expected = test.groupby(pd.Grouper(freq="h"))["data"].apply(pd.Series.nunique) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - "key, data, dropna, expected", - [ - ( - ["x", "x", "x"], - [Timestamp("2019-01-01"), NaT, Timestamp("2019-01-01")], - True, - Series([1], index=pd.Index(["x"], name="key"), name="data"), - ), - ( - ["x", "x", "x"], - [dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)], - True, - Series([1], index=pd.Index(["x"], name="key"), name="data"), - ), - ( - ["x", "x", "x", "y", "y"], - [dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)], - False, - Series([2, 2], index=pd.Index(["x", "y"], name="key"), name="data"), - ), - ( - ["x", "x", "x", "x", "y"], - [dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)], - False, - Series([2, 1], index=pd.Index(["x", "y"], name="key"), name="data"), - ), - ], -) -def test_nunique_with_NaT(key, data, dropna, expected): - # GH 27951 - df = pd.DataFrame({"key": key, "data": data}) - result = df.groupby(["key"])["data"].nunique(dropna=dropna) - tm.assert_series_equal(result, expected) - - -def test_nunique_preserves_column_level_names(): - # GH 23222 - test = pd.DataFrame([1, 2, 2], columns=pd.Index(["A"], name="level_0")) - result = test.groupby([0, 0, 0]).nunique() - expected = pd.DataFrame([2], columns=test.columns) - tm.assert_frame_equal(result, expected) - - -# count -# -------------------------------- - - -def test_groupby_timedelta_cython_count(): - df = DataFrame( - {"g": list("ab" * 2), "delt": np.arange(4).astype("timedelta64[ns]")} - ) - expected = Series([2, 2], index=pd.Index(["a", "b"], name="g"), name="delt") - result = df.groupby("g").delt.count() - tm.assert_series_equal(expected, result) - - -def test_count(): - n = 1 << 15 - dr = date_range("2015-08-30", periods=n // 10, freq="T") - - df = DataFrame( - { - "1st": np.random.choice(list(ascii_lowercase), n), - "2nd": np.random.randint(0, 5, n), - "3rd": np.random.randn(n).round(3), - "4th": np.random.randint(-10, 10, n), - "5th": np.random.choice(dr, n), - "6th": np.random.randn(n).round(3), - "7th": np.random.randn(n).round(3), - "8th": np.random.choice(dr, n) - np.random.choice(dr, 1), - "9th": np.random.choice(list(ascii_lowercase), n), - } - ) - - for col in df.columns.drop(["1st", "2nd", "4th"]): - df.loc[np.random.choice(n, n // 10), col] = np.nan - - df["9th"] = df["9th"].astype("category") - - for key in ["1st", "2nd", ["1st", "2nd"]]: - left = df.groupby(key).count() - right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) - tm.assert_frame_equal(left, right) - - -def test_count_non_nulls(): - # GH#5610 - # count counts non-nulls - df = pd.DataFrame( - [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, np.nan]], - columns=["A", "B", "C"], - ) - - count_as = df.groupby("A").count() - count_not_as = df.groupby("A", as_index=False).count() - - expected = DataFrame([[1, 2], [0, 0]], columns=["B", "C"], index=[1, 3]) - expected.index.name = "A" - tm.assert_frame_equal(count_not_as, expected.reset_index()) - tm.assert_frame_equal(count_as, expected) - - count_B = df.groupby("A")["B"].count() - tm.assert_series_equal(count_B, expected["B"]) - - -def test_count_object(): - df = pd.DataFrame({"a": ["a"] * 3 + ["b"] * 3, "c": [2] * 3 + [3] * 3}) - result = df.groupby("c").a.count() - expected = pd.Series([3, 3], index=pd.Index([2, 3], name="c"), name="a") - tm.assert_series_equal(result, expected) - - df = pd.DataFrame({"a": ["a", np.nan, np.nan] + ["b"] * 3, "c": [2] * 3 + [3] * 3}) - result = df.groupby("c").a.count() - expected = pd.Series([1, 3], index=pd.Index([2, 3], name="c"), name="a") - tm.assert_series_equal(result, expected) - - -def test_count_cross_type(): - # GH8169 - vals = np.hstack( - (np.random.randint(0, 5, (100, 2)), np.random.randint(0, 2, (100, 2))) - ) - - df = pd.DataFrame(vals, columns=["a", "b", "c", "d"]) - df[df == 2] = np.nan - expected = df.groupby(["c", "d"]).count() - - for t in ["float32", "object"]: - df["a"] = df["a"].astype(t) - df["b"] = df["b"].astype(t) - result = df.groupby(["c", "d"]).count() - tm.assert_frame_equal(result, expected) - - -def test_lower_int_prec_count(): - df = DataFrame( - { - "a": np.array([0, 1, 2, 100], np.int8), - "b": np.array([1, 2, 3, 6], np.uint32), - "c": np.array([4, 5, 6, 8], np.int16), - "grp": list("ab" * 2), - } - ) - result = df.groupby("grp").count() - expected = DataFrame( - {"a": [2, 2], "b": [2, 2], "c": [2, 2]}, index=pd.Index(list("ab"), name="grp") - ) - tm.assert_frame_equal(result, expected) - - -def test_count_uses_size_on_exception(): - class RaisingObjectException(Exception): - pass - - class RaisingObject: - def __init__(self, msg="I will raise inside Cython"): - super().__init__() - self.msg = msg - - def __eq__(self, other): - # gets called in Cython to check that raising calls the method - raise RaisingObjectException(self.msg) - - df = DataFrame({"a": [RaisingObject() for _ in range(4)], "grp": list("ab" * 2)}) - result = df.groupby("grp").count() - expected = DataFrame({"a": [2, 2]}, index=pd.Index(list("ab"), name="grp")) - tm.assert_frame_equal(result, expected) - - -# size -# -------------------------------- - - -@pytest.mark.parametrize("by", ["A", "B", ["A", "B"]]) -def test_size(df, by): - grouped = df.groupby(by=by) - result = grouped.size() - for key, group in grouped: - assert result[key] == len(group) - - -@pytest.mark.parametrize("by", ["A", "B", ["A", "B"]]) -@pytest.mark.parametrize("sort", [True, False]) -def test_size_sort(df, sort, by): - df = DataFrame(np.random.choice(20, (1000, 3)), columns=list("ABC")) - left = df.groupby(by=by, sort=sort).size() - right = df.groupby(by=by, sort=sort)["C"].apply(lambda a: a.shape[0]) - tm.assert_series_equal(left, right, check_names=False) - - -def test_size_series_dataframe(): - # https://github.com/pandas-dev/pandas/issues/11699 - df = DataFrame(columns=["A", "B"]) - out = Series(dtype="int64", index=Index([], name="A")) - tm.assert_series_equal(df.groupby("A").size(), out) - - -def test_size_groupby_all_null(): - # https://github.com/pandas-dev/pandas/issues/23050 - # Assert no 'Value Error : Length of passed values is 2, index implies 0' - df = DataFrame({"A": [None, None]}) # all-null groups - result = df.groupby("A").size() - expected = Series(dtype="int64", index=Index([], name="A")) - tm.assert_series_equal(result, expected) - - -# quantile -# -------------------------------- - - -@pytest.mark.parametrize( - "interpolation", ["linear", "lower", "higher", "nearest", "midpoint"] -) -@pytest.mark.parametrize( - "a_vals,b_vals", - [ - # Ints - ([1, 2, 3, 4, 5], [5, 4, 3, 2, 1]), - ([1, 2, 3, 4], [4, 3, 2, 1]), - ([1, 2, 3, 4, 5], [4, 3, 2, 1]), - # Floats - ([1.0, 2.0, 3.0, 4.0, 5.0], [5.0, 4.0, 3.0, 2.0, 1.0]), - # Missing data - ([1.0, np.nan, 3.0, np.nan, 5.0], [5.0, np.nan, 3.0, np.nan, 1.0]), - ([np.nan, 4.0, np.nan, 2.0, np.nan], [np.nan, 4.0, np.nan, 2.0, np.nan]), - # Timestamps - ( - list(pd.date_range("1/1/18", freq="D", periods=5)), - list(pd.date_range("1/1/18", freq="D", periods=5))[::-1], - ), - # All NA - ([np.nan] * 5, [np.nan] * 5), - ], -) -@pytest.mark.parametrize("q", [0, 0.25, 0.5, 0.75, 1]) -def test_quantile(interpolation, a_vals, b_vals, q): - if interpolation == "nearest" and q == 0.5 and b_vals == [4, 3, 2, 1]: - pytest.skip( - "Unclear numpy expectation for nearest result with equidistant data" - ) - - a_expected = pd.Series(a_vals).quantile(q, interpolation=interpolation) - b_expected = pd.Series(b_vals).quantile(q, interpolation=interpolation) - - df = DataFrame( - {"key": ["a"] * len(a_vals) + ["b"] * len(b_vals), "val": a_vals + b_vals} - ) - - expected = DataFrame( - [a_expected, b_expected], columns=["val"], index=Index(["a", "b"], name="key") - ) - result = df.groupby("key").quantile(q, interpolation=interpolation) - - tm.assert_frame_equal(result, expected) - - -def test_quantile_array(): - # https://github.com/pandas-dev/pandas/issues/27526 - df = pd.DataFrame({"A": [0, 1, 2, 3, 4]}) - result = df.groupby([0, 0, 1, 1, 1]).quantile([0.25]) - - index = pd.MultiIndex.from_product([[0, 1], [0.25]]) - expected = pd.DataFrame({"A": [0.25, 2.50]}, index=index) - tm.assert_frame_equal(result, expected) - - df = pd.DataFrame({"A": [0, 1, 2, 3], "B": [4, 5, 6, 7]}) - index = pd.MultiIndex.from_product([[0, 1], [0.25, 0.75]]) - - result = df.groupby([0, 0, 1, 1]).quantile([0.25, 0.75]) - expected = pd.DataFrame( - {"A": [0.25, 0.75, 2.25, 2.75], "B": [4.25, 4.75, 6.25, 6.75]}, index=index - ) - tm.assert_frame_equal(result, expected) - - -def test_quantile_array2(): - # https://github.com/pandas-dev/pandas/pull/28085#issuecomment-524066959 - df = pd.DataFrame( - np.random.RandomState(0).randint(0, 5, size=(10, 3)), columns=list("ABC") - ) - result = df.groupby("A").quantile([0.3, 0.7]) - expected = pd.DataFrame( - { - "B": [0.9, 2.1, 2.2, 3.4, 1.6, 2.4, 2.3, 2.7, 0.0, 0.0], - "C": [1.2, 2.8, 1.8, 3.0, 0.0, 0.0, 1.9, 3.1, 3.0, 3.0], - }, - index=pd.MultiIndex.from_product( - [[0, 1, 2, 3, 4], [0.3, 0.7]], names=["A", None] - ), - ) - tm.assert_frame_equal(result, expected) - - -def test_quantile_array_no_sort(): - df = pd.DataFrame({"A": [0, 1, 2], "B": [3, 4, 5]}) - result = df.groupby([1, 0, 1], sort=False).quantile([0.25, 0.5, 0.75]) - expected = pd.DataFrame( - {"A": [0.5, 1.0, 1.5, 1.0, 1.0, 1.0], "B": [3.5, 4.0, 4.5, 4.0, 4.0, 4.0]}, - index=pd.MultiIndex.from_product([[1, 0], [0.25, 0.5, 0.75]]), - ) - tm.assert_frame_equal(result, expected) - - result = df.groupby([1, 0, 1], sort=False).quantile([0.75, 0.25]) - expected = pd.DataFrame( - {"A": [1.5, 0.5, 1.0, 1.0], "B": [4.5, 3.5, 4.0, 4.0]}, - index=pd.MultiIndex.from_product([[1, 0], [0.75, 0.25]]), - ) - tm.assert_frame_equal(result, expected) - - -def test_quantile_array_multiple_levels(): - df = pd.DataFrame( - {"A": [0, 1, 2], "B": [3, 4, 5], "c": ["a", "a", "a"], "d": ["a", "a", "b"]} - ) - result = df.groupby(["c", "d"]).quantile([0.25, 0.75]) - index = pd.MultiIndex.from_tuples( - [("a", "a", 0.25), ("a", "a", 0.75), ("a", "b", 0.25), ("a", "b", 0.75)], - names=["c", "d", None], - ) - expected = pd.DataFrame( - {"A": [0.25, 0.75, 2.0, 2.0], "B": [3.25, 3.75, 5.0, 5.0]}, index=index - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("frame_size", [(2, 3), (100, 10)]) -@pytest.mark.parametrize("groupby", [[0], [0, 1]]) -@pytest.mark.parametrize("q", [[0.5, 0.6]]) -def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, q): - # GH30289 - nrow, ncol = frame_size - df = pd.DataFrame( - np.array([ncol * [_ % 4] for _ in range(nrow)]), columns=range(ncol) - ) - - idx_levels = [list(range(min(nrow, 4)))] * len(groupby) + [q] - idx_codes = [[x for x in range(min(nrow, 4)) for _ in q]] * len(groupby) + [ - list(range(len(q))) * min(nrow, 4) - ] - expected_index = pd.MultiIndex( - levels=idx_levels, codes=idx_codes, names=groupby + [None] - ) - expected_values = [ - [float(x)] * (ncol - len(groupby)) for x in range(min(nrow, 4)) for _ in q - ] - expected_columns = [x for x in range(ncol) if x not in groupby] - expected = pd.DataFrame( - expected_values, index=expected_index, columns=expected_columns - ) - result = df.groupby(groupby).quantile(q) - - tm.assert_frame_equal(result, expected) - - -def test_quantile_raises(): - df = pd.DataFrame( - [["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"] - ) - - with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"): - df.groupby("key").quantile() - - -def test_quantile_out_of_bounds_q_raises(): - # https://github.com/pandas-dev/pandas/issues/27470 - df = pd.DataFrame(dict(a=[0, 0, 0, 1, 1, 1], b=range(6))) - g = df.groupby([0, 0, 0, 1, 1, 1]) - with pytest.raises(ValueError, match="Got '50.0' instead"): - g.quantile(50) - - with pytest.raises(ValueError, match="Got '-1.0' instead"): - g.quantile(-1) - - -def test_quantile_missing_group_values_no_segfaults(): - # GH 28662 - data = np.array([1.0, np.nan, 1.0]) - df = pd.DataFrame(dict(key=data, val=range(3))) - - # Random segfaults; would have been guaranteed in loop - grp = df.groupby("key") - for _ in range(100): - grp.quantile() - - -def test_quantile_missing_group_values_correct_results(): - # GH 28662 - data = np.array([1.0, np.nan, 3.0, np.nan]) - df = pd.DataFrame(dict(key=data, val=range(4))) - - result = df.groupby("key").quantile() - expected = pd.DataFrame( - [1.0, 3.0], index=pd.Index([1.0, 3.0], name="key"), columns=["val"] - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "values", - [ - pd.array([1, 0, None] * 2, dtype="Int64"), - pd.array([True, False, None] * 2, dtype="boolean"), - ], -) -@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]]) -def test_groupby_quantile_nullable_array(values, q): - # https://github.com/pandas-dev/pandas/issues/33136 - df = pd.DataFrame({"a": ["x"] * 3 + ["y"] * 3, "b": values}) - result = df.groupby("a")["b"].quantile(q) - - if isinstance(q, list): - idx = pd.MultiIndex.from_product((["x", "y"], q), names=["a", None]) - true_quantiles = [0.0, 0.5, 1.0] - else: - idx = pd.Index(["x", "y"], name="a") - true_quantiles = [0.5] - - expected = pd.Series(true_quantiles * 2, index=idx, name="b") - tm.assert_series_equal(result, expected) - - -# pipe -# -------------------------------- - - -def test_pipe(): - # Test the pipe method of DataFrameGroupBy. - # Issue #17871 - - random_state = np.random.RandomState(1234567890) - - df = DataFrame( - { - "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], - "B": random_state.randn(8), - "C": random_state.randn(8), - } - ) - - def f(dfgb): - return dfgb.B.max() - dfgb.C.min().min() - - def square(srs): - return srs ** 2 - - # Note that the transformations are - # GroupBy -> Series - # Series -> Series - # This then chains the GroupBy.pipe and the - # NDFrame.pipe methods - result = df.groupby("A").pipe(f).pipe(square) - - index = Index(["bar", "foo"], dtype="object", name="A") - expected = pd.Series([8.99110003361, 8.17516964785], name="B", index=index) - - tm.assert_series_equal(expected, result) - - -def test_pipe_args(): - # Test passing args to the pipe method of DataFrameGroupBy. - # Issue #17871 - - df = pd.DataFrame( - { - "group": ["A", "A", "B", "B", "C"], - "x": [1.0, 2.0, 3.0, 2.0, 5.0], - "y": [10.0, 100.0, 1000.0, -100.0, -1000.0], - } - ) - - def f(dfgb, arg1): - return dfgb.filter(lambda grp: grp.y.mean() > arg1, dropna=False).groupby( - dfgb.grouper - ) - - def g(dfgb, arg2): - return dfgb.sum() / dfgb.sum().sum() + arg2 - - def h(df, arg3): - return df.x + df.y - arg3 - - result = df.groupby("group").pipe(f, 0).pipe(g, 10).pipe(h, 100) - - # Assert the results here - index = pd.Index(["A", "B", "C"], name="group") - expected = pd.Series([-79.5160891089, -78.4839108911, -80], index=index) - - tm.assert_series_equal(expected, result) - - # test SeriesGroupby.pipe - ser = pd.Series([1, 1, 2, 2, 3, 3]) - result = ser.groupby(ser).pipe(lambda grp: grp.sum() * grp.count()) - - expected = pd.Series([4, 8, 12], index=pd.Int64Index([1, 2, 3])) - - tm.assert_series_equal(result, expected) - - -def test_groupby_mean_no_overflow(): - # Regression test for (#22487) - df = pd.DataFrame( - { - "user": ["A", "A", "A", "A", "A"], - "connections": [4970, 4749, 4719, 4704, 18446744073699999744], - } - ) - assert df.groupby("user")["connections"].mean()["A"] == 3689348814740003840 - - -@pytest.mark.parametrize( - "values", - [ - { - "a": [1, 1, 1, 2, 2, 2, 3, 3, 3], - "b": [1, pd.NA, 2, 1, pd.NA, 2, 1, pd.NA, 2], - }, - {"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 1, 2, 1, 2]}, - ], -) -@pytest.mark.parametrize("function", ["mean", "median", "var"]) -def test_apply_to_nullable_integer_returns_float(values, function): - # https://github.com/pandas-dev/pandas/issues/32219 - output = 0.5 if function == "var" else 1.5 - arr = np.array([output] * 3, dtype=float) - idx = pd.Index([1, 2, 3], dtype=object, name="a") - expected = pd.DataFrame({"b": arr}, index=idx) - - groups = pd.DataFrame(values, dtype="Int64").groupby("a") - - result = getattr(groups, function)() - tm.assert_frame_equal(result, expected) - - result = groups.agg(function) - tm.assert_frame_equal(result, expected) - - result = groups.agg([function]) - expected.columns = MultiIndex.from_tuples([("b", function)]) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_nunique.py b/pandas/tests/groupby/test_nunique.py new file mode 100644 index 0000000000000..427a8accf7e7a --- /dev/null +++ b/pandas/tests/groupby/test_nunique.py @@ -0,0 +1,164 @@ +import datetime as dt +from string import ascii_lowercase + +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, MultiIndex, NaT, Series, Timestamp, date_range +import pandas._testing as tm + + +@pytest.mark.parametrize("n", 10 ** np.arange(2, 6)) +@pytest.mark.parametrize("m", [10, 100, 1000]) +@pytest.mark.parametrize("sort", [False, True]) +@pytest.mark.parametrize("dropna", [False, True]) +def test_series_groupby_nunique(n, m, sort, dropna): + def check_nunique(df, keys, as_index=True): + original_df = df.copy() + gr = df.groupby(keys, as_index=as_index, sort=sort) + left = gr["julie"].nunique(dropna=dropna) + + gr = df.groupby(keys, as_index=as_index, sort=sort) + right = gr["julie"].apply(Series.nunique, dropna=dropna) + if not as_index: + right = right.reset_index(drop=True) + + tm.assert_series_equal(left, right, check_names=False) + tm.assert_frame_equal(df, original_df) + + days = date_range("2015-08-23", periods=10) + + frame = DataFrame( + { + "jim": np.random.choice(list(ascii_lowercase), n), + "joe": np.random.choice(days, n), + "julie": np.random.randint(0, m, n), + } + ) + + check_nunique(frame, ["jim"]) + check_nunique(frame, ["jim", "joe"]) + + frame.loc[1::17, "jim"] = None + frame.loc[3::37, "joe"] = None + frame.loc[7::19, "julie"] = None + frame.loc[8::19, "julie"] = None + frame.loc[9::19, "julie"] = None + + check_nunique(frame, ["jim"]) + check_nunique(frame, ["jim", "joe"]) + check_nunique(frame, ["jim"], as_index=False) + check_nunique(frame, ["jim", "joe"], as_index=False) + + +def test_nunique(): + df = DataFrame({"A": list("abbacc"), "B": list("abxacc"), "C": list("abbacx")}) + + expected = DataFrame({"A": [1] * 3, "B": [1, 2, 1], "C": [1, 1, 2]}) + result = df.groupby("A", as_index=False).nunique() + tm.assert_frame_equal(result, expected) + + # as_index + expected.index = list("abc") + expected.index.name = "A" + result = df.groupby("A").nunique() + tm.assert_frame_equal(result, expected) + + # with na + result = df.replace({"x": None}).groupby("A").nunique(dropna=False) + tm.assert_frame_equal(result, expected) + + # dropna + expected = DataFrame({"A": [1] * 3, "B": [1] * 3, "C": [1] * 3}, index=list("abc")) + expected.index.name = "A" + result = df.replace({"x": None}).groupby("A").nunique() + tm.assert_frame_equal(result, expected) + + +def test_nunique_with_object(): + # GH 11077 + data = pd.DataFrame( + [ + [100, 1, "Alice"], + [200, 2, "Bob"], + [300, 3, "Charlie"], + [-400, 4, "Dan"], + [500, 5, "Edith"], + ], + columns=["amount", "id", "name"], + ) + + result = data.groupby(["id", "amount"])["name"].nunique() + index = MultiIndex.from_arrays([data.id, data.amount]) + expected = pd.Series([1] * 5, name="name", index=index) + tm.assert_series_equal(result, expected) + + +def test_nunique_with_empty_series(): + # GH 12553 + data = pd.Series(name="name", dtype=object) + result = data.groupby(level=0).nunique() + expected = pd.Series(name="name", dtype="int64") + tm.assert_series_equal(result, expected) + + +def test_nunique_with_timegrouper(): + # GH 13453 + test = pd.DataFrame( + { + "time": [ + Timestamp("2016-06-28 09:35:35"), + Timestamp("2016-06-28 16:09:30"), + Timestamp("2016-06-28 16:46:28"), + ], + "data": ["1", "2", "3"], + } + ).set_index("time") + result = test.groupby(pd.Grouper(freq="h"))["data"].nunique() + expected = test.groupby(pd.Grouper(freq="h"))["data"].apply(pd.Series.nunique) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "key, data, dropna, expected", + [ + ( + ["x", "x", "x"], + [Timestamp("2019-01-01"), NaT, Timestamp("2019-01-01")], + True, + Series([1], index=pd.Index(["x"], name="key"), name="data"), + ), + ( + ["x", "x", "x"], + [dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)], + True, + Series([1], index=pd.Index(["x"], name="key"), name="data"), + ), + ( + ["x", "x", "x", "y", "y"], + [dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)], + False, + Series([2, 2], index=pd.Index(["x", "y"], name="key"), name="data"), + ), + ( + ["x", "x", "x", "x", "y"], + [dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)], + False, + Series([2, 1], index=pd.Index(["x", "y"], name="key"), name="data"), + ), + ], +) +def test_nunique_with_NaT(key, data, dropna, expected): + # GH 27951 + df = pd.DataFrame({"key": key, "data": data}) + result = df.groupby(["key"])["data"].nunique(dropna=dropna) + tm.assert_series_equal(result, expected) + + +def test_nunique_preserves_column_level_names(): + # GH 23222 + test = pd.DataFrame([1, 2, 2], columns=pd.Index(["A"], name="level_0")) + result = test.groupby([0, 0, 0]).nunique() + expected = pd.DataFrame([2], columns=test.columns) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py new file mode 100644 index 0000000000000..69ffdba06cbca --- /dev/null +++ b/pandas/tests/groupby/test_quantile.py @@ -0,0 +1,336 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Index, MultiIndex +import pandas._testing as tm + + +@pytest.mark.parametrize( + "interpolation", ["linear", "lower", "higher", "nearest", "midpoint"] +) +@pytest.mark.parametrize( + "a_vals,b_vals", + [ + # Ints + ([1, 2, 3, 4, 5], [5, 4, 3, 2, 1]), + ([1, 2, 3, 4], [4, 3, 2, 1]), + ([1, 2, 3, 4, 5], [4, 3, 2, 1]), + # Floats + ([1.0, 2.0, 3.0, 4.0, 5.0], [5.0, 4.0, 3.0, 2.0, 1.0]), + # Missing data + ([1.0, np.nan, 3.0, np.nan, 5.0], [5.0, np.nan, 3.0, np.nan, 1.0]), + ([np.nan, 4.0, np.nan, 2.0, np.nan], [np.nan, 4.0, np.nan, 2.0, np.nan]), + # Timestamps + ( + list(pd.date_range("1/1/18", freq="D", periods=5)), + list(pd.date_range("1/1/18", freq="D", periods=5))[::-1], + ), + # All NA + ([np.nan] * 5, [np.nan] * 5), + ], +) +@pytest.mark.parametrize("q", [0, 0.25, 0.5, 0.75, 1]) +def test_quantile(interpolation, a_vals, b_vals, q): + if interpolation == "nearest" and q == 0.5 and b_vals == [4, 3, 2, 1]: + pytest.skip( + "Unclear numpy expectation for nearest result with equidistant data" + ) + + a_expected = pd.Series(a_vals).quantile(q, interpolation=interpolation) + b_expected = pd.Series(b_vals).quantile(q, interpolation=interpolation) + + df = DataFrame( + {"key": ["a"] * len(a_vals) + ["b"] * len(b_vals), "val": a_vals + b_vals} + ) + + expected = DataFrame( + [a_expected, b_expected], columns=["val"], index=Index(["a", "b"], name="key") + ) + result = df.groupby("key").quantile(q, interpolation=interpolation) + + tm.assert_frame_equal(result, expected) + + +def test_quantile_array(): + # https://github.com/pandas-dev/pandas/issues/27526 + df = pd.DataFrame({"A": [0, 1, 2, 3, 4]}) + result = df.groupby([0, 0, 1, 1, 1]).quantile([0.25]) + + index = pd.MultiIndex.from_product([[0, 1], [0.25]]) + expected = pd.DataFrame({"A": [0.25, 2.50]}, index=index) + tm.assert_frame_equal(result, expected) + + df = pd.DataFrame({"A": [0, 1, 2, 3], "B": [4, 5, 6, 7]}) + index = pd.MultiIndex.from_product([[0, 1], [0.25, 0.75]]) + + result = df.groupby([0, 0, 1, 1]).quantile([0.25, 0.75]) + expected = pd.DataFrame( + {"A": [0.25, 0.75, 2.25, 2.75], "B": [4.25, 4.75, 6.25, 6.75]}, index=index + ) + tm.assert_frame_equal(result, expected) + + +def test_quantile_array2(): + # https://github.com/pandas-dev/pandas/pull/28085#issuecomment-524066959 + df = pd.DataFrame( + np.random.RandomState(0).randint(0, 5, size=(10, 3)), columns=list("ABC") + ) + result = df.groupby("A").quantile([0.3, 0.7]) + expected = pd.DataFrame( + { + "B": [0.9, 2.1, 2.2, 3.4, 1.6, 2.4, 2.3, 2.7, 0.0, 0.0], + "C": [1.2, 2.8, 1.8, 3.0, 0.0, 0.0, 1.9, 3.1, 3.0, 3.0], + }, + index=pd.MultiIndex.from_product( + [[0, 1, 2, 3, 4], [0.3, 0.7]], names=["A", None] + ), + ) + tm.assert_frame_equal(result, expected) + + +def test_quantile_array_no_sort(): + df = pd.DataFrame({"A": [0, 1, 2], "B": [3, 4, 5]}) + result = df.groupby([1, 0, 1], sort=False).quantile([0.25, 0.5, 0.75]) + expected = pd.DataFrame( + {"A": [0.5, 1.0, 1.5, 1.0, 1.0, 1.0], "B": [3.5, 4.0, 4.5, 4.0, 4.0, 4.0]}, + index=pd.MultiIndex.from_product([[1, 0], [0.25, 0.5, 0.75]]), + ) + tm.assert_frame_equal(result, expected) + + result = df.groupby([1, 0, 1], sort=False).quantile([0.75, 0.25]) + expected = pd.DataFrame( + {"A": [1.5, 0.5, 1.0, 1.0], "B": [4.5, 3.5, 4.0, 4.0]}, + index=pd.MultiIndex.from_product([[1, 0], [0.75, 0.25]]), + ) + tm.assert_frame_equal(result, expected) + + +def test_quantile_array_multiple_levels(): + df = pd.DataFrame( + {"A": [0, 1, 2], "B": [3, 4, 5], "c": ["a", "a", "a"], "d": ["a", "a", "b"]} + ) + result = df.groupby(["c", "d"]).quantile([0.25, 0.75]) + index = pd.MultiIndex.from_tuples( + [("a", "a", 0.25), ("a", "a", 0.75), ("a", "b", 0.25), ("a", "b", 0.75)], + names=["c", "d", None], + ) + expected = pd.DataFrame( + {"A": [0.25, 0.75, 2.0, 2.0], "B": [3.25, 3.75, 5.0, 5.0]}, index=index + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("frame_size", [(2, 3), (100, 10)]) +@pytest.mark.parametrize("groupby", [[0], [0, 1]]) +@pytest.mark.parametrize("q", [[0.5, 0.6]]) +def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, q): + # GH30289 + nrow, ncol = frame_size + df = pd.DataFrame( + np.array([ncol * [_ % 4] for _ in range(nrow)]), columns=range(ncol) + ) + + idx_levels = [list(range(min(nrow, 4)))] * len(groupby) + [q] + idx_codes = [[x for x in range(min(nrow, 4)) for _ in q]] * len(groupby) + [ + list(range(len(q))) * min(nrow, 4) + ] + expected_index = pd.MultiIndex( + levels=idx_levels, codes=idx_codes, names=groupby + [None] + ) + expected_values = [ + [float(x)] * (ncol - len(groupby)) for x in range(min(nrow, 4)) for _ in q + ] + expected_columns = [x for x in range(ncol) if x not in groupby] + expected = pd.DataFrame( + expected_values, index=expected_index, columns=expected_columns + ) + result = df.groupby(groupby).quantile(q) + + tm.assert_frame_equal(result, expected) + + +def test_quantile_raises(): + df = pd.DataFrame( + [["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"] + ) + + with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"): + df.groupby("key").quantile() + + +def test_quantile_out_of_bounds_q_raises(): + # https://github.com/pandas-dev/pandas/issues/27470 + df = pd.DataFrame(dict(a=[0, 0, 0, 1, 1, 1], b=range(6))) + g = df.groupby([0, 0, 0, 1, 1, 1]) + with pytest.raises(ValueError, match="Got '50.0' instead"): + g.quantile(50) + + with pytest.raises(ValueError, match="Got '-1.0' instead"): + g.quantile(-1) + + +def test_quantile_missing_group_values_no_segfaults(): + # GH 28662 + data = np.array([1.0, np.nan, 1.0]) + df = pd.DataFrame(dict(key=data, val=range(3))) + + # Random segfaults; would have been guaranteed in loop + grp = df.groupby("key") + for _ in range(100): + grp.quantile() + + +def test_quantile_missing_group_values_correct_results(): + # GH 28662 + data = np.array([1.0, np.nan, 3.0, np.nan]) + df = pd.DataFrame(dict(key=data, val=range(4))) + + result = df.groupby("key").quantile() + expected = pd.DataFrame( + [1.0, 3.0], index=pd.Index([1.0, 3.0], name="key"), columns=["val"] + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "values", + [ + pd.array([1, 0, None] * 2, dtype="Int64"), + pd.array([True, False, None] * 2, dtype="boolean"), + ], +) +@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]]) +def test_groupby_quantile_nullable_array(values, q): + # https://github.com/pandas-dev/pandas/issues/33136 + df = pd.DataFrame({"a": ["x"] * 3 + ["y"] * 3, "b": values}) + result = df.groupby("a")["b"].quantile(q) + + if isinstance(q, list): + idx = pd.MultiIndex.from_product((["x", "y"], q), names=["a", None]) + true_quantiles = [0.0, 0.5, 1.0] + else: + idx = pd.Index(["x", "y"], name="a") + true_quantiles = [0.5] + + expected = pd.Series(true_quantiles * 2, index=idx, name="b") + tm.assert_series_equal(result, expected) + + +# pipe +# -------------------------------- + + +def test_pipe(): + # Test the pipe method of DataFrameGroupBy. + # Issue #17871 + + random_state = np.random.RandomState(1234567890) + + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": random_state.randn(8), + "C": random_state.randn(8), + } + ) + + def f(dfgb): + return dfgb.B.max() - dfgb.C.min().min() + + def square(srs): + return srs ** 2 + + # Note that the transformations are + # GroupBy -> Series + # Series -> Series + # This then chains the GroupBy.pipe and the + # NDFrame.pipe methods + result = df.groupby("A").pipe(f).pipe(square) + + index = Index(["bar", "foo"], dtype="object", name="A") + expected = pd.Series([8.99110003361, 8.17516964785], name="B", index=index) + + tm.assert_series_equal(expected, result) + + +def test_pipe_args(): + # Test passing args to the pipe method of DataFrameGroupBy. + # Issue #17871 + + df = pd.DataFrame( + { + "group": ["A", "A", "B", "B", "C"], + "x": [1.0, 2.0, 3.0, 2.0, 5.0], + "y": [10.0, 100.0, 1000.0, -100.0, -1000.0], + } + ) + + def f(dfgb, arg1): + return dfgb.filter(lambda grp: grp.y.mean() > arg1, dropna=False).groupby( + dfgb.grouper + ) + + def g(dfgb, arg2): + return dfgb.sum() / dfgb.sum().sum() + arg2 + + def h(df, arg3): + return df.x + df.y - arg3 + + result = df.groupby("group").pipe(f, 0).pipe(g, 10).pipe(h, 100) + + # Assert the results here + index = pd.Index(["A", "B", "C"], name="group") + expected = pd.Series([-79.5160891089, -78.4839108911, -80], index=index) + + tm.assert_series_equal(expected, result) + + # test SeriesGroupby.pipe + ser = pd.Series([1, 1, 2, 2, 3, 3]) + result = ser.groupby(ser).pipe(lambda grp: grp.sum() * grp.count()) + + expected = pd.Series([4, 8, 12], index=pd.Int64Index([1, 2, 3])) + + tm.assert_series_equal(result, expected) + + +def test_groupby_mean_no_overflow(): + # Regression test for (#22487) + df = pd.DataFrame( + { + "user": ["A", "A", "A", "A", "A"], + "connections": [4970, 4749, 4719, 4704, 18446744073699999744], + } + ) + assert df.groupby("user")["connections"].mean()["A"] == 3689348814740003840 + + +@pytest.mark.parametrize( + "values", + [ + { + "a": [1, 1, 1, 2, 2, 2, 3, 3, 3], + "b": [1, pd.NA, 2, 1, pd.NA, 2, 1, pd.NA, 2], + }, + {"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 1, 2, 1, 2]}, + ], +) +@pytest.mark.parametrize("function", ["mean", "median", "var"]) +def test_apply_to_nullable_integer_returns_float(values, function): + # https://github.com/pandas-dev/pandas/issues/32219 + output = 0.5 if function == "var" else 1.5 + arr = np.array([output] * 3, dtype=float) + idx = pd.Index([1, 2, 3], dtype=object, name="a") + expected = pd.DataFrame({"b": arr}, index=idx) + + groups = pd.DataFrame(values, dtype="Int64").groupby("a") + + result = getattr(groups, function)() + tm.assert_frame_equal(result, expected) + + result = groups.agg(function) + tm.assert_frame_equal(result, expected) + + result = groups.agg([function]) + expected.columns = MultiIndex.from_tuples([("b", function)]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_size.py b/pandas/tests/groupby/test_size.py new file mode 100644 index 0000000000000..346e6ae6cb9cb --- /dev/null +++ b/pandas/tests/groupby/test_size.py @@ -0,0 +1,38 @@ +import numpy as np +import pytest + +from pandas import DataFrame, Index, Series +import pandas._testing as tm + + +@pytest.mark.parametrize("by", ["A", "B", ["A", "B"]]) +def test_size(df, by): + grouped = df.groupby(by=by) + result = grouped.size() + for key, group in grouped: + assert result[key] == len(group) + + +@pytest.mark.parametrize("by", ["A", "B", ["A", "B"]]) +@pytest.mark.parametrize("sort", [True, False]) +def test_size_sort(df, sort, by): + df = DataFrame(np.random.choice(20, (1000, 3)), columns=list("ABC")) + left = df.groupby(by=by, sort=sort).size() + right = df.groupby(by=by, sort=sort)["C"].apply(lambda a: a.shape[0]) + tm.assert_series_equal(left, right, check_names=False) + + +def test_size_series_dataframe(): + # https://github.com/pandas-dev/pandas/issues/11699 + df = DataFrame(columns=["A", "B"]) + out = Series(dtype="int64", index=Index([], name="A")) + tm.assert_series_equal(df.groupby("A").size(), out) + + +def test_size_groupby_all_null(): + # https://github.com/pandas-dev/pandas/issues/23050 + # Assert no 'Value Error : Length of passed values is 2, index implies 0' + df = DataFrame({"A": [None, None]}) # all-null groups + result = df.groupby("A").size() + expected = Series(dtype="int64", index=Index([], name="A")) + tm.assert_series_equal(result, expected) From 8deed5f5ff5e1d9b567f01078340eda0d2261361 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 8 May 2020 11:46:30 -0500 Subject: [PATCH 2/3] Fixup --- pandas/tests/groupby/test_function.py | 42 +++++++++ pandas/tests/groupby/test_pipe.py | 78 +++++++++++++++++ pandas/tests/groupby/test_quantile.py | 119 -------------------------- 3 files changed, 120 insertions(+), 119 deletions(-) create mode 100644 pandas/tests/groupby/test_pipe.py diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 11673532681a4..840976a455f79 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -980,3 +980,45 @@ def test_frame_describe_unstacked_format(): columns=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], ) tm.assert_frame_equal(result, expected) + + +def test_groupby_mean_no_overflow(): + # Regression test for (#22487) + df = pd.DataFrame( + { + "user": ["A", "A", "A", "A", "A"], + "connections": [4970, 4749, 4719, 4704, 18446744073699999744], + } + ) + assert df.groupby("user")["connections"].mean()["A"] == 3689348814740003840 + + +@pytest.mark.parametrize( + "values", + [ + { + "a": [1, 1, 1, 2, 2, 2, 3, 3, 3], + "b": [1, pd.NA, 2, 1, pd.NA, 2, 1, pd.NA, 2], + }, + {"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 1, 2, 1, 2]}, + ], +) +@pytest.mark.parametrize("function", ["mean", "median", "var"]) +def test_apply_to_nullable_integer_returns_float(values, function): + # https://github.com/pandas-dev/pandas/issues/32219 + output = 0.5 if function == "var" else 1.5 + arr = np.array([output] * 3, dtype=float) + idx = pd.Index([1, 2, 3], dtype=object, name="a") + expected = pd.DataFrame({"b": arr}, index=idx) + + groups = pd.DataFrame(values, dtype="Int64").groupby("a") + + result = getattr(groups, function)() + tm.assert_frame_equal(result, expected) + + result = groups.agg(function) + tm.assert_frame_equal(result, expected) + + result = groups.agg([function]) + expected.columns = MultiIndex.from_tuples([("b", function)]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_pipe.py b/pandas/tests/groupby/test_pipe.py new file mode 100644 index 0000000000000..d2ab016f608fa --- /dev/null +++ b/pandas/tests/groupby/test_pipe.py @@ -0,0 +1,78 @@ +import numpy as np + +import pandas as pd +from pandas import DataFrame, Index +import pandas._testing as tm + + +def test_pipe(): + # Test the pipe method of DataFrameGroupBy. + # Issue #17871 + + random_state = np.random.RandomState(1234567890) + + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": random_state.randn(8), + "C": random_state.randn(8), + } + ) + + def f(dfgb): + return dfgb.B.max() - dfgb.C.min().min() + + def square(srs): + return srs ** 2 + + # Note that the transformations are + # GroupBy -> Series + # Series -> Series + # This then chains the GroupBy.pipe and the + # NDFrame.pipe methods + result = df.groupby("A").pipe(f).pipe(square) + + index = Index(["bar", "foo"], dtype="object", name="A") + expected = pd.Series([8.99110003361, 8.17516964785], name="B", index=index) + + tm.assert_series_equal(expected, result) + + +def test_pipe_args(): + # Test passing args to the pipe method of DataFrameGroupBy. + # Issue #17871 + + df = pd.DataFrame( + { + "group": ["A", "A", "B", "B", "C"], + "x": [1.0, 2.0, 3.0, 2.0, 5.0], + "y": [10.0, 100.0, 1000.0, -100.0, -1000.0], + } + ) + + def f(dfgb, arg1): + return dfgb.filter(lambda grp: grp.y.mean() > arg1, dropna=False).groupby( + dfgb.grouper + ) + + def g(dfgb, arg2): + return dfgb.sum() / dfgb.sum().sum() + arg2 + + def h(df, arg3): + return df.x + df.y - arg3 + + result = df.groupby("group").pipe(f, 0).pipe(g, 10).pipe(h, 100) + + # Assert the results here + index = pd.Index(["A", "B", "C"], name="group") + expected = pd.Series([-79.5160891089, -78.4839108911, -80], index=index) + + tm.assert_series_equal(expected, result) + + # test SeriesGroupby.pipe + ser = pd.Series([1, 1, 2, 2, 3, 3]) + result = ser.groupby(ser).pipe(lambda grp: grp.sum() * grp.count()) + + expected = pd.Series([4, 8, 12], index=pd.Int64Index([1, 2, 3])) + + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index 69ffdba06cbca..6cab1f5d540e9 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -215,122 +215,3 @@ def test_groupby_quantile_nullable_array(values, q): expected = pd.Series(true_quantiles * 2, index=idx, name="b") tm.assert_series_equal(result, expected) - - -# pipe -# -------------------------------- - - -def test_pipe(): - # Test the pipe method of DataFrameGroupBy. - # Issue #17871 - - random_state = np.random.RandomState(1234567890) - - df = DataFrame( - { - "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], - "B": random_state.randn(8), - "C": random_state.randn(8), - } - ) - - def f(dfgb): - return dfgb.B.max() - dfgb.C.min().min() - - def square(srs): - return srs ** 2 - - # Note that the transformations are - # GroupBy -> Series - # Series -> Series - # This then chains the GroupBy.pipe and the - # NDFrame.pipe methods - result = df.groupby("A").pipe(f).pipe(square) - - index = Index(["bar", "foo"], dtype="object", name="A") - expected = pd.Series([8.99110003361, 8.17516964785], name="B", index=index) - - tm.assert_series_equal(expected, result) - - -def test_pipe_args(): - # Test passing args to the pipe method of DataFrameGroupBy. - # Issue #17871 - - df = pd.DataFrame( - { - "group": ["A", "A", "B", "B", "C"], - "x": [1.0, 2.0, 3.0, 2.0, 5.0], - "y": [10.0, 100.0, 1000.0, -100.0, -1000.0], - } - ) - - def f(dfgb, arg1): - return dfgb.filter(lambda grp: grp.y.mean() > arg1, dropna=False).groupby( - dfgb.grouper - ) - - def g(dfgb, arg2): - return dfgb.sum() / dfgb.sum().sum() + arg2 - - def h(df, arg3): - return df.x + df.y - arg3 - - result = df.groupby("group").pipe(f, 0).pipe(g, 10).pipe(h, 100) - - # Assert the results here - index = pd.Index(["A", "B", "C"], name="group") - expected = pd.Series([-79.5160891089, -78.4839108911, -80], index=index) - - tm.assert_series_equal(expected, result) - - # test SeriesGroupby.pipe - ser = pd.Series([1, 1, 2, 2, 3, 3]) - result = ser.groupby(ser).pipe(lambda grp: grp.sum() * grp.count()) - - expected = pd.Series([4, 8, 12], index=pd.Int64Index([1, 2, 3])) - - tm.assert_series_equal(result, expected) - - -def test_groupby_mean_no_overflow(): - # Regression test for (#22487) - df = pd.DataFrame( - { - "user": ["A", "A", "A", "A", "A"], - "connections": [4970, 4749, 4719, 4704, 18446744073699999744], - } - ) - assert df.groupby("user")["connections"].mean()["A"] == 3689348814740003840 - - -@pytest.mark.parametrize( - "values", - [ - { - "a": [1, 1, 1, 2, 2, 2, 3, 3, 3], - "b": [1, pd.NA, 2, 1, pd.NA, 2, 1, pd.NA, 2], - }, - {"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 1, 2, 1, 2]}, - ], -) -@pytest.mark.parametrize("function", ["mean", "median", "var"]) -def test_apply_to_nullable_integer_returns_float(values, function): - # https://github.com/pandas-dev/pandas/issues/32219 - output = 0.5 if function == "var" else 1.5 - arr = np.array([output] * 3, dtype=float) - idx = pd.Index([1, 2, 3], dtype=object, name="a") - expected = pd.DataFrame({"b": arr}, index=idx) - - groups = pd.DataFrame(values, dtype="Int64").groupby("a") - - result = getattr(groups, function)() - tm.assert_frame_equal(result, expected) - - result = groups.agg(function) - tm.assert_frame_equal(result, expected) - - result = groups.agg([function]) - expected.columns = MultiIndex.from_tuples([("b", function)]) - tm.assert_frame_equal(result, expected) From a920597752cd7f7be5f24f3b8bdb9ec96fa6ecf3 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 8 May 2020 12:15:31 -0500 Subject: [PATCH 3/3] Lint --- pandas/tests/groupby/test_function.py | 10 +--------- pandas/tests/groupby/test_quantile.py | 2 +- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 840976a455f79..e3862b92faf9f 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -7,15 +7,7 @@ from pandas.errors import UnsupportedFunctionCall import pandas as pd -from pandas import ( - DataFrame, - Index, - MultiIndex, - Series, - Timestamp, - date_range, - isna, -) +from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range, isna import pandas._testing as tm import pandas.core.nanops as nanops from pandas.util import _test_decorators as td diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index 6cab1f5d540e9..87347fe1293ef 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -from pandas import DataFrame, Index, MultiIndex +from pandas import DataFrame, Index import pandas._testing as tm