Skip to content

BUG: GroupBy.apply() returns different results if a different GroupBy method is called first #35314

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 42 commits into from
Aug 7, 2020
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
e84a15d
GroupBy.apply() calls self._reset_group_selection at the start. Erran…
smithto1 Jul 14, 2020
2f764db
Merge remote-tracking branch 'upstream/master' into issue34656
smithto1 Jul 15, 2020
e122809
gb.apply() now resets group selection so it always returns grouping c…
smithto1 Jul 15, 2020
27b2694
test uses .drop() instead of selection
smithto1 Jul 15, 2020
0cca6df
wrote new tests
smithto1 Jul 15, 2020
2786eb5
rewrote test
smithto1 Jul 16, 2020
33cdf65
whatsnew
smithto1 Jul 16, 2020
6a65e2f
restore if-stat in test_transform
smithto1 Jul 16, 2020
9948d2f
amended test
smithto1 Jul 16, 2020
e4a132e
restored test
smithto1 Jul 16, 2020
4170ca6
restored test
smithto1 Jul 16, 2020
0ab1c8a
amended test
smithto1 Jul 16, 2020
f2a32f4
cleanup
smithto1 Jul 16, 2020
f5b674b
trailing comma
smithto1 Jul 16, 2020
7028756
fixed test_to_latex
smithto1 Jul 17, 2020
45abe63
added test to ensure .describe() keeps non-nuisance groupin columns
smithto1 Jul 17, 2020
063f0ea
minimized changes to exsiting tests
smithto1 Jul 17, 2020
7f0d192
add .describe() to whatsnew
smithto1 Jul 17, 2020
974da63
parametrize test over as_index=T/F
smithto1 Jul 17, 2020
b395e39
restored .describe to old behaviour
smithto1 Jul 18, 2020
2405504
Merge remote-tracking branch 'upstream/master' into issue34656
smithto1 Jul 18, 2020
682c93a
Merge remote-tracking branch 'upstream/master' into issue34656-temp
smithto1 Jul 18, 2020
67e9744
restoring test_function.py to master
smithto1 Jul 18, 2020
8f1b9c9
added comment
smithto1 Jul 18, 2020
804b0e0
Merge branch 'issue34656-temp' into issue34656
smithto1 Jul 18, 2020
f422b7d
fixed describe to work with duplicate cols
smithto1 Jul 26, 2020
b07a290
merge fixed
smithto1 Jul 26, 2020
6bec040
update comment
smithto1 Jul 26, 2020
8cdd4cd
context manager in agg_general
smithto1 Jul 27, 2020
abe8be3
remove hashed out line
smithto1 Jul 27, 2020
7112cf8
limited context manager in _make_wrapper
smithto1 Jul 27, 2020
0c8b144
removed unrelated test
smithto1 Jul 27, 2020
755c8f0
update comment
smithto1 Jul 27, 2020
8951a73
Merge remote-tracking branch 'upstream/master' into issue34656
smithto1 Aug 3, 2020
b61695b
whatsnew on v1.1.1
smithto1 Aug 3, 2020
673a35b
comment typo
smithto1 Aug 3, 2020
42f53dd
amend comment to restart tests
smithto1 Aug 3, 2020
18634a6
whatsnew to 1.2.0
smithto1 Aug 5, 2020
95553a1
resolve merge
smithto1 Aug 5, 2020
1a0aa44
remove line that can't be tiggered by test
smithto1 Aug 5, 2020
b09e41e
restart tests
smithto1 Aug 5, 2020
a7e264f
Merge remote-tracking branch 'upstream/master' into issue34656
smithto1 Aug 7, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1084,6 +1084,7 @@ Groupby/resample/rolling
- Bug in :meth:`DataFrame.ewm.cov` was throwing ``AssertionError`` for :class:`MultiIndex` inputs (:issue:`34440`)
- Bug in :meth:`core.groupby.DataFrameGroupBy.transform` when ``func='nunique'`` and columns are of type ``datetime64``, the result would also be of type ``datetime64`` instead of ``int64`` (:issue:`35109`)
- Bug in :meth:'DataFrameGroupBy.first' and :meth:'DataFrameGroupBy.last' that would raise an unnecessary ``ValueError`` when grouping on multiple ``Categoricals`` (:issue:`34951`)
- Bug in :meth:`DataFrameGroupBy.apply` :meth:`DataFrameGroupBy.describe` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply()`` (:issue:`34656`)

Reshaping
^^^^^^^^^
Expand Down
11 changes: 6 additions & 5 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -829,6 +829,8 @@ def __iter__(self):
)
def apply(self, func, *args, **kwargs):

self._reset_group_selection()

func = self._is_builtin_func(func)

# this is needed so we don't try and wrap strings. If we could
Expand Down Expand Up @@ -1623,11 +1625,10 @@ def ohlc(self) -> DataFrame:

@doc(DataFrame.describe)
def describe(self, **kwargs):
with _group_selection_context(self):
result = self.apply(lambda x: x.describe(**kwargs))
if self.axis == 1:
return result.T
return result.unstack()
result = self.apply(lambda x: x.describe(**kwargs))
if self.axis == 1:
return result.T
return result.unstack()

def resample(self, rule, *args, **kwargs):
"""
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/groupby/aggregate/test_other.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,13 +486,13 @@ def test_agg_timezone_round_trip():
assert ts == grouped.first()["B"].iloc[0]

# GH#27110 applying iloc should return a DataFrame
assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 0]
assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1]

ts = df["B"].iloc[2]
assert ts == grouped.last()["B"].iloc[0]

# GH#27110 applying iloc should return a DataFrame
assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 0]
assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1]


def test_sum_uint64_overflow():
Expand Down
29 changes: 29 additions & 0 deletions pandas/tests/groupby/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -1010,3 +1010,32 @@ def test_apply_with_timezones_aware():
result2 = df2.groupby("x", group_keys=False).apply(lambda df: df[["x", "y"]].copy())

tm.assert_frame_equal(result1, result2)


def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func):
# GH 34656
# GH 34271
df = DataFrame(
{
"a": [99, 99, 99, 88, 88, 88],
"b": [1, 2, 3, 4, 5, 6],
"c": [10, 20, 30, 40, 50, 60],
}
)

expected = pd.DataFrame(
{"a": [264, 297], "b": [15, 6], "c": [150, 60]},
index=pd.Index([88, 99], name="a"),
)

# Check output wehn no other methods are called before .apply()
grp = df.groupby(by="a")
result = grp.apply(sum)
tm.assert_frame_equal(result, expected)

# Check output when another method is called before .apply()
grp = df.groupby(by="a")
args = {"nth": [0], "corrwith": [df]}.get(reduction_func, [])
_ = getattr(grp, reduction_func)(*args)
result = grp.apply(sum)
tm.assert_frame_equal(result, expected)
87 changes: 82 additions & 5 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,14 +286,47 @@ def test_non_cython_api():

# describe
expected_index = pd.Index([1, 3], name="A")
expected_col = pd.MultiIndex(
levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]],
codes=[[0] * 8, list(range(8))],
expected_col = pd.MultiIndex.from_product(
[["A", "B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]]
)
expected = pd.DataFrame(
[
[1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0],
[0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
[
2.0,
1.0,
0.0,
1.0,
1.0,
1.0,
1.0,
1.0,
1.0,
2.0,
np.nan,
2.0,
2.0,
2.0,
2.0,
2.0,
],
[
1.0,
3.0,
np.nan,
3.0,
3.0,
3.0,
3.0,
3.0,
0.0,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
],
],
index=expected_index,
columns=expected_col,
Expand Down Expand Up @@ -974,6 +1007,50 @@ def test_frame_describe_unstacked_format():
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("by_col_dtype", [int, float, str])
def test_describe_results_includes_non_nuisance_columns(by_col_dtype):
# GH 34656
# GH 34271
df = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 3, 3], "b": [1, 2, 3, 4, 5, 6, 7, 8, 9]})
df = df.astype({"a": by_col_dtype})

expected = (
DataFrame.from_records(
[
("a", "count", 3.0, 3.0, 3.0),
("a", "mean", 1.0, 2.0, 3.0),
("a", "std", 0.0, 0.0, 0.0),
("a", "min", 1.0, 2.0, 3.0),
("a", "25%", 1.0, 2.0, 3.0),
("a", "50%", 1.0, 2.0, 3.0),
("a", "75%", 1.0, 2.0, 3.0),
("a", "max", 1.0, 2.0, 3.0),
("b", "count", 3.0, 3.0, 3.0),
("b", "mean", 2.0, 5.0, 8.0),
("b", "std", 1.0, 1.0, 1.0),
("b", "min", 1.0, 4.0, 7.0),
("b", "25%", 1.5, 4.5, 7.5),
("b", "50%", 2.0, 5.0, 8.0),
("b", "75%", 2.5, 5.5, 8.5),
("b", "max", 3.0, 6.0, 9.0),
],
columns=["col", "func", 1, 2, 3],
)
.set_index(["col", "func"])
.T
)
expected.columns.names = [None, None]
expected.index = pd.Index(expected.index.astype(by_col_dtype), name="a")

if by_col_dtype is str:
# If the grouping column is a nuisance column (i.e. can't apply the
# std() or quantile() to it) then it does not appear in the output
expected = expected.drop(columns="a")

result = df.groupby("a").describe()
tm.assert_frame_equal(result, expected)


def test_groupby_mean_no_overflow():
# Regression test for (#22487)
df = pd.DataFrame(
Expand Down
8 changes: 5 additions & 3 deletions pandas/tests/groupby/test_grouping.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,13 +191,15 @@ def test_grouper_creation_bug(self):
result = g.sum()
tm.assert_frame_equal(result, expected)

result = g.apply(lambda x: x.sum())
tm.assert_frame_equal(result, expected)

g = df.groupby(pd.Grouper(key="A", axis=0))
result = g.sum()
tm.assert_frame_equal(result, expected)

result = g.apply(lambda x: x.sum())
expected["A"] = [0, 2, 4]
expected = expected.loc[:, ["A", "B"]]
tm.assert_frame_equal(result, expected)

# GH14334
# pd.Grouper(key=...) may be passed in a list
df = DataFrame(
Expand Down
3 changes: 3 additions & 0 deletions pandas/tests/groupby/transform/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -743,6 +743,9 @@ def test_cython_transform_frame(op, args, targop):
else:
expected = gb.apply(targop)

if op == "shift" and type(gb_target.get("by")) is str:
expected = expected.drop(columns=gb_target.get("by"))

expected = expected.sort_index(axis=1)
tm.assert_frame_equal(expected, gb.transform(op, *args).sort_index(axis=1))
tm.assert_frame_equal(expected, getattr(gb, op)(*args).sort_index(axis=1))
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/io/formats/test_to_latex.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ def test_to_latex_multiindex(self):

assert result == expected

result = df.groupby("a").describe().to_latex()
result = df.groupby("a").describe().drop(columns="a").to_latex()
expected = r"""\begin{tabular}{lrrrrrrrr}
\toprule
{} & \multicolumn{8}{l}{c} \\
Expand Down