-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
[BUG] Fixed behavior of DataFrameGroupBy.apply to respect _group_selection_context #29131
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
78de38c
63677e7
1d99d9c
98bc673
fbf3202
947a5bd
8c3efb0
fa21e29
a0a9aa5
7070169
76815f1
6c49a16
8a4c1f8
ccf940d
b7d056d
cfacfc1
c384c09
91d1931
83be029
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -174,6 +174,43 @@ Backwards incompatible API changes | |
|
||
pd.arrays.IntervalArray.from_tuples([(0, 1), (2, 3)]) | ||
|
||
.. _whatsnew_1000.api_breaking.GroupBy.apply: | ||
|
||
``GroupBy.apply`` behaves consistently with `as_index` | ||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||
|
||
- The result of :meth:`GroupBy.apply` sometimes contained the grouper column(s), | ||
WillAyd marked this conversation as resolved.
Show resolved
Hide resolved
|
||
in both the index, and in the `DataFrame`. From Pandas 1.0, :meth:`GroupBy.apply` | ||
christopherzimmerman marked this conversation as resolved.
Show resolved
Hide resolved
|
||
will respect the `as_index` parameter, and only return the grouper column(s) in | ||
christopherzimmerman marked this conversation as resolved.
Show resolved
Hide resolved
|
||
the result if `as_index` is set to `False`. | ||
|
||
*pandas 0.25.x* | ||
christopherzimmerman marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
.. code-block:: ipython | ||
|
||
In [1]: df = pd.DataFrame({"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 3, 4, 5, 6]}) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. show df here |
||
In [2]: df.groupby("a").apply(lambda x: x.sum()) | ||
Out[2]: | ||
a b | ||
a | ||
1 2 3 | ||
2 4 7 | ||
3 6 11 | ||
|
||
*pandas 1.0.0* | ||
|
||
.. code-block:: ipython | ||
|
||
In [1]: df = pd.DataFrame({"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 3, 4, 5, 6]}) | ||
|
||
In [2]: df.groupby("a").apply(lambda x: x.sum()) | ||
Out[2]: | ||
b | ||
a | ||
1 3 | ||
2 7 | ||
3 11 | ||
|
||
.. _whatsnew_1000.api.other: | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -95,9 +95,16 @@ def f(x): | |
return x.drop_duplicates("person_name").iloc[0] | ||
|
||
result = g.apply(f) | ||
expected = x.iloc[[0, 1]].copy() | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so if the tests change a lot like this, make a new test |
||
# GH 28549 | ||
# grouper key should not be present after apply | ||
# with as_index=True. | ||
# TODO split this into multiple tests | ||
dropped = x.drop("person_id", 1) | ||
|
||
expected = dropped.iloc[[0, 1]].copy() | ||
expected.index = Index([1, 2], name="person_id") | ||
expected["person_name"] = expected["person_name"].astype("object") | ||
expected["person_name"] = expected["person_name"] | ||
tm.assert_frame_equal(result, expected) | ||
|
||
# GH 9921 | ||
|
@@ -1218,7 +1225,10 @@ def test_get_nonexistent_category(): | |
# Accessing a Category that is not in the dataframe | ||
df = pd.DataFrame({"var": ["a", "a", "b", "b"], "val": range(4)}) | ||
with pytest.raises(KeyError, match="'vau'"): | ||
df.groupby("var").apply( | ||
# GH2849 This needs to use as_index=False so that | ||
christopherzimmerman marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# var is still present when grouping or else another key error | ||
# will raise about var. | ||
df.groupby("var", as_index=False).apply( | ||
lambda rows: pd.DataFrame( | ||
{"var": [rows.iloc[-1]["var"]], "val": [rows.iloc[-1]["vau"]]} | ||
) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -87,10 +87,6 @@ def test_intercept_builtin_sum(): | |
tm.assert_series_equal(result2, expected) | ||
|
||
|
||
# @pytest.mark.parametrize("f", [max, min, sum]) | ||
# def test_builtins_apply(f): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is the kind of thing that can be done in a separate PR |
||
|
||
|
||
@pytest.mark.parametrize("f", [max, min, sum]) | ||
@pytest.mark.parametrize("keys", ["jim", ["jim", "joe"]]) # Single key # Multi-key | ||
def test_builtins_apply(keys, f): | ||
|
@@ -102,21 +98,33 @@ def test_builtins_apply(keys, f): | |
result = df.groupby(keys).apply(f) | ||
ngroups = len(df.drop_duplicates(subset=keys)) | ||
|
||
assert_msg = "invalid frame shape: {} (expected ({}, 3))".format( | ||
result.shape, ngroups | ||
# GH 28549 | ||
# grouping keys should not be included in output | ||
if isinstance(keys, list): | ||
result_shape = len(df.columns) - len(keys) | ||
else: | ||
result_shape = len(df.columns) - 1 | ||
|
||
assert_msg = "invalid frame shape: {} (expected ({}, {}))".format( | ||
result.shape, ngroups, result_shape | ||
) | ||
assert result.shape == (ngroups, 3), assert_msg | ||
assert result.shape == (ngroups, result_shape), assert_msg | ||
|
||
tm.assert_frame_equal( | ||
result, # numpy's equivalent function | ||
df.groupby(keys).apply(getattr(np, fname)), | ||
) | ||
|
||
if f != sum: | ||
expected = df.groupby(keys).agg(fname).reset_index() | ||
expected.set_index(keys, inplace=True, drop=False) | ||
# GH 28549 | ||
# No longer need to reset/set index here | ||
expected = df.groupby(keys).agg(fname) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you try not to use groupby here and just construct the expected result. the comment is also not very useful as a future reader won't have context. can you break this out to a new test |
||
tm.assert_frame_equal(result, expected, check_dtype=False) | ||
|
||
# GH 28549 | ||
# grouping keys should not be in output | ||
df = df.drop(keys, 1) | ||
|
||
tm.assert_series_equal(getattr(result, fname)(), getattr(df, fname)()) | ||
|
||
|
||
|
@@ -341,10 +349,13 @@ def test_cython_api2(): | |
tm.assert_frame_equal(result, expected) | ||
|
||
# GH 13994 | ||
result = df.groupby("A").cumsum(axis=1) | ||
# GH 28549 | ||
# Good represention of when as_index=False is now behaving | ||
# as expected | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. split to a separate tests and use as_index fixture There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was unsure what to do about this test. There is a comment in the original test that says "cumsum is a transformer and should ignore as_index", but with the updated behavior, it respects Previous Behavior In [21]: df = DataFrame([[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]], columns=["A", "B", "C"])
In [22]: df.groupby("A").cumsum(axis=1)
Out[22]:
A B C
0 1.0 3.0 NaN
1 1.0 NaN 10.0
2 3.0 7.0 16.0 Behavior after change to In [4]: df = DataFrame([[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]], columns=["A", "B", "C"])
In [5]: df.groupby("A").cumsum(axis=1)
Out[5]:
B C
0 2.0 NaN
1 NaN 9.0
2 4.0 13.0 |
||
result = df.groupby("A", as_index=False).cumsum(axis=1) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you test as_indexer=True as well. this seems like a big api change here |
||
expected = df.cumsum(axis=1) | ||
tm.assert_frame_equal(result, expected) | ||
result = df.groupby("A").cumprod(axis=1) | ||
result = df.groupby("A", as_index=False).cumprod(axis=1) | ||
expected = df.cumprod(axis=1) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
|
@@ -1107,7 +1118,10 @@ def test_count(): | |
|
||
for key in ["1st", "2nd", ["1st", "2nd"]]: | ||
left = df.groupby(key).count() | ||
right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) | ||
|
||
# GH 28549 | ||
christopherzimmerman marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# don't need to drop key here anymore | ||
right = df.groupby(key).apply(DataFrame.count) | ||
tm.assert_frame_equal(left, right) | ||
|
||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
need to move to 1.1