From 78de38cbbab6fa2a055fdb38bb9663eec133f26e Mon Sep 17 00:00:00 2001 From: Chris Zimmerman Date: Mon, 21 Oct 2019 10:25:58 -0500 Subject: [PATCH 01/12] Modifed tests and fixed bug in groupby/apply --- pandas/core/groupby/groupby.py | 8 +-- pandas/tests/groupby/test_apply.py | 61 ++++++++++++++----- pandas/tests/groupby/test_categorical.py | 16 ++++- pandas/tests/groupby/test_function.py | 38 ++++++++---- pandas/tests/groupby/test_groupby.py | 53 ++++++++++++---- pandas/tests/groupby/test_transform.py | 16 +++-- .../tests/resample/test_resampler_grouper.py | 6 +- 7 files changed, 144 insertions(+), 54 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f622480cfe4b7..c184e8d05b77f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -721,7 +721,9 @@ def f(g): f = func # ignore SettingWithCopy here in case the user mutates - with option_context("mode.chained_assignment", None): + with option_context( + "mode.chained_assignment", None + ) as _, _group_selection_context(self) as _: try: result = self._python_apply_general(f) except TypeError: @@ -732,9 +734,7 @@ def f(g): # except if the udf is trying an operation that # fails on *some* columns, e.g. a numeric operation # on a string grouper column - - with _group_selection_context(self): - return self._python_apply_general(f) + return self._python_apply_general(f) return result diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 1af4768b7381e..b5f664332cf82 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -363,11 +363,19 @@ def f(group): tm.assert_frame_equal(result.loc[key], f(group)) -def test_apply_chunk_view(): +@pytest.mark.parametrize("as_index", [False, True]) +def test_apply_chunk_view(as_index): # Low level tinkering could be unsafe, make sure not df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) - result = df.groupby("key", group_keys=False).apply(lambda x: x[:2]) + result = df.groupby("key", group_keys=False, as_index=as_index).apply( + lambda x: x[:2] + ) + # GH 28549 + # key no longer included in reduction output + if as_index: + df.pop("key") + expected = df.take([0, 1, 3, 4, 6, 7]) tm.assert_frame_equal(result, expected) @@ -386,7 +394,8 @@ def test_apply_no_name_column_conflict(): grouped.apply(lambda x: x.sort_values("value", inplace=True)) -def test_apply_typecast_fail(): +@pytest.mark.parametrize("as_index", [True, False]) +def test_apply_typecast_fail(as_index): df = DataFrame( { "d": [1.0, 1.0, 1.0, 2.0, 2.0, 2.0], @@ -400,7 +409,12 @@ def f(group): group["v2"] = (v - v.min()) / (v.max() - v.min()) return group - result = df.groupby("d").apply(f) + result = df.groupby("d", as_index=as_index).apply(f) + + # GH 28549 + # key no longer included in reduction output + if as_index: + df.pop("d") expected = df.copy() expected["v2"] = np.tile([0.0, 0.5, 1], 2) @@ -426,6 +440,10 @@ def f(group): result = df.groupby("d").apply(f) + # GH 28549 + # key no longer included in reduction output + df.pop("d") + expected = df.copy() expected["v2"] = np.tile([0.0, 0.5, 1], 2) @@ -638,24 +656,37 @@ def test_func(x): tm.assert_frame_equal(result, expected) -def test_groupby_apply_none_first(): +@pytest.mark.parametrize("as_index", [True, False]) +@pytest.mark.parametrize( + "groups, vars_, expected_vars, expected_groups", + [ + ([1, 1, 1, 2], [0, 1, 2, 3], [0, 2], [1, 1]), + ([1, 2, 2, 2], [0, 1, 2, 3], [1, 3], [2, 2]), + ], +) +def test_groupby_apply_none_first( + groups, vars_, expected_vars, expected_groups, as_index +): # GH 12824. Tests if apply returns None first. - test_df1 = DataFrame({"groups": [1, 1, 1, 2], "vars": [0, 1, 2, 3]}) - test_df2 = DataFrame({"groups": [1, 2, 2, 2], "vars": [0, 1, 2, 3]}) + test_df = DataFrame({"groups": groups, "vars": vars_}) def test_func(x): if x.shape[0] < 2: return None return x.iloc[[0, -1]] - result1 = test_df1.groupby("groups").apply(test_func) - result2 = test_df2.groupby("groups").apply(test_func) - index1 = MultiIndex.from_arrays([[1, 1], [0, 2]], names=["groups", None]) - index2 = MultiIndex.from_arrays([[2, 2], [1, 3]], names=["groups", None]) - expected1 = DataFrame({"groups": [1, 1], "vars": [0, 2]}, index=index1) - expected2 = DataFrame({"groups": [2, 2], "vars": [1, 3]}, index=index2) - tm.assert_frame_equal(result1, expected1) - tm.assert_frame_equal(result2, expected2) + result = test_df.groupby("groups", as_index=as_index).apply(test_func) + + # GH 28549 "groups" should not be in output of apply + # unless as_index=True + if not as_index: + expected = DataFrame( + {"groups": expected_groups, "vars": expected_vars}, index=result.index + ) + else: + expected = DataFrame({"vars": expected_vars}, index=result.index) + + tm.assert_frame_equal(result, expected) def test_groupby_apply_return_empty_chunk(): diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 5391cb5ce821f..75c6810d314ff 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -95,9 +95,16 @@ def f(x): return x.drop_duplicates("person_name").iloc[0] result = g.apply(f) - expected = x.iloc[[0, 1]].copy() + + # GH 28549 + # grouper key should not be present after apply + # with as_index=True. + # TODO split this into multiple tests + dropped = x.drop("person_id", 1) + + expected = dropped.iloc[[0, 1]].copy() expected.index = Index([1, 2], name="person_id") - expected["person_name"] = expected["person_name"].astype("object") + expected["person_name"] = expected["person_name"] tm.assert_frame_equal(result, expected) # GH 9921 @@ -1218,7 +1225,10 @@ def test_get_nonexistent_category(): # Accessing a Category that is not in the dataframe df = pd.DataFrame({"var": ["a", "a", "b", "b"], "val": range(4)}) with pytest.raises(KeyError, match="'vau'"): - df.groupby("var").apply( + # GH2849 This needs to use as_index=False so that + # var is still present when grouping or else another key error + # will raise about var. + df.groupby("var", as_index=False).apply( lambda rows: pd.DataFrame( {"var": [rows.iloc[-1]["var"]], "val": [rows.iloc[-1]["vau"]]} ) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 571e710ba8928..be5ac6e2a5745 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -87,10 +87,6 @@ def test_intercept_builtin_sum(): tm.assert_series_equal(result2, expected) -# @pytest.mark.parametrize("f", [max, min, sum]) -# def test_builtins_apply(f): - - @pytest.mark.parametrize("f", [max, min, sum]) @pytest.mark.parametrize("keys", ["jim", ["jim", "joe"]]) # Single key # Multi-key def test_builtins_apply(keys, f): @@ -102,10 +98,17 @@ def test_builtins_apply(keys, f): result = df.groupby(keys).apply(f) ngroups = len(df.drop_duplicates(subset=keys)) - assert_msg = "invalid frame shape: {} (expected ({}, 3))".format( - result.shape, ngroups + # GH 28549 + # grouping keys should not be included in output + if isinstance(keys, list): + result_shape = len(df.columns) - len(keys) + else: + result_shape = len(df.columns) - 1 + + assert_msg = "invalid frame shape: {} (expected ({}, {}))".format( + result.shape, ngroups, result_shape ) - assert result.shape == (ngroups, 3), assert_msg + assert result.shape == (ngroups, result_shape), assert_msg tm.assert_frame_equal( result, # numpy's equivalent function @@ -113,10 +116,15 @@ def test_builtins_apply(keys, f): ) if f != sum: - expected = df.groupby(keys).agg(fname).reset_index() - expected.set_index(keys, inplace=True, drop=False) + # GH 28549 + # No longer need to reset/set index here + expected = df.groupby(keys).agg(fname) tm.assert_frame_equal(result, expected, check_dtype=False) + # GH 28549 + # grouping keys should not be in output + df = df.drop(keys, 1) + tm.assert_series_equal(getattr(result, fname)(), getattr(df, fname)()) @@ -341,10 +349,13 @@ def test_cython_api2(): tm.assert_frame_equal(result, expected) # GH 13994 - result = df.groupby("A").cumsum(axis=1) + # GH 28549 + # Good represention of when as_index=False is now behaving + # as expected + result = df.groupby("A", as_index=False).cumsum(axis=1) expected = df.cumsum(axis=1) tm.assert_frame_equal(result, expected) - result = df.groupby("A").cumprod(axis=1) + result = df.groupby("A", as_index=False).cumprod(axis=1) expected = df.cumprod(axis=1) tm.assert_frame_equal(result, expected) @@ -1107,7 +1118,10 @@ def test_count(): for key in ["1st", "2nd", ["1st", "2nd"]]: left = df.groupby(key).count() - right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) + + # GH 28549 + # don't need to drop key here anymore + right = df.groupby(key).apply(DataFrame.count) tm.assert_frame_equal(left, right) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index dff5baa9b5984..d61defca5684d 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -89,9 +89,12 @@ def max_value(group): applied = df.groupby("A").apply(max_value) result = applied.dtypes + + # GH 28549 + # "A" should not be in output anymore expected = Series( - [np.dtype("object")] * 2 + [np.dtype("float64")] * 2 + [np.dtype("int64")], - index=["A", "B", "C", "D", "value"], + [np.dtype("object")] + [np.dtype("float64")] * 2 + [np.dtype("int64")], + index=["B", "C", "D", "value"], ) assert_series_equal(result, expected) @@ -944,16 +947,29 @@ def f_no_copy(x): assert_series_equal(grpby_copy, grpby_no_copy) -def test_no_mutate_but_looks_like(): +@pytest.mark.parametrize("as_index", [True, False]) +def test_no_mutate_but_looks_like(as_index): # GH 8467 # first show's mutation indicator # second does not, but should yield the same results df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) - result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key) - result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key) - assert_series_equal(result1, result2) + def run_test(df, as_index): + result1 = df.groupby("key", group_keys=True, as_index=as_index).apply( + lambda x: x[:].key + ) + result2 = df.groupby("key", group_keys=True, as_index=as_index).apply( + lambda x: x.key + ) + return result1, result2 + + if as_index: + with pytest.raises(AttributeError): + run_test(df, as_index) + else: + result1, result2 = run_test(df, as_index) + assert_series_equal(result1, result2) def test_groupby_series_indexed_differently(): @@ -1076,7 +1092,8 @@ def test_consistency_name(): assert_series_equal(result, expected) -def test_groupby_name_propagation(df): +@pytest.mark.parametrize("as_index", [True, False]) +def test_groupby_name_propagation(df, as_index): # GH 6124 def summarize(df, name=None): return Series({"count": 1, "mean": 2, "omissions": 3}, name=name) @@ -1087,12 +1104,14 @@ def summarize_random_name(df): # inconsistent. return Series({"count": 1, "mean": 2, "omissions": 3}, name=df.iloc[0]["A"]) - metrics = df.groupby("A").apply(summarize) + metrics = df.groupby("A", as_index=as_index).apply(summarize) assert metrics.columns.name is None - metrics = df.groupby("A").apply(summarize, "metrics") + metrics = df.groupby("A", as_index=as_index).apply(summarize, "metrics") assert metrics.columns.name == "metrics" - metrics = df.groupby("A").apply(summarize_random_name) - assert metrics.columns.name is None + + if not as_index: + metrics = df.groupby("A", as_index=as_index).apply(summarize_random_name) + assert metrics.columns.name is None def test_groupby_nonstring_columns(): @@ -1341,12 +1360,20 @@ def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): _check_groupby(df, result, ["a", "b"], "d") -def test_dont_clobber_name_column(): +@pytest.mark.parametrize("as_index", [True, False]) +def test_dont_clobber_name_column(as_index): df = DataFrame( {"key": ["a", "a", "a", "b", "b", "b"], "name": ["foo", "bar", "baz"] * 2} ) - result = df.groupby("key").apply(lambda x: x) + result = df.groupby("key", as_index=as_index).apply(lambda x: x) + + # GH 28549 + # test both True and False for as index to ensure + # proper reduction + if as_index: + df.pop("key") + assert_frame_equal(result, df) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index d3972e6ba9008..cc9e28153d83e 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -455,7 +455,8 @@ def test_groupby_transform_with_nan_group(): assert_series_equal(result, expected) -def test_transform_mixed_type(): +@pytest.mark.parametrize("as_index", [True, False]) +def test_transform_mixed_type(as_index): index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]]) df = DataFrame( { @@ -470,16 +471,23 @@ def f(group): group["g"] = group["d"] * 2 return group[:1] - grouped = df.groupby("c") + grouped = df.groupby("c", as_index=as_index) result = grouped.apply(f) assert result["d"].dtype == np.float64 # this is by definition a mutating operation! with pd.option_context("mode.chained_assignment", None): - for key, group in grouped: + for index, (key, group) in enumerate(grouped): res = f(group) - assert_frame_equal(res, result.loc[key]) + # GH 28549 + # if as_index need to drop column from res + if as_index: + res = res.drop("c", 1) + + k = key if as_index else index + + assert_frame_equal(res, result.loc[k]) def _check_cython_group_transform_cumulative(pd_op, np_op, dtype): diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 9053a7ebfea2b..07f5f7e1e379e 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -241,9 +241,9 @@ def test_resample_groupby_with_label(): ), ] mindex = pd.MultiIndex.from_arrays(mi, names=["col0", None]) - expected = DataFrame( - data={"col0": [0, 0, 2, 2], "col1": [1, 1, 2, 1]}, index=mindex - ) + # GH 28549 col0 should not be included in the output since + # it is a grouper key + expected = DataFrame(data={"col1": [1, 1, 2, 1]}, index=mindex) assert_frame_equal(result, expected) From 63677e71b785d096e1c596765f24eb86b974aa7e Mon Sep 17 00:00:00 2001 From: Chris Zimmerman Date: Mon, 21 Oct 2019 11:40:07 -0500 Subject: [PATCH 02/12] fixed resample docstring --- pandas/core/groupby/groupby.py | 58 +++++++++++++++++----------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c184e8d05b77f..5e3559815079f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1465,7 +1465,7 @@ def resample(self, rule, *args, **kwargs): ... columns=['a', 'b']) >>> df.iloc[2, 0] = 5 >>> df - a b + a b 2000-01-01 00:00:00 0 1 2000-01-01 00:01:00 0 1 2000-01-01 00:02:00 5 1 @@ -1475,63 +1475,63 @@ def resample(self, rule, *args, **kwargs): the timestamps falling into a bin. >>> df.groupby('a').resample('3T').sum() - a b + b a - 0 2000-01-01 00:00:00 0 2 - 2000-01-01 00:03:00 0 1 - 5 2000-01-01 00:00:00 5 1 + 0 2000-01-01 00:00:00 2 + 2000-01-01 00:03:00 1 + 5 2000-01-01 00:00:00 1 Upsample the series into 30 second bins. >>> df.groupby('a').resample('30S').sum() - a b + b a - 0 2000-01-01 00:00:00 0 1 - 2000-01-01 00:00:30 0 0 - 2000-01-01 00:01:00 0 1 - 2000-01-01 00:01:30 0 0 - 2000-01-01 00:02:00 0 0 - 2000-01-01 00:02:30 0 0 - 2000-01-01 00:03:00 0 1 - 5 2000-01-01 00:02:00 5 1 + 0 2000-01-01 00:00:00 1 + 2000-01-01 00:00:30 0 + 2000-01-01 00:01:00 1 + 2000-01-01 00:01:30 0 + 2000-01-01 00:02:00 0 + 2000-01-01 00:02:30 0 + 2000-01-01 00:03:00 1 + 5 2000-01-01 00:02:00 1 Resample by month. Values are assigned to the month of the period. >>> df.groupby('a').resample('M').sum() - a b + b a - 0 2000-01-31 0 3 - 5 2000-01-31 5 1 + 0 2000-01-31 3 + 5 2000-01-31 1 Downsample the series into 3 minute bins as above, but close the right side of the bin interval. >>> df.groupby('a').resample('3T', closed='right').sum() - a b + b a - 0 1999-12-31 23:57:00 0 1 - 2000-01-01 00:00:00 0 2 - 5 2000-01-01 00:00:00 5 1 + 0 1999-12-31 23:57:00 1 + 2000-01-01 00:00:00 2 + 5 2000-01-01 00:00:00 1 Downsample the series into 3 minute bins and close the right side of the bin interval, but label each bin using the right edge instead of the left. >>> df.groupby('a').resample('3T', closed='right', label='right').sum() - a b + b a - 0 2000-01-01 00:00:00 0 1 - 2000-01-01 00:03:00 0 2 - 5 2000-01-01 00:03:00 5 1 + 0 2000-01-01 00:00:00 1 + 2000-01-01 00:03:00 2 + 5 2000-01-01 00:03:00 1 Add an offset of twenty seconds. >>> df.groupby('a').resample('3T', loffset='20s').sum() - a b + b a - 0 2000-01-01 00:00:20 0 2 - 2000-01-01 00:03:20 0 1 - 5 2000-01-01 00:00:20 5 1 + 0 2000-01-01 00:00:20 2 + 2000-01-01 00:03:20 1 + 5 2000-01-01 00:00:20 1 """ from pandas.core.resample import get_resampler_for_grouping From 1d99d9cf6fbca26012530df7ed6658b0d4457d13 Mon Sep 17 00:00:00 2001 From: Chris Zimmerman Date: Mon, 21 Oct 2019 12:32:39 -0500 Subject: [PATCH 03/12] Added fixture and cleaned up tests --- pandas/core/groupby/groupby.py | 6 +++--- pandas/tests/groupby/conftest.py | 5 +++++ pandas/tests/groupby/test_apply.py | 3 --- pandas/tests/groupby/test_groupby.py | 3 --- pandas/tests/groupby/test_transform.py | 1 - 5 files changed, 8 insertions(+), 10 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 5e3559815079f..17f532dfbd713 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -721,9 +721,9 @@ def f(g): f = func # ignore SettingWithCopy here in case the user mutates - with option_context( - "mode.chained_assignment", None - ) as _, _group_selection_context(self) as _: + with option_context("mode.chained_assignment", None), _group_selection_context( + self + ): try: result = self._python_apply_general(f) except TypeError: diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index 72e60c5099304..a1aec599a053b 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -110,3 +110,8 @@ def reduction_func(request): """yields the string names of all groupby reduction functions, one at a time. """ return request.param + + +@pytest.fixture +def as_index(): + return [True, False] diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index b5f664332cf82..90663c4b6fd0d 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -363,7 +363,6 @@ def f(group): tm.assert_frame_equal(result.loc[key], f(group)) -@pytest.mark.parametrize("as_index", [False, True]) def test_apply_chunk_view(as_index): # Low level tinkering could be unsafe, make sure not df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) @@ -394,7 +393,6 @@ def test_apply_no_name_column_conflict(): grouped.apply(lambda x: x.sort_values("value", inplace=True)) -@pytest.mark.parametrize("as_index", [True, False]) def test_apply_typecast_fail(as_index): df = DataFrame( { @@ -656,7 +654,6 @@ def test_func(x): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("as_index", [True, False]) @pytest.mark.parametrize( "groups, vars_, expected_vars, expected_groups", [ diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index d61defca5684d..baf8b84bc5eb1 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -947,7 +947,6 @@ def f_no_copy(x): assert_series_equal(grpby_copy, grpby_no_copy) -@pytest.mark.parametrize("as_index", [True, False]) def test_no_mutate_but_looks_like(as_index): # GH 8467 @@ -1092,7 +1091,6 @@ def test_consistency_name(): assert_series_equal(result, expected) -@pytest.mark.parametrize("as_index", [True, False]) def test_groupby_name_propagation(df, as_index): # GH 6124 def summarize(df, name=None): @@ -1360,7 +1358,6 @@ def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): _check_groupby(df, result, ["a", "b"], "d") -@pytest.mark.parametrize("as_index", [True, False]) def test_dont_clobber_name_column(as_index): df = DataFrame( {"key": ["a", "a", "a", "b", "b", "b"], "name": ["foo", "bar", "baz"] * 2} diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index cc9e28153d83e..255c719573f6a 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -455,7 +455,6 @@ def test_groupby_transform_with_nan_group(): assert_series_equal(result, expected) -@pytest.mark.parametrize("as_index", [True, False]) def test_transform_mixed_type(as_index): index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]]) df = DataFrame( From 98bc6733c08e77701c4f5f0b49e4fb514349c6a7 Mon Sep 17 00:00:00 2001 From: Chris Zimmerman Date: Tue, 22 Oct 2019 20:39:47 -0500 Subject: [PATCH 04/12] Whatsnew and added match to test --- doc/source/whatsnew/v1.0.0.rst | 37 ++++++++++++++++++++++++++++ pandas/tests/groupby/test_groupby.py | 2 +- 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 48c1173a372a7..5a13fb226dd6d 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -174,6 +174,43 @@ Backwards incompatible API changes pd.arrays.IntervalArray.from_tuples([(0, 1), (2, 3)]) +.. _whatsnew_1000.api_breaking.GroupBy.apply: + +``GroupBy.apply`` behaves consistently with `as_index` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +- The result of :meth:`GroupBy.apply` sometimes contained the grouper column(s), + in both the index, and in the `DataFrame`. From Pandas 1.0, :meth:`GroupBy.apply` + will respect the `as_index` parameter, and only return the grouper column(s) in + the result if `as_index` is set to `False`. + +*pandas 0.25.x* + +.. code-block:: ipython + + In [1]: df = pd.DataFrame({"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 3, 4, 5, 6]}) + + In [2]: df.groupby("a").apply(lambda x: x.sum()) + Out[2]: + a b + a + 1 2 3 + 2 4 7 + 3 6 11 + +*pandas 1.0.0* + +.. code-block:: ipython + + In [1]: df = pd.DataFrame({"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 3, 4, 5, 6]}) + + In [2]: df.groupby("a").apply(lambda x: x.sum()) + Out[2]: + b + a + 1 3 + 2 7 + 3 11 .. _whatsnew_1000.api.other: diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index baf8b84bc5eb1..7457521c016f4 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -964,7 +964,7 @@ def run_test(df, as_index): return result1, result2 if as_index: - with pytest.raises(AttributeError): + with pytest.raises(AttributeError, match="'key'$"): run_test(df, as_index) else: result1, result2 = run_test(df, as_index) From fbf320267de4d3599939a76b2437b57892a22be3 Mon Sep 17 00:00:00 2001 From: Chris Zimmerman Date: Wed, 23 Oct 2019 15:23:14 -0500 Subject: [PATCH 05/12] conftest docstring and whatsnew example --- doc/source/whatsnew/v1.0.0.rst | 28 ++++++++++++++++++++++++++++ pandas/tests/groupby/conftest.py | 2 ++ 2 files changed, 30 insertions(+) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 5a13fb226dd6d..14abb8aa4aec5 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -212,6 +212,34 @@ Backwards incompatible API changes 2 7 3 11 +*pandas 0.25.x* + +.. code-block:: ipython + + In [1]: df = pd.DataFrame({"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 3, 4, 5, 6]}) + + In [2]: df.groupby("a").apply(lambda x: x.iloc[0]) + Out[2]: + a b + a + 1 1 1 + 2 2 3 + 3 3 5 + +*pandas 1.0.0* + +.. code-block:: ipython + + In [1]: df = pd.DataFrame({"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 3, 4, 5, 6]}) + + In [2]: df.groupby("a").apply(lambda x: x.iloc[0]) + Out[2]: + b + a + 1 1 + 2 3 + 3 5 + .. _whatsnew_1000.api.other: Other API changes diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index a1aec599a053b..597d3bbf764e3 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -114,4 +114,6 @@ def reduction_func(request): @pytest.fixture def as_index(): + """yields all possible options for the `as_index` parameter, one at a time. + """ return [True, False] From 947a5bdb9e4a032311ef14f8788e72da5d67e45f Mon Sep 17 00:00:00 2001 From: Chris Zimmerman Date: Mon, 28 Oct 2019 10:53:24 -0500 Subject: [PATCH 06/12] Changes to tests and whatsnew --- doc/source/whatsnew/v1.0.0.rst | 80 +++++++++++++++--------- pandas/core/groupby/groupby.py | 27 ++++---- pandas/tests/groupby/conftest.py | 7 +++ pandas/tests/groupby/test_apply.py | 6 +- pandas/tests/groupby/test_categorical.py | 15 +++-- pandas/tests/groupby/test_function.py | 6 -- pandas/tests/groupby/test_groupby.py | 28 ++------- pandas/tests/groupby/test_transform.py | 7 +-- 8 files changed, 93 insertions(+), 83 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 14abb8aa4aec5..0c85ca5c8d403 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -179,14 +179,15 @@ Backwards incompatible API changes ``GroupBy.apply`` behaves consistently with `as_index` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -- The result of :meth:`GroupBy.apply` sometimes contained the grouper column(s), - in both the index, and in the `DataFrame`. From Pandas 1.0, :meth:`GroupBy.apply` - will respect the `as_index` parameter, and only return the grouper column(s) in - the result if `as_index` is set to `False`. +- Previously, the result of :meth:`GroupBy.apply` sometimes contained the grouper column(s), + in both the index, and in the `DataFrame`. :meth:`GroupBy.apply` + now respects the ``as_index`` parameter, and only returns the grouper column(s) in + the result if ``as_index`` is set to `False`. Other methods such as :meth:`GroupBy.resample` + exhibited similar behavior and now also respect the ``as_index`` parameter. -*pandas 0.25.x* +*Previous Behavior* -.. code-block:: ipython +.. ipython:: python In [1]: df = pd.DataFrame({"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 3, 4, 5, 6]}) @@ -198,9 +199,34 @@ Backwards incompatible API changes 2 4 7 3 6 11 -*pandas 1.0.0* + In [3]: df.groupby("a").apply(lambda x: x.iloc[0]) + Out[3]: + a b + a + 1 1 1 + 2 2 3 + 3 3 5 -.. code-block:: ipython + In [4]: idx = pd.date_range('1/1/2000', periods=4, freq='T') + + In [5]: df = pd.DataFrame(data=4 * [range(2)], + ...: index=idx, + ...: columns=['a', 'b']) + + In [6]: df.iloc[2, 0] = 5 + + In [7]: df.groupby('a').resample('M').sum() + Out[7]: + a b + a + 0 2000-01-31 0 3 + 5 2000-01-31 5 1 + + + +*Current Behavior* + +.. ipython:: python In [1]: df = pd.DataFrame({"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 3, 4, 5, 6]}) @@ -212,33 +238,29 @@ Backwards incompatible API changes 2 7 3 11 -*pandas 0.25.x* - -.. code-block:: ipython - - In [1]: df = pd.DataFrame({"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 3, 4, 5, 6]}) - - In [2]: df.groupby("a").apply(lambda x: x.iloc[0]) - Out[2]: - a b + In [3]: df.groupby("a").apply(lambda x: x.iloc[0]) + Out[3]: + b a - 1 1 1 - 2 2 3 - 3 3 5 + 1 1 + 2 3 + 3 5 -*pandas 1.0.0* + In [4]: idx = pd.date_range('1/1/2000', periods=4, freq='T') -.. code-block:: ipython + In [5]: df = pd.DataFrame(data=4 * [range(2)], + ...: index=idx, + ...: columns=['a', 'b']) - In [1]: df = pd.DataFrame({"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 3, 4, 5, 6]}) + In [6]: df.iloc[2, 0] = 5 - In [2]: df.groupby("a").apply(lambda x: x.iloc[0]) - Out[2]: - b + In [7]: df.groupby('a').resample('M').sum() + Out[7]: + b a - 1 1 - 2 3 - 3 5 + 0 2000-01-31 3 + 5 2000-01-31 1 + .. _whatsnew_1000.api.other: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 17f532dfbd713..9cbc31d13019d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -721,20 +721,19 @@ def f(g): f = func # ignore SettingWithCopy here in case the user mutates - with option_context("mode.chained_assignment", None), _group_selection_context( - self - ): - try: - result = self._python_apply_general(f) - except TypeError: - # gh-20949 - # try again, with .apply acting as a filtering - # operation, by excluding the grouping column - # This would normally not be triggered - # except if the udf is trying an operation that - # fails on *some* columns, e.g. a numeric operation - # on a string grouper column - return self._python_apply_general(f) + with option_context("mode.chained_assignment", None): + with _group_selection_context(self): + try: + result = self._python_apply_general(f) + except TypeError: + # gh-20949 + # try again, with .apply acting as a filtering + # operation, by excluding the grouping column + # This would normally not be triggered + # except if the udf is trying an operation that + # fails on *some* columns, e.g. a numeric operation + # on a string grouper column + return self._python_apply_general(f) return result diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index 597d3bbf764e3..51dccc730451b 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -117,3 +117,10 @@ def as_index(): """yields all possible options for the `as_index` parameter, one at a time. """ return [True, False] + + +@pytest.fixture +def group_keys(): + """yields all possible options for the `group_keys` parameter, one at a time. + """ + return [True, False] diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 90663c4b6fd0d..aa56b15d9c856 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -363,15 +363,13 @@ def f(group): tm.assert_frame_equal(result.loc[key], f(group)) -def test_apply_chunk_view(as_index): +def test_apply_chunk_view(as_index, group_keys): # Low level tinkering could be unsafe, make sure not df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) - result = df.groupby("key", group_keys=False, as_index=as_index).apply( + result = df.groupby("key", group_keys=group_keys, as_index=as_index).apply( lambda x: x[:2] ) - # GH 28549 - # key no longer included in reduction output if as_index: df.pop("key") diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 75c6810d314ff..afdf6870c4fa1 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1225,10 +1225,17 @@ def test_get_nonexistent_category(): # Accessing a Category that is not in the dataframe df = pd.DataFrame({"var": ["a", "a", "b", "b"], "val": range(4)}) with pytest.raises(KeyError, match="'vau'"): - # GH2849 This needs to use as_index=False so that - # var is still present when grouping or else another key error - # will raise about var. - df.groupby("var", as_index=False).apply( + df.groupby("var").apply( + lambda rows: pd.DataFrame({"val": [rows.iloc[-1]["vau"]]}) + ) + + +def test_category_as_grouper_keys(as_index): + # Accessing a key that is not in the dataframe + df = pd.DataFrame({"var": ["a", "a", "b", "b"], "val": range(4)}) + bad_key = "'var'" if as_index else "'vau'" + with pytest.raises(KeyError, match=bad_key): + df.groupby("var").apply( lambda rows: pd.DataFrame( {"var": [rows.iloc[-1]["var"]], "val": [rows.iloc[-1]["vau"]]} ) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index be5ac6e2a5745..e02cfebc96b3d 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -349,9 +349,6 @@ def test_cython_api2(): tm.assert_frame_equal(result, expected) # GH 13994 - # GH 28549 - # Good represention of when as_index=False is now behaving - # as expected result = df.groupby("A", as_index=False).cumsum(axis=1) expected = df.cumsum(axis=1) tm.assert_frame_equal(result, expected) @@ -1118,9 +1115,6 @@ def test_count(): for key in ["1st", "2nd", ["1st", "2nd"]]: left = df.groupby(key).count() - - # GH 28549 - # don't need to drop key here anymore right = df.groupby(key).apply(DataFrame.count) tm.assert_frame_equal(left, right) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 7457521c016f4..4e3ee8eba6032 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -90,8 +90,6 @@ def max_value(group): applied = df.groupby("A").apply(max_value) result = applied.dtypes - # GH 28549 - # "A" should not be in output anymore expected = Series( [np.dtype("object")] + [np.dtype("float64")] * 2 + [np.dtype("int64")], index=["B", "C", "D", "value"], @@ -947,28 +945,17 @@ def f_no_copy(x): assert_series_equal(grpby_copy, grpby_no_copy) -def test_no_mutate_but_looks_like(as_index): - +def test_no_mutate_but_looks_like(): # GH 8467 # first show's mutation indicator # second does not, but should yield the same results df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) - def run_test(df, as_index): - result1 = df.groupby("key", group_keys=True, as_index=as_index).apply( - lambda x: x[:].key - ) - result2 = df.groupby("key", group_keys=True, as_index=as_index).apply( - lambda x: x.key - ) - return result1, result2 - - if as_index: - with pytest.raises(AttributeError, match="'key'$"): - run_test(df, as_index) - else: - result1, result2 = run_test(df, as_index) - assert_series_equal(result1, result2) + result1 = df.groupby("key", group_keys=True, as_index=False).apply( + lambda x: x[:].key + ) + result2 = df.groupby("key", group_keys=True, as_index=False).apply(lambda x: x.key) + assert_series_equal(result1, result2) def test_groupby_series_indexed_differently(): @@ -1365,9 +1352,6 @@ def test_dont_clobber_name_column(as_index): result = df.groupby("key", as_index=as_index).apply(lambda x: x) - # GH 28549 - # test both True and False for as index to ensure - # proper reduction if as_index: df.pop("key") diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 255c719573f6a..0e478c361f1fa 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -479,12 +479,11 @@ def f(group): with pd.option_context("mode.chained_assignment", None): for index, (key, group) in enumerate(grouped): res = f(group) - # GH 28549 - # if as_index need to drop column from res if as_index: res = res.drop("c", 1) - - k = key if as_index else index + k = key + else: + k = index assert_frame_equal(res, result.loc[k]) From fa21e29878a27be45baa48cd4feb75efe92c194d Mon Sep 17 00:00:00 2001 From: Chris Zimmerman Date: Mon, 28 Oct 2019 12:31:54 -0500 Subject: [PATCH 07/12] Had to parameterize the test because of group keys --- pandas/tests/groupby/test_apply.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index aa56b15d9c856..9501cd9fb643b 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -363,17 +363,33 @@ def f(group): tm.assert_frame_equal(result.loc[key], f(group)) -def test_apply_chunk_view(as_index, group_keys): +@pytest.mark.parametrize( + "as_index, group_keys, expected_index", + [ + ( + True, + True, + MultiIndex.from_tuples( + [(1, 0), (1, 1), (2, 2), (2, 3)], names=["key", None] + ), + ), + (True, False, [0, 1, 2, 3]), + (False, True, MultiIndex.from_tuples([(0, 0), (0, 1), (1, 2), (1, 3)])), + (False, False, [0, 1, 2, 3]), + ], +) +def test_apply_chunk_view(as_index, group_keys, expected_index): # Low level tinkering could be unsafe, make sure not - df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) + df = DataFrame({"key": [1, 1, 2, 2], "value": range(4)}) - result = df.groupby("key", group_keys=group_keys, as_index=as_index).apply( + result = df.groupby("key", as_index=as_index, group_keys=group_keys).apply( lambda x: x[:2] ) if as_index: df.pop("key") - expected = df.take([0, 1, 3, 4, 6, 7]) + expected = df.copy() + expected.index = expected_index tm.assert_frame_equal(result, expected) From 76815f1b5bba6611083c212822716e7f15779f0c Mon Sep 17 00:00:00 2001 From: Chris Zimmerman Date: Thu, 31 Oct 2019 20:27:43 -0500 Subject: [PATCH 08/12] Update test_transform.py --- pandas/tests/groupby/test_transform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 6f6d1f00de6d4..0324b92170647 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -484,7 +484,7 @@ def f(group): else: k = index - assert_frame_equal(res, result.loc[k]) + tm.assert_frame_equal(res, result.loc[k]) def _check_cython_group_transform_cumulative(pd_op, np_op, dtype): From 6c49a16cf834d671fcc071e83c5738e887728f21 Mon Sep 17 00:00:00 2001 From: Chris Zimmerman Date: Thu, 31 Oct 2019 20:28:21 -0500 Subject: [PATCH 09/12] Update test_groupby.py --- pandas/tests/groupby/test_groupby.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index bbae153b6177c..e7188e8eb7429 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -952,7 +952,7 @@ def test_no_mutate_but_looks_like(): lambda x: x[:].key ) result2 = df.groupby("key", group_keys=True, as_index=False).apply(lambda x: x.key) - assert_series_equal(result1, result2) + tm.assert_series_equal(result1, result2) def test_groupby_series_indexed_differently(): @@ -1352,7 +1352,7 @@ def test_dont_clobber_name_column(as_index): if as_index: df.pop("key") - assert_frame_equal(result, df) + tm.assert_frame_equal(result, df) def test_skip_group_keys(): From 8a4c1f82db15cf35becefa67f56991ec5a9871b1 Mon Sep 17 00:00:00 2001 From: Chris Zimmerman Date: Fri, 1 Nov 2019 09:37:50 -0500 Subject: [PATCH 10/12] Updated syntax in ipython block --- doc/source/whatsnew/v1.0.0.rst | 83 ++++++++-------------------------- 1 file changed, 18 insertions(+), 65 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index db1ad92951dc1..16735ba0ebe82 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -192,77 +192,30 @@ Backwards incompatible API changes .. ipython:: python - In [1]: df = pd.DataFrame({"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 3, 4, 5, 6]}) - - In [2]: df.groupby("a").apply(lambda x: x.sum()) - Out[2]: - a b - a - 1 2 3 - 2 4 7 - 3 6 11 - - In [3]: df.groupby("a").apply(lambda x: x.iloc[0]) - Out[3]: - a b - a - 1 1 1 - 2 2 3 - 3 3 5 - - In [4]: idx = pd.date_range('1/1/2000', periods=4, freq='T') - - In [5]: df = pd.DataFrame(data=4 * [range(2)], - ...: index=idx, - ...: columns=['a', 'b']) - - In [6]: df.iloc[2, 0] = 5 - - In [7]: df.groupby('a').resample('M').sum() - Out[7]: - a b - a - 0 2000-01-31 0 3 - 5 2000-01-31 5 1 - + df = pd.DataFrame({"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 3, 4, 5, 6]}) + df.groupby("a").apply(lambda x: x.sum()) + df.groupby("a").apply(lambda x: x.iloc[0]) + idx = pd.date_range('1/1/2000', periods=4, freq='T') + df = pd.DataFrame(data=4 * [range(2)], + index=idx, + columns=['a', 'b']) + df.iloc[2, 0] = 5 + df.groupby('a').resample('M').sum() *Current Behavior* .. ipython:: python - In [1]: df = pd.DataFrame({"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 3, 4, 5, 6]}) - - In [2]: df.groupby("a").apply(lambda x: x.sum()) - Out[2]: - b - a - 1 3 - 2 7 - 3 11 - - In [3]: df.groupby("a").apply(lambda x: x.iloc[0]) - Out[3]: - b - a - 1 1 - 2 3 - 3 5 - - In [4]: idx = pd.date_range('1/1/2000', periods=4, freq='T') - - In [5]: df = pd.DataFrame(data=4 * [range(2)], - ...: index=idx, - ...: columns=['a', 'b']) - - In [6]: df.iloc[2, 0] = 5 - - In [7]: df.groupby('a').resample('M').sum() - Out[7]: - b - a - 0 2000-01-31 3 - 5 2000-01-31 1 + df = pd.DataFrame({"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 3, 4, 5, 6]}) + df.groupby("a").apply(lambda x: x.sum()) + df.groupby("a").apply(lambda x: x.iloc[0]) + idx = pd.date_range('1/1/2000', periods=4, freq='T') + df = pd.DataFrame(data=4 * [range(2)], + index=idx, + columns=['a', 'b']) + df.iloc[2, 0] = 5 + df.groupby('a').resample('M').sum() .. _whatsnew_1000.api.other: From cfacfc1532cb1b3dde607cf7a3b03ee011a3bf9f Mon Sep 17 00:00:00 2001 From: Chris Zimmerman Date: Wed, 20 Nov 2019 08:59:39 -0600 Subject: [PATCH 11/12] Indent --- doc/source/whatsnew/v1.0.0.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 2613410a8790e..68f8ca428a031 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -230,9 +230,9 @@ New repr for :class:`pandas.core.arrays.IntervalArray` In [4]: idx = pd.date_range('1/1/2000', periods=4, freq='T') - In [5]: df = pd.DataFrame(data=4 * [range(2)], - ...: index=idx, - ...: columns=['a', 'b']) + In [5]: df = pd.DataFrame(data=4 * [range(2)], + ...: index=idx, + ...: columns=['a', 'b']) In [6]: df.iloc[2, 0] = 5 From 83be0290257a5e4321fd10e2710ec783412c81fb Mon Sep 17 00:00:00 2001 From: Chris Zimmerman Date: Tue, 4 Feb 2020 08:18:43 -0600 Subject: [PATCH 12/12] More tests changed with bad apply behavior --- pandas/tests/groupby/test_apply.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index d81ff49e37b45..19fc58a275a73 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -512,7 +512,7 @@ def test_apply_with_duplicated_non_sorted_axis(test_series): expected = ser.sort_index() tm.assert_series_equal(result, expected) else: - result = df.groupby("Y").apply(lambda x: x) + result = df.groupby("Y", as_index=False).apply(lambda x: x) # not expecting the order to remain the same for duplicated axis result = result.sort_values("Y") @@ -847,7 +847,7 @@ def test_groupby_apply_datetime_result_dtypes(): ], columns=["observation", "color", "mood", "intensity", "score"], ) - result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes + result = data.groupby("color", as_index=False).apply(lambda g: g.iloc[0]).dtypes expected = Series( [np.dtype("datetime64[ns]"), np.object, np.object, np.int64, np.object], index=["observation", "color", "mood", "intensity", "score"], @@ -867,7 +867,7 @@ def test_groupby_apply_datetime_result_dtypes(): def test_apply_index_has_complex_internals(index): # GH 31248 df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index) - result = df.groupby("group").apply(lambda x: x) + result = df.groupby("group", as_index=False).apply(lambda x: x) tm.assert_frame_equal(result, df)