From b0a9de9b5181f19317b431e6c054341267963315 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 12 May 2021 15:42:50 -0700 Subject: [PATCH 1/2] BUG: resample.apply with non-unique columns --- pandas/core/groupby/generic.py | 21 +++++++++++++------- pandas/tests/groupby/test_groupby.py | 20 +++++++++++++++++++ pandas/tests/resample/test_datetime_index.py | 8 ++++++-- pandas/tests/resample/test_timedelta.py | 8 +++++++- 4 files changed, 47 insertions(+), 10 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index cb5b54ca0c598..71d19cdd877a6 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1106,6 +1106,7 @@ def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame: result: dict[Hashable, NDFrame | np.ndarray] = {} if axis != obj._info_axis_number: + # test_pass_args_kwargs_duplicate_columns gets here with non-unique columns for name, data in self: fres = func(data, *args, **kwargs) result[name] = fres @@ -1119,18 +1120,23 @@ def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame: def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame: # only for axis==0 + # tests that get here with non-unique cols: + # test_resample_with_timedelta_yields_no_empty_groups, + # test_resample_apply_product obj = self._obj_with_exclusions result: dict[int | str, NDFrame] = {} - for item in obj: - data = obj[item] - colg = SeriesGroupBy(data, selection=item, grouper=self.grouper) - - result[item] = colg.aggregate(func, *args, **kwargs) + for i, item in enumerate(obj): + ser = obj.iloc[:, i] + colg = SeriesGroupBy( + ser, selection=item, grouper=self.grouper, exclusions=self.exclusions + ) - result_columns = obj.columns + result[i] = colg.aggregate(func, *args, **kwargs) - return self.obj._constructor(result, columns=result_columns) + res_df = self.obj._constructor(result) + res_df.columns = obj.columns + return res_df def _wrap_applied_output(self, data, keys, values, not_indexed_same=False): if len(keys) == 0: @@ -1401,6 +1407,7 @@ def _choose_path(self, fast_path: Callable, slow_path: Callable, group: DataFram def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame: # iterate through columns, see test_transform_exclude_nuisance + # gets here with non-unique columns output = {} inds = [] for i, col in enumerate(obj): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 4368e57a7da4d..83aeb29ec53df 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -248,6 +248,26 @@ def f(x, q=None, axis=0): tm.assert_frame_equal(apply_result, expected, check_names=False) +@pytest.mark.parametrize("as_index", [True, False]) +def test_pass_args_kwargs_duplicate_columns(tsframe, as_index): + # go through _aggregate_frame with self.axis == 0 and duplicate columns + tsframe.columns = ["A", "B", "A", "C"] + gb = tsframe.groupby(lambda x: x.month, as_index=as_index) + + res = gb.agg(np.percentile, 80, axis=0) + + ex_data = { + 1: tsframe[tsframe.index.month == 1].quantile(0.8), + 2: tsframe[tsframe.index.month == 2].quantile(0.8), + } + expected = DataFrame(ex_data).T + if not as_index: + # TODO: try to get this more consistent? + expected.index = Index(range(2)) + + tm.assert_frame_equal(res, expected) + + def test_len(): df = tm.makeTimeDataFrame() grouped = df.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 66cb2f2291e98..1c7aa5c444da9 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1748,19 +1748,23 @@ def test_get_timestamp_range_edges(first, last, freq, exp_first, exp_last): assert result == expected -def test_resample_apply_product(): +@pytest.mark.parametrize("duplicates", [True, False]) +def test_resample_apply_product(duplicates): # GH 5586 index = date_range(start="2012-01-31", freq="M", periods=12) ts = Series(range(12), index=index) df = DataFrame({"A": ts, "B": ts + 2}) + if duplicates: + df.columns = ["A", "A"] + result = df.resample("Q").apply(np.product) expected = DataFrame( np.array([[0, 24], [60, 210], [336, 720], [990, 1716]], dtype=np.int64), index=DatetimeIndex( ["2012-03-31", "2012-06-30", "2012-09-30", "2012-12-31"], freq="Q-DEC" ), - columns=["A", "B"], + columns=df.columns, ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_timedelta.py b/pandas/tests/resample/test_timedelta.py index b1560623cd871..e127f69b12674 100644 --- a/pandas/tests/resample/test_timedelta.py +++ b/pandas/tests/resample/test_timedelta.py @@ -153,18 +153,24 @@ def test_resample_timedelta_edge_case(start, end, freq, resample_freq): assert not np.isnan(result[-1]) -def test_resample_with_timedelta_yields_no_empty_groups(): +@pytest.mark.parametrize("duplicates", [True, False]) +def test_resample_with_timedelta_yields_no_empty_groups(duplicates): # GH 10603 df = DataFrame( np.random.normal(size=(10000, 4)), index=timedelta_range(start="0s", periods=10000, freq="3906250n"), ) + if duplicates: + # case with non-unique columns + df.columns = ["A", "B", "A", "C"] + result = df.loc["1s":, :].resample("3s").apply(lambda x: len(x)) expected = DataFrame( [[768] * 4] * 12 + [[528] * 4], index=timedelta_range(start="1s", periods=13, freq="3s"), ) + expected.columns = df.columns tm.assert_frame_equal(result, expected) From 30841cfd86954024f5a18188a42e9950d2c7c692 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 13 May 2021 09:23:40 -0700 Subject: [PATCH 2/2] whatsnew --- doc/source/whatsnew/v1.3.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 84f9dae8a0850..793818419c910 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -901,6 +901,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupBy.rank` with the GroupBy object's ``axis=0`` and the ``rank`` method's keyword ``axis=1`` (:issue:`41320`) - Bug in :meth:`DataFrameGroupBy.__getitem__` with non-unique columns incorrectly returning a malformed :class:`SeriesGroupBy` instead of :class:`DataFrameGroupBy` (:issue:`41427`) - Bug in :meth:`DataFrameGroupBy.transform` with non-unique columns incorrectly raising ``AttributeError`` (:issue:`41427`) +- Bug in :meth:`Resampler.apply` with non-unique columns incorrectly dropping duplicated columns (:issue:`41445`) Reshaping ^^^^^^^^^