Skip to content

Commit 069b65a

Browse files
authored
BUG: resample.apply with non-unique columns (pandas-dev#41445)
1 parent 25e7ed9 commit 069b65a

File tree

5 files changed

+48
-10
lines changed

5 files changed

+48
-10
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -901,6 +901,7 @@ Groupby/resample/rolling
901901
- Bug in :meth:`DataFrameGroupBy.rank` with the GroupBy object's ``axis=0`` and the ``rank`` method's keyword ``axis=1`` (:issue:`41320`)
902902
- Bug in :meth:`DataFrameGroupBy.__getitem__` with non-unique columns incorrectly returning a malformed :class:`SeriesGroupBy` instead of :class:`DataFrameGroupBy` (:issue:`41427`)
903903
- Bug in :meth:`DataFrameGroupBy.transform` with non-unique columns incorrectly raising ``AttributeError`` (:issue:`41427`)
904+
- Bug in :meth:`Resampler.apply` with non-unique columns incorrectly dropping duplicated columns (:issue:`41445`)
904905

905906
Reshaping
906907
^^^^^^^^^

pandas/core/groupby/generic.py

+14-7
Original file line numberDiff line numberDiff line change
@@ -1106,6 +1106,7 @@ def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame:
11061106

11071107
result: dict[Hashable, NDFrame | np.ndarray] = {}
11081108
if axis != obj._info_axis_number:
1109+
# test_pass_args_kwargs_duplicate_columns gets here with non-unique columns
11091110
for name, data in self:
11101111
fres = func(data, *args, **kwargs)
11111112
result[name] = fres
@@ -1119,18 +1120,23 @@ def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame:
11191120

11201121
def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame:
11211122
# only for axis==0
1123+
# tests that get here with non-unique cols:
1124+
# test_resample_with_timedelta_yields_no_empty_groups,
1125+
# test_resample_apply_product
11221126

11231127
obj = self._obj_with_exclusions
11241128
result: dict[int | str, NDFrame] = {}
1125-
for item in obj:
1126-
data = obj[item]
1127-
colg = SeriesGroupBy(data, selection=item, grouper=self.grouper)
1128-
1129-
result[item] = colg.aggregate(func, *args, **kwargs)
1129+
for i, item in enumerate(obj):
1130+
ser = obj.iloc[:, i]
1131+
colg = SeriesGroupBy(
1132+
ser, selection=item, grouper=self.grouper, exclusions=self.exclusions
1133+
)
11301134

1131-
result_columns = obj.columns
1135+
result[i] = colg.aggregate(func, *args, **kwargs)
11321136

1133-
return self.obj._constructor(result, columns=result_columns)
1137+
res_df = self.obj._constructor(result)
1138+
res_df.columns = obj.columns
1139+
return res_df
11341140

11351141
def _wrap_applied_output(self, data, keys, values, not_indexed_same=False):
11361142
if len(keys) == 0:
@@ -1401,6 +1407,7 @@ def _choose_path(self, fast_path: Callable, slow_path: Callable, group: DataFram
14011407

14021408
def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame:
14031409
# iterate through columns, see test_transform_exclude_nuisance
1410+
# gets here with non-unique columns
14041411
output = {}
14051412
inds = []
14061413
for i, col in enumerate(obj):

pandas/tests/groupby/test_groupby.py

+20
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,26 @@ def f(x, q=None, axis=0):
248248
tm.assert_frame_equal(apply_result, expected, check_names=False)
249249

250250

251+
@pytest.mark.parametrize("as_index", [True, False])
252+
def test_pass_args_kwargs_duplicate_columns(tsframe, as_index):
253+
# go through _aggregate_frame with self.axis == 0 and duplicate columns
254+
tsframe.columns = ["A", "B", "A", "C"]
255+
gb = tsframe.groupby(lambda x: x.month, as_index=as_index)
256+
257+
res = gb.agg(np.percentile, 80, axis=0)
258+
259+
ex_data = {
260+
1: tsframe[tsframe.index.month == 1].quantile(0.8),
261+
2: tsframe[tsframe.index.month == 2].quantile(0.8),
262+
}
263+
expected = DataFrame(ex_data).T
264+
if not as_index:
265+
# TODO: try to get this more consistent?
266+
expected.index = Index(range(2))
267+
268+
tm.assert_frame_equal(res, expected)
269+
270+
251271
def test_len():
252272
df = tm.makeTimeDataFrame()
253273
grouped = df.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day])

pandas/tests/resample/test_datetime_index.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -1748,19 +1748,23 @@ def test_get_timestamp_range_edges(first, last, freq, exp_first, exp_last):
17481748
assert result == expected
17491749

17501750

1751-
def test_resample_apply_product():
1751+
@pytest.mark.parametrize("duplicates", [True, False])
1752+
def test_resample_apply_product(duplicates):
17521753
# GH 5586
17531754
index = date_range(start="2012-01-31", freq="M", periods=12)
17541755

17551756
ts = Series(range(12), index=index)
17561757
df = DataFrame({"A": ts, "B": ts + 2})
1758+
if duplicates:
1759+
df.columns = ["A", "A"]
1760+
17571761
result = df.resample("Q").apply(np.product)
17581762
expected = DataFrame(
17591763
np.array([[0, 24], [60, 210], [336, 720], [990, 1716]], dtype=np.int64),
17601764
index=DatetimeIndex(
17611765
["2012-03-31", "2012-06-30", "2012-09-30", "2012-12-31"], freq="Q-DEC"
17621766
),
1763-
columns=["A", "B"],
1767+
columns=df.columns,
17641768
)
17651769
tm.assert_frame_equal(result, expected)
17661770

pandas/tests/resample/test_timedelta.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -153,18 +153,24 @@ def test_resample_timedelta_edge_case(start, end, freq, resample_freq):
153153
assert not np.isnan(result[-1])
154154

155155

156-
def test_resample_with_timedelta_yields_no_empty_groups():
156+
@pytest.mark.parametrize("duplicates", [True, False])
157+
def test_resample_with_timedelta_yields_no_empty_groups(duplicates):
157158
# GH 10603
158159
df = DataFrame(
159160
np.random.normal(size=(10000, 4)),
160161
index=timedelta_range(start="0s", periods=10000, freq="3906250n"),
161162
)
163+
if duplicates:
164+
# case with non-unique columns
165+
df.columns = ["A", "B", "A", "C"]
166+
162167
result = df.loc["1s":, :].resample("3s").apply(lambda x: len(x))
163168

164169
expected = DataFrame(
165170
[[768] * 4] * 12 + [[528] * 4],
166171
index=timedelta_range(start="1s", periods=13, freq="3s"),
167172
)
173+
expected.columns = df.columns
168174
tm.assert_frame_equal(result, expected)
169175

170176

0 commit comments

Comments
 (0)