Skip to content

Commit 4526ea7

Browse files
undermyumbrella1Keimroeschke
authored
Update compute_dict_like to get all columns (#58452)
* Update compute_dict_like to get all columns * Add tests * Update rst * Remove newline from rst * Project the columns before converting to series group by * retrigger doc build * Account for 1d/series projection result * Declare var before assignment * Remove if condition * Add test to test agg list funcs --------- Co-authored-by: Kei <[email protected]> Co-authored-by: Matthew Roeschke <[email protected]>
1 parent 728cfcb commit 4526ea7

File tree

3 files changed

+149
-5
lines changed

3 files changed

+149
-5
lines changed

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -535,6 +535,7 @@ Groupby/resample/rolling
535535
- Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`)
536536
- Bug in :meth:`.Resampler.interpolate` on a :class:`DataFrame` with non-uniform sampling and/or indices not aligning with the resulting resampled index would result in wrong interpolation (:issue:`21351`)
537537
- Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`)
538+
- Bug in :meth:`DataFrameGroupBy.agg` that raises ``AttributeError`` when there is dictionary input and duplicated columns, instead of returning a DataFrame with the aggregation of all duplicate columns. (:issue:`55041`)
538539
- Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`)
539540
- Bug in :meth:`DataFrameGroupBy.apply` with ``as_index=False`` that was returning :class:`MultiIndex` instead of returning :class:`Index`. (:issue:`58291`)
540541
- Bug in :meth:`DataFrameGroupby.transform` and :meth:`SeriesGroupby.transform` with a reducer and ``observed=False`` that coerces dtype to float when there are unobserved categories. (:issue:`55326`)

pandas/core/apply.py

+30-5
Original file line numberDiff line numberDiff line change
@@ -471,8 +471,30 @@ def compute_dict_like(
471471

472472
keys += [key] * len(key_data)
473473
results += key_data
474-
else:
474+
elif is_groupby:
475475
# key used for column selection and output
476+
477+
df = selected_obj
478+
results, keys = [], []
479+
for key, how in func.items():
480+
cols = df[key]
481+
482+
if cols.ndim == 1:
483+
series_list = [obj._gotitem(key, ndim=1, subset=cols)]
484+
else:
485+
series_list = []
486+
for index in range(cols.shape[1]):
487+
col = cols.iloc[:, index]
488+
489+
series = obj._gotitem(key, ndim=1, subset=col)
490+
series_list.append(series)
491+
492+
for series in series_list:
493+
result = getattr(series, op_name)(how, **kwargs)
494+
results.append(result)
495+
keys.append(key)
496+
497+
else:
476498
results = [
477499
getattr(obj._gotitem(key, ndim=1), op_name)(how, **kwargs)
478500
for key, how in func.items()
@@ -496,11 +518,14 @@ def wrap_results_dict_like(
496518
is_ndframe = [isinstance(r, ABCNDFrame) for r in result_data]
497519

498520
if all(is_ndframe):
499-
results = dict(zip(result_index, result_data))
521+
results = [result for result in result_data if not result.empty]
500522
keys_to_use: Iterable[Hashable]
501-
keys_to_use = [k for k in result_index if not results[k].empty]
523+
keys_to_use = [k for k, v in zip(result_index, result_data) if not v.empty]
502524
# Have to check, if at least one DataFrame is not empty.
503-
keys_to_use = keys_to_use if keys_to_use != [] else result_index
525+
if keys_to_use == []:
526+
keys_to_use = result_index
527+
results = result_data
528+
504529
if selected_obj.ndim == 2:
505530
# keys are columns, so we can preserve names
506531
ktu = Index(keys_to_use)
@@ -509,7 +534,7 @@ def wrap_results_dict_like(
509534

510535
axis: AxisInt = 0 if isinstance(obj, ABCSeries) else 1
511536
result = concat(
512-
{k: results[k] for k in keys_to_use},
537+
results,
513538
axis=axis,
514539
keys=keys_to_use,
515540
)

pandas/tests/groupby/aggregate/test_aggregate.py

+118
Original file line numberDiff line numberDiff line change
@@ -1662,3 +1662,121 @@ def func(x):
16621662
msg = "length must not be 0"
16631663
with pytest.raises(ValueError, match=msg):
16641664
df.groupby("A", observed=False).agg(func)
1665+
1666+
1667+
def test_groupby_aggregation_duplicate_columns_single_dict_value():
1668+
# GH#55041
1669+
df = DataFrame(
1670+
[[1, 2, 3, 4], [1, 3, 4, 5], [2, 4, 5, 6]],
1671+
columns=["a", "b", "c", "c"],
1672+
)
1673+
gb = df.groupby("a")
1674+
result = gb.agg({"c": "sum"})
1675+
1676+
expected = DataFrame(
1677+
[[7, 9], [5, 6]], columns=["c", "c"], index=Index([1, 2], name="a")
1678+
)
1679+
tm.assert_frame_equal(result, expected)
1680+
1681+
1682+
def test_groupby_aggregation_duplicate_columns_multiple_dict_values():
1683+
# GH#55041
1684+
df = DataFrame(
1685+
[[1, 2, 3, 4], [1, 3, 4, 5], [2, 4, 5, 6]],
1686+
columns=["a", "b", "c", "c"],
1687+
)
1688+
gb = df.groupby("a")
1689+
result = gb.agg({"c": ["sum", "min", "max", "min"]})
1690+
1691+
expected = DataFrame(
1692+
[[7, 3, 4, 3, 9, 4, 5, 4], [5, 5, 5, 5, 6, 6, 6, 6]],
1693+
columns=MultiIndex(
1694+
levels=[["c"], ["sum", "min", "max"]],
1695+
codes=[[0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 2, 1, 0, 1, 2, 1]],
1696+
),
1697+
index=Index([1, 2], name="a"),
1698+
)
1699+
tm.assert_frame_equal(result, expected)
1700+
1701+
1702+
def test_groupby_aggregation_duplicate_columns_some_empty_result():
1703+
# GH#55041
1704+
df = DataFrame(
1705+
[
1706+
[1, 9843, 43, 54, 7867],
1707+
[2, 940, 9, -34, 44],
1708+
[1, -34, -546, -549358, 0],
1709+
[2, 244, -33, -100, 44],
1710+
],
1711+
columns=["a", "b", "b", "c", "c"],
1712+
)
1713+
gb = df.groupby("a")
1714+
result = gb.agg({"b": [], "c": ["var"]})
1715+
1716+
expected = DataFrame(
1717+
[[1.509268e11, 30944844.5], [2.178000e03, 0.0]],
1718+
columns=MultiIndex(levels=[["c"], ["var"]], codes=[[0, 0], [0, 0]]),
1719+
index=Index([1, 2], name="a"),
1720+
)
1721+
tm.assert_frame_equal(result, expected)
1722+
1723+
1724+
def test_groupby_aggregation_multi_index_duplicate_columns():
1725+
# GH#55041
1726+
df = DataFrame(
1727+
[
1728+
[1, -9843, 43, 54, 7867],
1729+
[2, 940, 9, -34, 44],
1730+
[1, -34, 546, -549358, 0],
1731+
[2, 244, -33, -100, 44],
1732+
],
1733+
columns=MultiIndex(
1734+
levels=[["level1.1", "level1.2"], ["level2.1", "level2.2"]],
1735+
codes=[[0, 0, 0, 1, 1], [0, 1, 1, 0, 1]],
1736+
),
1737+
index=MultiIndex(
1738+
levels=[["level1.1", "level1.2"], ["level2.1", "level2.2"]],
1739+
codes=[[0, 0, 0, 1], [0, 1, 1, 0]],
1740+
),
1741+
)
1742+
gb = df.groupby(level=0)
1743+
result = gb.agg({("level1.1", "level2.2"): "min"})
1744+
1745+
expected = DataFrame(
1746+
[[-9843, 9], [244, -33]],
1747+
columns=MultiIndex(levels=[["level1.1"], ["level2.2"]], codes=[[0, 0], [0, 0]]),
1748+
index=Index(["level1.1", "level1.2"]),
1749+
)
1750+
tm.assert_frame_equal(result, expected)
1751+
1752+
1753+
def test_groupby_aggregation_func_list_multi_index_duplicate_columns():
1754+
# GH#55041
1755+
df = DataFrame(
1756+
[
1757+
[1, -9843, 43, 54, 7867],
1758+
[2, 940, 9, -34, 44],
1759+
[1, -34, 546, -549358, 0],
1760+
[2, 244, -33, -100, 44],
1761+
],
1762+
columns=MultiIndex(
1763+
levels=[["level1.1", "level1.2"], ["level2.1", "level2.2"]],
1764+
codes=[[0, 0, 0, 1, 1], [0, 1, 1, 0, 1]],
1765+
),
1766+
index=MultiIndex(
1767+
levels=[["level1.1", "level1.2"], ["level2.1", "level2.2"]],
1768+
codes=[[0, 0, 0, 1], [0, 1, 1, 0]],
1769+
),
1770+
)
1771+
gb = df.groupby(level=0)
1772+
result = gb.agg({("level1.1", "level2.2"): ["min", "max"]})
1773+
1774+
expected = DataFrame(
1775+
[[-9843, 940, 9, 546], [244, 244, -33, -33]],
1776+
columns=MultiIndex(
1777+
levels=[["level1.1"], ["level2.2"], ["min", "max"]],
1778+
codes=[[0, 0, 0, 0], [0, 0, 0, 0], [0, 1, 0, 1]],
1779+
),
1780+
index=Index(["level1.1", "level1.2"]),
1781+
)
1782+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)