Skip to content

Commit 50d288e

Browse files
authored
BUG: groupby.describe on a frame with duplicate column names (#50846)
* REF: groupby Series selection with as_index=False * GH# * type-hinting fixes * WIP * WIP * WIP * BUG: groupby.describe on a frame with duplicate column names * cleanup * test fixup * Fix type-hint for _group_selection * Merge branch 'groupby_select_obj_dup_cols' of https://github.com/rhshadrach/pandas into groupby_select_obj_dup_cols # Conflicts: # pandas/core/groupby/groupby.py * Speedup * refinement * cleanup, faster implementation * Make group_selection a Boolean flag * Avoid resetting cache * Improve test * Rework test
1 parent 5645847 commit 50d288e

File tree

3 files changed

+51
-0
lines changed

3 files changed

+51
-0
lines changed

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -1263,6 +1263,7 @@ Groupby/resample/rolling
12631263
- Bug in :meth:`.SeriesGroupBy.value_counts` did not respect ``sort=False`` (:issue:`50482`)
12641264
- Bug in :meth:`.DataFrameGroupBy.resample` raises ``KeyError`` when getting the result from a key list when resampling on time index (:issue:`50840`)
12651265
- Bug in :meth:`.DataFrameGroupBy.transform` and :meth:`.SeriesGroupBy.transform` would raise incorrectly when grouper had ``axis=1`` for ``"ngroup"`` argument (:issue:`45986`)
1266+
- Bug in :meth:`.DataFrameGroupBy.describe` produced incorrect results when data had duplicate columns (:issue:`50806`)
12661267
-
12671268

12681269
Reshaping

pandas/tests/groupby/test_function.py

+40
Original file line numberDiff line numberDiff line change
@@ -1256,6 +1256,27 @@ def test_describe_with_duplicate_output_column_names(as_index, keys):
12561256
tm.assert_frame_equal(result, expected)
12571257

12581258

1259+
def test_describe_duplicate_columns():
1260+
# GH#50806
1261+
df = DataFrame([[0, 1, 2, 3]])
1262+
df.columns = [0, 1, 2, 0]
1263+
gb = df.groupby(df[1])
1264+
result = gb.describe(percentiles=[])
1265+
1266+
columns = ["count", "mean", "std", "min", "50%", "max"]
1267+
frames = [
1268+
DataFrame([[1.0, val, np.nan, val, val, val]], index=[1], columns=columns)
1269+
for val in (0.0, 2.0, 3.0)
1270+
]
1271+
expected = pd.concat(frames, axis=1)
1272+
expected.columns = MultiIndex(
1273+
levels=[[0, 2], columns],
1274+
codes=[6 * [0] + 6 * [1] + 6 * [0], 3 * list(range(6))],
1275+
)
1276+
expected.index.names = [1]
1277+
tm.assert_frame_equal(result, expected)
1278+
1279+
12591280
def test_groupby_mean_no_overflow():
12601281
# Regression test for (#22487)
12611282
df = DataFrame(
@@ -1596,3 +1617,22 @@ def test_multiindex_group_all_columns_when_empty(groupby_func):
15961617
result = method(*args).index
15971618
expected = df.index
15981619
tm.assert_index_equal(result, expected)
1620+
1621+
1622+
def test_duplicate_columns(request, groupby_func, as_index):
1623+
# GH#50806
1624+
if groupby_func == "corrwith":
1625+
msg = "GH#50845 - corrwith fails when there are duplicate columns"
1626+
request.node.add_marker(pytest.mark.xfail(reason=msg))
1627+
df = DataFrame([[1, 3, 6], [1, 4, 7], [2, 5, 8]], columns=list("abb"))
1628+
args = get_groupby_method_args(groupby_func, df)
1629+
gb = df.groupby("a", as_index=as_index)
1630+
result = getattr(gb, groupby_func)(*args)
1631+
1632+
expected_df = df.set_axis(["a", "b", "c"], axis=1)
1633+
expected_args = get_groupby_method_args(groupby_func, expected_df)
1634+
expected_gb = expected_df.groupby("a", as_index=as_index)
1635+
expected = getattr(expected_gb, groupby_func)(*expected_args)
1636+
if groupby_func not in ("size", "ngroup", "cumcount"):
1637+
expected = expected.rename(columns={"c": "b"})
1638+
tm.assert_equal(result, expected)

pandas/tests/groupby/test_groupby.py

+10
Original file line numberDiff line numberDiff line change
@@ -2828,3 +2828,13 @@ def test_groupby_reduce_period():
28282828
expected = ser[:10]
28292829
expected.index = Index(range(10), dtype=np.int_)
28302830
tm.assert_series_equal(res, expected)
2831+
2832+
2833+
def test_obj_with_exclusions_duplicate_columns():
2834+
# GH#50806
2835+
df = DataFrame([[0, 1, 2, 3]])
2836+
df.columns = [0, 1, 2, 0]
2837+
gb = df.groupby(df[1])
2838+
result = gb._obj_with_exclusions
2839+
expected = df.take([0, 2, 3], axis=1)
2840+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)