diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 81c5e74957c62..29411b9c722a9 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1731,32 +1731,48 @@ def value_counts( observed=self.observed, dropna=self.dropna, ) - result = cast(Series, gb.size()) + result_series = cast(Series, gb.size()) if normalize: # Normalize the results by dividing by the original group sizes. # We are guaranteed to have the first N levels be the # user-requested grouping. - levels = list(range(len(self.grouper.groupings), result.index.nlevels)) - indexed_group_size = result.groupby( - result.index.droplevel(levels), + levels = list( + range(len(self.grouper.groupings), result_series.index.nlevels) + ) + indexed_group_size = result_series.groupby( + result_series.index.droplevel(levels), sort=self.sort, observed=self.observed, dropna=self.dropna, ).transform("sum") - result /= indexed_group_size + result_series /= indexed_group_size if sort: # Sort the values and then resort by the main grouping index_level = range(len(self.grouper.groupings)) - result = result.sort_values(ascending=ascending).sort_index( - level=index_level, sort_remaining=False - ) + result_series = result_series.sort_values( + ascending=ascending + ).sort_index(level=index_level, sort_remaining=False) - if not self.as_index: + result: Series | DataFrame + if self.as_index: + result = result_series + else: # Convert to frame - result = result.reset_index(name="proportion" if normalize else "count") + name = "proportion" if normalize else "count" + index = result_series.index + columns = com.fill_missing_names(index.names) + if name in columns: + raise ValueError( + f"Column label '{name}' is duplicate of result column" + ) + result_series.name = name + result_series.index = index.set_names(range(len(columns))) + result_frame = result_series.reset_index() + result_frame.columns = columns + [name] + result = result_frame return result.__finalize__(self.obj, method="value_counts") diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index 79ef46db8e95e..affef05ba4ed3 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -406,33 +406,52 @@ def test_mixed_groupings(normalize, expected_label, expected_values): @pytest.mark.parametrize( - "test, expected_names", + "test, columns, expected_names", [ - ("repeat", ["a", None, "d", "b", "b", "e"]), - ("level", ["a", None, "d", "b", "c", "level_1"]), + ("repeat", list("abbde"), ["a", None, "d", "b", "b", "e"]), + ("level", list("abcd") + ["level_1"], ["a", None, "d", "b", "c", "level_1"]), ], ) @pytest.mark.parametrize("as_index", [False, True]) -def test_column_name_clashes(test, expected_names, as_index): - df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6], "d": [7, 8], "e": [9, 10]}) - if test == "repeat": - df.columns = list("abbde") - else: - df.columns = list("abcd") + ["level_1"] - +def test_column_label_duplicates(test, columns, expected_names, as_index): + # GH 44992 + # Test for duplicate input column labels and generated duplicate labels + df = DataFrame([[1, 3, 5, 7, 9], [2, 4, 6, 8, 10]], columns=columns) + expected_data = [(1, 0, 7, 3, 5, 9), (2, 1, 8, 4, 6, 10)] + result = df.groupby(["a", [0, 1], "d"], as_index=as_index).value_counts() if as_index: - result = df.groupby(["a", [0, 1], "d"], as_index=as_index).value_counts() expected = Series( data=(1, 1), index=MultiIndex.from_tuples( - [(1, 0, 7, 3, 5, 9), (2, 1, 8, 4, 6, 10)], + expected_data, names=expected_names, ), ) tm.assert_series_equal(result, expected) else: - with pytest.raises(ValueError, match="cannot insert"): - df.groupby(["a", [0, 1], "d"], as_index=as_index).value_counts() + expected_data = [list(row) + [1] for row in expected_data] + expected_columns = list(expected_names) + expected_columns[1] = "level_1" + expected_columns.append("count") + expected = DataFrame(expected_data, columns=expected_columns) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "normalize, expected_label", + [ + (False, "count"), + (True, "proportion"), + ], +) +def test_result_label_duplicates(normalize, expected_label): + # Test for result column label duplicating an input column label + gb = DataFrame([[1, 2, 3]], columns=["a", "b", expected_label]).groupby( + "a", as_index=False + ) + msg = f"Column label '{expected_label}' is duplicate of result column" + with pytest.raises(ValueError, match=msg): + gb.value_counts(normalize=normalize) def test_ambiguous_grouping():