Skip to content

Commit 6b03989

Browse files
committed
Implement value_counts with duplicates and add test
1 parent 696130b commit 6b03989

File tree

2 files changed

+55
-11
lines changed

2 files changed

+55
-11
lines changed

pandas/core/groupby/generic.py

+27-5
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,10 @@
2626

2727
import numpy as np
2828

29-
from pandas._libs import reduction as libreduction
29+
from pandas._libs import (
30+
lib,
31+
reduction as libreduction,
32+
)
3033
from pandas._typing import (
3134
ArrayLike,
3235
Manager,
@@ -1730,7 +1733,7 @@ def value_counts(
17301733
observed=self.observed,
17311734
dropna=self.dropna,
17321735
)
1733-
result = cast(Series, gb.size())
1736+
result = gb.size()
17341737

17351738
if normalize:
17361739
# Normalize the results by dividing by the original group sizes.
@@ -1749,13 +1752,32 @@ def value_counts(
17491752
if sort:
17501753
# Sort the values and then resort by the main grouping
17511754
index_level = range(len(self.grouper.groupings))
1752-
result = result.sort_values(ascending=ascending).sort_index(
1753-
level=index_level, sort_remaining=False
1755+
result = (
1756+
cast(Series, result)
1757+
.sort_values(ascending=ascending)
1758+
.sort_index(level=index_level, sort_remaining=False)
17541759
)
17551760

17561761
if not self.as_index:
17571762
# Convert to frame
1758-
result = result.reset_index(name="proportion" if normalize else "count")
1763+
name = "proportion" if normalize else "count"
1764+
columns = result.index.names
1765+
if name in columns:
1766+
raise ValueError(
1767+
f"Column label '{name}' is duplicate of result column"
1768+
)
1769+
columns = com.fill_missing_names(columns)
1770+
values = result.values
1771+
result_frame = DataFrame()
1772+
for i, column in enumerate(columns):
1773+
level_values = result.index.get_level_values(i)._values
1774+
if level_values.dtype == np.object_:
1775+
level_values = lib.maybe_convert_objects(
1776+
cast(np.ndarray, level_values)
1777+
)
1778+
result_frame.insert(i, column, level_values, allow_duplicates=True)
1779+
result = result_frame.assign(**{name: values})
1780+
17591781
return result.__finalize__(self.obj, method="value_counts")
17601782

17611783

pandas/tests/groupby/test_frame_value_counts.py

+28-6
Original file line numberDiff line numberDiff line change
@@ -413,22 +413,44 @@ def test_mixed_groupings(normalize, expected_label, expected_values):
413413
],
414414
)
415415
@pytest.mark.parametrize("as_index", [False, True])
416-
def test_column_name_clashes(test, columns, expected_names, as_index):
416+
def test_column_label_duplicates(test, columns, expected_names, as_index):
417+
# Test for duplicate input column labels and generated duplicate labels
417418
df = DataFrame([[1, 3, 5, 7, 9], [2, 4, 6, 8, 10]], columns=columns)
418-
419+
expected_data = [(1, 0, 7, 3, 5, 9), (2, 1, 8, 4, 6, 10)]
420+
result = df.groupby(["a", [0, 1], "d"], as_index=as_index).value_counts()
419421
if as_index:
420-
result = df.groupby(["a", [0, 1], "d"], as_index=as_index).value_counts()
421422
expected = Series(
422423
data=(1, 1),
423424
index=MultiIndex.from_tuples(
424-
[(1, 0, 7, 3, 5, 9), (2, 1, 8, 4, 6, 10)],
425+
expected_data,
425426
names=expected_names,
426427
),
427428
)
428429
tm.assert_series_equal(result, expected)
429430
else:
430-
with pytest.raises(ValueError, match="cannot insert"):
431-
df.groupby(["a", [0, 1], "d"], as_index=as_index).value_counts()
431+
expected_data = [list(row) + [1] for row in expected_data]
432+
expected_columns = list(expected_names)
433+
expected_columns[1] = "level_1"
434+
expected_columns.append("count")
435+
expected = DataFrame(expected_data, columns=expected_columns)
436+
tm.assert_frame_equal(result, expected)
437+
438+
439+
@pytest.mark.parametrize(
440+
"normalize, expected_label",
441+
[
442+
(False, "count"),
443+
(True, "proportion"),
444+
],
445+
)
446+
def test_result_label_duplicates(normalize, expected_label):
447+
# Test for result column label duplicating an input column label
448+
gb = DataFrame([[1, 2, 3]], columns=["a", "b", expected_label]).groupby(
449+
"a", as_index=False
450+
)
451+
msg = f"Column label '{expected_label}' is duplicate of result column"
452+
with pytest.raises(ValueError, match=msg):
453+
gb.value_counts(normalize=normalize)
432454

433455

434456
def test_ambiguous_grouping():

0 commit comments

Comments
 (0)