Skip to content

BUG: DataFrameGroupBy.value_counts() fails if as_index=False and there are duplicate column labels #45160

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
696130b
Update test_frame_value_counts.py
johnzangwill Dec 31, 2021
6b03989
Implement value_counts with duplicates and add test
johnzangwill Jan 1, 2022
db2f38a
Merge branch 'pandas-dev:master' into value_counts-with-duplicate-labels
johnzangwill Jan 1, 2022
9093374
redo using private methods
johnzangwill Jan 2, 2022
4f65829
Update test_frame_value_counts.py
johnzangwill Jan 3, 2022
85cf095
Merge branch 'pandas-dev:master' into value_counts-with-duplicate-labels
johnzangwill Jan 3, 2022
68ae88b
Merge branch 'pandas-dev:master' into value_counts-with-duplicate-labels
johnzangwill Jan 4, 2022
faa17e5
Revert "redo using private methods"
johnzangwill Jan 4, 2022
44ff075
Merge branch 'value_counts-with-duplicate-labels' of https://github.c…
johnzangwill Jan 4, 2022
c097e5d
Update generic.py
johnzangwill Jan 4, 2022
7532cc0
Merge branch 'pandas-dev:master' into value_counts-with-duplicate-labels
johnzangwill Jan 4, 2022
d490187
Update generic.py
johnzangwill Jan 4, 2022
837c850
Merge branch 'value_counts-with-duplicate-labels' of https://github.c…
johnzangwill Jan 4, 2022
89c90c4
Update generic.py
johnzangwill Jan 4, 2022
92999fb
Update generic.py
johnzangwill Jan 4, 2022
6e55670
Back to reset_index
johnzangwill Jan 9, 2022
a47bbf7
Merge branch 'pandas-dev:master' into value_counts-with-duplicate-labels
johnzangwill Jan 9, 2022
ad164dc
Merge branch 'pandas-dev:master' into value_counts-with-duplicate-labels
johnzangwill Jan 10, 2022
0f4b155
Merge branch 'pandas-dev:master' into value_counts-with-duplicate-labels
johnzangwill Jan 10, 2022
f6be00d
Merge branch 'pandas-dev:master' into value_counts-with-duplicate-labels
johnzangwill Jan 11, 2022
0b4853c
Merge branch 'main' into value_counts-with-duplicate-labels
johnzangwill Jan 16, 2022
fece32b
Update generic.py
johnzangwill Jan 16, 2022
df127ef
Merge branch 'pandas-dev:main' into value_counts-with-duplicate-labels
johnzangwill Jan 16, 2022
36f2b0d
Trigger CI
johnzangwill Jan 16, 2022
207b55e
Merge branch 'value_counts-with-duplicate-labels' of https://github.c…
johnzangwill Jan 16, 2022
e3b245c
Merge branch 'pandas-dev:main' into value_counts-with-duplicate-labels
johnzangwill Jan 17, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 27 additions & 5 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,10 @@

import numpy as np

from pandas._libs import reduction as libreduction
from pandas._libs import (
lib,
reduction as libreduction,
)
from pandas._typing import (
ArrayLike,
Manager,
Expand Down Expand Up @@ -1731,7 +1734,7 @@ def value_counts(
observed=self.observed,
dropna=self.dropna,
)
result = cast(Series, gb.size())
result = gb.size()

if normalize:
# Normalize the results by dividing by the original group sizes.
Expand All @@ -1750,13 +1753,32 @@ def value_counts(
if sort:
# Sort the values and then resort by the main grouping
index_level = range(len(self.grouper.groupings))
result = result.sort_values(ascending=ascending).sort_index(
level=index_level, sort_remaining=False
result = (
cast(Series, result)
.sort_values(ascending=ascending)
.sort_index(level=index_level, sort_remaining=False)
)

if not self.as_index:
# Convert to frame
result = result.reset_index(name="proportion" if normalize else "count")
name = "proportion" if normalize else "count"
columns = result.index.names
if name in columns:
raise ValueError(
f"Column label '{name}' is duplicate of result column"
)
columns = com.fill_missing_names(columns)
values = result.values
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

result is a Series at this point? ._values is generally preferable to .values, as the latter will cast dt64tz to ndarray[object]

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks!
I decided to do this differently following a suggestion from a while ago (#44755 (comment))
I really wanted these methods to be public, but it is not happening...
Anyway, this way is more DRY and I shouldn't really be writing my own version of reset_index().

result_frame = DataFrame()
for i, column in enumerate(columns):
level_values = result.index.get_level_values(i)._values
if level_values.dtype == np.object_:
level_values = lib.maybe_convert_objects(
cast(np.ndarray, level_values)
)
result_frame.insert(i, column, level_values, allow_duplicates=True)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this almost O(n^2) inefficient.
so what you need to do is

  • if we don't have duplicates in the columns names, concat and are done
  • if we do, then a) should this really be the answer? why are we not raising?
  • if we actually do allow this then you simply change the column names to a range index, concat, then set them again.

pls don't conflate this issue with reset_index or allow_duplicates, they are completely orthogonal.

this is not going to move forward this keeps re-inventing the wheel.

Copy link
Contributor Author

@johnzangwill johnzangwill Jan 10, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not O(n^2)! But O(n), and of course I would rather not be itererating through an axis. That is the whole point of Pandas!

concat does not work, as I explained many times. The best I could do was:

result_frame = index.set_names(range(len(columns))).to_frame()
result_frame = concat([result_frame, result], axis=1).reset_index(drop=True).set_axis(columns + [name], axis=1)

which fails 6 of my tests due to bool/object problems in MultiIndex. These will probably be fixed by #45061, but I see that has been deferred until 1.5.

Meanwhile, I employed your column renaming suggestion and there is no more looping. It is now all green (apart from the usual ci problems)

result = result_frame.assign(**{name: values})

return result.__finalize__(self.obj, method="value_counts")


Expand Down
46 changes: 32 additions & 14 deletions pandas/tests/groupby/test_frame_value_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,33 +406,51 @@ def test_mixed_groupings(normalize, expected_label, expected_values):


@pytest.mark.parametrize(
"test, expected_names",
"test, columns, expected_names",
[
("repeat", ["a", None, "d", "b", "b", "e"]),
("level", ["a", None, "d", "b", "c", "level_1"]),
("repeat", list("abbde"), ["a", None, "d", "b", "b", "e"]),
("level", list("abcd") + ["level_1"], ["a", None, "d", "b", "c", "level_1"]),
],
)
@pytest.mark.parametrize("as_index", [False, True])
def test_column_name_clashes(test, expected_names, as_index):
df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6], "d": [7, 8], "e": [9, 10]})
if test == "repeat":
df.columns = list("abbde")
else:
df.columns = list("abcd") + ["level_1"]

def test_column_label_duplicates(test, columns, expected_names, as_index):
# Test for duplicate input column labels and generated duplicate labels
df = DataFrame([[1, 3, 5, 7, 9], [2, 4, 6, 8, 10]], columns=columns)
expected_data = [(1, 0, 7, 3, 5, 9), (2, 1, 8, 4, 6, 10)]
result = df.groupby(["a", [0, 1], "d"], as_index=as_index).value_counts()
if as_index:
result = df.groupby(["a", [0, 1], "d"], as_index=as_index).value_counts()
expected = Series(
data=(1, 1),
index=MultiIndex.from_tuples(
[(1, 0, 7, 3, 5, 9), (2, 1, 8, 4, 6, 10)],
expected_data,
names=expected_names,
),
)
tm.assert_series_equal(result, expected)
else:
with pytest.raises(ValueError, match="cannot insert"):
df.groupby(["a", [0, 1], "d"], as_index=as_index).value_counts()
expected_data = [list(row) + [1] for row in expected_data]
expected_columns = list(expected_names)
expected_columns[1] = "level_1"
expected_columns.append("count")
expected = DataFrame(expected_data, columns=expected_columns)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"normalize, expected_label",
[
(False, "count"),
(True, "proportion"),
],
)
def test_result_label_duplicates(normalize, expected_label):
# Test for result column label duplicating an input column label
gb = DataFrame([[1, 2, 3]], columns=["a", "b", expected_label]).groupby(
"a", as_index=False
)
msg = f"Column label '{expected_label}' is duplicate of result column"
with pytest.raises(ValueError, match=msg):
gb.value_counts(normalize=normalize)


def test_ambiguous_grouping():
Expand Down