Skip to content

Commit 2f71a9c

Browse files
Backport PR #48164 on branch 1.4.x (BUG/REGR: Fix subset for DataFrameGroupBy.value_counts) (#48204)
Backport PR #48164: BUG/REGR: Fix subset for DataFrameGroupBy.value_counts Co-authored-by: Matthew Roeschke <[email protected]>
1 parent be8b482 commit 2f71a9c

File tree

3 files changed

+63
-9
lines changed

3 files changed

+63
-9
lines changed

doc/source/whatsnew/v1.4.4.rst

+1
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ Bug fixes
3838
~~~~~~~~~
3939
- The :class:`errors.FutureWarning` raised when passing arguments (other than ``filepath_or_buffer``) as positional in :func:`read_csv` is now raised at the correct stacklevel (:issue:`47385`)
4040
- Bug in :meth:`DataFrame.to_sql` when ``method`` was a ``callable`` that did not return an ``int`` and would raise a ``TypeError`` (:issue:`46891`)
41+
- Bug in :meth:`DataFrameGroupBy.value_counts` where ``subset`` had no effect (:issue:`44267`)
4142
- Bug in :meth:`loc.__getitem__` with a list of keys causing an internal inconsistency that could lead to a disconnect between ``frame.at[x, y]`` vs ``frame[y].loc[x]`` (:issue:`22372`)
4243
- Bug in the :meth:`Series.dt.strftime` accessor return a float instead of object dtype Series for all-NaT input, which also causes a spurious deprecation warning (:issue:`45858`)
4344

pandas/core/groupby/generic.py

+19-9
Original file line numberDiff line numberDiff line change
@@ -1698,21 +1698,31 @@ def value_counts(
16981698
name = self._selected_obj.name
16991699
keys = [] if name in in_axis_names else [self._selected_obj]
17001700
else:
1701+
unique_cols = set(self._selected_obj.columns)
1702+
if subset is not None:
1703+
subsetted = set(subset)
1704+
clashing = subsetted & set(in_axis_names)
1705+
if clashing:
1706+
raise ValueError(
1707+
f"Keys {clashing} in subset cannot be in "
1708+
"the groupby column keys."
1709+
)
1710+
doesnt_exist = subsetted - unique_cols
1711+
if doesnt_exist:
1712+
raise ValueError(
1713+
f"Keys {doesnt_exist} in subset do not "
1714+
f"exist in the DataFrame."
1715+
)
1716+
else:
1717+
subsetted = unique_cols
1718+
17011719
keys = [
17021720
# Can't use .values because the column label needs to be preserved
17031721
self._selected_obj.iloc[:, idx]
17041722
for idx, name in enumerate(self._selected_obj.columns)
1705-
if name not in in_axis_names
1723+
if name not in in_axis_names and name in subsetted
17061724
]
17071725

1708-
if subset is not None:
1709-
clashing = set(subset) & set(in_axis_names)
1710-
if clashing:
1711-
raise ValueError(
1712-
f"Keys {clashing} in subset cannot be in "
1713-
"the groupby column keys"
1714-
)
1715-
17161726
groupings = list(self.grouper.groupings)
17171727
for key in keys:
17181728
grouper, _, _ = get_grouper(

pandas/tests/groupby/test_frame_value_counts.py

+43
Original file line numberDiff line numberDiff line change
@@ -442,3 +442,46 @@ def test_ambiguous_grouping():
442442
result = gb.value_counts()
443443
expected = Series([2], index=MultiIndex.from_tuples([[1, 1]], names=[None, "a"]))
444444
tm.assert_series_equal(result, expected)
445+
446+
447+
def test_subset_overlaps_gb_key_raises():
448+
# GH 46383
449+
df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1])
450+
msg = "Keys {'c1'} in subset cannot be in the groupby column keys."
451+
with pytest.raises(ValueError, match=msg):
452+
df.groupby("c1").value_counts(subset=["c1"])
453+
454+
455+
def test_subset_doesnt_exist_in_frame():
456+
# GH 46383
457+
df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1])
458+
msg = "Keys {'c3'} in subset do not exist in the DataFrame."
459+
with pytest.raises(ValueError, match=msg):
460+
df.groupby("c1").value_counts(subset=["c3"])
461+
462+
463+
def test_subset():
464+
# GH 46383
465+
df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1])
466+
result = df.groupby(level=0).value_counts(subset=["c2"])
467+
expected = Series(
468+
[1, 2], index=MultiIndex.from_arrays([[0, 1], ["x", "y"]], names=[None, "c2"])
469+
)
470+
tm.assert_series_equal(result, expected)
471+
472+
473+
def test_subset_duplicate_columns():
474+
# GH 46383
475+
df = DataFrame(
476+
[["a", "x", "x"], ["b", "y", "y"], ["b", "y", "y"]],
477+
index=[0, 1, 1],
478+
columns=["c1", "c2", "c2"],
479+
)
480+
result = df.groupby(level=0).value_counts(subset=["c2"])
481+
expected = Series(
482+
[1, 2],
483+
index=MultiIndex.from_arrays(
484+
[[0, 1], ["x", "y"], ["x", "y"]], names=[None, "c2", "c2"]
485+
),
486+
)
487+
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)