Skip to content

Commit 606dd3c

Browse files
authored
BUG/REGR: Fix subset for DataFrameGroupBy.value_counts (#48164)
* BUG/REGR: Fix subset for DataFrameGroupBy.value_counts * Test subset not in columns; duplicate columns
1 parent 2856ce1 commit 606dd3c

File tree

3 files changed

+63
-9
lines changed

3 files changed

+63
-9
lines changed

doc/source/whatsnew/v1.4.4.rst

+1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ Bug fixes
3737
~~~~~~~~~
3838
- The :class:`errors.FutureWarning` raised when passing arguments (other than ``filepath_or_buffer``) as positional in :func:`read_csv` is now raised at the correct stacklevel (:issue:`47385`)
3939
- Bug in :meth:`DataFrame.to_sql` when ``method`` was a ``callable`` that did not return an ``int`` and would raise a ``TypeError`` (:issue:`46891`)
40+
- Bug in :meth:`DataFrameGroupBy.value_counts` where ``subset`` had no effect (:issue:`44267`)
4041
- Bug in :meth:`loc.__getitem__` with a list of keys causing an internal inconsistency that could lead to a disconnect between ``frame.at[x, y]`` vs ``frame[y].loc[x]`` (:issue:`22372`)
4142
- Bug in the :meth:`Series.dt.strftime` accessor return a float instead of object dtype Series for all-NaT input, which also causes a spurious deprecation warning (:issue:`45858`)
4243

pandas/core/groupby/generic.py

+19-9
Original file line numberDiff line numberDiff line change
@@ -1805,21 +1805,31 @@ def value_counts(
18051805
name = self._selected_obj.name
18061806
keys = [] if name in in_axis_names else [self._selected_obj]
18071807
else:
1808+
unique_cols = set(self._selected_obj.columns)
1809+
if subset is not None:
1810+
subsetted = set(subset)
1811+
clashing = subsetted & set(in_axis_names)
1812+
if clashing:
1813+
raise ValueError(
1814+
f"Keys {clashing} in subset cannot be in "
1815+
"the groupby column keys."
1816+
)
1817+
doesnt_exist = subsetted - unique_cols
1818+
if doesnt_exist:
1819+
raise ValueError(
1820+
f"Keys {doesnt_exist} in subset do not "
1821+
f"exist in the DataFrame."
1822+
)
1823+
else:
1824+
subsetted = unique_cols
1825+
18081826
keys = [
18091827
# Can't use .values because the column label needs to be preserved
18101828
self._selected_obj.iloc[:, idx]
18111829
for idx, name in enumerate(self._selected_obj.columns)
1812-
if name not in in_axis_names
1830+
if name not in in_axis_names and name in subsetted
18131831
]
18141832

1815-
if subset is not None:
1816-
clashing = set(subset) & set(in_axis_names)
1817-
if clashing:
1818-
raise ValueError(
1819-
f"Keys {clashing} in subset cannot be in "
1820-
"the groupby column keys"
1821-
)
1822-
18231833
groupings = list(self.grouper.groupings)
18241834
for key in keys:
18251835
grouper, _, _ = get_grouper(

pandas/tests/groupby/test_frame_value_counts.py

+43
Original file line numberDiff line numberDiff line change
@@ -738,3 +738,46 @@ def test_ambiguous_grouping():
738738
result = gb.value_counts()
739739
expected = Series([2], index=MultiIndex.from_tuples([[1, 1]], names=[None, "a"]))
740740
tm.assert_series_equal(result, expected)
741+
742+
743+
def test_subset_overlaps_gb_key_raises():
744+
# GH 46383
745+
df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1])
746+
msg = "Keys {'c1'} in subset cannot be in the groupby column keys."
747+
with pytest.raises(ValueError, match=msg):
748+
df.groupby("c1").value_counts(subset=["c1"])
749+
750+
751+
def test_subset_doesnt_exist_in_frame():
752+
# GH 46383
753+
df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1])
754+
msg = "Keys {'c3'} in subset do not exist in the DataFrame."
755+
with pytest.raises(ValueError, match=msg):
756+
df.groupby("c1").value_counts(subset=["c3"])
757+
758+
759+
def test_subset():
760+
# GH 46383
761+
df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1])
762+
result = df.groupby(level=0).value_counts(subset=["c2"])
763+
expected = Series(
764+
[1, 2], index=MultiIndex.from_arrays([[0, 1], ["x", "y"]], names=[None, "c2"])
765+
)
766+
tm.assert_series_equal(result, expected)
767+
768+
769+
def test_subset_duplicate_columns():
770+
# GH 46383
771+
df = DataFrame(
772+
[["a", "x", "x"], ["b", "y", "y"], ["b", "y", "y"]],
773+
index=[0, 1, 1],
774+
columns=["c1", "c2", "c2"],
775+
)
776+
result = df.groupby(level=0).value_counts(subset=["c2"])
777+
expected = Series(
778+
[1, 2],
779+
index=MultiIndex.from_arrays(
780+
[[0, 1], ["x", "y"], ["x", "y"]], names=[None, "c2", "c2"]
781+
),
782+
)
783+
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)