Skip to content

Commit 4f42ecb

Browse files
BUG/PERF: SeriesGroupBy.value_counts sorting bug and categorical performance (pandas-dev#50548)
Co-authored-by: Simon Hawkins <[email protected]>
1 parent 9d8e704 commit 4f42ecb

File tree

4 files changed

+162
-123
lines changed

4 files changed

+162
-123
lines changed

doc/source/whatsnew/v2.0.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -846,6 +846,7 @@ Performance improvements
846846
- Performance improvement in :func:`to_datetime` when using ``'%Y%m%d'`` format (:issue:`17410`)
847847
- Performance improvement in :func:`to_datetime` when format is given or can be inferred (:issue:`50465`)
848848
- Performance improvement in :func:`read_csv` when passing :func:`to_datetime` lambda-function to ``date_parser`` and inputs have mixed timezone offsetes (:issue:`35296`)
849+
- Performance improvement in :meth:`.SeriesGroupBy.value_counts` with categorical dtype (:issue:`46202`)
849850

850851
.. ---------------------------------------------------------------------------
851852
.. _whatsnew_200.bug_fixes:
@@ -1020,6 +1021,7 @@ Groupby/resample/rolling
10201021
- Bug in :meth:`Resampler.size` caused a wide :class:`DataFrame` to be returned instead of a :class:`Series` with :class:`MultiIndex` (:issue:`46826`)
10211022
- Bug in :meth:`.DataFrameGroupBy.transform` and :meth:`.SeriesGroupBy.transform` would raise incorrectly when grouper had ``axis=1`` for ``"idxmin"`` and ``"idxmax"`` arguments (:issue:`45986`)
10221023
- Bug in :class:`.DataFrameGroupBy` would raise when used with an empty DataFrame, categorical grouper, and ``dropna=False`` (:issue:`50634`)
1024+
- Bug in :meth:`.SeriesGroupBy.value_counts` did not respect ``sort=False`` (:issue:`50482`)
10231025
-
10241026

10251027
Reshaping

pandas/core/groupby/generic.py

+7-119
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,6 @@
7575
reconstruct_func,
7676
validate_func_kwargs,
7777
)
78-
from pandas.core.arrays.categorical import Categorical
7978
import pandas.core.common as com
8079
from pandas.core.frame import DataFrame
8180
from pandas.core.groupby import base
@@ -86,14 +85,12 @@
8685
_apply_docs,
8786
_transform_template,
8887
)
89-
from pandas.core.groupby.grouper import get_grouper
9088
from pandas.core.indexes.api import (
9189
Index,
9290
MultiIndex,
9391
all_indexes_same,
9492
default_index,
9593
)
96-
from pandas.core.indexes.category import CategoricalIndex
9794
from pandas.core.series import Series
9895
from pandas.core.util.numba_ import maybe_use_numba
9996

@@ -647,6 +644,12 @@ def value_counts(
647644
bins=None,
648645
dropna: bool = True,
649646
) -> Series:
647+
if bins is None:
648+
result = self._value_counts(
649+
normalize=normalize, sort=sort, ascending=ascending, dropna=dropna
650+
)
651+
assert isinstance(result, Series)
652+
return result
650653

651654
from pandas.core.reshape.merge import get_join_indexers
652655
from pandas.core.reshape.tile import cut
@@ -2224,122 +2227,7 @@ def value_counts(
22242227
3 male low US 0.25
22252228
4 male medium FR 0.25
22262229
"""
2227-
if self.axis == 1:
2228-
raise NotImplementedError(
2229-
"DataFrameGroupBy.value_counts only handles axis=0"
2230-
)
2231-
2232-
with self._group_selection_context():
2233-
df = self.obj
2234-
2235-
in_axis_names = {
2236-
grouping.name for grouping in self.grouper.groupings if grouping.in_axis
2237-
}
2238-
if isinstance(self._selected_obj, Series):
2239-
name = self._selected_obj.name
2240-
keys = [] if name in in_axis_names else [self._selected_obj]
2241-
else:
2242-
unique_cols = set(self._selected_obj.columns)
2243-
if subset is not None:
2244-
subsetted = set(subset)
2245-
clashing = subsetted & set(in_axis_names)
2246-
if clashing:
2247-
raise ValueError(
2248-
f"Keys {clashing} in subset cannot be in "
2249-
"the groupby column keys."
2250-
)
2251-
doesnt_exist = subsetted - unique_cols
2252-
if doesnt_exist:
2253-
raise ValueError(
2254-
f"Keys {doesnt_exist} in subset do not "
2255-
f"exist in the DataFrame."
2256-
)
2257-
else:
2258-
subsetted = unique_cols
2259-
2260-
keys = [
2261-
# Can't use .values because the column label needs to be preserved
2262-
self._selected_obj.iloc[:, idx]
2263-
for idx, name in enumerate(self._selected_obj.columns)
2264-
if name not in in_axis_names and name in subsetted
2265-
]
2266-
2267-
groupings = list(self.grouper.groupings)
2268-
for key in keys:
2269-
grouper, _, _ = get_grouper(
2270-
df,
2271-
key=key,
2272-
axis=self.axis,
2273-
sort=self.sort,
2274-
observed=False,
2275-
dropna=dropna,
2276-
)
2277-
groupings += list(grouper.groupings)
2278-
2279-
# Take the size of the overall columns
2280-
gb = df.groupby(
2281-
groupings,
2282-
sort=self.sort,
2283-
observed=self.observed,
2284-
dropna=self.dropna,
2285-
)
2286-
result_series = cast(Series, gb.size())
2287-
2288-
# GH-46357 Include non-observed categories
2289-
# of non-grouping columns regardless of `observed`
2290-
if any(
2291-
isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex))
2292-
and not grouping._observed
2293-
for grouping in groupings
2294-
):
2295-
levels_list = [ping.result_index for ping in groupings]
2296-
multi_index, _ = MultiIndex.from_product(
2297-
levels_list, names=[ping.name for ping in groupings]
2298-
).sortlevel()
2299-
result_series = result_series.reindex(multi_index, fill_value=0)
2300-
2301-
if normalize:
2302-
# Normalize the results by dividing by the original group sizes.
2303-
# We are guaranteed to have the first N levels be the
2304-
# user-requested grouping.
2305-
levels = list(
2306-
range(len(self.grouper.groupings), result_series.index.nlevels)
2307-
)
2308-
indexed_group_size = result_series.groupby(
2309-
result_series.index.droplevel(levels),
2310-
sort=self.sort,
2311-
dropna=self.dropna,
2312-
).transform("sum")
2313-
result_series /= indexed_group_size
2314-
2315-
# Handle groups of non-observed categories
2316-
result_series = result_series.fillna(0.0)
2317-
2318-
if sort:
2319-
# Sort the values and then resort by the main grouping
2320-
index_level = range(len(self.grouper.groupings))
2321-
result_series = result_series.sort_values(
2322-
ascending=ascending
2323-
).sort_index(level=index_level, sort_remaining=False)
2324-
2325-
result: Series | DataFrame
2326-
if self.as_index:
2327-
result = result_series
2328-
else:
2329-
# Convert to frame
2330-
name = "proportion" if normalize else "count"
2331-
index = result_series.index
2332-
columns = com.fill_missing_names(index.names)
2333-
if name in columns:
2334-
raise ValueError(
2335-
f"Column label '{name}' is duplicate of result column"
2336-
)
2337-
result_series.name = name
2338-
result_series.index = index.set_names(range(len(columns)))
2339-
result_frame = result_series.reset_index()
2340-
result_frame.columns = columns + [name]
2341-
result = result_frame
2342-
return result.__finalize__(self.obj, method="value_counts")
2230+
return self._value_counts(subset, normalize, sort, ascending, dropna)
23432231

23442232
def fillna(
23452233
self,

pandas/core/groupby/groupby.py

+133-4
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ class providing the base-class of operations.
113113
numba_,
114114
ops,
115115
)
116+
from pandas.core.groupby.grouper import get_grouper
116117
from pandas.core.groupby.indexing import (
117118
GroupByIndexingMixin,
118119
GroupByNthSelector,
@@ -923,8 +924,6 @@ def __init__(
923924
self.dropna = dropna
924925

925926
if grouper is None:
926-
from pandas.core.groupby.grouper import get_grouper
927-
928927
grouper, exclusions, obj = get_grouper(
929928
obj,
930929
keys,
@@ -2160,6 +2159,138 @@ def var(
21602159
ddof=ddof,
21612160
)
21622161

2162+
@final
2163+
def _value_counts(
2164+
self,
2165+
subset: Sequence[Hashable] | None = None,
2166+
normalize: bool = False,
2167+
sort: bool = True,
2168+
ascending: bool = False,
2169+
dropna: bool = True,
2170+
) -> DataFrame | Series:
2171+
"""
2172+
Shared implementation of value_counts for SeriesGroupBy and DataFrameGroupBy.
2173+
2174+
SeriesGroupBy additionally supports a bins argument. See the docstring of
2175+
DataFrameGroupBy.value_counts for a description of arguments.
2176+
"""
2177+
if self.axis == 1:
2178+
raise NotImplementedError(
2179+
"DataFrameGroupBy.value_counts only handles axis=0"
2180+
)
2181+
2182+
with self._group_selection_context():
2183+
df = self.obj
2184+
2185+
in_axis_names = {
2186+
grouping.name for grouping in self.grouper.groupings if grouping.in_axis
2187+
}
2188+
if isinstance(self._selected_obj, Series):
2189+
name = self._selected_obj.name
2190+
keys = [] if name in in_axis_names else [self._selected_obj]
2191+
else:
2192+
unique_cols = set(self._selected_obj.columns)
2193+
if subset is not None:
2194+
subsetted = set(subset)
2195+
clashing = subsetted & set(in_axis_names)
2196+
if clashing:
2197+
raise ValueError(
2198+
f"Keys {clashing} in subset cannot be in "
2199+
"the groupby column keys."
2200+
)
2201+
doesnt_exist = subsetted - unique_cols
2202+
if doesnt_exist:
2203+
raise ValueError(
2204+
f"Keys {doesnt_exist} in subset do not "
2205+
f"exist in the DataFrame."
2206+
)
2207+
else:
2208+
subsetted = unique_cols
2209+
2210+
keys = [
2211+
# Can't use .values because the column label needs to be preserved
2212+
self._selected_obj.iloc[:, idx]
2213+
for idx, name in enumerate(self._selected_obj.columns)
2214+
if name not in in_axis_names and name in subsetted
2215+
]
2216+
2217+
groupings = list(self.grouper.groupings)
2218+
for key in keys:
2219+
grouper, _, _ = get_grouper(
2220+
df,
2221+
key=key,
2222+
axis=self.axis,
2223+
sort=self.sort,
2224+
observed=False,
2225+
dropna=dropna,
2226+
)
2227+
groupings += list(grouper.groupings)
2228+
2229+
# Take the size of the overall columns
2230+
gb = df.groupby(
2231+
groupings,
2232+
sort=self.sort,
2233+
observed=self.observed,
2234+
dropna=self.dropna,
2235+
)
2236+
result_series = cast(Series, gb.size())
2237+
2238+
# GH-46357 Include non-observed categories
2239+
# of non-grouping columns regardless of `observed`
2240+
if any(
2241+
isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex))
2242+
and not grouping._observed
2243+
for grouping in groupings
2244+
):
2245+
levels_list = [ping.result_index for ping in groupings]
2246+
multi_index, _ = MultiIndex.from_product(
2247+
levels_list, names=[ping.name for ping in groupings]
2248+
).sortlevel()
2249+
result_series = result_series.reindex(multi_index, fill_value=0)
2250+
2251+
if normalize:
2252+
# Normalize the results by dividing by the original group sizes.
2253+
# We are guaranteed to have the first N levels be the
2254+
# user-requested grouping.
2255+
levels = list(
2256+
range(len(self.grouper.groupings), result_series.index.nlevels)
2257+
)
2258+
indexed_group_size = result_series.groupby(
2259+
result_series.index.droplevel(levels),
2260+
sort=self.sort,
2261+
dropna=self.dropna,
2262+
).transform("sum")
2263+
result_series /= indexed_group_size
2264+
2265+
# Handle groups of non-observed categories
2266+
result_series = result_series.fillna(0.0)
2267+
2268+
if sort:
2269+
# Sort the values and then resort by the main grouping
2270+
index_level = range(len(self.grouper.groupings))
2271+
result_series = result_series.sort_values(
2272+
ascending=ascending
2273+
).sort_index(level=index_level, sort_remaining=False)
2274+
2275+
result: Series | DataFrame
2276+
if self.as_index:
2277+
result = result_series
2278+
else:
2279+
# Convert to frame
2280+
name = "proportion" if normalize else "count"
2281+
index = result_series.index
2282+
columns = com.fill_missing_names(index.names)
2283+
if name in columns:
2284+
raise ValueError(
2285+
f"Column label '{name}' is duplicate of result column"
2286+
)
2287+
result_series.name = name
2288+
result_series.index = index.set_names(range(len(columns)))
2289+
result_frame = result_series.reset_index()
2290+
result_frame.columns = columns + [name]
2291+
result = result_frame
2292+
return result.__finalize__(self.obj, method="value_counts")
2293+
21632294
@final
21642295
@Substitution(name="groupby")
21652296
@Appender(_common_see_also)
@@ -2944,8 +3075,6 @@ def _nth(
29443075

29453076
# create a grouper with the original parameters, but on dropped
29463077
# object
2947-
from pandas.core.groupby.grouper import get_grouper
2948-
29493078
grouper, _, _ = get_grouper(
29503079
dropped,
29513080
key=self.keys,

pandas/tests/groupby/test_value_counts.py

+20
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,26 @@ def test_series_groupby_value_counts_on_categorical():
199199
tm.assert_series_equal(result, expected)
200200

201201

202+
def test_series_groupby_value_counts_no_sort():
203+
# GH#50482
204+
df = DataFrame(
205+
{
206+
"gender": ["male", "male", "female", "male", "female", "male"],
207+
"education": ["low", "medium", "high", "low", "high", "low"],
208+
"country": ["US", "FR", "US", "FR", "FR", "FR"],
209+
}
210+
)
211+
gb = df.groupby(["country", "gender"], sort=False)["education"]
212+
result = gb.value_counts(sort=False)
213+
index = MultiIndex(
214+
levels=[["US", "FR"], ["male", "female"], ["low", "medium", "high"]],
215+
codes=[[0, 1, 0, 1, 1], [0, 0, 1, 0, 1], [0, 1, 2, 0, 2]],
216+
names=["country", "gender", "education"],
217+
)
218+
expected = Series([1, 1, 1, 2, 1], index=index, name="education")
219+
tm.assert_series_equal(result, expected)
220+
221+
202222
@pytest.fixture
203223
def education_df():
204224
return DataFrame(

0 commit comments

Comments
 (0)