Skip to content

BUG/PERF: SeriesGroupBy.value_counts sorting bug and categorical performance #50548

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Jan 11, 2023
Merged
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -844,6 +844,7 @@ Performance improvements
- Performance improvement in :func:`to_datetime` when using ``'%Y%m%d'`` format (:issue:`17410`)
- Performance improvement in :func:`to_datetime` when format is given or can be inferred (:issue:`50465`)
- Performance improvement in :func:`read_csv` when passing :func:`to_datetime` lambda-function to ``date_parser`` and inputs have mixed timezone offsetes (:issue:`35296`)
- Performance improvement in :meth:`.SeriesGroupBy.value_count` with categorical dtype (:issue:`46202`)

.. ---------------------------------------------------------------------------
.. _whatsnew_200.bug_fixes:
Expand Down Expand Up @@ -1018,6 +1019,7 @@ Groupby/resample/rolling
- Bug in :meth:`Resampler.size` caused a wide :class:`DataFrame` to be returned instead of a :class:`Series` with :class:`MultiIndex` (:issue:`46826`)
- Bug in :meth:`.DataFrameGroupBy.transform` and :meth:`.SeriesGroupBy.transform` would raise incorrectly when grouper had ``axis=1`` for ``"idxmin"`` and ``"idxmax"`` arguments (:issue:`45986`)
- Bug in :class:`.DataFrameGroupBy` would raise when used with an empty DataFrame, categorical grouper, and ``dropna=False`` (:issue:`50634`)
- Bug in :meth:`.SeriesGroupBy.value_counts` did not respect ``sort=False`` (:issue:`50482`)
-

Reshaping
Expand Down
126 changes: 7 additions & 119 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,6 @@
reconstruct_func,
validate_func_kwargs,
)
from pandas.core.arrays.categorical import Categorical
import pandas.core.common as com
from pandas.core.frame import DataFrame
from pandas.core.groupby import base
Expand All @@ -86,14 +85,12 @@
_apply_docs,
_transform_template,
)
from pandas.core.groupby.grouper import get_grouper
from pandas.core.indexes.api import (
Index,
MultiIndex,
all_indexes_same,
default_index,
)
from pandas.core.indexes.category import CategoricalIndex
from pandas.core.series import Series
from pandas.core.util.numba_ import maybe_use_numba

Expand Down Expand Up @@ -647,6 +644,12 @@ def value_counts(
bins=None,
dropna: bool = True,
) -> Series:
if bins is None:
result = self._value_counts(
normalize=normalize, sort=sort, ascending=ascending, dropna=dropna
)
assert isinstance(result, Series)
return result

from pandas.core.reshape.merge import get_join_indexers
from pandas.core.reshape.tile import cut
Expand Down Expand Up @@ -2224,122 +2227,7 @@ def value_counts(
3 male low US 0.25
4 male medium FR 0.25
"""
if self.axis == 1:
raise NotImplementedError(
"DataFrameGroupBy.value_counts only handles axis=0"
)

with self._group_selection_context():
df = self.obj

in_axis_names = {
grouping.name for grouping in self.grouper.groupings if grouping.in_axis
}
if isinstance(self._selected_obj, Series):
name = self._selected_obj.name
keys = [] if name in in_axis_names else [self._selected_obj]
else:
unique_cols = set(self._selected_obj.columns)
if subset is not None:
subsetted = set(subset)
clashing = subsetted & set(in_axis_names)
if clashing:
raise ValueError(
f"Keys {clashing} in subset cannot be in "
"the groupby column keys."
)
doesnt_exist = subsetted - unique_cols
if doesnt_exist:
raise ValueError(
f"Keys {doesnt_exist} in subset do not "
f"exist in the DataFrame."
)
else:
subsetted = unique_cols

keys = [
# Can't use .values because the column label needs to be preserved
self._selected_obj.iloc[:, idx]
for idx, name in enumerate(self._selected_obj.columns)
if name not in in_axis_names and name in subsetted
]

groupings = list(self.grouper.groupings)
for key in keys:
grouper, _, _ = get_grouper(
df,
key=key,
axis=self.axis,
sort=self.sort,
observed=False,
dropna=dropna,
)
groupings += list(grouper.groupings)

# Take the size of the overall columns
gb = df.groupby(
groupings,
sort=self.sort,
observed=self.observed,
dropna=self.dropna,
)
result_series = cast(Series, gb.size())

# GH-46357 Include non-observed categories
# of non-grouping columns regardless of `observed`
if any(
isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex))
and not grouping._observed
for grouping in groupings
):
levels_list = [ping.result_index for ping in groupings]
multi_index, _ = MultiIndex.from_product(
levels_list, names=[ping.name for ping in groupings]
).sortlevel()
result_series = result_series.reindex(multi_index, fill_value=0)

if normalize:
# Normalize the results by dividing by the original group sizes.
# We are guaranteed to have the first N levels be the
# user-requested grouping.
levels = list(
range(len(self.grouper.groupings), result_series.index.nlevels)
)
indexed_group_size = result_series.groupby(
result_series.index.droplevel(levels),
sort=self.sort,
dropna=self.dropna,
).transform("sum")
result_series /= indexed_group_size

# Handle groups of non-observed categories
result_series = result_series.fillna(0.0)

if sort:
# Sort the values and then resort by the main grouping
index_level = range(len(self.grouper.groupings))
result_series = result_series.sort_values(
ascending=ascending
).sort_index(level=index_level, sort_remaining=False)

result: Series | DataFrame
if self.as_index:
result = result_series
else:
# Convert to frame
name = "proportion" if normalize else "count"
index = result_series.index
columns = com.fill_missing_names(index.names)
if name in columns:
raise ValueError(
f"Column label '{name}' is duplicate of result column"
)
result_series.name = name
result_series.index = index.set_names(range(len(columns)))
result_frame = result_series.reset_index()
result_frame.columns = columns + [name]
result = result_frame
return result.__finalize__(self.obj, method="value_counts")
return self._value_counts(subset, normalize, sort, ascending, dropna)

def fillna(
self,
Expand Down
133 changes: 133 additions & 0 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ class providing the base-class of operations.
numba_,
ops,
)
from pandas.core.groupby.grouper import get_grouper
from pandas.core.groupby.indexing import (
GroupByIndexingMixin,
GroupByNthSelector,
Expand Down Expand Up @@ -2160,6 +2161,138 @@ def var(
ddof=ddof,
)

@final
def _value_counts(
self,
subset: Sequence[Hashable] | None = None,
normalize: bool = False,
sort: bool = True,
ascending: bool = False,
dropna: bool = True,
) -> DataFrame | Series:
"""
Shared implementation of value_counts for SeriesGroupBy and DataFrameGroupBy.

SeriesGroupBy additionally supports a bins argument. See the docstring of
DataFrameGroupBy.value_counts for a description of arguments.
"""
if self.axis == 1:
raise NotImplementedError(
"DataFrameGroupBy.value_counts only handles axis=0"
)

with self._group_selection_context():
df = self.obj

in_axis_names = {
grouping.name for grouping in self.grouper.groupings if grouping.in_axis
}
if isinstance(self._selected_obj, Series):
name = self._selected_obj.name
keys = [] if name in in_axis_names else [self._selected_obj]
else:
unique_cols = set(self._selected_obj.columns)
if subset is not None:
subsetted = set(subset)
clashing = subsetted & set(in_axis_names)
if clashing:
raise ValueError(
f"Keys {clashing} in subset cannot be in "
"the groupby column keys."
)
doesnt_exist = subsetted - unique_cols
if doesnt_exist:
raise ValueError(
f"Keys {doesnt_exist} in subset do not "
f"exist in the DataFrame."
)
else:
subsetted = unique_cols

keys = [
# Can't use .values because the column label needs to be preserved
self._selected_obj.iloc[:, idx]
for idx, name in enumerate(self._selected_obj.columns)
if name not in in_axis_names and name in subsetted
]

groupings = list(self.grouper.groupings)
for key in keys:
grouper, _, _ = get_grouper(
df,
key=key,
axis=self.axis,
sort=self.sort,
observed=False,
dropna=dropna,
)
groupings += list(grouper.groupings)

# Take the size of the overall columns
gb = df.groupby(
groupings,
sort=self.sort,
observed=self.observed,
dropna=self.dropna,
)
result_series = cast(Series, gb.size())

# GH-46357 Include non-observed categories
# of non-grouping columns regardless of `observed`
if any(
isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex))
and not grouping._observed
for grouping in groupings
):
levels_list = [ping.result_index for ping in groupings]
multi_index, _ = MultiIndex.from_product(
levels_list, names=[ping.name for ping in groupings]
).sortlevel()
result_series = result_series.reindex(multi_index, fill_value=0)

if normalize:
# Normalize the results by dividing by the original group sizes.
# We are guaranteed to have the first N levels be the
# user-requested grouping.
levels = list(
range(len(self.grouper.groupings), result_series.index.nlevels)
)
indexed_group_size = result_series.groupby(
result_series.index.droplevel(levels),
sort=self.sort,
dropna=self.dropna,
).transform("sum")
result_series /= indexed_group_size

# Handle groups of non-observed categories
result_series = result_series.fillna(0.0)

if sort:
# Sort the values and then resort by the main grouping
index_level = range(len(self.grouper.groupings))
result_series = result_series.sort_values(
ascending=ascending
).sort_index(level=index_level, sort_remaining=False)

result: Series | DataFrame
if self.as_index:
result = result_series
else:
# Convert to frame
name = "proportion" if normalize else "count"
index = result_series.index
columns = com.fill_missing_names(index.names)
if name in columns:
raise ValueError(
f"Column label '{name}' is duplicate of result column"
)
result_series.name = name
result_series.index = index.set_names(range(len(columns)))
result_frame = result_series.reset_index()
result_frame.columns = columns + [name]
result = result_frame
return result.__finalize__(self.obj, method="value_counts")

@final
@Substitution(name="groupby")
@Appender(_common_see_also)
Expand Down
20 changes: 20 additions & 0 deletions pandas/tests/groupby/test_value_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,26 @@ def test_series_groupby_value_counts_on_categorical():
tm.assert_series_equal(result, expected)


def test_series_groupby_value_counts_no_sort():
# GH#50482
df = DataFrame(
{
"gender": ["male", "male", "female", "male", "female", "male"],
"education": ["low", "medium", "high", "low", "high", "low"],
"country": ["US", "FR", "US", "FR", "FR", "FR"],
}
)
gb = df.groupby(["country", "gender"], sort=False)["education"]
result = gb.value_counts(sort=False)
index = MultiIndex(
levels=[["US", "FR"], ["male", "female"], ["low", "medium", "high"]],
codes=[[0, 1, 0, 1, 1], [0, 0, 1, 0, 1], [0, 1, 2, 0, 2]],
names=["country", "gender", "education"],
)
expected = Series([1, 1, 1, 2, 1], index=index, name="education")
tm.assert_series_equal(result, expected)


@pytest.fixture
def education_df():
return DataFrame(
Expand Down