Skip to content

WARN introduce FutureWarning for value_counts behaviour change #49640

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 20 commits into from
Closed
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.5.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ Bug fixes

Other
~~~~~
-
- Introduced ``FutureWarning`` notifying about behaviour change in :meth:`DataFrame.value_counts`, :meth:`Series.value_counts`, :meth:`DataFrameGroupBy.value_counts`, :meth:`SeriesGroupBy.value_counts` - the resulting series will by default now be named ``'counts'`` (or ``'proportion'`` if ``normalize=True``), and the index (if present) will be taken from the original object's name (:issue:`49497`)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this warrants a full on section and before / after on how to fix

-

.. ---------------------------------------------------------------------------
Expand Down
6 changes: 4 additions & 2 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -810,6 +810,7 @@ def value_counts(
normalize: bool = False,
bins=None,
dropna: bool = True,
name: Hashable | None = None,
) -> Series:
"""
Compute a histogram of the counts of non-null values.
Expand Down Expand Up @@ -838,7 +839,8 @@ def value_counts(
Series,
)

name = getattr(values, "name", None)
if name is None:
name = getattr(values, "name", None)

if bins is not None:
from pandas.core.reshape.tile import cut
Expand All @@ -850,7 +852,7 @@ def value_counts(
raise TypeError("bins argument only works with numeric data.") from err

# count, remove nulls (from the index), and but the bins
result = ii.value_counts(dropna=dropna)
result = ii.value_counts(dropna=dropna, name=name)
result = result[result.index.notna()]
result.index = result.index.astype("interval")
result = result.sort_index()
Expand Down
15 changes: 15 additions & 0 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
final,
overload,
)
import warnings

import numpy as np

Expand All @@ -37,6 +38,7 @@
cache_readonly,
doc,
)
from pandas.util._exceptions import find_stack_level

from pandas.core.dtypes.common import (
is_categorical_dtype,
Expand Down Expand Up @@ -912,6 +914,8 @@ def value_counts(
ascending: bool = False,
bins=None,
dropna: bool = True,
*,
name: Hashable | None = None,
) -> Series:
"""
Return a Series containing counts of unique values.
Expand Down Expand Up @@ -991,13 +995,24 @@ def value_counts(
NaN 1
dtype: int64
"""
if name is None:
result_name = "proportion" if normalize else "count"
warnings.warn(
"In pandas 2.0.0, the name of the resulting Series will be "
"'count' (or 'proportion' if `normalize=True`), and the index "
"will inherit the original object's name. Specify "
f"`name='{result_name}'` to silence this warning.",
FutureWarning,
stacklevel=find_stack_level(),
)
return value_counts(
self,
sort=sort,
ascending=ascending,
normalize=normalize,
bins=bins,
dropna=dropna,
name=name,
)

def unique(self):
Expand Down
13 changes: 12 additions & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -6936,6 +6936,8 @@ def value_counts(
sort: bool = True,
ascending: bool = False,
dropna: bool = True,
*,
name: Hashable | None = None,
) -> Series:
"""
Return a Series containing counts of unique rows in the DataFrame.
Expand Down Expand Up @@ -7037,10 +7039,19 @@ def value_counts(
NaN 1
dtype: int64
"""
if name is None:
result_name = "proportion" if normalize else "count"
warnings.warn(
"In pandas 2.0.0, the name of the resulting Series will be "
"'count' (or 'proportion' if `normalize=True`). Specify "
f"`name='{result_name}'` to silence this warning.",
FutureWarning,
stacklevel=find_stack_level(),
)
if subset is None:
subset = self.columns.tolist()

counts = self.groupby(subset, dropna=dropna).grouper.size()
counts = self.groupby(subset, dropna=dropna).grouper.size().rename(name)

if sort:
counts = counts.sort_values(ascending=ascending)
Expand Down
43 changes: 36 additions & 7 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -604,7 +604,19 @@ def value_counts(
ascending: bool = False,
bins=None,
dropna: bool = True,
*,
name: Hashable | None = None,
Copy link
Member

@rhshadrach rhshadrach Nov 11, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is adding name to the API, which would presumably be deprecated in 2.0. So users would be adding name to silence the warning, then removing it to silence the new deprecation warning.

My understanding of the proposal was to just add a warning - no arguments - that the user will see on every use. While that is very noisy (maybe we could do a DeprecationWarning instead?), I think it's better than having users change code only to change it back.

cc @jreback

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks @rhshadrach for taking a look - yeah if it's OK to emit a warning each time then that'd be better, I'm also not keen on adding something to the API and then immediately deprecating it. I'll do that

) -> Series:
if name is None:
result_name = "proportion" if normalize else "count"
warnings.warn(
"In pandas 2.0.0, the name of the resulting Series will be "
"'count' (or 'proportion' if `normalize=True`). Specify "
f"`name='{result_name}'` to silence this warning.",
FutureWarning,
stacklevel=find_stack_level(),
)
name = self.obj.name

from pandas.core.reshape.merge import get_join_indexers
from pandas.core.reshape.tile import cut
Expand All @@ -626,6 +638,7 @@ def value_counts(
sort=sort,
ascending=ascending,
bins=bins,
name=name,
)
ser.index.names = names
return ser
Expand Down Expand Up @@ -741,7 +754,7 @@ def build_codes(lev_codes: np.ndarray) -> np.ndarray:

if is_integer_dtype(out.dtype):
out = ensure_int64(out)
return self.obj._constructor(out, index=mi, name=self.obj.name)
return self.obj._constructor(out, index=mi, name=name)

def fillna(
self,
Expand Down Expand Up @@ -1875,6 +1888,8 @@ def value_counts(
sort: bool = True,
ascending: bool = False,
dropna: bool = True,
*,
name: Hashable | None = None,
) -> DataFrame | Series:
"""
Return a Series or DataFrame containing counts of unique rows.
Expand Down Expand Up @@ -1979,6 +1994,16 @@ def value_counts(
3 male low US 0.25
4 male medium FR 0.25
"""
if name is None and self.as_index:
result_name = "proportion" if normalize else "count"
warnings.warn(
"In pandas 2.0.0, the name of the resulting Series will be "
"'count' (or 'proportion' if `normalize=True`). Specify "
f"`name='{result_name}'` to silence this warning.",
FutureWarning,
stacklevel=find_stack_level(),
)

if self.axis == 1:
raise NotImplementedError(
"DataFrameGroupBy.value_counts only handles axis=0"
Expand All @@ -1991,8 +2016,11 @@ def value_counts(
grouping.name for grouping in self.grouper.groupings if grouping.in_axis
}
if isinstance(self._selected_obj, Series):
name = self._selected_obj.name
keys = [] if name in in_axis_names else [self._selected_obj]
keys = (
[]
if self._selected_obj.name in in_axis_names
else [self._selected_obj]
)
else:
unique_cols = set(self._selected_obj.columns)
if subset is not None:
Expand All @@ -2015,8 +2043,8 @@ def value_counts(
keys = [
# Can't use .values because the column label needs to be preserved
self._selected_obj.iloc[:, idx]
for idx, name in enumerate(self._selected_obj.columns)
if name not in in_axis_names and name in subsetted
for idx, _name in enumerate(self._selected_obj.columns)
if _name not in in_axis_names and _name in subsetted
]

groupings = list(self.grouper.groupings)
Expand All @@ -2038,7 +2066,7 @@ def value_counts(
observed=self.observed,
dropna=self.dropna,
)
result_series = cast(Series, gb.size())
result_series = cast(Series, gb.size()).rename(name)

# GH-46357 Include non-observed categories
# of non-grouping columns regardless of `observed`
Expand Down Expand Up @@ -2082,7 +2110,8 @@ def value_counts(
result = result_series
else:
# Convert to frame
name = "proportion" if normalize else "count"
if name is None:
name = "proportion" if normalize else "count"
index = result_series.index
columns = com.fill_missing_names(index.names)
if name in columns:
Expand Down
Loading