Skip to content

Commit bec92a4

Browse files
authored
API: various .value_counts() result in different names / indices (#49912)
* sort out series / index case * getting there... * only just fixed up another file! * more fixups * getting there * wip, do this apply thing separately * fixup * more fixup * tmp fixup per gh49909 * 🎨 * fixup test * fixup tests * 🏷️ typing * 📝 whatsnew * rewrite whatsnew * add back missing line * shorten * pin name, simplify whatsnew example, reference issue, retitle * avoid rename * fixup new test * adjust new path * fixup * remove outdated comment --------- Co-authored-by: MarcoGorelli <>
1 parent dbe5927 commit bec92a4

27 files changed

+334
-167
lines changed

doc/source/whatsnew/v2.0.0.rst

+30
Original file line numberDiff line numberDiff line change
@@ -333,6 +333,36 @@ a supported dtype:
333333
334334
pd.Series(["2016-01-01"], dtype="datetime64[D]")
335335
336+
.. _whatsnew_200.api_breaking.value_counts:
337+
338+
Value counts sets the resulting name to ``count``
339+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
340+
In past versions, when running :meth:`Series.value_counts`, the result would inherit
341+
the original object's name, and the result index would be nameless. This would cause
342+
confusion when resetting the index, and the column names would not correspond with the
343+
column values.
344+
Now, the result name will be ``'count'`` (or ``'proportion'`` if ``normalize=True`` was passed),
345+
and the index will be named after the original object (:issue:`49497`).
346+
347+
*Previous behavior*:
348+
349+
.. code-block:: ipython
350+
351+
In [8]: pd.Series(['quetzal', 'quetzal', 'elk'], name='animal').value_counts()
352+
353+
Out[2]:
354+
quetzal 2
355+
elk 1
356+
Name: animal, dtype: int64
357+
358+
*New behavior*:
359+
360+
.. ipython:: python
361+
362+
pd.Series(['quetzal', 'quetzal', 'elk'], name='animal').value_counts()
363+
364+
Likewise for other ``value_counts`` methods (for example, :meth:`DataFrame.value_counts`).
365+
336366
.. _whatsnew_200.api_breaking.astype_to_unsupported_datetimelike:
337367

338368
Disallow astype conversion to non-supported datetime64/timedelta64 dtypes

pandas/core/algorithms.py

+11-4
Original file line numberDiff line numberDiff line change
@@ -847,7 +847,8 @@ def value_counts(
847847
Series,
848848
)
849849

850-
name = getattr(values, "name", None)
850+
index_name = getattr(values, "name", None)
851+
name = "proportion" if normalize else "count"
851852

852853
if bins is not None:
853854
from pandas.core.reshape.tile import cut
@@ -860,6 +861,7 @@ def value_counts(
860861

861862
# count, remove nulls (from the index), and but the bins
862863
result = ii.value_counts(dropna=dropna)
864+
result.name = name
863865
result = result[result.index.notna()]
864866
result.index = result.index.astype("interval")
865867
result = result.sort_index()
@@ -878,14 +880,18 @@ def value_counts(
878880
# handle Categorical and sparse,
879881
result = Series(values)._values.value_counts(dropna=dropna)
880882
result.name = name
883+
result.index.name = index_name
881884
counts = result._values
882885

883886
elif isinstance(values, ABCMultiIndex):
884887
# GH49558
885888
levels = list(range(values.nlevels))
886-
result = Series(index=values).groupby(level=levels, dropna=dropna).size()
887-
# TODO: allow index names to remain (see discussion in GH49497)
888-
result.index.names = [None] * values.nlevels
889+
result = (
890+
Series(index=values, name=name)
891+
.groupby(level=levels, dropna=dropna)
892+
.size()
893+
)
894+
result.index.names = values.names
889895
counts = result._values
890896

891897
else:
@@ -899,6 +905,7 @@ def value_counts(
899905
idx = Index(keys)
900906
if idx.dtype == bool and keys.dtype == object:
901907
idx = idx.astype(object)
908+
idx.name = index_name
902909

903910
result = Series(counts, index=idx, name=name)
904911

pandas/core/arrays/arrow/array.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -934,7 +934,7 @@ def value_counts(self, dropna: bool = True) -> Series:
934934

935935
index = Index(type(self)(values))
936936

937-
return Series(counts, index=index).astype("Int64")
937+
return Series(counts, index=index, name="count").astype("Int64")
938938

939939
@classmethod
940940
def _concat_same_type(

pandas/core/arrays/categorical.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1499,7 +1499,7 @@ def value_counts(self, dropna: bool = True) -> Series:
14991499
ix = coerce_indexer_dtype(ix, self.dtype.categories)
15001500
ix = self._from_backing_data(ix)
15011501

1502-
return Series(count, index=CategoricalIndex(ix), dtype="int64")
1502+
return Series(count, index=CategoricalIndex(ix), dtype="int64", name="count")
15031503

15041504
# error: Argument 2 of "_empty" is incompatible with supertype
15051505
# "NDArrayBackedExtensionArray"; supertype defines the argument type as

pandas/core/arrays/masked.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -996,7 +996,7 @@ def value_counts(self, dropna: bool = True) -> Series:
996996
)
997997

998998
if dropna:
999-
res = Series(value_counts, index=keys)
999+
res = Series(value_counts, index=keys, name="count")
10001000
res.index = res.index.astype(self.dtype)
10011001
res = res.astype("Int64")
10021002
return res
@@ -1012,7 +1012,7 @@ def value_counts(self, dropna: bool = True) -> Series:
10121012
mask = np.zeros(len(counts), dtype="bool")
10131013
counts_array = IntegerArray(counts, mask)
10141014

1015-
return Series(counts_array, index=index)
1015+
return Series(counts_array, index=index, name="count")
10161016

10171017
@doc(ExtensionArray.equals)
10181018
def equals(self, other) -> bool:

pandas/core/base.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -966,7 +966,7 @@ def value_counts(
966966
1.0 1
967967
2.0 1
968968
4.0 1
969-
dtype: int64
969+
Name: count, dtype: int64
970970
971971
With `normalize` set to `True`, returns the relative frequency by
972972
dividing all values by the sum of values.
@@ -977,7 +977,7 @@ def value_counts(
977977
1.0 0.2
978978
2.0 0.2
979979
4.0 0.2
980-
dtype: float64
980+
Name: proportion, dtype: float64
981981
982982
**bins**
983983
@@ -990,7 +990,7 @@ def value_counts(
990990
(0.996, 2.0] 2
991991
(2.0, 3.0] 2
992992
(3.0, 4.0] 1
993-
dtype: int64
993+
Name: count, dtype: int64
994994
995995
**dropna**
996996
@@ -1002,7 +1002,7 @@ def value_counts(
10021002
2.0 1
10031003
4.0 1
10041004
NaN 1
1005-
dtype: int64
1005+
Name: count, dtype: int64
10061006
"""
10071007
return algorithms.value_counts(
10081008
self,

pandas/core/frame.py

+8-6
Original file line numberDiff line numberDiff line change
@@ -7003,28 +7003,28 @@ def value_counts(
70037003
4 0 2
70047004
2 2 1
70057005
6 0 1
7006-
dtype: int64
7006+
Name: count, dtype: int64
70077007
70087008
>>> df.value_counts(sort=False)
70097009
num_legs num_wings
70107010
2 2 1
70117011
4 0 2
70127012
6 0 1
7013-
dtype: int64
7013+
Name: count, dtype: int64
70147014
70157015
>>> df.value_counts(ascending=True)
70167016
num_legs num_wings
70177017
2 2 1
70187018
6 0 1
70197019
4 0 2
7020-
dtype: int64
7020+
Name: count, dtype: int64
70217021
70227022
>>> df.value_counts(normalize=True)
70237023
num_legs num_wings
70247024
4 0 0.50
70257025
2 2 0.25
70267026
6 0 0.25
7027-
dtype: float64
7027+
Name: proportion, dtype: float64
70287028
70297029
With `dropna` set to `False` we can also count rows with NA values.
70307030
@@ -7041,20 +7041,22 @@ def value_counts(
70417041
first_name middle_name
70427042
Beth Louise 1
70437043
John Smith 1
7044-
dtype: int64
7044+
Name: count, dtype: int64
70457045
70467046
>>> df.value_counts(dropna=False)
70477047
first_name middle_name
70487048
Anne NaN 1
70497049
Beth Louise 1
70507050
John Smith 1
70517051
NaN 1
7052-
dtype: int64
7052+
Name: count, dtype: int64
70537053
"""
70547054
if subset is None:
70557055
subset = self.columns.tolist()
70567056

7057+
name = "proportion" if normalize else "count"
70577058
counts = self.groupby(subset, dropna=dropna).grouper.size()
7059+
counts.name = name
70587060

70597061
if sort:
70607062
counts = counts.sort_values(ascending=ascending)

pandas/core/groupby/generic.py

+13-8
Original file line numberDiff line numberDiff line change
@@ -666,10 +666,13 @@ def value_counts(
666666
bins=None,
667667
dropna: bool = True,
668668
) -> Series | DataFrame:
669+
name = "proportion" if normalize else "count"
670+
669671
if bins is None:
670672
result = self._value_counts(
671673
normalize=normalize, sort=sort, ascending=ascending, dropna=dropna
672674
)
675+
result.name = name
673676
return result
674677

675678
from pandas.core.reshape.merge import get_join_indexers
@@ -678,7 +681,7 @@ def value_counts(
678681
ids, _, _ = self.grouper.group_info
679682
val = self.obj._values
680683

681-
names = self.grouper.names + [self.obj.name]
684+
index_names = self.grouper.names + [self.obj.name]
682685

683686
if is_categorical_dtype(val.dtype) or (
684687
bins is not None and not np.iterable(bins)
@@ -693,7 +696,8 @@ def value_counts(
693696
ascending=ascending,
694697
bins=bins,
695698
)
696-
ser.index.names = names
699+
ser.name = name
700+
ser.index.names = index_names
697701
return ser
698702

699703
# groupby removes null keys from groupings
@@ -803,13 +807,14 @@ def build_codes(lev_codes: np.ndarray) -> np.ndarray:
803807
codes = [build_codes(lev_codes) for lev_codes in codes[:-1]]
804808
codes.append(left[-1])
805809

806-
mi = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False)
810+
mi = MultiIndex(
811+
levels=levels, codes=codes, names=index_names, verify_integrity=False
812+
)
807813

808814
if is_integer_dtype(out.dtype):
809815
out = ensure_int64(out)
810-
result = self.obj._constructor(out, index=mi, name=self.obj.name)
816+
result = self.obj._constructor(out, index=mi, name=name)
811817
if not self.as_index:
812-
result.name = "proportion" if normalize else "count"
813818
result = result.reset_index()
814819
return result
815820

@@ -2204,7 +2209,7 @@ def value_counts(
22042209
male low FR 2
22052210
US 1
22062211
medium FR 1
2207-
dtype: int64
2212+
Name: count, dtype: int64
22082213
22092214
>>> df.groupby('gender').value_counts(ascending=True)
22102215
gender education country
@@ -2213,7 +2218,7 @@ def value_counts(
22132218
male low US 1
22142219
medium FR 1
22152220
low FR 2
2216-
dtype: int64
2221+
Name: count, dtype: int64
22172222
22182223
>>> df.groupby('gender').value_counts(normalize=True)
22192224
gender education country
@@ -2222,7 +2227,7 @@ def value_counts(
22222227
male low FR 0.50
22232228
US 0.25
22242229
medium FR 0.25
2225-
dtype: float64
2230+
Name: proportion, dtype: float64
22262231
22272232
>>> df.groupby('gender', as_index=False).value_counts()
22282233
gender education country count

pandas/core/groupby/groupby.py

+6-5
Original file line numberDiff line numberDiff line change
@@ -2205,6 +2205,7 @@ def _value_counts(
22052205
raise NotImplementedError(
22062206
"DataFrameGroupBy.value_counts only handles axis=0"
22072207
)
2208+
name = "proportion" if normalize else "count"
22082209

22092210
with self._group_selection_context():
22102211
df = self.obj
@@ -2213,8 +2214,8 @@ def _value_counts(
22132214
grouping.name for grouping in self.grouper.groupings if grouping.in_axis
22142215
}
22152216
if isinstance(self._selected_obj, Series):
2216-
name = self._selected_obj.name
2217-
keys = [] if name in in_axis_names else [self._selected_obj]
2217+
_name = self._selected_obj.name
2218+
keys = [] if _name in in_axis_names else [self._selected_obj]
22182219
else:
22192220
unique_cols = set(self._selected_obj.columns)
22202221
if subset is not None:
@@ -2237,8 +2238,8 @@ def _value_counts(
22372238
keys = [
22382239
# Can't use .values because the column label needs to be preserved
22392240
self._selected_obj.iloc[:, idx]
2240-
for idx, name in enumerate(self._selected_obj.columns)
2241-
if name not in in_axis_names and name in subsetted
2241+
for idx, _name in enumerate(self._selected_obj.columns)
2242+
if _name not in in_axis_names and _name in subsetted
22422243
]
22432244

22442245
groupings = list(self.grouper.groupings)
@@ -2261,6 +2262,7 @@ def _value_counts(
22612262
dropna=self.dropna,
22622263
)
22632264
result_series = cast(Series, gb.size())
2265+
result_series.name = name
22642266

22652267
# GH-46357 Include non-observed categories
22662268
# of non-grouping columns regardless of `observed`
@@ -2304,7 +2306,6 @@ def _value_counts(
23042306
result = result_series
23052307
else:
23062308
# Convert to frame
2307-
name = "proportion" if normalize else "count"
23082309
index = result_series.index
23092310
columns = com.fill_missing_names(index.names)
23102311
if name in columns:

pandas/tests/arrays/boolean/test_function.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -92,20 +92,20 @@ def test_ufunc_reduce_raises(values):
9292
def test_value_counts_na():
9393
arr = pd.array([True, False, pd.NA], dtype="boolean")
9494
result = arr.value_counts(dropna=False)
95-
expected = pd.Series([1, 1, 1], index=arr, dtype="Int64")
95+
expected = pd.Series([1, 1, 1], index=arr, dtype="Int64", name="count")
9696
assert expected.index.dtype == arr.dtype
9797
tm.assert_series_equal(result, expected)
9898

9999
result = arr.value_counts(dropna=True)
100-
expected = pd.Series([1, 1], index=arr[:-1], dtype="Int64")
100+
expected = pd.Series([1, 1], index=arr[:-1], dtype="Int64", name="count")
101101
assert expected.index.dtype == arr.dtype
102102
tm.assert_series_equal(result, expected)
103103

104104

105105
def test_value_counts_with_normalize():
106106
ser = pd.Series([True, False, pd.NA], dtype="boolean")
107107
result = ser.value_counts(normalize=True)
108-
expected = pd.Series([1, 1], index=ser[:-1], dtype="Float64") / 2
108+
expected = pd.Series([1, 1], index=ser[:-1], dtype="Float64", name="proportion") / 2
109109
assert expected.index.dtype == "boolean"
110110
tm.assert_series_equal(result, expected)
111111

pandas/tests/arrays/floating/test_function.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -102,11 +102,11 @@ def test_value_counts_na():
102102
result = arr.value_counts(dropna=False)
103103
idx = pd.Index([0.1, 0.2, pd.NA], dtype=arr.dtype)
104104
assert idx.dtype == arr.dtype
105-
expected = pd.Series([2, 1, 1], index=idx, dtype="Int64")
105+
expected = pd.Series([2, 1, 1], index=idx, dtype="Int64", name="count")
106106
tm.assert_series_equal(result, expected)
107107

108108
result = arr.value_counts(dropna=True)
109-
expected = pd.Series([2, 1], index=idx[:-1], dtype="Int64")
109+
expected = pd.Series([2, 1], index=idx[:-1], dtype="Int64", name="count")
110110
tm.assert_series_equal(result, expected)
111111

112112

@@ -115,14 +115,14 @@ def test_value_counts_empty():
115115
result = ser.value_counts()
116116
idx = pd.Index([], dtype="Float64")
117117
assert idx.dtype == "Float64"
118-
expected = pd.Series([], index=idx, dtype="Int64")
118+
expected = pd.Series([], index=idx, dtype="Int64", name="count")
119119
tm.assert_series_equal(result, expected)
120120

121121

122122
def test_value_counts_with_normalize():
123123
ser = pd.Series([0.1, 0.2, 0.1, pd.NA], dtype="Float64")
124124
result = ser.value_counts(normalize=True)
125-
expected = pd.Series([2, 1], index=ser[:2], dtype="Float64") / 3
125+
expected = pd.Series([2, 1], index=ser[:2], dtype="Float64", name="proportion") / 3
126126
assert expected.index.dtype == ser.dtype
127127
tm.assert_series_equal(result, expected)
128128

0 commit comments

Comments
 (0)