diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 4039276caa0af..7d9b648887f1a 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -330,6 +330,36 @@ a supported dtype: pd.Series(["2016-01-01"], dtype="datetime64[D]") +.. _whatsnew_200.api_breaking.value_counts: + +Value counts sets the resulting name to ``count`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +In past versions, when running :meth:`Series.value_counts`, the result would inherit +the original object's name, and the result index would be nameless. This would cause +confusion when resetting the index, and the column names would not correspond with the +column values. +Now, the result name will be ``'count'`` (or ``'proportion'`` if ``normalize=True`` was passed), +and the index will be named after the original object (:issue:`49497`). + +*Previous behavior*: + +.. code-block:: ipython + + In [8]: pd.Series(['quetzal', 'quetzal', 'elk'], name='animal').value_counts() + + Out[2]: + quetzal 2 + elk 1 + Name: animal, dtype: int64 + +*New behavior*: + +.. ipython:: python + + pd.Series(['quetzal', 'quetzal', 'elk'], name='animal').value_counts() + +Likewise for other ``value_counts`` methods (for example, :meth:`DataFrame.value_counts`). + .. _whatsnew_200.api_breaking.astype_to_unsupported_datetimelike: Disallow astype conversion to non-supported datetime64/timedelta64 dtypes diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 17abed45f65cc..1a22c3fe327e9 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -847,7 +847,8 @@ def value_counts( Series, ) - name = getattr(values, "name", None) + index_name = getattr(values, "name", None) + name = "proportion" if normalize else "count" if bins is not None: from pandas.core.reshape.tile import cut @@ -860,6 +861,7 @@ def value_counts( # count, remove nulls (from the index), and but the bins result = ii.value_counts(dropna=dropna) + result.name = name result = result[result.index.notna()] result.index = result.index.astype("interval") result = result.sort_index() @@ -878,14 +880,18 @@ def value_counts( # handle Categorical and sparse, result = Series(values)._values.value_counts(dropna=dropna) result.name = name + result.index.name = index_name counts = result._values elif isinstance(values, ABCMultiIndex): # GH49558 levels = list(range(values.nlevels)) - result = Series(index=values).groupby(level=levels, dropna=dropna).size() - # TODO: allow index names to remain (see discussion in GH49497) - result.index.names = [None] * values.nlevels + result = ( + Series(index=values, name=name) + .groupby(level=levels, dropna=dropna) + .size() + ) + result.index.names = values.names counts = result._values else: @@ -899,6 +905,7 @@ def value_counts( idx = Index(keys) if idx.dtype == bool and keys.dtype == object: idx = idx.astype(object) + idx.name = index_name result = Series(counts, index=idx, name=name) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 0f81ab5f7b424..5cf043242fb31 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -934,7 +934,7 @@ def value_counts(self, dropna: bool = True) -> Series: index = Index(type(self)(values)) - return Series(counts, index=index).astype("Int64") + return Series(counts, index=index, name="count").astype("Int64") @classmethod def _concat_same_type( diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 5b61695410474..eacd9137db224 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1499,7 +1499,7 @@ def value_counts(self, dropna: bool = True) -> Series: ix = coerce_indexer_dtype(ix, self.dtype.categories) ix = self._from_backing_data(ix) - return Series(count, index=CategoricalIndex(ix), dtype="int64") + return Series(count, index=CategoricalIndex(ix), dtype="int64", name="count") # error: Argument 2 of "_empty" is incompatible with supertype # "NDArrayBackedExtensionArray"; supertype defines the argument type as diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index d45fe05d52937..8324d4b2618f1 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -996,7 +996,7 @@ def value_counts(self, dropna: bool = True) -> Series: ) if dropna: - res = Series(value_counts, index=keys) + res = Series(value_counts, index=keys, name="count") res.index = res.index.astype(self.dtype) res = res.astype("Int64") return res @@ -1012,7 +1012,7 @@ def value_counts(self, dropna: bool = True) -> Series: mask = np.zeros(len(counts), dtype="bool") counts_array = IntegerArray(counts, mask) - return Series(counts_array, index=index) + return Series(counts_array, index=index, name="count") @doc(ExtensionArray.equals) def equals(self, other) -> bool: diff --git a/pandas/core/base.py b/pandas/core/base.py index 9cf93ea8eee2f..275fd989841a8 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -966,7 +966,7 @@ def value_counts( 1.0 1 2.0 1 4.0 1 - dtype: int64 + Name: count, dtype: int64 With `normalize` set to `True`, returns the relative frequency by dividing all values by the sum of values. @@ -977,7 +977,7 @@ def value_counts( 1.0 0.2 2.0 0.2 4.0 0.2 - dtype: float64 + Name: proportion, dtype: float64 **bins** @@ -990,7 +990,7 @@ def value_counts( (0.996, 2.0] 2 (2.0, 3.0] 2 (3.0, 4.0] 1 - dtype: int64 + Name: count, dtype: int64 **dropna** @@ -1002,7 +1002,7 @@ def value_counts( 2.0 1 4.0 1 NaN 1 - dtype: int64 + Name: count, dtype: int64 """ return algorithms.value_counts( self, diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 69c2476681021..9426f3fa495cc 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7003,28 +7003,28 @@ def value_counts( 4 0 2 2 2 1 6 0 1 - dtype: int64 + Name: count, dtype: int64 >>> df.value_counts(sort=False) num_legs num_wings 2 2 1 4 0 2 6 0 1 - dtype: int64 + Name: count, dtype: int64 >>> df.value_counts(ascending=True) num_legs num_wings 2 2 1 6 0 1 4 0 2 - dtype: int64 + Name: count, dtype: int64 >>> df.value_counts(normalize=True) num_legs num_wings 4 0 0.50 2 2 0.25 6 0 0.25 - dtype: float64 + Name: proportion, dtype: float64 With `dropna` set to `False` we can also count rows with NA values. @@ -7041,7 +7041,7 @@ def value_counts( first_name middle_name Beth Louise 1 John Smith 1 - dtype: int64 + Name: count, dtype: int64 >>> df.value_counts(dropna=False) first_name middle_name @@ -7049,12 +7049,14 @@ def value_counts( Beth Louise 1 John Smith 1 NaN 1 - dtype: int64 + Name: count, dtype: int64 """ if subset is None: subset = self.columns.tolist() + name = "proportion" if normalize else "count" counts = self.groupby(subset, dropna=dropna).grouper.size() + counts.name = name if sort: counts = counts.sort_values(ascending=ascending) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 2340c36d14301..e28a52e66689e 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -666,10 +666,13 @@ def value_counts( bins=None, dropna: bool = True, ) -> Series | DataFrame: + name = "proportion" if normalize else "count" + if bins is None: result = self._value_counts( normalize=normalize, sort=sort, ascending=ascending, dropna=dropna ) + result.name = name return result from pandas.core.reshape.merge import get_join_indexers @@ -678,7 +681,7 @@ def value_counts( ids, _, _ = self.grouper.group_info val = self.obj._values - names = self.grouper.names + [self.obj.name] + index_names = self.grouper.names + [self.obj.name] if is_categorical_dtype(val.dtype) or ( bins is not None and not np.iterable(bins) @@ -693,7 +696,8 @@ def value_counts( ascending=ascending, bins=bins, ) - ser.index.names = names + ser.name = name + ser.index.names = index_names return ser # groupby removes null keys from groupings @@ -803,13 +807,14 @@ def build_codes(lev_codes: np.ndarray) -> np.ndarray: codes = [build_codes(lev_codes) for lev_codes in codes[:-1]] codes.append(left[-1]) - mi = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False) + mi = MultiIndex( + levels=levels, codes=codes, names=index_names, verify_integrity=False + ) if is_integer_dtype(out.dtype): out = ensure_int64(out) - result = self.obj._constructor(out, index=mi, name=self.obj.name) + result = self.obj._constructor(out, index=mi, name=name) if not self.as_index: - result.name = "proportion" if normalize else "count" result = result.reset_index() return result @@ -2205,7 +2210,7 @@ def value_counts( male low FR 2 US 1 medium FR 1 - dtype: int64 + Name: count, dtype: int64 >>> df.groupby('gender').value_counts(ascending=True) gender education country @@ -2214,7 +2219,7 @@ def value_counts( male low US 1 medium FR 1 low FR 2 - dtype: int64 + Name: count, dtype: int64 >>> df.groupby('gender').value_counts(normalize=True) gender education country @@ -2223,7 +2228,7 @@ def value_counts( male low FR 0.50 US 0.25 medium FR 0.25 - dtype: float64 + Name: proportion, dtype: float64 >>> df.groupby('gender', as_index=False).value_counts() gender education country count diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 2bf0a8f0b4293..d597b85a8b690 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2205,6 +2205,7 @@ def _value_counts( raise NotImplementedError( "DataFrameGroupBy.value_counts only handles axis=0" ) + name = "proportion" if normalize else "count" with self._group_selection_context(): df = self.obj @@ -2213,8 +2214,8 @@ def _value_counts( grouping.name for grouping in self.grouper.groupings if grouping.in_axis } if isinstance(self._selected_obj, Series): - name = self._selected_obj.name - keys = [] if name in in_axis_names else [self._selected_obj] + _name = self._selected_obj.name + keys = [] if _name in in_axis_names else [self._selected_obj] else: unique_cols = set(self._selected_obj.columns) if subset is not None: @@ -2237,8 +2238,8 @@ def _value_counts( keys = [ # Can't use .values because the column label needs to be preserved self._selected_obj.iloc[:, idx] - for idx, name in enumerate(self._selected_obj.columns) - if name not in in_axis_names and name in subsetted + for idx, _name in enumerate(self._selected_obj.columns) + if _name not in in_axis_names and _name in subsetted ] groupings = list(self.grouper.groupings) @@ -2261,6 +2262,7 @@ def _value_counts( dropna=self.dropna, ) result_series = cast(Series, gb.size()) + result_series.name = name # GH-46357 Include non-observed categories # of non-grouping columns regardless of `observed` @@ -2304,7 +2306,6 @@ def _value_counts( result = result_series else: # Convert to frame - name = "proportion" if normalize else "count" index = result_series.index columns = com.fill_missing_names(index.names) if name in columns: diff --git a/pandas/tests/arrays/boolean/test_function.py b/pandas/tests/arrays/boolean/test_function.py index b484dc39cf23b..2b3f3d3d16ac6 100644 --- a/pandas/tests/arrays/boolean/test_function.py +++ b/pandas/tests/arrays/boolean/test_function.py @@ -92,12 +92,12 @@ def test_ufunc_reduce_raises(values): def test_value_counts_na(): arr = pd.array([True, False, pd.NA], dtype="boolean") result = arr.value_counts(dropna=False) - expected = pd.Series([1, 1, 1], index=arr, dtype="Int64") + expected = pd.Series([1, 1, 1], index=arr, dtype="Int64", name="count") assert expected.index.dtype == arr.dtype tm.assert_series_equal(result, expected) result = arr.value_counts(dropna=True) - expected = pd.Series([1, 1], index=arr[:-1], dtype="Int64") + expected = pd.Series([1, 1], index=arr[:-1], dtype="Int64", name="count") assert expected.index.dtype == arr.dtype tm.assert_series_equal(result, expected) @@ -105,7 +105,7 @@ def test_value_counts_na(): def test_value_counts_with_normalize(): ser = pd.Series([True, False, pd.NA], dtype="boolean") result = ser.value_counts(normalize=True) - expected = pd.Series([1, 1], index=ser[:-1], dtype="Float64") / 2 + expected = pd.Series([1, 1], index=ser[:-1], dtype="Float64", name="proportion") / 2 assert expected.index.dtype == "boolean" tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/floating/test_function.py b/pandas/tests/arrays/floating/test_function.py index f2af3118c6cbe..40fd66fd049a6 100644 --- a/pandas/tests/arrays/floating/test_function.py +++ b/pandas/tests/arrays/floating/test_function.py @@ -102,11 +102,11 @@ def test_value_counts_na(): result = arr.value_counts(dropna=False) idx = pd.Index([0.1, 0.2, pd.NA], dtype=arr.dtype) assert idx.dtype == arr.dtype - expected = pd.Series([2, 1, 1], index=idx, dtype="Int64") + expected = pd.Series([2, 1, 1], index=idx, dtype="Int64", name="count") tm.assert_series_equal(result, expected) result = arr.value_counts(dropna=True) - expected = pd.Series([2, 1], index=idx[:-1], dtype="Int64") + expected = pd.Series([2, 1], index=idx[:-1], dtype="Int64", name="count") tm.assert_series_equal(result, expected) @@ -115,14 +115,14 @@ def test_value_counts_empty(): result = ser.value_counts() idx = pd.Index([], dtype="Float64") assert idx.dtype == "Float64" - expected = pd.Series([], index=idx, dtype="Int64") + expected = pd.Series([], index=idx, dtype="Int64", name="count") tm.assert_series_equal(result, expected) def test_value_counts_with_normalize(): ser = pd.Series([0.1, 0.2, 0.1, pd.NA], dtype="Float64") result = ser.value_counts(normalize=True) - expected = pd.Series([2, 1], index=ser[:2], dtype="Float64") / 3 + expected = pd.Series([2, 1], index=ser[:2], dtype="Float64", name="proportion") / 3 assert expected.index.dtype == ser.dtype tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py index e5177e9e50d71..d48b636a98feb 100644 --- a/pandas/tests/arrays/integer/test_function.py +++ b/pandas/tests/arrays/integer/test_function.py @@ -113,11 +113,11 @@ def test_value_counts_na(): result = arr.value_counts(dropna=False) ex_index = pd.Index([1, 2, pd.NA], dtype="Int64") assert ex_index.dtype == "Int64" - expected = pd.Series([2, 1, 1], index=ex_index, dtype="Int64") + expected = pd.Series([2, 1, 1], index=ex_index, dtype="Int64", name="count") tm.assert_series_equal(result, expected) result = arr.value_counts(dropna=True) - expected = pd.Series([2, 1], index=arr[:2], dtype="Int64") + expected = pd.Series([2, 1], index=arr[:2], dtype="Int64", name="count") assert expected.index.dtype == arr.dtype tm.assert_series_equal(result, expected) @@ -128,7 +128,7 @@ def test_value_counts_empty(): result = ser.value_counts() idx = pd.Index([], dtype=ser.dtype) assert idx.dtype == ser.dtype - expected = pd.Series([], index=idx, dtype="Int64") + expected = pd.Series([], index=idx, dtype="Int64", name="count") tm.assert_series_equal(result, expected) @@ -136,7 +136,7 @@ def test_value_counts_with_normalize(): # GH 33172 ser = pd.Series([1, 2, 1, pd.NA], dtype="Int64") result = ser.value_counts(normalize=True) - expected = pd.Series([2, 1], index=ser[:2], dtype="Float64") / 3 + expected = pd.Series([2, 1], index=ser[:2], dtype="Float64", name="proportion") / 3 assert expected.index.dtype == ser.dtype tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index e1ea001819b1c..044ad089ae958 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -471,18 +471,18 @@ def test_arrow_load_from_zero_chunks(dtype, string_storage2): def test_value_counts_na(dtype): arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype) result = arr.value_counts(dropna=False) - expected = pd.Series([2, 1, 1], index=arr[[0, 1, 3]], dtype="Int64") + expected = pd.Series([2, 1, 1], index=arr[[0, 1, 3]], dtype="Int64", name="count") tm.assert_series_equal(result, expected) result = arr.value_counts(dropna=True) - expected = pd.Series([2, 1], index=arr[:2], dtype="Int64") + expected = pd.Series([2, 1], index=arr[:2], dtype="Int64", name="count") tm.assert_series_equal(result, expected) def test_value_counts_with_normalize(dtype): ser = pd.Series(["a", "b", "a", pd.NA], dtype=dtype) result = ser.value_counts(normalize=True) - expected = pd.Series([2, 1], index=ser[:2], dtype="Float64") / 3 + expected = pd.Series([2, 1], index=ser[:2], dtype="Float64", name="proportion") / 3 tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index fb5627b68abff..53c9b3d174967 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -479,7 +479,7 @@ def test_value_counts_preserves_tz(self): arr[-2] = pd.NaT result = arr.value_counts(dropna=False) - expected = pd.Series([4, 2, 1], index=[dti[0], dti[1], pd.NaT]) + expected = pd.Series([4, 2, 1], index=[dti[0], dti[1], pd.NaT], name="count") tm.assert_series_equal(result, expected) @pytest.mark.parametrize("method", ["pad", "backfill"]) diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index 3aa0827b22a78..4f5e8adbcdf93 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -13,6 +13,7 @@ Index, Interval, IntervalIndex, + MultiIndex, Series, Timedelta, TimedeltaIndex, @@ -27,7 +28,7 @@ def test_value_counts(index_or_series_obj): result = obj.value_counts() counter = collections.Counter(obj) - expected = Series(dict(counter.most_common()), dtype=np.int64, name=obj.name) + expected = Series(dict(counter.most_common()), dtype=np.int64, name="count") if obj.dtype != np.float16: expected.index = expected.index.astype(obj.dtype) @@ -35,6 +36,10 @@ def test_value_counts(index_or_series_obj): with pytest.raises(NotImplementedError, match="float16 indexes are not "): expected.index.astype(obj.dtype) return + if isinstance(expected.index, MultiIndex): + expected.index.names = obj.names + else: + expected.index.name = obj.name if not isinstance(result.dtype, np.dtype): # i.e IntegerDtype @@ -65,7 +70,7 @@ def test_value_counts_null(null_obj, index_or_series_obj): pytest.skip("type doesn't allow for NA operations") elif len(obj) < 1: pytest.skip("Test doesn't make sense on empty data") - elif isinstance(orig, pd.MultiIndex): + elif isinstance(orig, MultiIndex): pytest.skip(f"MultiIndex can't hold '{null_obj}'") values = obj._values @@ -78,7 +83,7 @@ def test_value_counts_null(null_obj, index_or_series_obj): # because np.nan == np.nan is False, but None == None is True # np.nan would be duplicated, whereas None wouldn't counter = collections.Counter(obj.dropna()) - expected = Series(dict(counter.most_common()), dtype=np.int64) + expected = Series(dict(counter.most_common()), dtype=np.int64, name="count") if obj.dtype != np.float16: expected.index = expected.index.astype(obj.dtype) @@ -86,6 +91,7 @@ def test_value_counts_null(null_obj, index_or_series_obj): with pytest.raises(NotImplementedError, match="float16 indexes are not "): expected.index.astype(obj.dtype) return + expected.index.name = obj.name result = obj.value_counts() if obj.duplicated().any(): @@ -130,7 +136,7 @@ def test_value_counts_inferred(index_or_series): klass = index_or_series s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"] s = klass(s_values) - expected = Series([4, 3, 2, 1], index=["b", "a", "d", "c"]) + expected = Series([4, 3, 2, 1], index=["b", "a", "d", "c"], name="count") tm.assert_series_equal(s.value_counts(), expected) if isinstance(s, Index): @@ -144,17 +150,19 @@ def test_value_counts_inferred(index_or_series): # don't sort, have to sort after the fact as not sorting is # platform-dep hist = s.value_counts(sort=False).sort_values() - expected = Series([3, 1, 4, 2], index=list("acbd")).sort_values() + expected = Series([3, 1, 4, 2], index=list("acbd"), name="count").sort_values() tm.assert_series_equal(hist, expected) # sort ascending hist = s.value_counts(ascending=True) - expected = Series([1, 2, 3, 4], index=list("cdab")) + expected = Series([1, 2, 3, 4], index=list("cdab"), name="count") tm.assert_series_equal(hist, expected) # relative histogram. hist = s.value_counts(normalize=True) - expected = Series([0.4, 0.3, 0.2, 0.1], index=["b", "a", "d", "c"]) + expected = Series( + [0.4, 0.3, 0.2, 0.1], index=["b", "a", "d", "c"], name="proportion" + ) tm.assert_series_equal(hist, expected) @@ -170,10 +178,10 @@ def test_value_counts_bins(index_or_series): s1 = Series([1, 1, 2, 3]) res1 = s1.value_counts(bins=1) - exp1 = Series({Interval(0.997, 3.0): 4}) + exp1 = Series({Interval(0.997, 3.0): 4}, name="count") tm.assert_series_equal(res1, exp1) res1n = s1.value_counts(bins=1, normalize=True) - exp1n = Series({Interval(0.997, 3.0): 1.0}) + exp1n = Series({Interval(0.997, 3.0): 1.0}, name="proportion") tm.assert_series_equal(res1n, exp1n) if isinstance(s1, Index): @@ -187,22 +195,24 @@ def test_value_counts_bins(index_or_series): # these return the same res4 = s1.value_counts(bins=4, dropna=True) intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) - exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 1, 3, 2])) + exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 1, 3, 2]), name="count") tm.assert_series_equal(res4, exp4) res4 = s1.value_counts(bins=4, dropna=False) intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) - exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 1, 3, 2])) + exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 1, 3, 2]), name="count") tm.assert_series_equal(res4, exp4) res4n = s1.value_counts(bins=4, normalize=True) - exp4n = Series([0.5, 0.25, 0.25, 0], index=intervals.take([0, 1, 3, 2])) + exp4n = Series( + [0.5, 0.25, 0.25, 0], index=intervals.take([0, 1, 3, 2]), name="proportion" + ) tm.assert_series_equal(res4n, exp4n) # handle NA's properly s_values = ["a", "b", "b", "b", np.nan, np.nan, "d", "d", "a", "a", "b"] s = klass(s_values) - expected = Series([4, 3, 2], index=["b", "a", "d"]) + expected = Series([4, 3, 2], index=["b", "a", "d"], name="count") tm.assert_series_equal(s.value_counts(), expected) if isinstance(s, Index): @@ -214,7 +224,7 @@ def test_value_counts_bins(index_or_series): assert s.nunique() == 3 s = klass({}) if klass is dict else klass({}, dtype=object) - expected = Series([], dtype=np.int64) + expected = Series([], dtype=np.int64, name="count") tm.assert_series_equal(s.value_counts(), expected, check_index_type=False) # returned dtype differs depending on original if isinstance(s, Index): @@ -252,7 +262,7 @@ def test_value_counts_datetime64(index_or_series): idx = pd.to_datetime( ["2010-01-01 00:00:00", "2008-09-09 00:00:00", "2009-01-01 00:00:00"] ) - expected_s = Series([3, 2, 1], index=idx) + expected_s = Series([3, 2, 1], index=idx, name="count") tm.assert_series_equal(s.value_counts(), expected_s) expected = pd.array( @@ -277,7 +287,9 @@ def test_value_counts_datetime64(index_or_series): tm.assert_series_equal(result, expected_s) result = s.value_counts(dropna=False) - expected_s = pd.concat([Series([4], index=DatetimeIndex([pd.NaT])), expected_s]) + expected_s = pd.concat( + [Series([4], index=DatetimeIndex([pd.NaT]), name="count"), expected_s] + ) tm.assert_series_equal(result, expected_s) assert s.dtype == "datetime64[ns]" @@ -300,7 +312,7 @@ def test_value_counts_datetime64(index_or_series): td = klass(td, name="dt") result = td.value_counts() - expected_s = Series([6], index=[Timedelta("1day")], name="dt") + expected_s = Series([6], index=Index([Timedelta("1day")], name="dt"), name="count") tm.assert_series_equal(result, expected_s) expected = TimedeltaIndex(["1 days"], name="dt") @@ -323,7 +335,7 @@ def test_value_counts_with_nan(dropna, index_or_series): obj = klass(values) res = obj.value_counts(dropna=dropna) if dropna is True: - expected = Series([1], index=Index([True], dtype=obj.dtype)) + expected = Series([1], index=Index([True], dtype=obj.dtype), name="count") else: - expected = Series([1, 1, 1], index=[True, pd.NA, np.nan]) + expected = Series([1, 1, 1], index=[True, pd.NA, np.nan], name="count") tm.assert_series_equal(res, expected) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 54d41fa9d972a..b74372017f303 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -48,9 +48,11 @@ def test_value_counts_with_normalize(self, data): result = ser.value_counts(normalize=True).sort_index() if not isinstance(data, pd.Categorical): - expected = pd.Series([1 / len(values)] * len(values), index=result.index) + expected = pd.Series( + [1 / len(values)] * len(values), index=result.index, name="proportion" + ) else: - expected = pd.Series(0.0, index=result.index) + expected = pd.Series(0.0, index=result.index, name="proportion") expected[result > 0] = 1 / len(values) if na_value_for_dtype(data.dtype) is pd.NA: # TODO(GH#44692): avoid special-casing diff --git a/pandas/tests/frame/methods/test_asfreq.py b/pandas/tests/frame/methods/test_asfreq.py index 9fa9f27e7e312..2cff2c4b2bc57 100644 --- a/pandas/tests/frame/methods/test_asfreq.py +++ b/pandas/tests/frame/methods/test_asfreq.py @@ -53,7 +53,9 @@ def test_asfreq2(self, frame_or_series): if frame_or_series is Series: daily_ts = ts.asfreq("D", fill_value=-1) result = daily_ts.value_counts().sort_index() - expected = Series([60, 1, 1, 1], index=[-1.0, 2.0, 1.0, 0.0]).sort_index() + expected = Series( + [60, 1, 1, 1], index=[-1.0, 2.0, 1.0, 0.0], name="count" + ).sort_index() tm.assert_series_equal(result, expected) def test_asfreq_datetimeindex_empty(self, frame_or_series): diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py index 9859ffb83da66..e8c129fd12bfd 100644 --- a/pandas/tests/frame/methods/test_value_counts.py +++ b/pandas/tests/frame/methods/test_value_counts.py @@ -16,6 +16,7 @@ def test_data_frame_value_counts_unsorted(): index=pd.MultiIndex.from_arrays( [(2, 4, 6), (2, 0, 0)], names=["num_legs", "num_wings"] ), + name="count", ) tm.assert_series_equal(result, expected) @@ -33,6 +34,7 @@ def test_data_frame_value_counts_ascending(): index=pd.MultiIndex.from_arrays( [(2, 6, 4), (2, 0, 0)], names=["num_legs", "num_wings"] ), + name="count", ) tm.assert_series_equal(result, expected) @@ -50,6 +52,7 @@ def test_data_frame_value_counts_default(): index=pd.MultiIndex.from_arrays( [(4, 2, 6), (0, 2, 0)], names=["num_legs", "num_wings"] ), + name="count", ) tm.assert_series_equal(result, expected) @@ -67,6 +70,7 @@ def test_data_frame_value_counts_normalize(): index=pd.MultiIndex.from_arrays( [(4, 2, 6), (0, 2, 0)], names=["num_legs", "num_wings"] ), + name="proportion", ) tm.assert_series_equal(result, expected) @@ -79,6 +83,7 @@ def test_data_frame_value_counts_single_col_default(): expected = pd.Series( data=[2, 1, 1], index=pd.MultiIndex.from_arrays([[4, 2, 6]], names=["num_legs"]), + name="count", ) tm.assert_series_equal(result, expected) @@ -88,7 +93,9 @@ def test_data_frame_value_counts_empty(): df_no_cols = pd.DataFrame() result = df_no_cols.value_counts() - expected = pd.Series([], dtype=np.int64, index=np.array([], dtype=np.intp)) + expected = pd.Series( + [], dtype=np.int64, name="count", index=np.array([], dtype=np.intp) + ) tm.assert_series_equal(result, expected) @@ -97,7 +104,9 @@ def test_data_frame_value_counts_empty_normalize(): df_no_cols = pd.DataFrame() result = df_no_cols.value_counts(normalize=True) - expected = pd.Series([], dtype=np.float64, index=np.array([], dtype=np.intp)) + expected = pd.Series( + [], dtype=np.float64, name="proportion", index=np.array([], dtype=np.intp) + ) tm.assert_series_equal(result, expected) @@ -116,6 +125,7 @@ def test_data_frame_value_counts_dropna_true(nulls_fixture): index=pd.MultiIndex.from_arrays( [("Beth", "John"), ("Louise", "Smith")], names=["first_name", "middle_name"] ), + name="count", ) tm.assert_series_equal(result, expected) @@ -141,6 +151,7 @@ def test_data_frame_value_counts_dropna_false(nulls_fixture): codes=[[0, 1, 2, 2], [2, 0, 1, 2]], names=["first_name", "middle_name"], ), + name="count", ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 1ab20c282b23a..78cadae3f206d 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -219,7 +219,7 @@ def test_with_datetimelikes(self): t = df.T result = t.dtypes.value_counts() - expected = Series({np.dtype("object"): 10}) + expected = Series({np.dtype("object"): 10}, name="count") tm.assert_series_equal(result, expected) def test_deepcopy(self, float_frame): diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index ab3494f0823ad..0453d7881a811 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -446,7 +446,9 @@ def test_df_flex_cmp_constant_return_types(self, opname): const = 2 result = getattr(df, opname)(const).dtypes.value_counts() - tm.assert_series_equal(result, Series([2], index=[np.dtype(bool)])) + tm.assert_series_equal( + result, Series([2], index=[np.dtype(bool)], name="count") + ) @pytest.mark.parametrize("opname", ["eq", "ne", "gt", "lt", "ge", "le"]) def test_df_flex_cmp_constant_return_types_empty(self, opname): @@ -456,7 +458,9 @@ def test_df_flex_cmp_constant_return_types_empty(self, opname): empty = df.iloc[:0] result = getattr(empty, opname)(const).dtypes.value_counts() - tm.assert_series_equal(result, Series([2], index=[np.dtype(bool)])) + tm.assert_series_equal( + result, Series([2], index=[np.dtype(bool)], name="count") + ) def test_df_flex_cmp_ea_dtype_with_ndarray_series(self): ii = pd.IntervalIndex.from_breaks([1, 2, 3]) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 30bf5eb39cf51..986ee48ca9876 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -271,7 +271,7 @@ def test_sorting_with_different_categoricals(): index = Categorical(index, categories=["low", "med", "high"], ordered=True) index = [["A", "A", "A", "B", "B", "B"], CategoricalIndex(index)] index = MultiIndex.from_arrays(index, names=["group", "dose"]) - expected = Series([2] * 6, index=index, name="dose") + expected = Series([2] * 6, index=index, name="count") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index ae4b74fc814da..ce5ce3e56ab98 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -38,7 +38,7 @@ def tests_value_counts_index_names_category_column(): df_mi_expected = DataFrame([["US", "female"]], columns=["country", "gender"]) df_mi_expected["gender"] = df_mi_expected["gender"].astype("category") mi_expected = MultiIndex.from_frame(df_mi_expected) - expected = Series([1], index=mi_expected, name="gender") + expected = Series([1], index=mi_expected, name="count") tm.assert_series_equal(result, expected) @@ -85,12 +85,12 @@ def seed_df(seed_nans, n, m): @pytest.mark.slow @pytest.mark.parametrize("df, keys, bins, n, m", binned, ids=ids) @pytest.mark.parametrize("isort", [True, False]) -@pytest.mark.parametrize("normalize", [True, False]) +@pytest.mark.parametrize("normalize, name", [(True, "proportion"), (False, "count")]) @pytest.mark.parametrize("sort", [True, False]) @pytest.mark.parametrize("ascending", [True, False]) @pytest.mark.parametrize("dropna", [True, False]) def test_series_groupby_value_counts( - df, keys, bins, n, m, isort, normalize, sort, ascending, dropna + df, keys, bins, n, m, isort, normalize, name, sort, ascending, dropna ): def rebuild_index(df): arr = list(map(df.index.get_level_values, range(df.index.nlevels))) @@ -111,6 +111,8 @@ def rebuild_index(df): gr = df.groupby(keys, sort=isort) right = gr["3rd"].apply(Series.value_counts, **kwargs) right.index.names = right.index.names[:-1] + ["3rd"] + # https://github.com/pandas-dev/pandas/issues/49909 + right = right.rename(name) # have to sort on index because of unstable sort on values left, right = map(rebuild_index, (left, right)) # xref GH9212 @@ -142,6 +144,8 @@ def test_series_groupby_value_counts_with_grouper(utc): result = dfg["Food"].value_counts().sort_index() expected = dfg["Food"].apply(Series.value_counts).sort_index() expected.index.names = result.index.names + # https://github.com/pandas-dev/pandas/issues/49909 + expected = expected.rename("count") tm.assert_series_equal(result, expected) @@ -153,7 +157,7 @@ def test_series_groupby_value_counts_empty(columns): dfg = df.groupby(columns[:-1]) result = dfg[columns[-1]].value_counts() - expected = Series([], name=columns[-1], dtype=result.dtype) + expected = Series([], dtype=result.dtype, name="count") expected.index = MultiIndex.from_arrays([[]] * len(columns), names=columns) tm.assert_series_equal(result, expected) @@ -166,7 +170,7 @@ def test_series_groupby_value_counts_one_row(columns): dfg = df.groupby(columns[:-1]) result = dfg[columns[-1]].value_counts() - expected = df.value_counts().rename(columns[-1]) + expected = df.value_counts() tm.assert_series_equal(result, expected) @@ -187,6 +191,7 @@ def test_series_groupby_value_counts_on_categorical(): ), ] ), + name="count", ) # Expected: @@ -213,7 +218,7 @@ def test_series_groupby_value_counts_no_sort(): codes=[[0, 1, 0, 1, 1], [0, 0, 1, 0, 1], [0, 1, 2, 0, 2]], names=["country", "gender", "education"], ) - expected = Series([1, 1, 1, 2, 1], index=index, name="education") + expected = Series([1, 1, 1, 2, 1], index=index, name="count") tm.assert_series_equal(result, expected) @@ -257,6 +262,7 @@ def test_basic(education_df): ], names=["country", "gender", "education"], ), + name="proportion", ) tm.assert_series_equal(result, expected) @@ -266,7 +272,7 @@ def _frame_value_counts(df, keys, normalize, sort, ascending): @pytest.mark.parametrize("groupby", ["column", "array", "function"]) -@pytest.mark.parametrize("normalize", [True, False]) +@pytest.mark.parametrize("normalize, name", [(True, "proportion"), (False, "count")]) @pytest.mark.parametrize( "sort, ascending", [ @@ -278,7 +284,7 @@ def _frame_value_counts(df, keys, normalize, sort, ascending): @pytest.mark.parametrize("as_index", [True, False]) @pytest.mark.parametrize("frame", [True, False]) def test_against_frame_and_seriesgroupby( - education_df, groupby, normalize, sort, ascending, as_index, frame + education_df, groupby, normalize, name, sort, ascending, as_index, frame ): # test all parameters: # - Use column, array or function as by= parameter @@ -323,7 +329,7 @@ def test_against_frame_and_seriesgroupby( expected = gp["both"].value_counts( normalize=normalize, sort=sort, ascending=ascending ) - expected.name = None + expected.name = name if as_index: index_frame = expected.index.to_frame(index=False) index_frame["gender"] = index_frame["both"].str.split("-").str.get(0) @@ -382,16 +388,23 @@ def animals_df(): @pytest.mark.parametrize( - "sort, ascending, normalize, expected_data, expected_index", + "sort, ascending, normalize, name, expected_data, expected_index", [ - (False, None, False, [1, 2, 1], [(1, 1, 1), (2, 4, 6), (2, 0, 0)]), - (True, True, False, [1, 1, 2], [(1, 1, 1), (2, 6, 4), (2, 0, 0)]), - (True, False, False, [2, 1, 1], [(1, 1, 1), (4, 2, 6), (0, 2, 0)]), - (True, False, True, [0.5, 0.25, 0.25], [(1, 1, 1), (4, 2, 6), (0, 2, 0)]), + (False, None, False, "count", [1, 2, 1], [(1, 1, 1), (2, 4, 6), (2, 0, 0)]), + (True, True, False, "count", [1, 1, 2], [(1, 1, 1), (2, 6, 4), (2, 0, 0)]), + (True, False, False, "count", [2, 1, 1], [(1, 1, 1), (4, 2, 6), (0, 2, 0)]), + ( + True, + False, + True, + "proportion", + [0.5, 0.25, 0.25], + [(1, 1, 1), (4, 2, 6), (0, 2, 0)], + ), ], ) def test_data_frame_value_counts( - animals_df, sort, ascending, normalize, expected_data, expected_index + animals_df, sort, ascending, normalize, name, expected_data, expected_index ): # 3-way compare with :meth:`~DataFrame.value_counts` # Tests from frame/methods/test_value_counts.py @@ -403,6 +416,7 @@ def test_data_frame_value_counts( index=MultiIndex.from_arrays( expected_index, names=["key", "num_legs", "num_wings"] ), + name=name, ) tm.assert_series_equal(result_frame, expected) @@ -449,7 +463,7 @@ def test_dropna_combinations( for column in nulls_df.columns: columns[column] = [nulls_df[column][row] for row in expected_rows] index = MultiIndex.from_frame(columns) - expected = Series(data=expected_values, index=index) + expected = Series(data=expected_values, index=index, name="proportion") tm.assert_series_equal(result, expected) @@ -490,9 +504,9 @@ def names_with_nulls_df(nulls_fixture): ), ], ) -@pytest.mark.parametrize("normalize", [False, True]) +@pytest.mark.parametrize("normalize, name", [(False, "count"), (True, "proportion")]) def test_data_frame_value_counts_dropna( - names_with_nulls_df, dropna, normalize, expected_data, expected_index + names_with_nulls_df, dropna, normalize, name, expected_data, expected_index ): # GH 41334 # 3-way compare with :meth:`~DataFrame.value_counts` @@ -501,6 +515,7 @@ def test_data_frame_value_counts_dropna( expected = Series( data=expected_data, index=expected_index, + name=name, ) if normalize: expected /= float(len(expected_data)) @@ -517,17 +532,22 @@ def test_data_frame_value_counts_dropna( @pytest.mark.parametrize("as_index", [False, True]) @pytest.mark.parametrize("observed", [False, True]) @pytest.mark.parametrize( - "normalize, expected_data", + "normalize, name, expected_data", [ - (False, np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64)), + ( + False, + "count", + np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64), + ), ( True, + "proportion", np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]), ), ], ) def test_categorical_single_grouper_with_only_observed_categories( - education_df, as_index, observed, normalize, expected_data + education_df, as_index, observed, normalize, name, expected_data ): # Test single categorical grouper with only observed grouping categories @@ -559,6 +579,7 @@ def test_categorical_single_grouper_with_only_observed_categories( expected_series = Series( data=expected_data, index=expected_index, + name=name, ) for i in range(3): expected_series.index = expected_series.index.set_levels( @@ -575,7 +596,7 @@ def test_categorical_single_grouper_with_only_observed_categories( def assert_categorical_single_grouper( - education_df, as_index, observed, expected_index, normalize, expected_data + education_df, as_index, observed, expected_index, normalize, name, expected_data ): # Test single categorical grouper when non-groupers are also categorical education_df = education_df.copy().astype("category") @@ -592,6 +613,7 @@ def assert_categorical_single_grouper( expected_index, names=["country", "gender", "education"], ), + name=name, ) for i in range(3): index_level = CategoricalIndex(expected_series.index.levels[i]) @@ -604,25 +626,28 @@ def assert_categorical_single_grouper( if as_index: tm.assert_series_equal(result, expected_series) else: - expected = expected_series.reset_index( - name="proportion" if normalize else "count" - ) + expected = expected_series.reset_index(name=name) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("as_index", [True, False]) @pytest.mark.parametrize( - "normalize, expected_data", + "normalize, name, expected_data", [ - (False, np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64)), + ( + False, + "count", + np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64), + ), ( True, + "proportion", np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]), ), ], ) def test_categorical_single_grouper_observed_true( - education_df, as_index, normalize, expected_data + education_df, as_index, normalize, name, expected_data ): # GH#46357 @@ -647,22 +672,25 @@ def test_categorical_single_grouper_observed_true( observed=True, expected_index=expected_index, normalize=normalize, + name=name, expected_data=expected_data, ) @pytest.mark.parametrize("as_index", [True, False]) @pytest.mark.parametrize( - "normalize, expected_data", + "normalize, name, expected_data", [ ( False, + "count", np.array( [2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=np.int64 ), ), ( True, + "proportion", np.array( [ 0.5, @@ -689,7 +717,7 @@ def test_categorical_single_grouper_observed_true( ], ) def test_categorical_single_grouper_observed_false( - education_df, as_index, normalize, expected_data + education_df, as_index, normalize, name, expected_data ): # GH#46357 @@ -720,6 +748,7 @@ def test_categorical_single_grouper_observed_false( observed=False, expected_index=expected_index, normalize=normalize, + name=name, expected_data=expected_data, ) @@ -758,18 +787,23 @@ def test_categorical_single_grouper_observed_false( ], ) @pytest.mark.parametrize( - "normalize, expected_data", + "normalize, name, expected_data", [ - (False, np.array([1, 0, 2, 0, 1, 0, 1, 0, 1, 0, 0, 0], dtype=np.int64)), + ( + False, + "count", + np.array([1, 0, 2, 0, 1, 0, 1, 0, 1, 0, 0, 0], dtype=np.int64), + ), ( True, + "proportion", # NaN values corresponds to non-observed groups np.array([1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0]), ), ], ) def test_categorical_multiple_groupers( - education_df, as_index, observed, expected_index, normalize, expected_data + education_df, as_index, observed, expected_index, normalize, name, expected_data ): # GH#46357 @@ -789,6 +823,7 @@ def test_categorical_multiple_groupers( expected_index, names=["country", "education", "gender"], ), + name=name, ) for i in range(2): expected_series.index = expected_series.index.set_levels( @@ -807,18 +842,23 @@ def test_categorical_multiple_groupers( @pytest.mark.parametrize("as_index", [False, True]) @pytest.mark.parametrize("observed", [False, True]) @pytest.mark.parametrize( - "normalize, expected_data", + "normalize, name, expected_data", [ - (False, np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64)), + ( + False, + "count", + np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64), + ), ( True, + "proportion", # NaN values corresponds to non-observed groups np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]), ), ], ) def test_categorical_non_groupers( - education_df, as_index, observed, normalize, expected_data + education_df, as_index, observed, normalize, name, expected_data ): # GH#46357 Test non-observed categories are included in the result, # regardless of `observed` @@ -849,6 +889,7 @@ def test_categorical_non_groupers( expected_index, names=["country", "gender", "education"], ), + name=name, ) for i in range(1, 3): expected_series.index = expected_series.index.set_levels( @@ -910,6 +951,7 @@ def test_column_label_duplicates(test, columns, expected_names, as_index): expected_data, names=expected_names, ), + name="count", ) tm.assert_series_equal(result, expected) else: @@ -943,7 +985,9 @@ def test_ambiguous_grouping(): df = DataFrame({"a": [1, 1]}) gb = df.groupby(np.array([1, 1], dtype=np.int64)) result = gb.value_counts() - expected = Series([2], index=MultiIndex.from_tuples([[1, 1]], names=[None, "a"])) + expected = Series( + [2], index=MultiIndex.from_tuples([[1, 1]], names=[None, "a"]), name="count" + ) tm.assert_series_equal(result, expected) @@ -968,7 +1012,9 @@ def test_subset(): df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1]) result = df.groupby(level=0).value_counts(subset=["c2"]) expected = Series( - [1, 2], index=MultiIndex.from_arrays([[0, 1], ["x", "y"]], names=[None, "c2"]) + [1, 2], + index=MultiIndex.from_arrays([[0, 1], ["x", "y"]], names=[None, "c2"]), + name="count", ) tm.assert_series_equal(result, expected) @@ -986,6 +1032,7 @@ def test_subset_duplicate_columns(): index=MultiIndex.from_arrays( [[0, 1], ["x", "y"], ["x", "y"]], names=[None, "c2", "c2"] ), + name="count", ) tm.assert_series_equal(result, expected) @@ -1020,5 +1067,5 @@ def test_value_counts_time_grouper(utc): codes=[[0, 1, 1, 2, 2, 3], range(6), [0, 0, 1, 2, 2, 3]], names=["Datetime", "Timestamp", "Food"], ) - expected = Series(1, index=index) + expected = Series(1, index=index, name="count") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/datetimelike_/test_value_counts.py b/pandas/tests/indexes/datetimelike_/test_value_counts.py index f0df6dd678ef5..a0f05a1a35d79 100644 --- a/pandas/tests/indexes/datetimelike_/test_value_counts.py +++ b/pandas/tests/indexes/datetimelike_/test_value_counts.py @@ -38,7 +38,7 @@ def _check_value_counts_with_repeats(self, orig): exp_idx = orig[::-1] if not isinstance(exp_idx, PeriodIndex): exp_idx = exp_idx._with_freq(None) - expected = Series(range(10, 0, -1), index=exp_idx, dtype="int64") + expected = Series(range(10, 0, -1), index=exp_idx, dtype="int64", name="count") for obj in [idx, Series(idx)]: tm.assert_series_equal(obj.value_counts(), expected) @@ -89,13 +89,13 @@ def test_value_counts_unique_periodindex2(self): def _check_value_counts_dropna(self, idx): exp_idx = idx[[2, 3]] - expected = Series([3, 2], index=exp_idx) + expected = Series([3, 2], index=exp_idx, name="count") for obj in [idx, Series(idx)]: tm.assert_series_equal(obj.value_counts(), expected) exp_idx = idx[[2, 3, -1]] - expected = Series([3, 2, 1], index=exp_idx) + expected = Series([3, 2, 1], index=exp_idx, name="count") for obj in [idx, Series(idx)]: tm.assert_series_equal(obj.value_counts(dropna=False), expected) diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index 5c7c4f9ce0b75..ab84e0781a8f5 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -252,7 +252,8 @@ def test_table_values_dtypes_roundtrip(setup_path): "int64": 1, "object": 1, "datetime64[ns]": 2, - } + }, + name="count", ) result = result.sort_index() expected = expected.sort_index() diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py index daac5a0c9dac2..fab9b0a5d1846 100644 --- a/pandas/tests/reshape/test_get_dummies.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -102,7 +102,7 @@ def test_get_dummies_basic_types(self, sparse, dtype): else: dtype_name = self.effective_dtype(dtype).name - expected = Series({dtype_name: 8}) + expected = Series({dtype_name: 8}, name="count") result = result.dtypes.value_counts() result.index = [str(i) for i in result.index] tm.assert_series_equal(result, expected) @@ -112,7 +112,7 @@ def test_get_dummies_basic_types(self, sparse, dtype): expected_counts = {"int64": 1, "object": 1} expected_counts[dtype_name] = 3 + expected_counts.get(dtype_name, 0) - expected = Series(expected_counts).sort_index() + expected = Series(expected_counts, name="count").sort_index() result = result.dtypes.value_counts() result.index = [str(i) for i in result.index] result = result.sort_index() diff --git a/pandas/tests/series/methods/test_value_counts.py b/pandas/tests/series/methods/test_value_counts.py index a7d57eee7e5a1..f54489ac8a8b4 100644 --- a/pandas/tests/series/methods/test_value_counts.py +++ b/pandas/tests/series/methods/test_value_counts.py @@ -5,6 +5,7 @@ from pandas import ( Categorical, CategoricalIndex, + Index, Series, ) import pandas._testing as tm @@ -23,9 +24,10 @@ def test_value_counts_datetime(self): ] exp_idx = pd.DatetimeIndex( - ["2011-01-01 09:00", "2011-01-01 11:00", "2011-01-01 10:00"] + ["2011-01-01 09:00", "2011-01-01 11:00", "2011-01-01 10:00"], + name="xxx", ) - exp = Series([3, 2, 1], index=exp_idx, name="xxx") + exp = Series([3, 2, 1], index=exp_idx, name="count") ser = Series(values, name="xxx") tm.assert_series_equal(ser.value_counts(), exp) @@ -34,7 +36,7 @@ def test_value_counts_datetime(self): tm.assert_series_equal(idx.value_counts(), exp) # normalize - exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") + exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="proportion") tm.assert_series_equal(ser.value_counts(normalize=True), exp) tm.assert_series_equal(idx.value_counts(normalize=True), exp) @@ -51,15 +53,16 @@ def test_value_counts_datetime_tz(self): exp_idx = pd.DatetimeIndex( ["2011-01-01 09:00", "2011-01-01 11:00", "2011-01-01 10:00"], tz="US/Eastern", + name="xxx", ) - exp = Series([3, 2, 1], index=exp_idx, name="xxx") + exp = Series([3, 2, 1], index=exp_idx, name="count") ser = Series(values, name="xxx") tm.assert_series_equal(ser.value_counts(), exp) idx = pd.DatetimeIndex(values, name="xxx") tm.assert_series_equal(idx.value_counts(), exp) - exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") + exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="proportion") tm.assert_series_equal(ser.value_counts(normalize=True), exp) tm.assert_series_equal(idx.value_counts(normalize=True), exp) @@ -73,8 +76,10 @@ def test_value_counts_period(self): pd.Period("2011-03", freq="M"), ] - exp_idx = pd.PeriodIndex(["2011-01", "2011-03", "2011-02"], freq="M") - exp = Series([3, 2, 1], index=exp_idx, name="xxx") + exp_idx = pd.PeriodIndex( + ["2011-01", "2011-03", "2011-02"], freq="M", name="xxx" + ) + exp = Series([3, 2, 1], index=exp_idx, name="count") ser = Series(values, name="xxx") tm.assert_series_equal(ser.value_counts(), exp) @@ -83,7 +88,7 @@ def test_value_counts_period(self): tm.assert_series_equal(idx.value_counts(), exp) # normalize - exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") + exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="proportion") tm.assert_series_equal(ser.value_counts(normalize=True), exp) tm.assert_series_equal(idx.value_counts(normalize=True), exp) @@ -91,8 +96,10 @@ def test_value_counts_categorical_ordered(self): # most dtypes are tested in tests/base values = Categorical([1, 2, 3, 1, 1, 3], ordered=True) - exp_idx = CategoricalIndex([1, 3, 2], categories=[1, 2, 3], ordered=True) - exp = Series([3, 2, 1], index=exp_idx, name="xxx") + exp_idx = CategoricalIndex( + [1, 3, 2], categories=[1, 2, 3], ordered=True, name="xxx" + ) + exp = Series([3, 2, 1], index=exp_idx, name="count") ser = Series(values, name="xxx") tm.assert_series_equal(ser.value_counts(), exp) @@ -101,15 +108,17 @@ def test_value_counts_categorical_ordered(self): tm.assert_series_equal(idx.value_counts(), exp) # normalize - exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") + exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="proportion") tm.assert_series_equal(ser.value_counts(normalize=True), exp) tm.assert_series_equal(idx.value_counts(normalize=True), exp) def test_value_counts_categorical_not_ordered(self): values = Categorical([1, 2, 3, 1, 1, 3], ordered=False) - exp_idx = CategoricalIndex([1, 3, 2], categories=[1, 2, 3], ordered=False) - exp = Series([3, 2, 1], index=exp_idx, name="xxx") + exp_idx = CategoricalIndex( + [1, 3, 2], categories=[1, 2, 3], ordered=False, name="xxx" + ) + exp = Series([3, 2, 1], index=exp_idx, name="count") ser = Series(values, name="xxx") tm.assert_series_equal(ser.value_counts(), exp) @@ -118,7 +127,7 @@ def test_value_counts_categorical_not_ordered(self): tm.assert_series_equal(idx.value_counts(), exp) # normalize - exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") + exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="proportion") tm.assert_series_equal(ser.value_counts(normalize=True), exp) tm.assert_series_equal(idx.value_counts(normalize=True), exp) @@ -128,21 +137,25 @@ def test_value_counts_categorical(self): ser = Series(cats, name="xxx") res = ser.value_counts(sort=False) - exp_index = CategoricalIndex(list("cabd"), categories=cats.categories) - exp = Series([3, 1, 2, 0], name="xxx", index=exp_index) + exp_index = CategoricalIndex( + list("cabd"), categories=cats.categories, name="xxx" + ) + exp = Series([3, 1, 2, 0], name="count", index=exp_index) tm.assert_series_equal(res, exp) res = ser.value_counts(sort=True) - exp_index = CategoricalIndex(list("cbad"), categories=cats.categories) - exp = Series([3, 2, 1, 0], name="xxx", index=exp_index) + exp_index = CategoricalIndex( + list("cbad"), categories=cats.categories, name="xxx" + ) + exp = Series([3, 2, 1, 0], name="count", index=exp_index) tm.assert_series_equal(res, exp) # check object dtype handles the Series.name as the same # (tested in tests/base) ser = Series(["a", "b", "c", "c", "c", "b"], name="xxx") res = ser.value_counts() - exp = Series([3, 2, 1], name="xxx", index=["c", "b", "a"]) + exp = Series([3, 2, 1], name="count", index=Index(["c", "b", "a"], name="xxx")) tm.assert_series_equal(res, exp) def test_value_counts_categorical_with_nan(self): @@ -150,7 +163,7 @@ def test_value_counts_categorical_with_nan(self): # sanity check ser = Series(["a", "b", "a"], dtype="category") - exp = Series([2, 1], index=CategoricalIndex(["a", "b"])) + exp = Series([2, 1], index=CategoricalIndex(["a", "b"]), name="count") res = ser.value_counts(dropna=True) tm.assert_series_equal(res, exp) @@ -168,18 +181,22 @@ def test_value_counts_categorical_with_nan(self): for ser in series: # None is a NaN value, so we exclude its count here - exp = Series([2, 1], index=CategoricalIndex(["a", "b"])) + exp = Series([2, 1], index=CategoricalIndex(["a", "b"]), name="count") res = ser.value_counts(dropna=True) tm.assert_series_equal(res, exp) # we don't exclude the count of None and sort by counts - exp = Series([3, 2, 1], index=CategoricalIndex([np.nan, "a", "b"])) + exp = Series( + [3, 2, 1], index=CategoricalIndex([np.nan, "a", "b"]), name="count" + ) res = ser.value_counts(dropna=False) tm.assert_series_equal(res, exp) # When we aren't sorting by counts, and np.nan isn't a # category, it should be last. - exp = Series([2, 1, 3], index=CategoricalIndex(["a", "b", np.nan])) + exp = Series( + [2, 1, 3], index=CategoricalIndex(["a", "b", np.nan]), name="count" + ) res = ser.value_counts(dropna=False, sort=False) tm.assert_series_equal(res, exp) @@ -189,17 +206,17 @@ def test_value_counts_categorical_with_nan(self): ( Series([False, True, True, pd.NA]), False, - Series([2, 1, 1], index=[True, False, pd.NA]), + Series([2, 1, 1], index=[True, False, pd.NA], name="count"), ), ( Series([False, True, True, pd.NA]), True, - Series([2, 1], index=pd.Index([True, False], dtype=object)), + Series([2, 1], index=Index([True, False], dtype=object), name="count"), ), ( Series(range(3), index=[True, False, np.nan]).index, False, - Series([1, 1, 1], index=[True, False, np.nan]), + Series([1, 1, 1], index=[True, False, np.nan], name="count"), ), ], ) @@ -213,11 +230,19 @@ def test_value_counts_bool_with_nan(self, ser, dropna, exp): [ ( [1 + 1j, 1 + 1j, 1, 3j, 3j, 3j], - Series([3, 2, 1], index=pd.Index([3j, 1 + 1j, 1], dtype=np.complex128)), + Series( + [3, 2, 1], + index=Index([3j, 1 + 1j, 1], dtype=np.complex128), + name="count", + ), ), ( np.array([1 + 1j, 1 + 1j, 1, 3j, 3j, 3j], dtype=np.complex64), - Series([3, 2, 1], index=pd.Index([3j, 1 + 1j, 1], dtype=np.complex64)), + Series( + [3, 2, 1], + index=Index([3j, 1 + 1j, 1], dtype=np.complex64), + name="count", + ), ), ], ) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index ce3388973458f..d96109ecd960c 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1182,18 +1182,22 @@ def test_value_counts(self): result = algos.value_counts(factor) breaks = [-1.194, -0.535, 0.121, 0.777, 1.433] index = IntervalIndex.from_breaks(breaks).astype(CDT(ordered=True)) - expected = Series([1, 1, 1, 1], index=index) + expected = Series([1, 1, 1, 1], index=index, name="count") tm.assert_series_equal(result.sort_index(), expected.sort_index()) def test_value_counts_bins(self): s = [1, 2, 3, 4] result = algos.value_counts(s, bins=1) - expected = Series([4], index=IntervalIndex.from_tuples([(0.996, 4.0)])) + expected = Series( + [4], index=IntervalIndex.from_tuples([(0.996, 4.0)]), name="count" + ) tm.assert_series_equal(result, expected) result = algos.value_counts(s, bins=2, sort=False) expected = Series( - [2, 2], index=IntervalIndex.from_tuples([(0.996, 2.5), (2.5, 4.0)]) + [2, 2], + index=IntervalIndex.from_tuples([(0.996, 2.5), (2.5, 4.0)]), + name="count", ) tm.assert_series_equal(result, expected) @@ -1221,7 +1225,7 @@ def test_value_counts_nat(self): assert len(vc) == 1 assert len(vc_with_na) == 2 - exp_dt = Series({Timestamp("2014-01-01 00:00:00"): 1}) + exp_dt = Series({Timestamp("2014-01-01 00:00:00"): 1}, name="count") tm.assert_series_equal(algos.value_counts(dt), exp_dt) # TODO same for (timedelta) @@ -1243,7 +1247,7 @@ def test_value_counts_datetime_outofbounds(self): [datetime(3000, 1, 1), datetime(5000, 1, 1), datetime(6000, 1, 1)], dtype=object, ) - exp = Series([3, 2, 1], index=exp_index) + exp = Series([3, 2, 1], index=exp_index, name="count") tm.assert_series_equal(res, exp) # GH 12424 @@ -1255,7 +1259,9 @@ def test_value_counts_datetime_outofbounds(self): def test_categorical(self): s = Series(Categorical(list("aaabbc"))) result = s.value_counts() - expected = Series([3, 2, 1], index=CategoricalIndex(["a", "b", "c"])) + expected = Series( + [3, 2, 1], index=CategoricalIndex(["a", "b", "c"]), name="count" + ) tm.assert_series_equal(result, expected, check_index_type=True) @@ -1272,10 +1278,13 @@ def test_categorical_nans(self): expected = Series( [4, 3, 2], index=CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c"]), + name="count", ) tm.assert_series_equal(result, expected, check_index_type=True) result = s.value_counts(dropna=False) - expected = Series([4, 3, 2, 1], index=CategoricalIndex(["a", "b", "c", np.nan])) + expected = Series( + [4, 3, 2, 1], index=CategoricalIndex(["a", "b", "c", np.nan]), name="count" + ) tm.assert_series_equal(result, expected, check_index_type=True) # out of order @@ -1287,8 +1296,11 @@ def test_categorical_nans(self): expected = Series( [4, 3, 2], index=CategoricalIndex( - ["a", "b", "c"], categories=["b", "a", "c"], ordered=True + ["a", "b", "c"], + categories=["b", "a", "c"], + ordered=True, ), + name="count", ) tm.assert_series_equal(result, expected, check_index_type=True) @@ -1298,6 +1310,7 @@ def test_categorical_nans(self): index=CategoricalIndex( ["a", "b", "c", np.nan], categories=["b", "a", "c"], ordered=True ), + name="count", ) tm.assert_series_equal(result, expected, check_index_type=True) @@ -1310,6 +1323,7 @@ def test_categorical_zeroes(self): index=Categorical( ["b", "a", "c", "d"], categories=list("abcd"), ordered=True ), + name="count", ) tm.assert_series_equal(result, expected, check_index_type=True) @@ -1318,37 +1332,37 @@ def test_dropna(self): tm.assert_series_equal( Series([True, True, False]).value_counts(dropna=True), - Series([2, 1], index=[True, False]), + Series([2, 1], index=[True, False], name="count"), ) tm.assert_series_equal( Series([True, True, False]).value_counts(dropna=False), - Series([2, 1], index=[True, False]), + Series([2, 1], index=[True, False], name="count"), ) tm.assert_series_equal( Series([True] * 3 + [False] * 2 + [None] * 5).value_counts(dropna=True), - Series([3, 2], index=Index([True, False], dtype=object)), + Series([3, 2], index=Index([True, False], dtype=object), name="count"), ) tm.assert_series_equal( Series([True] * 5 + [False] * 3 + [None] * 2).value_counts(dropna=False), - Series([5, 3, 2], index=[True, False, np.nan]), + Series([5, 3, 2], index=[True, False, np.nan], name="count"), ) tm.assert_series_equal( Series([10.3, 5.0, 5.0]).value_counts(dropna=True), - Series([2, 1], index=[5.0, 10.3]), + Series([2, 1], index=[5.0, 10.3], name="count"), ) tm.assert_series_equal( Series([10.3, 5.0, 5.0]).value_counts(dropna=False), - Series([2, 1], index=[5.0, 10.3]), + Series([2, 1], index=[5.0, 10.3], name="count"), ) tm.assert_series_equal( Series([10.3, 5.0, 5.0, None]).value_counts(dropna=True), - Series([2, 1], index=[5.0, 10.3]), + Series([2, 1], index=[5.0, 10.3], name="count"), ) result = Series([10.3, 10.3, 5.0, 5.0, 5.0, None]).value_counts(dropna=False) - expected = Series([3, 2, 1], index=[5.0, 10.3, np.nan]) + expected = Series([3, 2, 1], index=[5.0, 10.3, np.nan], name="count") tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dtype", (np.float64, object, "M8[ns]")) @@ -1358,23 +1372,27 @@ def test_value_counts_normalized(self, dtype): s_typed = s.astype(dtype) result = s_typed.value_counts(normalize=True, dropna=False) expected = Series( - [0.5, 0.3, 0.2], index=Series([np.nan, 2.0, 1.0], dtype=dtype) + [0.5, 0.3, 0.2], + index=Series([np.nan, 2.0, 1.0], dtype=dtype), + name="proportion", ) tm.assert_series_equal(result, expected) result = s_typed.value_counts(normalize=True, dropna=True) - expected = Series([0.6, 0.4], index=Series([2.0, 1.0], dtype=dtype)) + expected = Series( + [0.6, 0.4], index=Series([2.0, 1.0], dtype=dtype), name="proportion" + ) tm.assert_series_equal(result, expected) def test_value_counts_uint64(self): arr = np.array([2**63], dtype=np.uint64) - expected = Series([1], index=[2**63]) + expected = Series([1], index=[2**63], name="count") result = algos.value_counts(arr) tm.assert_series_equal(result, expected) arr = np.array([-1, 2**63], dtype=object) - expected = Series([1, 1], index=[-1, 2**63]) + expected = Series([1, 1], index=[-1, 2**63], name="count") result = algos.value_counts(arr) tm.assert_series_equal(result, expected)