diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 39e8e9008a844..cfbbb9e94628f 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -708,8 +708,9 @@ def value_counts( if is_extension_array_dtype(values): # handle Categorical and sparse, - result = Series(values)._values.value_counts(dropna=dropna) - result.name = name + arr = extract_array(values) + index, counts = arr._value_counts(dropna=dropna) + result = Series(counts, index=index, name=name) counts = result.values else: diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 409be244c4327..9daa88c6cabc7 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -528,11 +528,9 @@ def astype(self, dtype, copy=True): data = self._coerce_to_ndarray(na_value=na_value) return astype_nansafe(data, dtype, copy=False) - def value_counts(self, dropna=True): + def _value_counts(self, dropna=True): """ - Returns a Series containing counts of each category. - - Every category will have an entry, even those with a count of 0. + Return a tuple describing the counts for each value. Parameters ---------- @@ -541,15 +539,14 @@ def value_counts(self, dropna=True): Returns ------- - counts : Series + index : BooleanArray + values : ndarray[int64] See Also -------- Series.value_counts - """ - - from pandas import Index, Series + from pandas import Index # compute counts on the data with no nans data = self._data[~self._mask] @@ -571,8 +568,7 @@ def value_counts(self, dropna=True): index = Index( np.concatenate([index, np.array([np.nan], dtype=object)]), dtype=object ) - - return Series(array, index=index) + return index, array def _values_for_argsort(self) -> np.ndarray: """ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 7386c9d0ef1de..04879f79b91fa 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1423,9 +1423,9 @@ def dropna(self): return result - def value_counts(self, dropna=True): + def _value_counts(self, dropna=True): """ - Return a Series containing counts of each category. + Return a tuple describing the counts of each category. Every category will have an entry, even those with a count of 0. @@ -1436,17 +1436,21 @@ def value_counts(self, dropna=True): Returns ------- - counts : Series + index : Categorical + values : ndarray[int64] See Also -------- Series.value_counts """ - from pandas import Series, CategoricalIndex - code, cat = self._codes, self.categories - ncat, mask = len(cat), 0 <= code - ix, clean = np.arange(ncat), mask.all() + code = self._codes + mask = 0 <= code + clean = mask.all() + + cat = self.categories + ncat = len(cat) + ix = np.arange(ncat) if dropna or clean: obs = code if clean else code[mask] @@ -1455,9 +1459,8 @@ def value_counts(self, dropna=True): count = np.bincount(np.where(mask, code, ncat)) ix = np.append(ix, -1) - ix = self._constructor(ix, dtype=self.dtype, fastpath=True) - - return Series(count, index=CategoricalIndex(ix), dtype="int64") + index = self._constructor(ix, dtype=self.dtype, fastpath=True) + return index, count.astype(np.int64) def _internal_get_values(self): """ @@ -2323,7 +2326,11 @@ def describe(self): description: `DataFrame` A dataframe with frequency and counts by category. """ - counts = self.value_counts(dropna=False) + from pandas import Series + + index, values = self._value_counts(dropna=False) + counts = Series(values, index=index) + freqs = counts / float(counts.sum()) from pandas.core.reshape.concat import concat diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 2bdd9acaeb70f..3ab6bcb7759a9 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -679,33 +679,34 @@ def repeat(self, repeats, *args, **kwargs): values = self._data.repeat(repeats) return type(self)(values.view("i8"), dtype=self.dtype) - def value_counts(self, dropna=False): + def _value_counts(self, dropna: bool = False): """ - Return a Series containing counts of unique values. + Return an array of unique values and an array of their counts. Parameters ---------- - dropna : bool, default True - Don't include counts of NaT values. + dropna : bool, default False Returns ------- - Series + ExtensionArray + ndarray[int64] """ - from pandas import Series, Index - if dropna: - values = self[~self.isna()]._data + values = self[~self.isna()] else: - values = self._data + values = self - cls = type(self) + arg = values._values_for_factorize()[0] - result = value_counts(values, sort=False, dropna=dropna) - index = Index( - cls(result.index.view("i8"), dtype=self.dtype), name=result.index.name - ) - return Series(result.values, index=index, name=result.name) + result = value_counts(arg, sort=False, dropna=False) + + freq = self.freq if is_period_dtype(self) else None + idx = result.index + new_index = type(self)(idx, dtype=self.dtype, freq=freq) # type: ignore + counts = result.values + + return new_index, counts def map(self, mapper): # TODO(GH-23179): Add ExtensionArray.map diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 0922f4ac6f71d..71bf72fe76f53 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -578,11 +578,9 @@ def _ndarray_values(self) -> np.ndarray: """ return self._data - def value_counts(self, dropna=True): + def _value_counts(self, dropna=True): """ - Returns a Series containing counts of each category. - - Every category will have an entry, even those with a count of 0. + Return a tuple describing the counts for each value. Parameters ---------- @@ -591,15 +589,15 @@ def value_counts(self, dropna=True): Returns ------- - counts : Series + index : IntegerArray + values : ndarray[int64] See Also -------- Series.value_counts - """ - from pandas import Index, Series + from pandas import Index # compute counts on the data with no nans data = self._data[~self._mask] @@ -624,8 +622,7 @@ def value_counts(self, dropna=True): ), dtype=object, ) - - return Series(array, index=index) + return index, array def _values_for_factorize(self) -> Tuple[np.ndarray, Any]: # TODO: https://github.com/pandas-dev/pandas/issues/30037 diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 75dd00104db1b..e400c5ec2614d 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -855,25 +855,12 @@ def take(self, indices, allow_fill=False, fill_value=None, axis=None, **kwargs): return self._shallow_copy(left_take, right_take) - def value_counts(self, dropna=True): - """ - Returns a Series containing counts of each interval. - - Parameters - ---------- - dropna : bool, default True - Don't include counts of NaN. - - Returns - ------- - counts : Series - - See Also - -------- - Series.value_counts - """ + def _value_counts(self, dropna=True): # TODO: implement this is a non-naive way! - return value_counts(np.asarray(self), dropna=dropna) + + arg = self._values_for_factorize()[0] + result = value_counts(arg, dropna=dropna) + return result.index, result.values # Formatting diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 9838cdfabbb95..8a6509624cac8 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -34,7 +34,7 @@ is_string_dtype, pandas_dtype, ) -from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries, ABCSparseArray +from pandas.core.dtypes.generic import ABCSeries, ABCSparseArray from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna import pandas.core.algorithms as algos @@ -696,20 +696,19 @@ def factorize(self, na_sentinel=-1): uniques = SparseArray(uniques, dtype=self.dtype) return codes, uniques - def value_counts(self, dropna=True): + def _value_counts(self, dropna=True): """ - Returns a Series containing counts of unique values. + Return an array of unique values and an array of their counts. Parameters ---------- - dropna : boolean, default True - Don't include counts of NaN, even if NaN is in sp_values. + dropna : bool, default True Returns ------- - counts : Series + ndarray + ndarray[int64] """ - from pandas import Index, Series keys, counts = algos._value_counts_arraylike(self.sp_values, dropna=dropna) fcounts = self.sp_index.ngaps @@ -728,10 +727,7 @@ def value_counts(self, dropna=True): keys = np.insert(keys, 0, self.fill_value) counts = np.insert(counts, 0, fcounts) - if not isinstance(keys, ABCIndexClass): - keys = Index(keys) - result = Series(counts, index=keys) - return result + return keys, counts # -------- # Indexing diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 0da877fb1ad45..610a5c3db563b 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -250,10 +250,11 @@ def astype(self, dtype, copy=True): def _reduce(self, name, skipna=True, **kwargs): raise TypeError(f"Cannot perform reduction '{name}' with string dtype") - def value_counts(self, dropna=False): + def _value_counts(self, dropna=False): from pandas import value_counts - return value_counts(self._ndarray, dropna=dropna) + result = value_counts(self._ndarray, dropna=dropna) + return result.index, result.values # Overrride parent because we have different return types. @classmethod diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index bca629ae32270..4428d8027bcfe 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -214,13 +214,16 @@ def test_value_counts_preserves_tz(self): dti = pd.date_range("2000", periods=2, freq="D", tz="US/Central") arr = DatetimeArray(dti).repeat([4, 3]) - result = arr.value_counts() + index, values = arr._value_counts() + result = pd.Series(values, index=index) # Note: not tm.assert_index_equal, since `freq`s do not match assert result.index.equals(dti) arr[-2] = pd.NaT - result = arr.value_counts() + index, values = arr._value_counts() + result = pd.Series(values, index=index) + expected = pd.Series([1, 4, 2], index=[pd.NaT, dti[0], dti[1]]) tm.assert_series_equal(result, expected)