From 6e15159c30a92c725ac276155c9eb63e29db2f0e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 3 Jan 2020 19:05:53 -0800 Subject: [PATCH 1/5] REF: EA value_counts -> _value_counts --- pandas/core/algorithms.py | 5 +++-- pandas/core/arrays/boolean.py | 16 ++++++-------- pandas/core/arrays/categorical.py | 29 ++++++++++++++++---------- pandas/core/arrays/datetimelike.py | 30 +++++++++++++-------------- pandas/core/arrays/integer.py | 15 ++++++-------- pandas/core/arrays/interval.py | 9 ++++++++ pandas/core/arrays/sparse/array.py | 18 +++++++--------- pandas/core/arrays/string_.py | 5 +++-- pandas/tests/arrays/test_datetimes.py | 7 +++++-- 9 files changed, 72 insertions(+), 62 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 42cfd9d54ac19..b0ff79b047006 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -705,8 +705,9 @@ def value_counts( if is_extension_array_dtype(values): # handle Categorical and sparse, - result = Series(values)._values.value_counts(dropna=dropna) - result.name = name + arr = extract_array(values) + index, counts = arr._value_counts(dropna=dropna) + result = Series(counts, index=index, name=name) counts = result.values else: diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 102150b1cbce1..0e12e631c46f2 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -539,11 +539,9 @@ def astype(self, dtype, copy=True): data = self._coerce_to_ndarray(na_value=na_value) return astype_nansafe(data, dtype, copy=False) - def value_counts(self, dropna=True): + def _value_counts(self, dropna=True): """ - Returns a Series containing counts of each category. - - Every category will have an entry, even those with a count of 0. + Return a tuple describing the counts for each value. Parameters ---------- @@ -552,15 +550,14 @@ def value_counts(self, dropna=True): Returns ------- - counts : Series + index : BooleanArray + values : ndarray[int64] See Also -------- Series.value_counts - """ - - from pandas import Index, Series + from pandas import Index # compute counts on the data with no nans data = self._data[~self._mask] @@ -582,8 +579,7 @@ def value_counts(self, dropna=True): index = Index( np.concatenate([index, np.array([np.nan], dtype=object)]), dtype=object ) - - return Series(array, index=index) + return index, array def _values_for_argsort(self) -> np.ndarray: """ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index f4e75364ae932..93a354a47ab0f 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1423,9 +1423,9 @@ def dropna(self): return result - def value_counts(self, dropna=True): + def _value_counts(self, dropna=True): """ - Return a Series containing counts of each category. + Return a tuple describing the counts of each category. Every category will have an entry, even those with a count of 0. @@ -1436,17 +1436,21 @@ def value_counts(self, dropna=True): Returns ------- - counts : Series + index : Categorical + values : ndarray[int64] See Also -------- Series.value_counts """ - from pandas import Series, CategoricalIndex - code, cat = self._codes, self.categories - ncat, mask = len(cat), 0 <= code - ix, clean = np.arange(ncat), mask.all() + code = self._values_for_factorize()[0] + mask = 0 <= code + clean = mask.all() + + cat = self.categories + ncat = len(cat) + ix = np.arange(ncat) if dropna or clean: obs = code if clean else code[mask] @@ -1455,9 +1459,8 @@ def value_counts(self, dropna=True): count = np.bincount(np.where(mask, code, ncat)) ix = np.append(ix, -1) - ix = self._constructor(ix, dtype=self.dtype, fastpath=True) - - return Series(count, index=CategoricalIndex(ix), dtype="int64") + index = self._constructor(ix, dtype=self.dtype, fastpath=True) + return index, count def _internal_get_values(self): """ @@ -2323,7 +2326,11 @@ def describe(self): description: `DataFrame` A dataframe with frequency and counts by category. """ - counts = self.value_counts(dropna=False) + from pandas import Series + + index, values = self._value_counts(dropna=False) + counts = Series(values, index=index) + freqs = counts / float(counts.sum()) from pandas.core.reshape.concat import concat diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 2bdd9acaeb70f..814e5dddd756c 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -679,33 +679,33 @@ def repeat(self, repeats, *args, **kwargs): values = self._data.repeat(repeats) return type(self)(values.view("i8"), dtype=self.dtype) - def value_counts(self, dropna=False): + def _value_counts(self, dropna: bool = False): """ - Return a Series containing counts of unique values. + Return an array of unique values and an array of their counts. Parameters ---------- - dropna : bool, default True - Don't include counts of NaT values. + dropna : bool, default False Returns ------- - Series + ExtensionArray + ndarray[int64] """ - from pandas import Series, Index - if dropna: - values = self[~self.isna()]._data + values = self[~self.isna()] else: - values = self._data + values = self - cls = type(self) + arg = values._values_for_factorize()[0] - result = value_counts(values, sort=False, dropna=dropna) - index = Index( - cls(result.index.view("i8"), dtype=self.dtype), name=result.index.name - ) - return Series(result.values, index=index, name=result.name) + result = value_counts(arg, sort=False, dropna=False) + + freq = self.freq if is_period_dtype(self) else None + new_index = type(self)(result.index, dtype=self.dtype, freq=freq) + counts = result.values + + return new_index, counts def map(self, mapper): # TODO(GH-23179): Add ExtensionArray.map diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 0922f4ac6f71d..71bf72fe76f53 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -578,11 +578,9 @@ def _ndarray_values(self) -> np.ndarray: """ return self._data - def value_counts(self, dropna=True): + def _value_counts(self, dropna=True): """ - Returns a Series containing counts of each category. - - Every category will have an entry, even those with a count of 0. + Return a tuple describing the counts for each value. Parameters ---------- @@ -591,15 +589,15 @@ def value_counts(self, dropna=True): Returns ------- - counts : Series + index : IntegerArray + values : ndarray[int64] See Also -------- Series.value_counts - """ - from pandas import Index, Series + from pandas import Index # compute counts on the data with no nans data = self._data[~self._mask] @@ -624,8 +622,7 @@ def value_counts(self, dropna=True): ), dtype=object, ) - - return Series(array, index=index) + return index, array def _values_for_factorize(self) -> Tuple[np.ndarray, Any]: # TODO: https://github.com/pandas-dev/pandas/issues/30037 diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index cea059fb22be1..27bd2ba75d3fb 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -800,6 +800,14 @@ def take(self, indices, allow_fill=False, fill_value=None, axis=None, **kwargs): return self._shallow_copy(left_take, right_take) + def _value_counts(self, dropna=True): + # TODO: implement this is a non-naive way! + + arg = self._values_for_factorize()[0] + result = value_counts(arg, dropna=dropna) + return result.index, result.values + + ''' def value_counts(self, dropna=True): """ Returns a Series containing counts of each interval. @@ -819,6 +827,7 @@ def value_counts(self, dropna=True): """ # TODO: implement this is a non-naive way! return value_counts(np.asarray(self), dropna=dropna) + ''' # Formatting diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index adf10642f337a..504f0f635ea3f 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -34,7 +34,7 @@ is_string_dtype, pandas_dtype, ) -from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries, ABCSparseArray +from pandas.core.dtypes.generic import ABCSeries, ABCSparseArray from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna import pandas.core.algorithms as algos @@ -696,20 +696,19 @@ def factorize(self, na_sentinel=-1): uniques = SparseArray(uniques, dtype=self.dtype) return codes, uniques - def value_counts(self, dropna=True): + def _value_counts(self, dropna=True): """ - Returns a Series containing counts of unique values. + Return an array of unique values and an array of their counts. Parameters ---------- - dropna : boolean, default True - Don't include counts of NaN, even if NaN is in sp_values. + dropna : bool, default True Returns ------- - counts : Series + ndarray + ndarray[int64] """ - from pandas import Index, Series keys, counts = algos._value_counts_arraylike(self.sp_values, dropna=dropna) fcounts = self.sp_index.ngaps @@ -728,10 +727,7 @@ def value_counts(self, dropna=True): keys = np.insert(keys, 0, self.fill_value) counts = np.insert(counts, 0, fcounts) - if not isinstance(keys, ABCIndexClass): - keys = Index(keys) - result = Series(counts, index=keys) - return result + return keys, counts # -------- # Indexing diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index de254f662bb32..49d0ad057cbf9 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -261,10 +261,11 @@ def astype(self, dtype, copy=True): def _reduce(self, name, skipna=True, **kwargs): raise TypeError(f"Cannot perform reduction '{name}' with string dtype") - def value_counts(self, dropna=False): + def _value_counts(self, dropna=False): from pandas import value_counts - return value_counts(self._ndarray, dropna=dropna) + result = value_counts(self._ndarray, dropna=dropna) + return result.index, result.values # Overrride parent because we have different return types. @classmethod diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index bca629ae32270..4428d8027bcfe 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -214,13 +214,16 @@ def test_value_counts_preserves_tz(self): dti = pd.date_range("2000", periods=2, freq="D", tz="US/Central") arr = DatetimeArray(dti).repeat([4, 3]) - result = arr.value_counts() + index, values = arr._value_counts() + result = pd.Series(values, index=index) # Note: not tm.assert_index_equal, since `freq`s do not match assert result.index.equals(dti) arr[-2] = pd.NaT - result = arr.value_counts() + index, values = arr._value_counts() + result = pd.Series(values, index=index) + expected = pd.Series([1, 4, 2], index=[pd.NaT, dti[0], dti[1]]) tm.assert_series_equal(result, expected) From 2a469d79cabb87f9d70cc01bccd8e0c4a1b92673 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 3 Jan 2020 19:14:58 -0800 Subject: [PATCH 2/5] remove docsttringd out code --- pandas/core/arrays/interval.py | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 27bd2ba75d3fb..ea4cd4ba630c5 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -807,28 +807,6 @@ def _value_counts(self, dropna=True): result = value_counts(arg, dropna=dropna) return result.index, result.values - ''' - def value_counts(self, dropna=True): - """ - Returns a Series containing counts of each interval. - - Parameters - ---------- - dropna : bool, default True - Don't include counts of NaN. - - Returns - ------- - counts : Series - - See Also - -------- - Series.value_counts - """ - # TODO: implement this is a non-naive way! - return value_counts(np.asarray(self), dropna=dropna) - ''' - # Formatting def _format_data(self): From e785b7eaa7ce5e6f340f39e688041f12c99b4ec6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 3 Jan 2020 20:27:04 -0800 Subject: [PATCH 3/5] troubleshoot 32 bit build --- pandas/core/arrays/categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index a38acdc460868..d351a26c6b256 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1444,7 +1444,7 @@ def _value_counts(self, dropna=True): Series.value_counts """ - code = self._values_for_factorize()[0] + code = self._codes mask = 0 <= code clean = mask.all() From 614069019b064cce6eb7da26d8a3d1a84215216f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 4 Jan 2020 08:08:52 -0800 Subject: [PATCH 4/5] restore cast --- pandas/core/arrays/categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index d351a26c6b256..d0cdcdd48763d 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1460,7 +1460,7 @@ def _value_counts(self, dropna=True): ix = np.append(ix, -1) index = self._constructor(ix, dtype=self.dtype, fastpath=True) - return index, count + return index, count.astype(np.int64) def _internal_get_values(self): """ From 18571563779f6ea9bbbaa4b34bde6103d0919a31 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 6 Jan 2020 08:38:56 -0800 Subject: [PATCH 5/5] mypy fixup --- pandas/core/arrays/datetimelike.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 814e5dddd756c..3ab6bcb7759a9 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -702,7 +702,8 @@ def _value_counts(self, dropna: bool = False): result = value_counts(arg, sort=False, dropna=False) freq = self.freq if is_period_dtype(self) else None - new_index = type(self)(result.index, dtype=self.dtype, freq=freq) + idx = result.index + new_index = type(self)(idx, dtype=self.dtype, freq=freq) # type: ignore counts = result.values return new_index, counts