Skip to content

Commit 8bdd7b1

Browse files
BUG: BooleanArray.value_counts dropna (#30824)
* BUG: BooleanArray.value_counts dropna Closes #30685
1 parent feaa503 commit 8bdd7b1

File tree

11 files changed

+107
-98
lines changed

11 files changed

+107
-98
lines changed

doc/source/whatsnew/v1.0.0.rst

+18
Original file line numberDiff line numberDiff line change
@@ -411,6 +411,24 @@ Use :meth:`arrays.IntegerArray.to_numpy` with an explicit ``na_value`` instead.
411411
412412
a.to_numpy(dtype="float", na_value=np.nan)
413413
414+
**value_counts returns a nullable integer dtype**
415+
416+
:meth:`Series.value_counts` with a nullable integer dtype now returns a nullable
417+
integer dtype for the values.
418+
419+
*pandas 0.25.x*
420+
421+
.. code-block:: python
422+
423+
>>> pd.Series([2, 1, 1, None], dtype="Int64").value_counts().dtype
424+
dtype('int64')
425+
426+
*pandas 1.0.0*
427+
428+
.. ipython:: python
429+
430+
pd.Series([2, 1, 1, None], dtype="Int64").value_counts().dtype
431+
414432
See :ref:`missing_data.NA` for more on the differences between :attr:`pandas.NA`
415433
and :attr:`numpy.nan`.
416434

pandas/core/arrays/boolean.py

-46
Original file line numberDiff line numberDiff line change
@@ -410,52 +410,6 @@ def astype(self, dtype, copy=True):
410410
data = self.to_numpy(na_value=na_value)
411411
return astype_nansafe(data, dtype, copy=False)
412412

413-
def value_counts(self, dropna=True):
414-
"""
415-
Returns a Series containing counts of each category.
416-
417-
Every category will have an entry, even those with a count of 0.
418-
419-
Parameters
420-
----------
421-
dropna : bool, default True
422-
Don't include counts of NaN.
423-
424-
Returns
425-
-------
426-
counts : Series
427-
428-
See Also
429-
--------
430-
Series.value_counts
431-
432-
"""
433-
434-
from pandas import Index, Series
435-
436-
# compute counts on the data with no nans
437-
data = self._data[~self._mask]
438-
value_counts = Index(data).value_counts()
439-
array = value_counts.values
440-
441-
# TODO(extension)
442-
# if we have allow Index to hold an ExtensionArray
443-
# this is easier
444-
index = value_counts.index.values.astype(bool).astype(object)
445-
446-
# if we want nans, count the mask
447-
if not dropna:
448-
449-
# TODO(extension)
450-
# appending to an Index *always* infers
451-
# w/o passing the dtype
452-
array = np.append(array, [self._mask.sum()])
453-
index = Index(
454-
np.concatenate([index, np.array([np.nan], dtype=object)]), dtype=object
455-
)
456-
457-
return Series(array, index=index)
458-
459413
def _values_for_argsort(self) -> np.ndarray:
460414
"""
461415
Return values for sorting.

pandas/core/arrays/integer.py

-49
Original file line numberDiff line numberDiff line change
@@ -467,55 +467,6 @@ def _ndarray_values(self) -> np.ndarray:
467467
"""
468468
return self._data
469469

470-
def value_counts(self, dropna=True):
471-
"""
472-
Returns a Series containing counts of each category.
473-
474-
Every category will have an entry, even those with a count of 0.
475-
476-
Parameters
477-
----------
478-
dropna : bool, default True
479-
Don't include counts of NaN.
480-
481-
Returns
482-
-------
483-
counts : Series
484-
485-
See Also
486-
--------
487-
Series.value_counts
488-
489-
"""
490-
491-
from pandas import Index, Series
492-
493-
# compute counts on the data with no nans
494-
data = self._data[~self._mask]
495-
value_counts = Index(data).value_counts()
496-
array = value_counts.values
497-
498-
# TODO(extension)
499-
# if we have allow Index to hold an ExtensionArray
500-
# this is easier
501-
index = value_counts.index.astype(object)
502-
503-
# if we want nans, count the mask
504-
if not dropna:
505-
506-
# TODO(extension)
507-
# appending to an Index *always* infers
508-
# w/o passing the dtype
509-
array = np.append(array, [self._mask.sum()])
510-
index = Index(
511-
np.concatenate(
512-
[index.values, np.array([self.dtype.na_value], dtype=object)]
513-
),
514-
dtype=object,
515-
)
516-
517-
return Series(array, index=index)
518-
519470
def _values_for_factorize(self) -> Tuple[np.ndarray, Any]:
520471
# TODO: https://github.com/pandas-dev/pandas/issues/30037
521472
# use masked algorithms, rather than object-dtype / np.nan.

pandas/core/arrays/masked.py

+47
Original file line numberDiff line numberDiff line change
@@ -201,3 +201,50 @@ def copy(self):
201201
data = data.copy()
202202
mask = mask.copy()
203203
return type(self)(data, mask, copy=False)
204+
205+
def value_counts(self, dropna=True):
206+
"""
207+
Returns a Series containing counts of each unique value.
208+
209+
Parameters
210+
----------
211+
dropna : bool, default True
212+
Don't include counts of missing values.
213+
214+
Returns
215+
-------
216+
counts : Series
217+
218+
See Also
219+
--------
220+
Series.value_counts
221+
"""
222+
from pandas import Index, Series
223+
from pandas.arrays import IntegerArray
224+
225+
# compute counts on the data with no nans
226+
data = self._data[~self._mask]
227+
value_counts = Index(data).value_counts()
228+
229+
# TODO(extension)
230+
# if we have allow Index to hold an ExtensionArray
231+
# this is easier
232+
index = value_counts.index.values.astype(object)
233+
234+
# if we want nans, count the mask
235+
if dropna:
236+
counts = value_counts.values
237+
else:
238+
counts = np.empty(len(value_counts) + 1, dtype="int64")
239+
counts[:-1] = value_counts
240+
counts[-1] = self._mask.sum()
241+
242+
index = Index(
243+
np.concatenate([index, np.array([self.dtype.na_value], dtype=object)]),
244+
dtype=object,
245+
)
246+
247+
mask = np.zeros(len(counts), dtype="bool")
248+
counts = IntegerArray(counts, mask)
249+
250+
return Series(counts, index=index)

pandas/core/arrays/string_.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,7 @@ def _reduce(self, name, skipna=True, **kwargs):
253253
def value_counts(self, dropna=False):
254254
from pandas import value_counts
255255

256-
return value_counts(self._ndarray, dropna=dropna)
256+
return value_counts(self._ndarray, dropna=dropna).astype("Int64")
257257

258258
# Overrride parent because we have different return types.
259259
@classmethod

pandas/tests/arrays/string_/test_string.py

+11
Original file line numberDiff line numberDiff line change
@@ -239,3 +239,14 @@ def test_arrow_roundtrip():
239239
tm.assert_frame_equal(result, df)
240240
# ensure the missing value is represented by NA and not np.nan or None
241241
assert result.loc[2, "a"] is pd.NA
242+
243+
244+
def test_value_counts_na():
245+
arr = pd.array(["a", "b", "a", pd.NA], dtype="string")
246+
result = arr.value_counts(dropna=False)
247+
expected = pd.Series([2, 1, 1], index=["a", "b", pd.NA], dtype="Int64")
248+
tm.assert_series_equal(result, expected)
249+
250+
result = arr.value_counts(dropna=True)
251+
expected = pd.Series([2, 1], index=["a", "b"], dtype="Int64")
252+
tm.assert_series_equal(result, expected)

pandas/tests/arrays/test_boolean.py

+11
Original file line numberDiff line numberDiff line change
@@ -868,3 +868,14 @@ def test_arrow_roundtrip():
868868
result = table.to_pandas()
869869
assert isinstance(result["a"].dtype, pd.BooleanDtype)
870870
tm.assert_frame_equal(result, df)
871+
872+
873+
def test_value_counts_na():
874+
arr = pd.array([True, False, pd.NA], dtype="boolean")
875+
result = arr.value_counts(dropna=False)
876+
expected = pd.Series([1, 1, 1], index=[True, False, pd.NA], dtype="Int64")
877+
tm.assert_series_equal(result, expected)
878+
879+
result = arr.value_counts(dropna=True)
880+
expected = pd.Series([1, 1], index=[True, False], dtype="Int64")
881+
tm.assert_series_equal(result, expected)

pandas/tests/arrays/test_integer.py

+11
Original file line numberDiff line numberDiff line change
@@ -1039,6 +1039,17 @@ def test_stat_method(pandasmethname, kwargs):
10391039
assert expected == result
10401040

10411041

1042+
def test_value_counts_na():
1043+
arr = pd.array([1, 2, 1, pd.NA], dtype="Int64")
1044+
result = arr.value_counts(dropna=False)
1045+
expected = pd.Series([2, 1, 1], index=[1, 2, pd.NA], dtype="Int64")
1046+
tm.assert_series_equal(result, expected)
1047+
1048+
result = arr.value_counts(dropna=True)
1049+
expected = pd.Series([2, 1], index=[1, 2], dtype="Int64")
1050+
tm.assert_series_equal(result, expected)
1051+
1052+
10421053
# TODO(jreback) - these need testing / are broken
10431054

10441055
# shift

pandas/tests/extension/test_boolean.py

+4
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,10 @@ def test_searchsorted(self, data_for_sorting, as_series):
226226
sorter = np.array([1, 0])
227227
assert data_for_sorting.searchsorted(a, sorter=sorter) == 0
228228

229+
@pytest.mark.skip(reason="uses nullable integer")
230+
def test_value_counts(self, all_data, dropna):
231+
return super().test_value_counts(all_data, dropna)
232+
229233

230234
class TestCasting(base.BaseCastingTests):
231235
pass

pandas/tests/extension/test_integer.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ class TestMissing(base.BaseMissingTests):
209209

210210

211211
class TestMethods(base.BaseMethodsTests):
212-
@pytest.mark.parametrize("dropna", [True, False])
212+
@pytest.mark.skip(reason="uses nullable integer")
213213
def test_value_counts(self, all_data, dropna):
214214
all_data = all_data[:10]
215215
if dropna:

pandas/tests/extension/test_string.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,9 @@ class TestNoReduce(base.BaseNoReduceTests):
8181

8282

8383
class TestMethods(base.BaseMethodsTests):
84-
pass
84+
@pytest.mark.skip(reason="returns nullable")
85+
def test_value_counts(self, all_data, dropna):
86+
return super().test_value_counts(all_data, dropna)
8587

8688

8789
class TestCasting(base.BaseCastingTests):

0 commit comments

Comments
 (0)