Skip to content

API: value_counts consistently return int64 dtype #44692

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 9 commits into from
4 changes: 0 additions & 4 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -656,7 +656,6 @@ def value_counts(self, dropna: bool = True) -> Series:
Index,
Series,
)
from pandas.arrays import IntegerArray

# compute counts on the data with no nans
data = self._data[~self._mask]
Expand All @@ -680,9 +679,6 @@ def value_counts(self, dropna: bool = True) -> Series:
dtype=object,
)

mask = np.zeros(len(counts), dtype="bool")
counts = IntegerArray(counts, mask)

return Series(counts, index=index)

@doc(ExtensionArray.equals)
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/string_.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,7 +470,7 @@ def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar:
def value_counts(self, dropna: bool = True):
from pandas import value_counts

return value_counts(self._ndarray, dropna=dropna).astype("Int64")
return value_counts(self._ndarray, dropna=dropna)

def memory_usage(self, deep: bool = False) -> int:
result = self._ndarray.nbytes
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -622,7 +622,7 @@ def value_counts(self, dropna: bool = True) -> Series:
# Index cannot hold ExtensionArrays yet
index = Index(type(self)(values)).astype(object)

return Series(counts, index=index).astype("Int64")
return Series(counts, index=index)

def astype(self, dtype, copy=True):
dtype = pandas_dtype(dtype)
Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/arrays/boolean/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,18 +77,18 @@ def test_ufunc_reduce_raises(values):
def test_value_counts_na():
arr = pd.array([True, False, pd.NA], dtype="boolean")
result = arr.value_counts(dropna=False)
expected = pd.Series([1, 1, 1], index=[True, False, pd.NA], dtype="Int64")
expected = pd.Series([1, 1, 1], index=[True, False, pd.NA])
tm.assert_series_equal(result, expected)

result = arr.value_counts(dropna=True)
expected = pd.Series([1, 1], index=[True, False], dtype="Int64")
expected = pd.Series([1, 1], index=[True, False])
tm.assert_series_equal(result, expected)


def test_value_counts_with_normalize():
ser = pd.Series([True, False, pd.NA], dtype="boolean")
result = ser.value_counts(normalize=True)
expected = pd.Series([1, 1], index=[True, False], dtype="Float64") / 2
expected = pd.Series([1, 1], index=[True, False]) / 2
tm.assert_series_equal(result, expected)


Expand Down
8 changes: 4 additions & 4 deletions pandas/tests/arrays/floating/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,26 +97,26 @@ def test_stat_method(pandasmethname, kwargs):
def test_value_counts_na():
arr = pd.array([0.1, 0.2, 0.1, pd.NA], dtype="Float64")
result = arr.value_counts(dropna=False)
expected = pd.Series([2, 1, 1], index=[0.1, 0.2, pd.NA], dtype="Int64")
expected = pd.Series([2, 1, 1], index=[0.1, 0.2, pd.NA])
tm.assert_series_equal(result, expected)

result = arr.value_counts(dropna=True)
expected = pd.Series([2, 1], index=[0.1, 0.2], dtype="Int64")
expected = pd.Series([2, 1], index=[0.1, 0.2])
tm.assert_series_equal(result, expected)


def test_value_counts_empty():
ser = pd.Series([], dtype="Float64")
result = ser.value_counts()
idx = pd.Index([], dtype="object")
expected = pd.Series([], index=idx, dtype="Int64")
expected = pd.Series([], index=idx, dtype="int64")
tm.assert_series_equal(result, expected)


def test_value_counts_with_normalize():
ser = pd.Series([0.1, 0.2, 0.1, pd.NA], dtype="Float64")
result = ser.value_counts(normalize=True)
expected = pd.Series([2, 1], index=[0.1, 0.2], dtype="Float64") / 3
expected = pd.Series([2, 1], index=[0.1, 0.2]) / 3
tm.assert_series_equal(result, expected)


Expand Down
8 changes: 4 additions & 4 deletions pandas/tests/arrays/integer/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,11 +108,11 @@ def test_stat_method(pandasmethname, kwargs):
def test_value_counts_na():
arr = pd.array([1, 2, 1, pd.NA], dtype="Int64")
result = arr.value_counts(dropna=False)
expected = pd.Series([2, 1, 1], index=[1, 2, pd.NA], dtype="Int64")
expected = pd.Series([2, 1, 1], index=[1, 2, pd.NA])
tm.assert_series_equal(result, expected)

result = arr.value_counts(dropna=True)
expected = pd.Series([2, 1], index=[1, 2], dtype="Int64")
expected = pd.Series([2, 1], index=[1, 2])
tm.assert_series_equal(result, expected)


Expand All @@ -122,15 +122,15 @@ def test_value_counts_empty():
result = ser.value_counts()
# TODO: The dtype of the index seems wrong (it's int64 for non-empty)
idx = pd.Index([], dtype="object")
expected = pd.Series([], index=idx, dtype="Int64")
expected = pd.Series([], index=idx, dtype="int64")
tm.assert_series_equal(result, expected)


def test_value_counts_with_normalize():
# GH 33172
ser = pd.Series([1, 2, 1, pd.NA], dtype="Int64")
result = ser.value_counts(normalize=True)
expected = pd.Series([2, 1], index=[1, 2], dtype="Float64") / 3
expected = pd.Series([2, 1], index=[1, 2]) / 3
tm.assert_series_equal(result, expected)


Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/arrays/string_/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -466,18 +466,18 @@ def test_arrow_load_from_zero_chunks(dtype, string_storage2):
def test_value_counts_na(dtype):
arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype)
result = arr.value_counts(dropna=False)
expected = pd.Series([2, 1, 1], index=["a", "b", pd.NA], dtype="Int64")
expected = pd.Series([2, 1, 1], index=["a", "b", pd.NA])
tm.assert_series_equal(result, expected)

result = arr.value_counts(dropna=True)
expected = pd.Series([2, 1], index=["a", "b"], dtype="Int64")
expected = pd.Series([2, 1], index=["a", "b"])
tm.assert_series_equal(result, expected)


def test_value_counts_with_normalize(dtype):
ser = pd.Series(["a", "b", "a", pd.NA], dtype=dtype)
result = ser.value_counts(normalize=True)
expected = pd.Series([2, 1], index=["a", "b"], dtype="Float64") / 3
expected = pd.Series([2, 1], index=["a", "b"]) / 3
tm.assert_series_equal(result, expected)


Expand Down
3 changes: 0 additions & 3 deletions pandas/tests/extension/decimal/test_decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,9 +148,6 @@ def test_value_counts(self, all_data, dropna, request):

tm.assert_series_equal(result, expected)

def test_value_counts_with_normalize(self, data):
return super().test_value_counts_with_normalize(data)


class TestCasting(base.BaseCastingTests):
pass
Expand Down
8 changes: 0 additions & 8 deletions pandas/tests/extension/test_boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,14 +216,6 @@ def test_searchsorted(self, data_for_sorting, as_series):
sorter = np.array([1, 0])
assert data_for_sorting.searchsorted(a, sorter=sorter) == 0

@pytest.mark.skip(reason="uses nullable integer")
def test_value_counts(self, all_data, dropna):
return super().test_value_counts(all_data, dropna)

@pytest.mark.skip(reason="uses nullable integer")
def test_value_counts_with_normalize(self, data):
pass

def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting):
# override because there are only 2 unique values

Expand Down
18 changes: 1 addition & 17 deletions pandas/tests/extension/test_floating.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,23 +172,7 @@ class TestMissing(base.BaseMissingTests):


class TestMethods(base.BaseMethodsTests):
@pytest.mark.skip(reason="uses nullable integer")
def test_value_counts(self, all_data, dropna):
all_data = all_data[:10]
if dropna:
other = np.array(all_data[~all_data.isna()])
else:
other = all_data

result = pd.Series(all_data).value_counts(dropna=dropna).sort_index()
expected = pd.Series(other).value_counts(dropna=dropna).sort_index()
expected.index = expected.index.astype(all_data.dtype)

self.assert_series_equal(result, expected)

@pytest.mark.skip(reason="uses nullable integer")
def test_value_counts_with_normalize(self, data):
pass
pass


class TestCasting(base.BaseCastingTests):
Expand Down
18 changes: 1 addition & 17 deletions pandas/tests/extension/test_integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,23 +195,7 @@ class TestMissing(base.BaseMissingTests):


class TestMethods(base.BaseMethodsTests):
@pytest.mark.skip(reason="uses nullable integer")
def test_value_counts(self, all_data, dropna):
all_data = all_data[:10]
if dropna:
other = np.array(all_data[~all_data.isna()])
else:
other = all_data

result = pd.Series(all_data).value_counts(dropna=dropna).sort_index()
expected = pd.Series(other).value_counts(dropna=dropna).sort_index()
expected.index = expected.index.astype(all_data.dtype)

self.assert_series_equal(result, expected)

@pytest.mark.skip(reason="uses nullable integer")
def test_value_counts_with_normalize(self, data):
pass
pass


class TestCasting(base.BaseCastingTests):
Expand Down
8 changes: 1 addition & 7 deletions pandas/tests/extension/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,13 +152,7 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna):


class TestMethods(base.BaseMethodsTests):
@pytest.mark.skip(reason="returns nullable")
def test_value_counts(self, all_data, dropna):
return super().test_value_counts(all_data, dropna)

@pytest.mark.skip(reason="returns nullable")
def test_value_counts_with_normalize(self, data):
pass
pass


class TestCasting(base.BaseCastingTests):
Expand Down