Skip to content

API: value_counts consistently return int64 dtype #44692

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 9 commits into from
4 changes: 0 additions & 4 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -905,7 +905,6 @@ def value_counts(self, dropna: bool = True) -> Series:
Index,
Series,
)
from pandas.arrays import IntegerArray

# compute counts on the data with no nans
data = self._data[~self._mask]
Expand All @@ -925,9 +924,6 @@ def value_counts(self, dropna: bool = True) -> Series:

index = index.astype(self.dtype)

mask = np.zeros(len(counts), dtype="bool")
counts = IntegerArray(counts, mask)

return Series(counts, index=index)

@doc(ExtensionArray.equals)
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/string_.py
Original file line number Diff line number Diff line change
Expand Up @@ -485,7 +485,7 @@ def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar:
def value_counts(self, dropna: bool = True):
from pandas import value_counts

result = value_counts(self._ndarray, dropna=dropna).astype("Int64")
result = value_counts(self._ndarray, dropna=dropna)
result.index = result.index.astype(self.dtype)
return result

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -566,7 +566,7 @@ def value_counts(self, dropna: bool = True) -> Series:

index = Index(type(self)(values))

return Series(counts, index=index).astype("Int64")
return Series(counts, index=index)

def astype(self, dtype, copy: bool = True):
dtype = pandas_dtype(dtype)
Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/arrays/boolean/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,20 +92,20 @@ def test_ufunc_reduce_raises(values):
def test_value_counts_na():
arr = pd.array([True, False, pd.NA], dtype="boolean")
result = arr.value_counts(dropna=False)
expected = pd.Series([1, 1, 1], index=arr, dtype="Int64")
expected = pd.Series([1, 1, 1], index=arr)
assert expected.index.dtype == arr.dtype
tm.assert_series_equal(result, expected)

result = arr.value_counts(dropna=True)
expected = pd.Series([1, 1], index=arr[:-1], dtype="Int64")
expected = pd.Series([1, 1], index=arr[:-1])
assert expected.index.dtype == arr.dtype
tm.assert_series_equal(result, expected)


def test_value_counts_with_normalize():
ser = pd.Series([True, False, pd.NA], dtype="boolean")
result = ser.value_counts(normalize=True)
expected = pd.Series([1, 1], index=ser[:-1], dtype="Float64") / 2
expected = pd.Series([1, 1], index=ser[:-1]) / 2
assert expected.index.dtype == "boolean"
tm.assert_series_equal(result, expected)

Expand Down
8 changes: 4 additions & 4 deletions pandas/tests/arrays/floating/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,11 +100,11 @@ def test_value_counts_na():
result = arr.value_counts(dropna=False)
idx = pd.Index([0.1, 0.2, pd.NA], dtype=arr.dtype)
assert idx.dtype == arr.dtype
expected = pd.Series([2, 1, 1], index=idx, dtype="Int64")
expected = pd.Series([2, 1, 1], index=idx)
tm.assert_series_equal(result, expected)

result = arr.value_counts(dropna=True)
expected = pd.Series([2, 1], index=idx[:-1], dtype="Int64")
expected = pd.Series([2, 1], index=idx[:-1])
tm.assert_series_equal(result, expected)


Expand All @@ -113,14 +113,14 @@ def test_value_counts_empty():
result = ser.value_counts()
idx = pd.Index([], dtype="Float64")
assert idx.dtype == "Float64"
expected = pd.Series([], index=idx, dtype="Int64")
expected = pd.Series([], index=idx, dtype="int64")
tm.assert_series_equal(result, expected)


def test_value_counts_with_normalize():
ser = pd.Series([0.1, 0.2, 0.1, pd.NA], dtype="Float64")
result = ser.value_counts(normalize=True)
expected = pd.Series([2, 1], index=ser[:2], dtype="Float64") / 3
expected = pd.Series([2, 1], index=ser[:2]) / 3
assert expected.index.dtype == ser.dtype
tm.assert_series_equal(result, expected)

Expand Down
8 changes: 4 additions & 4 deletions pandas/tests/arrays/integer/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,11 +111,11 @@ def test_value_counts_na():
result = arr.value_counts(dropna=False)
ex_index = pd.Index([1, 2, pd.NA], dtype="Int64")
assert ex_index.dtype == "Int64"
expected = pd.Series([2, 1, 1], index=ex_index, dtype="Int64")
expected = pd.Series([2, 1, 1], index=ex_index)
tm.assert_series_equal(result, expected)

result = arr.value_counts(dropna=True)
expected = pd.Series([2, 1], index=arr[:2], dtype="Int64")
expected = pd.Series([2, 1], index=arr[:2])
assert expected.index.dtype == arr.dtype
tm.assert_series_equal(result, expected)

Expand All @@ -125,16 +125,16 @@ def test_value_counts_empty():
ser = pd.Series([], dtype="Int64")
result = ser.value_counts()
idx = pd.Index([], dtype=ser.dtype)
expected = pd.Series([], index=idx, dtype="int64")
assert idx.dtype == ser.dtype
expected = pd.Series([], index=idx, dtype="Int64")
tm.assert_series_equal(result, expected)


def test_value_counts_with_normalize():
# GH 33172
ser = pd.Series([1, 2, 1, pd.NA], dtype="Int64")
result = ser.value_counts(normalize=True)
expected = pd.Series([2, 1], index=ser[:2], dtype="Float64") / 3
expected = pd.Series([2, 1], index=ser[:2]) / 3
assert expected.index.dtype == ser.dtype
tm.assert_series_equal(result, expected)

Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/arrays/string_/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,18 +481,18 @@ def test_arrow_load_from_zero_chunks(dtype, string_storage2):
def test_value_counts_na(dtype):
arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype)
result = arr.value_counts(dropna=False)
expected = pd.Series([2, 1, 1], index=arr[[0, 1, 3]], dtype="Int64")
expected = pd.Series([2, 1, 1], index=arr[[0, 1, 3]])
tm.assert_series_equal(result, expected)

result = arr.value_counts(dropna=True)
expected = pd.Series([2, 1], index=arr[:2], dtype="Int64")
expected = pd.Series([2, 1], index=arr[:2])
tm.assert_series_equal(result, expected)


def test_value_counts_with_normalize(dtype):
ser = pd.Series(["a", "b", "a", pd.NA], dtype=dtype)
result = ser.value_counts(normalize=True)
expected = pd.Series([2, 1], index=ser[:2], dtype="Float64") / 3
expected = pd.Series([2, 1], index=ser[:2]) / 3
tm.assert_series_equal(result, expected)


Expand Down
4 changes: 0 additions & 4 deletions pandas/tests/extension/base/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,6 @@ def test_value_counts_with_normalize(self, data):
expected = pd.Series(0.0, index=result.index)
expected[result > 0] = 1 / len(values)

if isinstance(data.dtype, pd.core.dtypes.dtypes.BaseMaskedDtype):
# TODO(GH#44692): avoid special-casing
expected = expected.astype("Float64")

self.assert_series_equal(result, expected)

def test_count(self, data_missing):
Expand Down
6 changes: 1 addition & 5 deletions pandas/tests/extension/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,14 +156,10 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna):


class TestMethods(base.BaseMethodsTests):
@pytest.mark.skip(reason="returns nullable")
@pytest.mark.xfail(reason="returns nullable")
def test_value_counts(self, all_data, dropna):
return super().test_value_counts(all_data, dropna)

@pytest.mark.skip(reason="returns nullable")
def test_value_counts_with_normalize(self, data):
pass


class TestCasting(base.BaseCastingTests):
pass
Expand Down