From 4405a53863b5c8a7daaef997831a5638e1602d0d Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 30 Nov 2021 11:37:12 -0800 Subject: [PATCH 1/2] API: value_counts consistently return int64 dtype --- pandas/core/arrays/masked.py | 4 ---- pandas/core/arrays/string_.py | 2 +- pandas/core/arrays/string_arrow.py | 2 +- pandas/tests/arrays/boolean/test_function.py | 6 +++--- pandas/tests/arrays/floating/test_function.py | 8 ++++---- pandas/tests/arrays/integer/test_function.py | 8 ++++---- pandas/tests/arrays/string_/test_string.py | 6 +++--- pandas/tests/extension/decimal/test_decimal.py | 3 --- pandas/tests/extension/test_boolean.py | 8 -------- pandas/tests/extension/test_floating.py | 18 +----------------- pandas/tests/extension/test_integer.py | 18 +----------------- pandas/tests/extension/test_string.py | 8 +------- 12 files changed, 19 insertions(+), 72 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index cc61fb4d93ffc..32844ae929c46 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -656,7 +656,6 @@ def value_counts(self, dropna: bool = True) -> Series: Index, Series, ) - from pandas.arrays import IntegerArray # compute counts on the data with no nans data = self._data[~self._mask] @@ -680,9 +679,6 @@ def value_counts(self, dropna: bool = True) -> Series: dtype=object, ) - mask = np.zeros(len(counts), dtype="bool") - counts = IntegerArray(counts, mask) - return Series(counts, index=index) @doc(ExtensionArray.equals) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index df71501d55b20..3dcd9738ee532 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -470,7 +470,7 @@ def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: def value_counts(self, dropna: bool = True): from pandas import value_counts - return value_counts(self._ndarray, dropna=dropna).astype("Int64") + return value_counts(self._ndarray, dropna=dropna) def memory_usage(self, deep: bool = False) -> int: result = self._ndarray.nbytes diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index b1daf0e393ef0..eb64712a45422 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -622,7 +622,7 @@ def value_counts(self, dropna: bool = True) -> Series: # Index cannot hold ExtensionArrays yet index = Index(type(self)(values)).astype(object) - return Series(counts, index=index).astype("Int64") + return Series(counts, index=index) def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) diff --git a/pandas/tests/arrays/boolean/test_function.py b/pandas/tests/arrays/boolean/test_function.py index 2f1a3121cdf5b..6ede1de399911 100644 --- a/pandas/tests/arrays/boolean/test_function.py +++ b/pandas/tests/arrays/boolean/test_function.py @@ -77,18 +77,18 @@ def test_ufunc_reduce_raises(values): def test_value_counts_na(): arr = pd.array([True, False, pd.NA], dtype="boolean") result = arr.value_counts(dropna=False) - expected = pd.Series([1, 1, 1], index=[True, False, pd.NA], dtype="Int64") + expected = pd.Series([1, 1, 1], index=[True, False, pd.NA]) tm.assert_series_equal(result, expected) result = arr.value_counts(dropna=True) - expected = pd.Series([1, 1], index=[True, False], dtype="Int64") + expected = pd.Series([1, 1], index=[True, False]) tm.assert_series_equal(result, expected) def test_value_counts_with_normalize(): ser = pd.Series([True, False, pd.NA], dtype="boolean") result = ser.value_counts(normalize=True) - expected = pd.Series([1, 1], index=[True, False], dtype="Float64") / 2 + expected = pd.Series([1, 1], index=[True, False]) / 2 tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/floating/test_function.py b/pandas/tests/arrays/floating/test_function.py index ff84116fa1b18..882f0a210424c 100644 --- a/pandas/tests/arrays/floating/test_function.py +++ b/pandas/tests/arrays/floating/test_function.py @@ -97,11 +97,11 @@ def test_stat_method(pandasmethname, kwargs): def test_value_counts_na(): arr = pd.array([0.1, 0.2, 0.1, pd.NA], dtype="Float64") result = arr.value_counts(dropna=False) - expected = pd.Series([2, 1, 1], index=[0.1, 0.2, pd.NA], dtype="Int64") + expected = pd.Series([2, 1, 1], index=[0.1, 0.2, pd.NA]) tm.assert_series_equal(result, expected) result = arr.value_counts(dropna=True) - expected = pd.Series([2, 1], index=[0.1, 0.2], dtype="Int64") + expected = pd.Series([2, 1], index=[0.1, 0.2]) tm.assert_series_equal(result, expected) @@ -109,14 +109,14 @@ def test_value_counts_empty(): ser = pd.Series([], dtype="Float64") result = ser.value_counts() idx = pd.Index([], dtype="object") - expected = pd.Series([], index=idx, dtype="Int64") + expected = pd.Series([], index=idx, dtype="int64") tm.assert_series_equal(result, expected) def test_value_counts_with_normalize(): ser = pd.Series([0.1, 0.2, 0.1, pd.NA], dtype="Float64") result = ser.value_counts(normalize=True) - expected = pd.Series([2, 1], index=[0.1, 0.2], dtype="Float64") / 3 + expected = pd.Series([2, 1], index=[0.1, 0.2]) / 3 tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py index 3d8c93fbd507f..f4f4d5a2fac02 100644 --- a/pandas/tests/arrays/integer/test_function.py +++ b/pandas/tests/arrays/integer/test_function.py @@ -108,11 +108,11 @@ def test_stat_method(pandasmethname, kwargs): def test_value_counts_na(): arr = pd.array([1, 2, 1, pd.NA], dtype="Int64") result = arr.value_counts(dropna=False) - expected = pd.Series([2, 1, 1], index=[1, 2, pd.NA], dtype="Int64") + expected = pd.Series([2, 1, 1], index=[1, 2, pd.NA]) tm.assert_series_equal(result, expected) result = arr.value_counts(dropna=True) - expected = pd.Series([2, 1], index=[1, 2], dtype="Int64") + expected = pd.Series([2, 1], index=[1, 2]) tm.assert_series_equal(result, expected) @@ -122,7 +122,7 @@ def test_value_counts_empty(): result = ser.value_counts() # TODO: The dtype of the index seems wrong (it's int64 for non-empty) idx = pd.Index([], dtype="object") - expected = pd.Series([], index=idx, dtype="Int64") + expected = pd.Series([], index=idx, dtype="int64") tm.assert_series_equal(result, expected) @@ -130,7 +130,7 @@ def test_value_counts_with_normalize(): # GH 33172 ser = pd.Series([1, 2, 1, pd.NA], dtype="Int64") result = ser.value_counts(normalize=True) - expected = pd.Series([2, 1], index=[1, 2], dtype="Float64") / 3 + expected = pd.Series([2, 1], index=[1, 2]) / 3 tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index c330e959ad5bf..997ac267831a6 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -466,18 +466,18 @@ def test_arrow_load_from_zero_chunks(dtype, string_storage2): def test_value_counts_na(dtype): arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype) result = arr.value_counts(dropna=False) - expected = pd.Series([2, 1, 1], index=["a", "b", pd.NA], dtype="Int64") + expected = pd.Series([2, 1, 1], index=["a", "b", pd.NA]) tm.assert_series_equal(result, expected) result = arr.value_counts(dropna=True) - expected = pd.Series([2, 1], index=["a", "b"], dtype="Int64") + expected = pd.Series([2, 1], index=["a", "b"]) tm.assert_series_equal(result, expected) def test_value_counts_with_normalize(dtype): ser = pd.Series(["a", "b", "a", pd.NA], dtype=dtype) result = ser.value_counts(normalize=True) - expected = pd.Series([2, 1], index=["a", "b"], dtype="Float64") / 3 + expected = pd.Series([2, 1], index=["a", "b"]) / 3 tm.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 53416b6a3e9db..f969e60e5244f 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -148,9 +148,6 @@ def test_value_counts(self, all_data, dropna, request): tm.assert_series_equal(result, expected) - def test_value_counts_with_normalize(self, data): - return super().test_value_counts_with_normalize(data) - class TestCasting(base.BaseCastingTests): pass diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index 05455905860d2..2f68477411ec8 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -216,14 +216,6 @@ def test_searchsorted(self, data_for_sorting, as_series): sorter = np.array([1, 0]) assert data_for_sorting.searchsorted(a, sorter=sorter) == 0 - @pytest.mark.skip(reason="uses nullable integer") - def test_value_counts(self, all_data, dropna): - return super().test_value_counts(all_data, dropna) - - @pytest.mark.skip(reason="uses nullable integer") - def test_value_counts_with_normalize(self, data): - pass - def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting): # override because there are only 2 unique values diff --git a/pandas/tests/extension/test_floating.py b/pandas/tests/extension/test_floating.py index 2b08c5b7be450..da4258c31b393 100644 --- a/pandas/tests/extension/test_floating.py +++ b/pandas/tests/extension/test_floating.py @@ -172,23 +172,7 @@ class TestMissing(base.BaseMissingTests): class TestMethods(base.BaseMethodsTests): - @pytest.mark.skip(reason="uses nullable integer") - def test_value_counts(self, all_data, dropna): - all_data = all_data[:10] - if dropna: - other = np.array(all_data[~all_data.isna()]) - else: - other = all_data - - result = pd.Series(all_data).value_counts(dropna=dropna).sort_index() - expected = pd.Series(other).value_counts(dropna=dropna).sort_index() - expected.index = expected.index.astype(all_data.dtype) - - self.assert_series_equal(result, expected) - - @pytest.mark.skip(reason="uses nullable integer") - def test_value_counts_with_normalize(self, data): - pass + pass class TestCasting(base.BaseCastingTests): diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index 7d343aab3c7a0..0eaebc23b9834 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -195,23 +195,7 @@ class TestMissing(base.BaseMissingTests): class TestMethods(base.BaseMethodsTests): - @pytest.mark.skip(reason="uses nullable integer") - def test_value_counts(self, all_data, dropna): - all_data = all_data[:10] - if dropna: - other = np.array(all_data[~all_data.isna()]) - else: - other = all_data - - result = pd.Series(all_data).value_counts(dropna=dropna).sort_index() - expected = pd.Series(other).value_counts(dropna=dropna).sort_index() - expected.index = expected.index.astype(all_data.dtype) - - self.assert_series_equal(result, expected) - - @pytest.mark.skip(reason="uses nullable integer") - def test_value_counts_with_normalize(self, data): - pass + pass class TestCasting(base.BaseCastingTests): diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 5049116a9320e..64793b098d57f 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -152,13 +152,7 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): class TestMethods(base.BaseMethodsTests): - @pytest.mark.skip(reason="returns nullable") - def test_value_counts(self, all_data, dropna): - return super().test_value_counts(all_data, dropna) - - @pytest.mark.skip(reason="returns nullable") - def test_value_counts_with_normalize(self, data): - pass + pass class TestCasting(base.BaseCastingTests): From cdae1c522b9ed825b488f2cedba3f0cd35a86e21 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 14 Feb 2022 09:59:40 -0800 Subject: [PATCH 2/2] update tests --- pandas/tests/extension/test_floating.py | 1 - pandas/tests/extension/test_integer.py | 1 - pandas/tests/extension/test_string.py | 6 +----- 3 files changed, 1 insertion(+), 7 deletions(-) diff --git a/pandas/tests/extension/test_floating.py b/pandas/tests/extension/test_floating.py index ce49567c50594..f6ffe51fd93d4 100644 --- a/pandas/tests/extension/test_floating.py +++ b/pandas/tests/extension/test_floating.py @@ -183,7 +183,6 @@ def test_value_counts(self, all_data, dropna): result = pd.Series(all_data).value_counts(dropna=dropna).sort_index() expected = pd.Series(other).value_counts(dropna=dropna).sort_index() - expected = expected.astype("Int64") expected.index = expected.index.astype(all_data.dtype) self.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index 83745cc3f66d8..c4b95d8da01e8 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -206,7 +206,6 @@ def test_value_counts(self, all_data, dropna): result = pd.Series(all_data).value_counts(dropna=dropna).sort_index() expected = pd.Series(other).value_counts(dropna=dropna).sort_index() - expected = expected.astype("Int64") expected.index = expected.index.astype(all_data.dtype) self.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 4256142556894..2484433449feb 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -158,14 +158,10 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): class TestMethods(base.BaseMethodsTests): - @pytest.mark.skip(reason="returns nullable") + @pytest.mark.xfail(reason="returns nullable") def test_value_counts(self, all_data, dropna): return super().test_value_counts(all_data, dropna) - @pytest.mark.skip(reason="returns nullable") - def test_value_counts_with_normalize(self, data): - pass - class TestCasting(base.BaseCastingTests): pass