diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 587010453db5a..95f84d6c85157 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -818,6 +818,7 @@ Other API changes - :class:`DataFrame` and :class:`DataFrameGroupBy` aggregations (e.g. "sum") with object-dtype columns no longer infer non-object dtypes for their results, explicitly call ``result.infer_objects(copy=False)`` on the result to obtain the old behavior (:issue:`51205`, :issue:`49603`) - Division by zero with :class:`ArrowDtype` dtypes returns ``-inf``, ``nan``, or ``inf`` depending on the numerator, instead of raising (:issue:`51541`) - Added :func:`pandas.api.types.is_any_real_numeric_dtype` to check for real numeric dtypes (:issue:`51152`) +- :meth:`~arrays.ArrowExtensionArray.value_counts` now returns data with :class:`ArrowDtype` with ``pyarrow.int64`` type instead of ``"Int64"`` type (:issue:`51462`) .. note:: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 42c513aaf5aa6..c82b47867fbb3 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -873,6 +873,9 @@ def value_counts( result.name = name result.index.name = index_name counts = result._values + if not isinstance(counts, np.ndarray): + # e.g. ArrowExtensionArray + counts = np.asarray(counts) elif isinstance(values, ABCMultiIndex): # GH49558 diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 4facc194978d5..76723a8973e27 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -983,12 +983,11 @@ def value_counts(self, dropna: bool = True) -> Series: if pa.types.is_duration(pa_type): values = values.cast(pa_type) - # No missing values so we can adhere to the interface and return a numpy array. - counts = np.array(counts) + counts = ArrowExtensionArray(counts) index = Index(type(self)(values)) - return Series(counts, index=index, name="count").astype("Int64") + return Series(counts, index=index, name="count") @classmethod def _concat_same_type( diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index adb86b568e891..0b41abc3b3a73 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -453,20 +453,28 @@ def test_arrow_load_from_zero_chunks(dtype, string_storage2): def test_value_counts_na(dtype): + if getattr(dtype, "storage", "") == "pyarrow": + exp_dtype = "int64[pyarrow]" + else: + exp_dtype = "Int64" arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype) result = arr.value_counts(dropna=False) - expected = pd.Series([2, 1, 1], index=arr[[0, 1, 3]], dtype="Int64", name="count") + expected = pd.Series([2, 1, 1], index=arr[[0, 1, 3]], dtype=exp_dtype, name="count") tm.assert_series_equal(result, expected) result = arr.value_counts(dropna=True) - expected = pd.Series([2, 1], index=arr[:2], dtype="Int64", name="count") + expected = pd.Series([2, 1], index=arr[:2], dtype=exp_dtype, name="count") tm.assert_series_equal(result, expected) def test_value_counts_with_normalize(dtype): + if getattr(dtype, "storage", "") == "pyarrow": + exp_dtype = "double[pyarrow]" + else: + exp_dtype = "Float64" ser = pd.Series(["a", "b", "a", pd.NA], dtype=dtype) result = ser.value_counts(normalize=True) - expected = pd.Series([2, 1], index=ser[:2], dtype="Float64", name="proportion") / 3 + expected = pd.Series([2, 1], index=ser[:2], dtype=exp_dtype, name="proportion") / 3 tm.assert_series_equal(result, expected) diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index 97217430007eb..89f3c005c52f0 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -39,8 +39,11 @@ def test_value_counts(index_or_series_obj): expected.index.name = obj.name if not isinstance(result.dtype, np.dtype): - # i.e IntegerDtype - expected = expected.astype("Int64") + if getattr(obj.dtype, "storage", "") == "pyarrow": + expected = expected.astype("int64[pyarrow]") + else: + # i.e IntegerDtype + expected = expected.astype("Int64") # TODO(GH#32514): Order of entries with the same count is inconsistent # on CI (gh-32449) @@ -90,8 +93,11 @@ def test_value_counts_null(null_obj, index_or_series_obj): result = result.sort_index() if not isinstance(result.dtype, np.dtype): - # i.e IntegerDtype - expected = expected.astype("Int64") + if getattr(obj.dtype, "storage", "") == "pyarrow": + expected = expected.astype("int64[pyarrow]") + else: + # i.e IntegerDtype + expected = expected.astype("Int64") tm.assert_series_equal(result, expected) expected[null_obj] = 3 diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 59766c7b2c647..a838dc6550f78 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -785,9 +785,25 @@ def test_diff(self, data, periods, request): ) super().test_diff(data, periods) - @pytest.mark.parametrize("dropna", [True, False]) - def test_value_counts(self, all_data, dropna, request): - super().test_value_counts(all_data, dropna) + def test_value_counts_returns_pyarrow_int64(self, data): + # GH 51462 + data = data[:10] + result = data.value_counts() + assert result.dtype == ArrowDtype(pa.int64()) + + def test_value_counts_with_normalize(self, data, request): + data = data[:10].unique() + values = np.array(data[~data.isna()]) + ser = pd.Series(data, dtype=data.dtype) + + result = ser.value_counts(normalize=True).sort_index() + + expected = pd.Series( + [1 / len(values)] * len(values), index=result.index, name="proportion" + ) + expected = expected.astype("double[pyarrow]") + + self.assert_series_equal(result, expected) def test_argmin_argmax( self, data_for_sorting, data_missing_for_sorting, na_value, request diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 4fb98d67414e7..a2e438b858e59 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -18,7 +18,6 @@ import numpy as np import pytest -from pandas.compat import pa_version_under7p0 from pandas.errors import PerformanceWarning import pandas as pd @@ -196,70 +195,20 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): class TestMethods(base.BaseMethodsTests): - def test_argsort(self, data_for_sorting): - with tm.maybe_produces_warning( - PerformanceWarning, - pa_version_under7p0 - and getattr(data_for_sorting.dtype, "storage", "") == "pyarrow", - check_stacklevel=False, - ): - super().test_argsort(data_for_sorting) + def test_value_counts_with_normalize(self, data): + data = data[:10].unique() + values = np.array(data[~data.isna()]) + ser = pd.Series(data, dtype=data.dtype) - def test_argsort_missing(self, data_missing_for_sorting): - with tm.maybe_produces_warning( - PerformanceWarning, - pa_version_under7p0 - and getattr(data_missing_for_sorting.dtype, "storage", "") == "pyarrow", - check_stacklevel=False, - ): - super().test_argsort_missing(data_missing_for_sorting) - - def test_argmin_argmax( - self, data_for_sorting, data_missing_for_sorting, na_value, request - ): - super().test_argmin_argmax(data_for_sorting, data_missing_for_sorting, na_value) - - @pytest.mark.parametrize( - "op_name, skipna, expected", - [ - ("idxmax", True, 0), - ("idxmin", True, 2), - ("argmax", True, 0), - ("argmin", True, 2), - ("idxmax", False, np.nan), - ("idxmin", False, np.nan), - ("argmax", False, -1), - ("argmin", False, -1), - ], - ) - def test_argreduce_series( - self, data_missing_for_sorting, op_name, skipna, expected, request - ): - super().test_argreduce_series( - data_missing_for_sorting, op_name, skipna, expected - ) + result = ser.value_counts(normalize=True).sort_index() - @pytest.mark.parametrize("dropna", [True, False]) - def test_value_counts(self, all_data, dropna, request): - all_data = all_data[:10] - if dropna: - other = all_data[~all_data.isna()] + expected = pd.Series( + [1 / len(values)] * len(values), index=result.index, name="proportion" + ) + if getattr(data.dtype, "storage", "") == "pyarrow": + expected = expected.astype("double[pyarrow]") else: - other = all_data - with tm.maybe_produces_warning( - PerformanceWarning, - pa_version_under7p0 - and getattr(all_data.dtype, "storage", "") == "pyarrow" - and not (dropna and "data_missing" in request.node.nodeid), - ): - result = pd.Series(all_data).value_counts(dropna=dropna).sort_index() - with tm.maybe_produces_warning( - PerformanceWarning, - pa_version_under7p0 - and getattr(other.dtype, "storage", "") == "pyarrow" - and not (dropna and "data_missing" in request.node.nodeid), - ): - expected = pd.Series(other).value_counts(dropna=dropna).sort_index() + expected = expected.astype("Float64") self.assert_series_equal(result, expected)