From 03ca01d30515c0942ea1e2550aa91db370e56633 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 21 Feb 2023 12:23:49 -0800
Subject: [PATCH 1/4] API: ArrowExtensionArray.value_counts returns
 pyarrow.int64 type

---
 doc/source/whatsnew/v2.0.0.rst       |  1 +
 pandas/core/algorithms.py            |  3 +++
 pandas/core/arrays/arrow/array.py    |  5 ++---
 pandas/tests/extension/test_arrow.py | 32 ++++++++++++++++++----------
 4 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
index c0082b451c95d..33fb6034bf63f 100644
--- a/doc/source/whatsnew/v2.0.0.rst
+++ b/doc/source/whatsnew/v2.0.0.rst
@@ -815,6 +815,7 @@ Other API changes
 - The methods :meth:`Series.round`, :meth:`DataFrame.__invert__`, :meth:`Series.__invert__`, :meth:`DataFrame.swapaxes`, :meth:`DataFrame.first`, :meth:`DataFrame.last`, :meth:`Series.first`, :meth:`Series.last` and :meth:`DataFrame.align` will now always return new objects (:issue:`51032`)
 - :class:`DataFrame` and :class:`DataFrameGroupBy` aggregations (e.g. "sum") with object-dtype columns no longer infer non-object dtypes for their results, explicitly call ``result.infer_objects(copy=False)`` on the result to obtain the old behavior (:issue:`51205`, :issue:`49603`)
 - Added :func:`pandas.api.types.is_any_real_numeric_dtype` to check for real numeric dtypes (:issue:`51152`)
+- :meth:`~arrays.ArrowExtensionArray.value_counts` now returns data with :class:`ArrowDtype` with ``pyarrow.int64`` type instead of ``"Int64"`` type (:issue:`51462`)
 
 .. note::
 
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 42c513aaf5aa6..c82b47867fbb3 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -873,6 +873,9 @@ def value_counts(
             result.name = name
             result.index.name = index_name
             counts = result._values
+            if not isinstance(counts, np.ndarray):
+                # e.g. ArrowExtensionArray
+                counts = np.asarray(counts)
 
         elif isinstance(values, ABCMultiIndex):
             # GH49558
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index eeb252b10b1ea..a3302e5e207d2 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -990,12 +990,11 @@ def value_counts(self, dropna: bool = True) -> Series:
         if pa.types.is_duration(pa_type):
             values = values.cast(pa_type)
 
-        # No missing values so we can adhere to the interface and return a numpy array.
-        counts = np.array(counts)
+        counts = type(self)(counts)
 
         index = Index(type(self)(values))
 
-        return Series(counts, index=index, name="count").astype("Int64")
+        return Series(counts, index=index, name="count")
 
     @classmethod
     def _concat_same_type(
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
index 705e9d55c06e7..d938b039dd60e 100644
--- a/pandas/tests/extension/test_arrow.py
+++ b/pandas/tests/extension/test_arrow.py
@@ -758,19 +758,29 @@ def test_diff(self, data, periods, request):
             )
         super().test_diff(data, periods)
 
-    @pytest.mark.filterwarnings("ignore:Falling back:pandas.errors.PerformanceWarning")
-    @pytest.mark.parametrize("dropna", [True, False])
-    def test_value_counts(self, all_data, dropna, request):
-        super().test_value_counts(all_data, dropna)
+    def test_value_counts_returns_pyarrow_int64(self, data):
+        # GH 51462
+        data = data[:10]
+        result = data.value_counts()
+        assert result.dtype == ArrowDtype(pa.int64())
 
     def test_value_counts_with_normalize(self, data, request):
-        pa_dtype = data.dtype.pyarrow_dtype
-        with tm.maybe_produces_warning(
-            PerformanceWarning,
-            pa_version_under7p0 and not pa.types.is_duration(pa_dtype),
-            check_stacklevel=False,
-        ):
-            super().test_value_counts_with_normalize(data)
+        data = data[:10].unique()
+        values = np.array(data[~data.isna()])
+        ser = pd.Series(data, dtype=data.dtype)
+
+        result = ser.value_counts(normalize=True).sort_index()
+
+        if not isinstance(data, pd.Categorical):
+            expected = pd.Series(
+                [1 / len(values)] * len(values), index=result.index, name="proportion"
+            )
+        else:
+            expected = pd.Series(0.0, index=result.index, name="proportion")
+            expected[result > 0] = 1 / len(values)
+        expected = expected.astype("double[pyarrow]")
+
+        self.assert_series_equal(result, expected)
 
     def test_argmin_argmax(
         self, data_for_sorting, data_missing_for_sorting, na_value, request

From 608907c5de4a25fe30c0b236baafd138d8276d29 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 21 Feb 2023 12:28:15 -0800
Subject: [PATCH 2/4] Address arrowstringarray

---
 pandas/core/arrays/arrow/array.py     |  2 +-
 pandas/tests/extension/test_string.py | 21 +++++++++++++++++++--
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index a3302e5e207d2..3c6f6e0c95abc 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -990,7 +990,7 @@ def value_counts(self, dropna: bool = True) -> Series:
         if pa.types.is_duration(pa_type):
             values = values.cast(pa_type)
 
-        counts = type(self)(counts)
+        counts = ArrowExtensionArray(counts)
 
         index = Index(type(self)(values))
 
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index ee855bb1cde8c..87f28fd684372 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -256,9 +256,26 @@ def test_value_counts(self, all_data, dropna, request):
 
         self.assert_series_equal(result, expected)
 
-    @pytest.mark.filterwarnings("ignore:Falling back:pandas.errors.PerformanceWarning")
     def test_value_counts_with_normalize(self, data):
-        super().test_value_counts_with_normalize(data)
+        data = data[:10].unique()
+        values = np.array(data[~data.isna()])
+        ser = pd.Series(data, dtype=data.dtype)
+
+        result = ser.value_counts(normalize=True).sort_index()
+
+        if not isinstance(data, pd.Categorical):
+            expected = pd.Series(
+                [1 / len(values)] * len(values), index=result.index, name="proportion"
+            )
+        else:
+            expected = pd.Series(0.0, index=result.index, name="proportion")
+            expected[result > 0] = 1 / len(values)
+        if getattr(data.dtype, "storage", "") == "pyarrow":
+            expected = expected.astype("double[pyarrow]")
+        else:
+            expected = expected.astype("Float64")
+
+        self.assert_series_equal(result, expected)
 
     def test_argsort_missing_array(self, data_missing_for_sorting):
         with tm.maybe_produces_warning(

From b940622110c500b65345a166f77692495ef0d8f0 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 21 Feb 2023 13:23:50 -0800
Subject: [PATCH 3/4] Address other tests

---
 pandas/tests/arrays/string_/test_string.py | 14 +++++++++++---
 pandas/tests/base/test_value_counts.py     | 14 ++++++++++----
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index adb86b568e891..0b41abc3b3a73 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -453,20 +453,28 @@ def test_arrow_load_from_zero_chunks(dtype, string_storage2):
 
 
 def test_value_counts_na(dtype):
+    if getattr(dtype, "storage", "") == "pyarrow":
+        exp_dtype = "int64[pyarrow]"
+    else:
+        exp_dtype = "Int64"
     arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype)
     result = arr.value_counts(dropna=False)
-    expected = pd.Series([2, 1, 1], index=arr[[0, 1, 3]], dtype="Int64", name="count")
+    expected = pd.Series([2, 1, 1], index=arr[[0, 1, 3]], dtype=exp_dtype, name="count")
     tm.assert_series_equal(result, expected)
 
     result = arr.value_counts(dropna=True)
-    expected = pd.Series([2, 1], index=arr[:2], dtype="Int64", name="count")
+    expected = pd.Series([2, 1], index=arr[:2], dtype=exp_dtype, name="count")
     tm.assert_series_equal(result, expected)
 
 
 def test_value_counts_with_normalize(dtype):
+    if getattr(dtype, "storage", "") == "pyarrow":
+        exp_dtype = "double[pyarrow]"
+    else:
+        exp_dtype = "Float64"
     ser = pd.Series(["a", "b", "a", pd.NA], dtype=dtype)
     result = ser.value_counts(normalize=True)
-    expected = pd.Series([2, 1], index=ser[:2], dtype="Float64", name="proportion") / 3
+    expected = pd.Series([2, 1], index=ser[:2], dtype=exp_dtype, name="proportion") / 3
     tm.assert_series_equal(result, expected)
 
 
diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py
index 4f5e8adbcdf93..1ce27dcb88f2a 100644
--- a/pandas/tests/base/test_value_counts.py
+++ b/pandas/tests/base/test_value_counts.py
@@ -42,8 +42,11 @@ def test_value_counts(index_or_series_obj):
         expected.index.name = obj.name
 
     if not isinstance(result.dtype, np.dtype):
-        # i.e IntegerDtype
-        expected = expected.astype("Int64")
+        if getattr(obj.dtype, "storage", "") == "pyarrow":
+            expected = expected.astype("int64[pyarrow]")
+        else:
+            # i.e IntegerDtype
+            expected = expected.astype("Int64")
 
     # TODO(GH#32514): Order of entries with the same count is inconsistent
     #  on CI (gh-32449)
@@ -109,8 +112,11 @@ def test_value_counts_null(null_obj, index_or_series_obj):
             result = result.sort_index()
 
     if not isinstance(result.dtype, np.dtype):
-        # i.e IntegerDtype
-        expected = expected.astype("Int64")
+        if getattr(obj.dtype, "storage", "") == "pyarrow":
+            expected = expected.astype("int64[pyarrow]")
+        else:
+            # i.e IntegerDtype
+            expected = expected.astype("Int64")
     tm.assert_series_equal(result, expected)
 
     expected[null_obj] = 3

From 5ded922d8c94ab9739966ee80a1da8be0d584f38 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 24 Feb 2023 10:32:40 -0800
Subject: [PATCH 4/4] Further cleanup

---
 pandas/tests/extension/test_string.py | 68 ---------------------------
 1 file changed, 68 deletions(-)

diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index 93380df1a689f..a2e438b858e59 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -18,7 +18,6 @@
 import numpy as np
 import pytest
 
-from pandas.compat import pa_version_under7p0
 from pandas.errors import PerformanceWarning
 
 import pandas as pd
@@ -196,73 +195,6 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna):
 
 
 class TestMethods(base.BaseMethodsTests):
-    def test_argsort(self, data_for_sorting):
-        with tm.maybe_produces_warning(
-            PerformanceWarning,
-            pa_version_under7p0
-            and getattr(data_for_sorting.dtype, "storage", "") == "pyarrow",
-            check_stacklevel=False,
-        ):
-            super().test_argsort(data_for_sorting)
-
-    def test_argsort_missing(self, data_missing_for_sorting):
-        with tm.maybe_produces_warning(
-            PerformanceWarning,
-            pa_version_under7p0
-            and getattr(data_missing_for_sorting.dtype, "storage", "") == "pyarrow",
-            check_stacklevel=False,
-        ):
-            super().test_argsort_missing(data_missing_for_sorting)
-
-    def test_argmin_argmax(
-        self, data_for_sorting, data_missing_for_sorting, na_value, request
-    ):
-        super().test_argmin_argmax(data_for_sorting, data_missing_for_sorting, na_value)
-
-    @pytest.mark.parametrize(
-        "op_name, skipna, expected",
-        [
-            ("idxmax", True, 0),
-            ("idxmin", True, 2),
-            ("argmax", True, 0),
-            ("argmin", True, 2),
-            ("idxmax", False, np.nan),
-            ("idxmin", False, np.nan),
-            ("argmax", False, -1),
-            ("argmin", False, -1),
-        ],
-    )
-    def test_argreduce_series(
-        self, data_missing_for_sorting, op_name, skipna, expected, request
-    ):
-        super().test_argreduce_series(
-            data_missing_for_sorting, op_name, skipna, expected
-        )
-
-    @pytest.mark.parametrize("dropna", [True, False])
-    def test_value_counts(self, all_data, dropna, request):
-        all_data = all_data[:10]
-        if dropna:
-            other = all_data[~all_data.isna()]
-        else:
-            other = all_data
-        with tm.maybe_produces_warning(
-            PerformanceWarning,
-            pa_version_under7p0
-            and getattr(all_data.dtype, "storage", "") == "pyarrow"
-            and not (dropna and "data_missing" in request.node.nodeid),
-        ):
-            result = pd.Series(all_data).value_counts(dropna=dropna).sort_index()
-        with tm.maybe_produces_warning(
-            PerformanceWarning,
-            pa_version_under7p0
-            and getattr(other.dtype, "storage", "") == "pyarrow"
-            and not (dropna and "data_missing" in request.node.nodeid),
-        ):
-            expected = pd.Series(other).value_counts(dropna=dropna).sort_index()
-
-        self.assert_series_equal(result, expected)
-
     def test_value_counts_with_normalize(self, data):
         data = data[:10].unique()
         values = np.array(data[~data.isna()])