From d0214108d0e34e512787b096e2a73c2659cbc860 Mon Sep 17 00:00:00 2001
From: Luke Manley <lukemanley@gmail.com>
Date: Wed, 30 Nov 2022 07:40:31 -0500
Subject: [PATCH 1/7] PERF: ArrowExtensionArray.to_numpy

---
 asv_bench/benchmarks/array.py      | 38 ++++++++++++++++++++++++++++++
 pandas/core/arrays/arrow/array.py  | 32 +++++++++++++++++++++++++
 pandas/core/arrays/string_arrow.py | 26 --------------------
 3 files changed, 70 insertions(+), 26 deletions(-)

diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py
index cb949637ea745..924040ff0648b 100644
--- a/asv_bench/benchmarks/array.py
+++ b/asv_bench/benchmarks/array.py
@@ -92,3 +92,41 @@ def time_setitem_slice(self, multiple_chunks):
 
     def time_tolist(self, multiple_chunks):
         self.array.tolist()
+
+
+class ArrowExtensionArray:
+
+    params = [
+        [
+            "boolean[pyarrow]",
+            "float64[pyarrow]",
+            "int64[pyarrow]",
+            "string[pyarrow]",
+            "timestamp[ns][pyarrow]",
+        ],
+        [False, True],
+    ]
+    param_names = ["dtype", "hasna"]
+
+    def setup(self, dtype, hasna):
+        N = 100_000
+        if dtype == "boolean[pyarrow]":
+            data = np.random.choice([True, False], N, replace=True)
+        elif dtype == "float64[pyarrow]":
+            data = np.random.randn(N)
+        elif dtype == "int64[pyarrow]":
+            data = np.arange(N)
+        elif dtype == "string[pyarrow]":
+            data = tm.rands_array(10, N)
+        elif dtype == "timestamp[ns][pyarrow]":
+            data = pd.date_range("2000-01-01", freq="s", periods=N)
+        else:
+            raise NotImplementedError
+
+        arr = pd.array(data, dtype=dtype)
+        if hasna:
+            arr[::2] = pd.NA
+        self.arr = arr
+
+    def time_to_numpy(self, dtype, hasna):
+        self.arr.to_numpy()
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 254ff8894b36c..7e030b00875a8 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -9,11 +9,13 @@
 
 import numpy as np
 
+from pandas._libs import lib
 from pandas._typing import (
     ArrayLike,
     Dtype,
     FillnaOptions,
     Iterator,
+    NpDtype,
     PositionalIndexer,
     SortKind,
     TakeIndexer,
@@ -351,6 +353,10 @@ def __arrow_array__(self, type=None):
         """Convert myself to a pyarrow ChunkedArray."""
         return self._data
 
+    def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
+        """Correctly construct numpy arrays when passed to `np.asarray()`."""
+        return self.to_numpy(dtype=dtype)
+
     def __invert__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
         return type(self)(pc.invert(self._data))
 
@@ -749,6 +755,32 @@ def take(
                 indices_array[indices_array < 0] += len(self._data)
             return type(self)(self._data.take(indices_array))
 
+    @doc(ExtensionArray.to_numpy)
+    def to_numpy(
+        self,
+        dtype: npt.DTypeLike | None = None,
+        copy: bool = False,
+        na_value: object = lib.no_default,
+    ) -> np.ndarray:
+        # TODO: copy argument is ignored
+
+        if na_value is lib.no_default:
+            na_value = self.dtype.na_value
+
+        pa_type = self._data.type
+        if pa.types.is_timestamp(pa_type) or pa.types.is_duration(pa_type):
+            result = np.array(self.tolist(), dtype=np.object_)
+        elif not self._hasna or (pa.types.is_floating(pa_type) and na_value is np.nan):
+            return np.array(self._data, dtype=dtype)
+        else:
+            result = np.array(self._data, dtype=np.object_)
+
+        if self._hasna:
+            result[self.isna()] = na_value
+        if dtype is not None:
+            return result.astype(dtype, copy=False)
+        return result
+
     def unique(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
         """
         Compute the ArrowExtensionArray of unique values.
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index b8b1d64d7a093..dc23b16c3a452 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -12,7 +12,6 @@
 )
 from pandas._typing import (
     Dtype,
-    NpDtype,
     Scalar,
     npt,
 )
@@ -151,31 +150,6 @@ def dtype(self) -> StringDtype:  # type: ignore[override]
         """
         return self._dtype
 
-    def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
-        """Correctly construct numpy arrays when passed to `np.asarray()`."""
-        return self.to_numpy(dtype=dtype)
-
-    def to_numpy(
-        self,
-        dtype: npt.DTypeLike | None = None,
-        copy: bool = False,
-        na_value=lib.no_default,
-    ) -> np.ndarray:
-        """
-        Convert to a NumPy ndarray.
-        """
-        # TODO: copy argument is ignored
-
-        result = np.array(self._data, dtype=dtype)
-        if self._data.null_count > 0:
-            if na_value is lib.no_default:
-                if dtype and np.issubdtype(dtype, np.floating):
-                    return result
-                na_value = self._dtype.na_value
-            mask = self.isna()
-            result[mask] = na_value
-        return result
-
     def insert(self, loc: int, item) -> ArrowStringArray:
         if not isinstance(item, str) and item is not libmissing.NA:
             raise TypeError("Scalar must be NA or str")

From df3c9ced79d0a26ec8ea1778c6df1074ed7ef2fb Mon Sep 17 00:00:00 2001
From: Luke Manley <lukemanley@gmail.com>
Date: Wed, 30 Nov 2022 07:53:24 -0500
Subject: [PATCH 2/7] whatsnew

---
 doc/source/whatsnew/v2.0.0.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
index 1fb9a81e85a83..56fde8daeedf7 100644
--- a/doc/source/whatsnew/v2.0.0.rst
+++ b/doc/source/whatsnew/v2.0.0.rst
@@ -603,6 +603,7 @@ Performance improvements
 - Reduce memory usage of :meth:`DataFrame.to_pickle`/:meth:`Series.to_pickle` when using BZ2 or LZMA (:issue:`49068`)
 - Performance improvement for :class:`~arrays.StringArray` constructor passing a numpy array with type ``np.str_`` (:issue:`49109`)
 - Performance improvement in :meth:`~arrays.ArrowExtensionArray.factorize` (:issue:`49177`)
+- Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`49973`)
 - Performance improvement in :meth:`DataFrame.join` when joining on a subset of a :class:`MultiIndex` (:issue:`48611`)
 - Performance improvement for :meth:`MultiIndex.intersection` (:issue:`48604`)
 - Performance improvement in ``var`` for nullable dtypes (:issue:`48379`).

From 49d0d20f3f6c71bc2892fafc61c47be4cf463e74 Mon Sep 17 00:00:00 2001
From: Luke Manley <lukemanley@gmail.com>
Date: Wed, 30 Nov 2022 19:46:51 -0500
Subject: [PATCH 3/7] fix failing test

---
 pandas/core/arrays/arrow/array.py  | 4 +++-
 pandas/core/arrays/string_arrow.py | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 7e030b00875a8..d43692742efd4 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -770,7 +770,9 @@ def to_numpy(
         pa_type = self._data.type
         if pa.types.is_timestamp(pa_type) or pa.types.is_duration(pa_type):
             result = np.array(self.tolist(), dtype=np.object_)
-        elif not self._hasna or (pa.types.is_floating(pa_type) and na_value is np.nan):
+        elif not self._hasna or (
+            np.issubdtype(float, np.floating) and na_value is np.nan
+        ):
             return np.array(self._data, dtype=dtype)
         else:
             result = np.array(self._data, dtype=np.object_)
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index dc23b16c3a452..c79e2f752c5a8 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -193,10 +193,11 @@ def astype(self, dtype, copy: bool = True):
             if copy:
                 return self.copy()
             return self
-
         elif isinstance(dtype, NumericDtype):
             data = self._data.cast(pa.from_numpy_dtype(dtype.numpy_dtype))
             return dtype.__from_arrow__(data)
+        elif isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.floating):
+            return self.to_numpy(dtype=dtype, na_value=np.nan)
 
         return super().astype(dtype, copy=copy)
 

From f425fc035f79d69e0f627e7ad07c0c2d5986d03c Mon Sep 17 00:00:00 2001
From: Luke Manley <lukemanley@gmail.com>
Date: Wed, 7 Dec 2022 07:40:33 -0500
Subject: [PATCH 4/7] default to pyarrow behavior in to_numpy

---
 doc/source/whatsnew/v2.0.0.rst       |  1 +
 pandas/core/arrays/arrow/array.py    | 25 ++++++++++---------------
 pandas/core/arrays/string_arrow.py   | 12 ++++++++++++
 pandas/tests/extension/test_arrow.py | 20 ++++++++++++++++++++
 4 files changed, 43 insertions(+), 15 deletions(-)

diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
index e10775b0ae161..efc2bd32ce7e3 100644
--- a/doc/source/whatsnew/v2.0.0.rst
+++ b/doc/source/whatsnew/v2.0.0.rst
@@ -370,6 +370,7 @@ Other API changes
 - Changed behavior of :class:`Index` constructor with sequence containing at least one ``NaT`` and everything else either ``None`` or ``NaN`` to infer ``datetime64[ns]`` dtype instead of ``object``, matching :class:`Series` behavior (:issue:`49340`)
 - :func:`read_stata` with parameter ``index_col`` set to ``None`` (the default) will now set the index on the returned :class:`DataFrame` to a :class:`RangeIndex` instead of a :class:`Int64Index` (:issue:`49745`)
 - Changed behavior of :class:`Index` constructor with an object-dtype ``numpy.ndarray`` containing all-``bool`` values or all-complex values, this will now retain object dtype, consistent with the :class:`Series` behavior (:issue:`49594`)
+- Changed the default behavior of :meth:`.arrays.ArrowExtensionArray.to_numpy` to pyarrow's behavior in terms of numpy dtype and missing value representation for all types except ``duration`` and ``timestamp`` (:issue:`49973`)
 - Changed behavior of :meth:`DataFrame.shift` with ``axis=1``, an integer ``fill_value``, and homogeneous datetime-like dtype, this now fills new columns with integer dtypes instead of casting to datetimelike (:issue:`49842`)
 - Files are now closed when encountering an exception in :func:`read_json` (:issue:`49921`)
 - :meth:`DataFrame.values`, :meth:`DataFrame.to_numpy`, :meth:`DataFrame.xs`, :meth:`DataFrame.reindex`, :meth:`DataFrame.fillna`, and :meth:`DataFrame.replace` no longer silently consolidate the underlying arrays; do ``df = df.copy()`` to ensure consolidation (:issue:`49356`)
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index d43692742efd4..599b2cead2b9c 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -33,6 +33,7 @@
     is_bool_dtype,
     is_integer,
     is_integer_dtype,
+    is_object_dtype,
     is_scalar,
 )
 from pandas.core.dtypes.missing import isna
@@ -762,25 +763,19 @@ def to_numpy(
         copy: bool = False,
         na_value: object = lib.no_default,
     ) -> np.ndarray:
-        # TODO: copy argument is ignored
-
-        if na_value is lib.no_default:
-            na_value = self.dtype.na_value
-
         pa_type = self._data.type
-        if pa.types.is_timestamp(pa_type) or pa.types.is_duration(pa_type):
-            result = np.array(self.tolist(), dtype=np.object_)
-        elif not self._hasna or (
-            np.issubdtype(float, np.floating) and na_value is np.nan
+        if (
+            is_object_dtype(dtype)
+            or pa.types.is_timestamp(pa_type)
+            or pa.types.is_duration(pa_type)
         ):
-            return np.array(self._data, dtype=dtype)
+            result = np.array(list(self), dtype=dtype)
         else:
-            result = np.array(self._data, dtype=np.object_)
-
-        if self._hasna:
+            result = np.asarray(self._data, dtype=dtype)
+            if copy or na_value is not lib.no_default:
+                result = result.copy()
+        if self._hasna and na_value is not lib.no_default:
             result[self.isna()] = na_value
-        if dtype is not None:
-            return result.astype(dtype, copy=False)
         return result
 
     def unique(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index c79e2f752c5a8..52397122da5a5 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -150,6 +150,18 @@ def dtype(self) -> StringDtype:  # type: ignore[override]
         """
         return self._dtype
 
+    def to_numpy(
+        self,
+        dtype: npt.DTypeLike | None = None,
+        copy: bool = False,
+        na_value=lib.no_default,
+    ) -> np.ndarray:
+        # TODO: should na_value default to pyarrow's behavior of None (vs. pd.NA)?
+        if na_value is lib.no_default:
+            if not (dtype and np.issubdtype(dtype, np.floating)):
+                na_value = self._dtype.na_value
+        return super().to_numpy(dtype=dtype, copy=copy, na_value=na_value)
+
     def insert(self, loc: int, item) -> ArrowStringArray:
         if not isinstance(item, str) and item is not libmissing.NA:
             raise TypeError("Scalar must be NA or str")
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
index e6f1675bb8bc8..e9a3e5e096ee4 100644
--- a/pandas/tests/extension/test_arrow.py
+++ b/pandas/tests/extension/test_arrow.py
@@ -1406,3 +1406,23 @@ def test_astype_from_non_pyarrow(data):
     assert not isinstance(pd_array.dtype, ArrowDtype)
     assert isinstance(result.dtype, ArrowDtype)
     tm.assert_extension_array_equal(result, data)
+
+
+def test_to_numpy_with_defaults_matches_pyarrow(data, request):
+    # GH49973
+    pa_type = data._data.type
+    if pa.types.is_duration(pa_type) or pa.types.is_timestamp(pa_type):
+        request.node.add_marker(
+            pytest.mark.xfail(
+                raises=AssertionError,
+                reason="numpy array are different",
+            )
+        )
+    result = data.to_numpy()
+
+    # these should be equivalent
+    expected1 = np.array(data._data)
+    expected2 = data._data.to_numpy()
+
+    tm.assert_numpy_array_equal(result, expected1)
+    tm.assert_numpy_array_equal(result, expected2)

From 685d66ac2546521b68a2e23d4130b1b11b5e816a Mon Sep 17 00:00:00 2001
From: Luke Manley <lukemanley@gmail.com>
Date: Wed, 7 Dec 2022 19:00:09 -0500
Subject: [PATCH 5/7] use pd.NA for missing

---
 doc/source/whatsnew/v2.0.0.rst       |  1 -
 pandas/core/arrays/arrow/array.py    |  9 +++++++--
 pandas/core/arrays/string_arrow.py   | 12 ------------
 pandas/tests/extension/test_arrow.py | 23 ++++++++++-------------
 4 files changed, 17 insertions(+), 28 deletions(-)

diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
index efc2bd32ce7e3..e10775b0ae161 100644
--- a/doc/source/whatsnew/v2.0.0.rst
+++ b/doc/source/whatsnew/v2.0.0.rst
@@ -370,7 +370,6 @@ Other API changes
 - Changed behavior of :class:`Index` constructor with sequence containing at least one ``NaT`` and everything else either ``None`` or ``NaN`` to infer ``datetime64[ns]`` dtype instead of ``object``, matching :class:`Series` behavior (:issue:`49340`)
 - :func:`read_stata` with parameter ``index_col`` set to ``None`` (the default) will now set the index on the returned :class:`DataFrame` to a :class:`RangeIndex` instead of a :class:`Int64Index` (:issue:`49745`)
 - Changed behavior of :class:`Index` constructor with an object-dtype ``numpy.ndarray`` containing all-``bool`` values or all-complex values, this will now retain object dtype, consistent with the :class:`Series` behavior (:issue:`49594`)
-- Changed the default behavior of :meth:`.arrays.ArrowExtensionArray.to_numpy` to pyarrow's behavior in terms of numpy dtype and missing value representation for all types except ``duration`` and ``timestamp`` (:issue:`49973`)
 - Changed behavior of :meth:`DataFrame.shift` with ``axis=1``, an integer ``fill_value``, and homogeneous datetime-like dtype, this now fills new columns with integer dtypes instead of casting to datetimelike (:issue:`49842`)
 - Files are now closed when encountering an exception in :func:`read_json` (:issue:`49921`)
 - :meth:`DataFrame.values`, :meth:`DataFrame.to_numpy`, :meth:`DataFrame.xs`, :meth:`DataFrame.reindex`, :meth:`DataFrame.fillna`, and :meth:`DataFrame.replace` no longer silently consolidate the underlying arrays; do ``df = df.copy()`` to ensure consolidation (:issue:`49356`)
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 599b2cead2b9c..d698c5eb11751 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -763,6 +763,11 @@ def to_numpy(
         copy: bool = False,
         na_value: object = lib.no_default,
     ) -> np.ndarray:
+        if dtype is None and self._hasna:
+            dtype = object
+        if na_value is lib.no_default:
+            na_value = self.dtype.na_value
+
         pa_type = self._data.type
         if (
             is_object_dtype(dtype)
@@ -772,9 +777,9 @@ def to_numpy(
             result = np.array(list(self), dtype=dtype)
         else:
             result = np.asarray(self._data, dtype=dtype)
-            if copy or na_value is not lib.no_default:
+            if copy or self._hasna:
                 result = result.copy()
-        if self._hasna and na_value is not lib.no_default:
+        if self._hasna:
             result[self.isna()] = na_value
         return result
 
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 52397122da5a5..c79e2f752c5a8 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -150,18 +150,6 @@ def dtype(self) -> StringDtype:  # type: ignore[override]
         """
         return self._dtype
 
-    def to_numpy(
-        self,
-        dtype: npt.DTypeLike | None = None,
-        copy: bool = False,
-        na_value=lib.no_default,
-    ) -> np.ndarray:
-        # TODO: should na_value default to pyarrow's behavior of None (vs. pd.NA)?
-        if na_value is lib.no_default:
-            if not (dtype and np.issubdtype(dtype, np.floating)):
-                na_value = self._dtype.na_value
-        return super().to_numpy(dtype=dtype, copy=copy, na_value=na_value)
-
     def insert(self, loc: int, item) -> ArrowStringArray:
         if not isinstance(item, str) and item is not libmissing.NA:
             raise TypeError("Scalar must be NA or str")
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
index e9a3e5e096ee4..82192b5f0a603 100644
--- a/pandas/tests/extension/test_arrow.py
+++ b/pandas/tests/extension/test_arrow.py
@@ -1408,21 +1408,18 @@ def test_astype_from_non_pyarrow(data):
     tm.assert_extension_array_equal(result, data)
 
 
-def test_to_numpy_with_defaults_matches_pyarrow(data, request):
+def test_to_numpy_with_defaults(data, request):
     # GH49973
+    result = data.to_numpy()
+
     pa_type = data._data.type
     if pa.types.is_duration(pa_type) or pa.types.is_timestamp(pa_type):
-        request.node.add_marker(
-            pytest.mark.xfail(
-                raises=AssertionError,
-                reason="numpy array are different",
-            )
-        )
-    result = data.to_numpy()
+        expected = np.array(list(data))
+    else:
 
-    # these should be equivalent
-    expected1 = np.array(data._data)
-    expected2 = data._data.to_numpy()
+        expected = np.array(data._data)
+    if data._hasna:
+        expected = expected.astype(object)
+        expected[pd.isna(data)] = pd.NA
 
-    tm.assert_numpy_array_equal(result, expected1)
-    tm.assert_numpy_array_equal(result, expected2)
+    tm.assert_numpy_array_equal(result, expected)

From 2f8e15645b9155292d5affdc25b04a7e1406f27d Mon Sep 17 00:00:00 2001
From: Luke Manley <lukemanley@gmail.com>
Date: Wed, 7 Dec 2022 19:07:46 -0500
Subject: [PATCH 6/7] fix spacing

---
 pandas/tests/extension/test_arrow.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
index 82192b5f0a603..dfccd50cd136e 100644
--- a/pandas/tests/extension/test_arrow.py
+++ b/pandas/tests/extension/test_arrow.py
@@ -1416,8 +1416,8 @@ def test_to_numpy_with_defaults(data, request):
     if pa.types.is_duration(pa_type) or pa.types.is_timestamp(pa_type):
         expected = np.array(list(data))
     else:
-
         expected = np.array(data._data)
+
     if data._hasna:
         expected = expected.astype(object)
         expected[pd.isna(data)] = pd.NA

From d6c3d57f3986b18d9aca54b1bde9ee3d2878e554 Mon Sep 17 00:00:00 2001
From: Luke Manley <lukemanley@gmail.com>
Date: Thu, 15 Dec 2022 19:30:39 -0500
Subject: [PATCH 7/7] remove unused parameter in test

---
 pandas/tests/extension/test_arrow.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
index dfccd50cd136e..a9577fc9ad8e6 100644
--- a/pandas/tests/extension/test_arrow.py
+++ b/pandas/tests/extension/test_arrow.py
@@ -1408,7 +1408,7 @@ def test_astype_from_non_pyarrow(data):
     tm.assert_extension_array_equal(result, data)
 
 
-def test_to_numpy_with_defaults(data, request):
+def test_to_numpy_with_defaults(data):
     # GH49973
     result = data.to_numpy()