From 0f99ac3dac8b6ae83c2c43175c274af81670d4ea Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 7 Feb 2023 21:00:55 -0500 Subject: [PATCH 1/4] PERF: ArrowExtensionArray.to_numpy(dtype=object) --- pandas/core/arrays/arrow/array.py | 10 +++++----- pandas/tests/extension/test_arrow.py | 10 ++++++++++ 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index a4cde823c6713..82df2f5eac2fa 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -838,12 +838,12 @@ def to_numpy( na_value = self.dtype.na_value pa_type = self._data.type - if ( - is_object_dtype(dtype) - or pa.types.is_timestamp(pa_type) - or pa.types.is_duration(pa_type) - ): + if pa.types.is_timestamp(pa_type) or pa.types.is_duration(pa_type): result = np.array(list(self), dtype=dtype) + elif is_object_dtype(dtype) and self._hasna: + result = np.empty(len(self), dtype=object) + mask = ~self.isna() + result[mask] = np.asarray(self[mask]._data) else: result = np.asarray(self._data, dtype=dtype) if copy or self._hasna: diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 522a0d59e4161..6b7fd578c6f6d 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1525,6 +1525,16 @@ def test_to_numpy_with_defaults(data): tm.assert_numpy_array_equal(result, expected) +def test_to_numpy_int_with_na(): + # ensure to_numpy does not convert int to float + data = [1, None] + arr = pd.array(data, dtype="int64[pyarrow]") + result = arr.to_numpy() + expected = np.array([1, pd.NA], dtype=object) + assert isinstance(result[0], int) + tm.assert_numpy_array_equal(result, expected) + + def test_setitem_null_slice(data): # GH50248 orig = data.copy() From f098c589d11a34d34b7663d28380b107f13ef2cf Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 7 Feb 2023 21:13:06 -0500 Subject: [PATCH 2/4] gh refs --- doc/source/whatsnew/v2.0.0.rst | 2 +- pandas/tests/extension/test_arrow.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 7d028935ad175..d7787e50d8179 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1061,7 +1061,7 @@ Performance improvements - Performance improvement in :meth:`~arrays.ArrowExtensionArray.factorize` (:issue:`49177`) - Performance improvement in :meth:`~arrays.ArrowExtensionArray.__setitem__` (:issue:`50248`, :issue:`50632`) - Performance improvement in :class:`~arrays.ArrowExtensionArray` comparison methods when array contains NA (:issue:`50524`) -- Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`49973`) +- Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`49973`, :issue:`51227`) - Performance improvement when parsing strings to :class:`BooleanDtype` (:issue:`50613`) - Performance improvement in :meth:`DataFrame.join` when joining on a subset of a :class:`MultiIndex` (:issue:`48611`) - Performance improvement for :meth:`MultiIndex.intersection` (:issue:`48604`) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 6b7fd578c6f6d..4a6705e13032b 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1526,7 +1526,7 @@ def test_to_numpy_with_defaults(data): def test_to_numpy_int_with_na(): - # ensure to_numpy does not convert int to float + # GH51227: ensure to_numpy does not convert int to float data = [1, None] arr = pd.array(data, dtype="int64[pyarrow]") result = arr.to_numpy() From f25ccba759e2f852ed7e21c03fa67fa76bdaf6d2 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sat, 11 Feb 2023 21:58:26 -0500 Subject: [PATCH 3/4] fix --- pandas/core/arrays/arrow/array.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 93aaf3e4d0bc3..028b49a20c1dd 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -865,7 +865,11 @@ def to_numpy( na_value = self.dtype.na_value pa_type = self._data.type - if pa.types.is_timestamp(pa_type) or pa.types.is_duration(pa_type): + if ( + pa.types.is_time(pa_type) + or pa.types.is_timestamp(pa_type) + or pa.types.is_duration(pa_type) + ): result = np.array(list(self), dtype=dtype) elif is_object_dtype(dtype) and self._hasna: result = np.empty(len(self), dtype=object) From 3748f5a36d70dcdce5d5b84b2fd27f91feb0fbf8 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 14 Feb 2023 19:29:17 -0500 Subject: [PATCH 4/4] cleanup / add comment --- pandas/core/arrays/arrow/array.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 028b49a20c1dd..540ce016a7fde 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -865,11 +865,10 @@ def to_numpy( na_value = self.dtype.na_value pa_type = self._data.type - if ( - pa.types.is_time(pa_type) - or pa.types.is_timestamp(pa_type) - or pa.types.is_duration(pa_type) - ): + if pa.types.is_temporal(pa_type) and not pa.types.is_date(pa_type): + # temporal types with units and/or timezones currently + # require pandas/python scalars to pass all tests + # TODO: improve performance (this is slow) result = np.array(list(self), dtype=dtype) elif is_object_dtype(dtype) and self._hasna: result = np.empty(len(self), dtype=object)