Skip to content

Commit 026a83e

Browse files
authored
PERF: ArrowExtensionArray.to_numpy (#49973)
1 parent 1488157 commit 026a83e

File tree

5 files changed

+92
-27
lines changed

5 files changed

+92
-27
lines changed

asv_bench/benchmarks/array.py

+38
Original file line numberDiff line numberDiff line change
@@ -92,3 +92,41 @@ def time_setitem_slice(self, multiple_chunks):
9292

9393
def time_tolist(self, multiple_chunks):
9494
self.array.tolist()
95+
96+
97+
class ArrowExtensionArray:
98+
99+
params = [
100+
[
101+
"boolean[pyarrow]",
102+
"float64[pyarrow]",
103+
"int64[pyarrow]",
104+
"string[pyarrow]",
105+
"timestamp[ns][pyarrow]",
106+
],
107+
[False, True],
108+
]
109+
param_names = ["dtype", "hasna"]
110+
111+
def setup(self, dtype, hasna):
112+
N = 100_000
113+
if dtype == "boolean[pyarrow]":
114+
data = np.random.choice([True, False], N, replace=True)
115+
elif dtype == "float64[pyarrow]":
116+
data = np.random.randn(N)
117+
elif dtype == "int64[pyarrow]":
118+
data = np.arange(N)
119+
elif dtype == "string[pyarrow]":
120+
data = tm.rands_array(10, N)
121+
elif dtype == "timestamp[ns][pyarrow]":
122+
data = pd.date_range("2000-01-01", freq="s", periods=N)
123+
else:
124+
raise NotImplementedError
125+
126+
arr = pd.array(data, dtype=dtype)
127+
if hasna:
128+
arr[::2] = pd.NA
129+
self.arr = arr
130+
131+
def time_to_numpy(self, dtype, hasna):
132+
self.arr.to_numpy()

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -748,6 +748,7 @@ Performance improvements
748748
- Reduce memory usage of :meth:`DataFrame.to_pickle`/:meth:`Series.to_pickle` when using BZ2 or LZMA (:issue:`49068`)
749749
- Performance improvement for :class:`~arrays.StringArray` constructor passing a numpy array with type ``np.str_`` (:issue:`49109`)
750750
- Performance improvement in :meth:`~arrays.ArrowExtensionArray.factorize` (:issue:`49177`)
751+
- Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`49973`)
751752
- Performance improvement in :meth:`DataFrame.join` when joining on a subset of a :class:`MultiIndex` (:issue:`48611`)
752753
- Performance improvement for :meth:`MultiIndex.intersection` (:issue:`48604`)
753754
- Performance improvement in ``var`` for nullable dtypes (:issue:`48379`).

pandas/core/arrays/arrow/array.py

+34
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,13 @@
99

1010
import numpy as np
1111

12+
from pandas._libs import lib
1213
from pandas._typing import (
1314
ArrayLike,
1415
Dtype,
1516
FillnaOptions,
1617
Iterator,
18+
NpDtype,
1719
PositionalIndexer,
1820
SortKind,
1921
TakeIndexer,
@@ -31,6 +33,7 @@
3133
is_bool_dtype,
3234
is_integer,
3335
is_integer_dtype,
36+
is_object_dtype,
3437
is_scalar,
3538
)
3639
from pandas.core.dtypes.missing import isna
@@ -351,6 +354,10 @@ def __arrow_array__(self, type=None):
351354
"""Convert myself to a pyarrow ChunkedArray."""
352355
return self._data
353356

357+
def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
358+
"""Correctly construct numpy arrays when passed to `np.asarray()`."""
359+
return self.to_numpy(dtype=dtype)
360+
354361
def __invert__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
355362
return type(self)(pc.invert(self._data))
356363

@@ -749,6 +756,33 @@ def take(
749756
indices_array[indices_array < 0] += len(self._data)
750757
return type(self)(self._data.take(indices_array))
751758

759+
@doc(ExtensionArray.to_numpy)
760+
def to_numpy(
761+
self,
762+
dtype: npt.DTypeLike | None = None,
763+
copy: bool = False,
764+
na_value: object = lib.no_default,
765+
) -> np.ndarray:
766+
if dtype is None and self._hasna:
767+
dtype = object
768+
if na_value is lib.no_default:
769+
na_value = self.dtype.na_value
770+
771+
pa_type = self._data.type
772+
if (
773+
is_object_dtype(dtype)
774+
or pa.types.is_timestamp(pa_type)
775+
or pa.types.is_duration(pa_type)
776+
):
777+
result = np.array(list(self), dtype=dtype)
778+
else:
779+
result = np.asarray(self._data, dtype=dtype)
780+
if copy or self._hasna:
781+
result = result.copy()
782+
if self._hasna:
783+
result[self.isna()] = na_value
784+
return result
785+
752786
def unique(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
753787
"""
754788
Compute the ArrowExtensionArray of unique values.

pandas/core/arrays/string_arrow.py

+2-27
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
)
1313
from pandas._typing import (
1414
Dtype,
15-
NpDtype,
1615
Scalar,
1716
npt,
1817
)
@@ -151,31 +150,6 @@ def dtype(self) -> StringDtype: # type: ignore[override]
151150
"""
152151
return self._dtype
153152

154-
def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
155-
"""Correctly construct numpy arrays when passed to `np.asarray()`."""
156-
return self.to_numpy(dtype=dtype)
157-
158-
def to_numpy(
159-
self,
160-
dtype: npt.DTypeLike | None = None,
161-
copy: bool = False,
162-
na_value=lib.no_default,
163-
) -> np.ndarray:
164-
"""
165-
Convert to a NumPy ndarray.
166-
"""
167-
# TODO: copy argument is ignored
168-
169-
result = np.array(self._data, dtype=dtype)
170-
if self._data.null_count > 0:
171-
if na_value is lib.no_default:
172-
if dtype and np.issubdtype(dtype, np.floating):
173-
return result
174-
na_value = self._dtype.na_value
175-
mask = self.isna()
176-
result[mask] = na_value
177-
return result
178-
179153
def insert(self, loc: int, item) -> ArrowStringArray:
180154
if not isinstance(item, str) and item is not libmissing.NA:
181155
raise TypeError("Scalar must be NA or str")
@@ -219,10 +193,11 @@ def astype(self, dtype, copy: bool = True):
219193
if copy:
220194
return self.copy()
221195
return self
222-
223196
elif isinstance(dtype, NumericDtype):
224197
data = self._data.cast(pa.from_numpy_dtype(dtype.numpy_dtype))
225198
return dtype.__from_arrow__(data)
199+
elif isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.floating):
200+
return self.to_numpy(dtype=dtype, na_value=np.nan)
226201

227202
return super().astype(dtype, copy=copy)
228203

pandas/tests/extension/test_arrow.py

+17
Original file line numberDiff line numberDiff line change
@@ -1421,3 +1421,20 @@ def test_astype_from_non_pyarrow(data):
14211421
assert not isinstance(pd_array.dtype, ArrowDtype)
14221422
assert isinstance(result.dtype, ArrowDtype)
14231423
tm.assert_extension_array_equal(result, data)
1424+
1425+
1426+
def test_to_numpy_with_defaults(data):
1427+
# GH49973
1428+
result = data.to_numpy()
1429+
1430+
pa_type = data._data.type
1431+
if pa.types.is_duration(pa_type) or pa.types.is_timestamp(pa_type):
1432+
expected = np.array(list(data))
1433+
else:
1434+
expected = np.array(data._data)
1435+
1436+
if data._hasna:
1437+
expected = expected.astype(object)
1438+
expected[pd.isna(data)] = pd.NA
1439+
1440+
tm.assert_numpy_array_equal(result, expected)

0 commit comments

Comments
 (0)