Skip to content

PERF: ArrowExtensionArray.to_numpy #49973

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Dec 16, 2022
38 changes: 38 additions & 0 deletions asv_bench/benchmarks/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,41 @@ def time_setitem_slice(self, multiple_chunks):

def time_tolist(self, multiple_chunks):
self.array.tolist()


class ArrowExtensionArray:

params = [
[
"boolean[pyarrow]",
"float64[pyarrow]",
"int64[pyarrow]",
"string[pyarrow]",
"timestamp[ns][pyarrow]",
],
[False, True],
]
param_names = ["dtype", "hasna"]

def setup(self, dtype, hasna):
N = 100_000
if dtype == "boolean[pyarrow]":
data = np.random.choice([True, False], N, replace=True)
elif dtype == "float64[pyarrow]":
data = np.random.randn(N)
elif dtype == "int64[pyarrow]":
data = np.arange(N)
elif dtype == "string[pyarrow]":
data = tm.rands_array(10, N)
elif dtype == "timestamp[ns][pyarrow]":
data = pd.date_range("2000-01-01", freq="s", periods=N)
else:
raise NotImplementedError

arr = pd.array(data, dtype=dtype)
if hasna:
arr[::2] = pd.NA
self.arr = arr

def time_to_numpy(self, dtype, hasna):
self.arr.to_numpy()
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -748,6 +748,7 @@ Performance improvements
- Reduce memory usage of :meth:`DataFrame.to_pickle`/:meth:`Series.to_pickle` when using BZ2 or LZMA (:issue:`49068`)
- Performance improvement for :class:`~arrays.StringArray` constructor passing a numpy array with type ``np.str_`` (:issue:`49109`)
- Performance improvement in :meth:`~arrays.ArrowExtensionArray.factorize` (:issue:`49177`)
- Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`49973`)
- Performance improvement in :meth:`DataFrame.join` when joining on a subset of a :class:`MultiIndex` (:issue:`48611`)
- Performance improvement for :meth:`MultiIndex.intersection` (:issue:`48604`)
- Performance improvement in ``var`` for nullable dtypes (:issue:`48379`).
Expand Down
34 changes: 34 additions & 0 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,13 @@

import numpy as np

from pandas._libs import lib
from pandas._typing import (
ArrayLike,
Dtype,
FillnaOptions,
Iterator,
NpDtype,
PositionalIndexer,
SortKind,
TakeIndexer,
Expand All @@ -31,6 +33,7 @@
is_bool_dtype,
is_integer,
is_integer_dtype,
is_object_dtype,
is_scalar,
)
from pandas.core.dtypes.missing import isna
Expand Down Expand Up @@ -351,6 +354,10 @@ def __arrow_array__(self, type=None):
"""Convert myself to a pyarrow ChunkedArray."""
return self._data

def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
"""Correctly construct numpy arrays when passed to `np.asarray()`."""
return self.to_numpy(dtype=dtype)

def __invert__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
return type(self)(pc.invert(self._data))

Expand Down Expand Up @@ -749,6 +756,33 @@ def take(
indices_array[indices_array < 0] += len(self._data)
return type(self)(self._data.take(indices_array))

@doc(ExtensionArray.to_numpy)
def to_numpy(
self,
dtype: npt.DTypeLike | None = None,
copy: bool = False,
na_value: object = lib.no_default,
) -> np.ndarray:
if dtype is None and self._hasna:
dtype = object
if na_value is lib.no_default:
na_value = self.dtype.na_value

pa_type = self._data.type
if (
is_object_dtype(dtype)
or pa.types.is_timestamp(pa_type)
or pa.types.is_duration(pa_type)
):
result = np.array(list(self), dtype=dtype)
else:
result = np.asarray(self._data, dtype=dtype)
if copy or self._hasna:
result = result.copy()
if self._hasna:
result[self.isna()] = na_value
return result

def unique(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
"""
Compute the ArrowExtensionArray of unique values.
Expand Down
29 changes: 2 additions & 27 deletions pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
)
from pandas._typing import (
Dtype,
NpDtype,
Scalar,
npt,
)
Expand Down Expand Up @@ -151,31 +150,6 @@ def dtype(self) -> StringDtype: # type: ignore[override]
"""
return self._dtype

def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
"""Correctly construct numpy arrays when passed to `np.asarray()`."""
return self.to_numpy(dtype=dtype)

def to_numpy(
self,
dtype: npt.DTypeLike | None = None,
copy: bool = False,
na_value=lib.no_default,
) -> np.ndarray:
"""
Convert to a NumPy ndarray.
"""
# TODO: copy argument is ignored

result = np.array(self._data, dtype=dtype)
if self._data.null_count > 0:
if na_value is lib.no_default:
if dtype and np.issubdtype(dtype, np.floating):
return result
na_value = self._dtype.na_value
mask = self.isna()
result[mask] = na_value
return result

def insert(self, loc: int, item) -> ArrowStringArray:
if not isinstance(item, str) and item is not libmissing.NA:
raise TypeError("Scalar must be NA or str")
Expand Down Expand Up @@ -219,10 +193,11 @@ def astype(self, dtype, copy: bool = True):
if copy:
return self.copy()
return self

elif isinstance(dtype, NumericDtype):
data = self._data.cast(pa.from_numpy_dtype(dtype.numpy_dtype))
return dtype.__from_arrow__(data)
elif isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.floating):
return self.to_numpy(dtype=dtype, na_value=np.nan)

return super().astype(dtype, copy=copy)

Expand Down
17 changes: 17 additions & 0 deletions pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -1421,3 +1421,20 @@ def test_astype_from_non_pyarrow(data):
assert not isinstance(pd_array.dtype, ArrowDtype)
assert isinstance(result.dtype, ArrowDtype)
tm.assert_extension_array_equal(result, data)


def test_to_numpy_with_defaults(data):
# GH49973
result = data.to_numpy()

pa_type = data._data.type
if pa.types.is_duration(pa_type) or pa.types.is_timestamp(pa_type):
expected = np.array(list(data))
else:
expected = np.array(data._data)

if data._hasna:
expected = expected.astype(object)
expected[pd.isna(data)] = pd.NA

tm.assert_numpy_array_equal(result, expected)