From d0214108d0e34e512787b096e2a73c2659cbc860 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 30 Nov 2022 07:40:31 -0500 Subject: [PATCH 1/7] PERF: ArrowExtensionArray.to_numpy --- asv_bench/benchmarks/array.py | 38 ++++++++++++++++++++++++++++++ pandas/core/arrays/arrow/array.py | 32 +++++++++++++++++++++++++ pandas/core/arrays/string_arrow.py | 26 -------------------- 3 files changed, 70 insertions(+), 26 deletions(-) diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py index cb949637ea745..924040ff0648b 100644 --- a/asv_bench/benchmarks/array.py +++ b/asv_bench/benchmarks/array.py @@ -92,3 +92,41 @@ def time_setitem_slice(self, multiple_chunks): def time_tolist(self, multiple_chunks): self.array.tolist() + + +class ArrowExtensionArray: + + params = [ + [ + "boolean[pyarrow]", + "float64[pyarrow]", + "int64[pyarrow]", + "string[pyarrow]", + "timestamp[ns][pyarrow]", + ], + [False, True], + ] + param_names = ["dtype", "hasna"] + + def setup(self, dtype, hasna): + N = 100_000 + if dtype == "boolean[pyarrow]": + data = np.random.choice([True, False], N, replace=True) + elif dtype == "float64[pyarrow]": + data = np.random.randn(N) + elif dtype == "int64[pyarrow]": + data = np.arange(N) + elif dtype == "string[pyarrow]": + data = tm.rands_array(10, N) + elif dtype == "timestamp[ns][pyarrow]": + data = pd.date_range("2000-01-01", freq="s", periods=N) + else: + raise NotImplementedError + + arr = pd.array(data, dtype=dtype) + if hasna: + arr[::2] = pd.NA + self.arr = arr + + def time_to_numpy(self, dtype, hasna): + self.arr.to_numpy() diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 254ff8894b36c..7e030b00875a8 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -9,11 +9,13 @@ import numpy as np +from pandas._libs import lib from pandas._typing import ( ArrayLike, Dtype, FillnaOptions, Iterator, + NpDtype, PositionalIndexer, SortKind, TakeIndexer, @@ -351,6 +353,10 @@ def __arrow_array__(self, type=None): """Convert myself to a pyarrow ChunkedArray.""" return self._data + def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + """Correctly construct numpy arrays when passed to `np.asarray()`.""" + return self.to_numpy(dtype=dtype) + def __invert__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: return type(self)(pc.invert(self._data)) @@ -749,6 +755,32 @@ def take( indices_array[indices_array < 0] += len(self._data) return type(self)(self._data.take(indices_array)) + @doc(ExtensionArray.to_numpy) + def to_numpy( + self, + dtype: npt.DTypeLike | None = None, + copy: bool = False, + na_value: object = lib.no_default, + ) -> np.ndarray: + # TODO: copy argument is ignored + + if na_value is lib.no_default: + na_value = self.dtype.na_value + + pa_type = self._data.type + if pa.types.is_timestamp(pa_type) or pa.types.is_duration(pa_type): + result = np.array(self.tolist(), dtype=np.object_) + elif not self._hasna or (pa.types.is_floating(pa_type) and na_value is np.nan): + return np.array(self._data, dtype=dtype) + else: + result = np.array(self._data, dtype=np.object_) + + if self._hasna: + result[self.isna()] = na_value + if dtype is not None: + return result.astype(dtype, copy=False) + return result + def unique(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: """ Compute the ArrowExtensionArray of unique values. diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index b8b1d64d7a093..dc23b16c3a452 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -12,7 +12,6 @@ ) from pandas._typing import ( Dtype, - NpDtype, Scalar, npt, ) @@ -151,31 +150,6 @@ def dtype(self) -> StringDtype: # type: ignore[override] """ return self._dtype - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: - """Correctly construct numpy arrays when passed to `np.asarray()`.""" - return self.to_numpy(dtype=dtype) - - def to_numpy( - self, - dtype: npt.DTypeLike | None = None, - copy: bool = False, - na_value=lib.no_default, - ) -> np.ndarray: - """ - Convert to a NumPy ndarray. - """ - # TODO: copy argument is ignored - - result = np.array(self._data, dtype=dtype) - if self._data.null_count > 0: - if na_value is lib.no_default: - if dtype and np.issubdtype(dtype, np.floating): - return result - na_value = self._dtype.na_value - mask = self.isna() - result[mask] = na_value - return result - def insert(self, loc: int, item) -> ArrowStringArray: if not isinstance(item, str) and item is not libmissing.NA: raise TypeError("Scalar must be NA or str") From df3c9ced79d0a26ec8ea1778c6df1074ed7ef2fb Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 30 Nov 2022 07:53:24 -0500 Subject: [PATCH 2/7] whatsnew --- doc/source/whatsnew/v2.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 1fb9a81e85a83..56fde8daeedf7 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -603,6 +603,7 @@ Performance improvements - Reduce memory usage of :meth:`DataFrame.to_pickle`/:meth:`Series.to_pickle` when using BZ2 or LZMA (:issue:`49068`) - Performance improvement for :class:`~arrays.StringArray` constructor passing a numpy array with type ``np.str_`` (:issue:`49109`) - Performance improvement in :meth:`~arrays.ArrowExtensionArray.factorize` (:issue:`49177`) +- Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`49973`) - Performance improvement in :meth:`DataFrame.join` when joining on a subset of a :class:`MultiIndex` (:issue:`48611`) - Performance improvement for :meth:`MultiIndex.intersection` (:issue:`48604`) - Performance improvement in ``var`` for nullable dtypes (:issue:`48379`). From 49d0d20f3f6c71bc2892fafc61c47be4cf463e74 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 30 Nov 2022 19:46:51 -0500 Subject: [PATCH 3/7] fix failing test --- pandas/core/arrays/arrow/array.py | 4 +++- pandas/core/arrays/string_arrow.py | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 7e030b00875a8..d43692742efd4 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -770,7 +770,9 @@ def to_numpy( pa_type = self._data.type if pa.types.is_timestamp(pa_type) or pa.types.is_duration(pa_type): result = np.array(self.tolist(), dtype=np.object_) - elif not self._hasna or (pa.types.is_floating(pa_type) and na_value is np.nan): + elif not self._hasna or ( + np.issubdtype(float, np.floating) and na_value is np.nan + ): return np.array(self._data, dtype=dtype) else: result = np.array(self._data, dtype=np.object_) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index dc23b16c3a452..c79e2f752c5a8 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -193,10 +193,11 @@ def astype(self, dtype, copy: bool = True): if copy: return self.copy() return self - elif isinstance(dtype, NumericDtype): data = self._data.cast(pa.from_numpy_dtype(dtype.numpy_dtype)) return dtype.__from_arrow__(data) + elif isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.floating): + return self.to_numpy(dtype=dtype, na_value=np.nan) return super().astype(dtype, copy=copy) From f425fc035f79d69e0f627e7ad07c0c2d5986d03c Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 7 Dec 2022 07:40:33 -0500 Subject: [PATCH 4/7] default to pyarrow behavior in to_numpy --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/arrays/arrow/array.py | 25 ++++++++++--------------- pandas/core/arrays/string_arrow.py | 12 ++++++++++++ pandas/tests/extension/test_arrow.py | 20 ++++++++++++++++++++ 4 files changed, 43 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index e10775b0ae161..efc2bd32ce7e3 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -370,6 +370,7 @@ Other API changes - Changed behavior of :class:`Index` constructor with sequence containing at least one ``NaT`` and everything else either ``None`` or ``NaN`` to infer ``datetime64[ns]`` dtype instead of ``object``, matching :class:`Series` behavior (:issue:`49340`) - :func:`read_stata` with parameter ``index_col`` set to ``None`` (the default) will now set the index on the returned :class:`DataFrame` to a :class:`RangeIndex` instead of a :class:`Int64Index` (:issue:`49745`) - Changed behavior of :class:`Index` constructor with an object-dtype ``numpy.ndarray`` containing all-``bool`` values or all-complex values, this will now retain object dtype, consistent with the :class:`Series` behavior (:issue:`49594`) +- Changed the default behavior of :meth:`.arrays.ArrowExtensionArray.to_numpy` to pyarrow's behavior in terms of numpy dtype and missing value representation for all types except ``duration`` and ``timestamp`` (:issue:`49973`) - Changed behavior of :meth:`DataFrame.shift` with ``axis=1``, an integer ``fill_value``, and homogeneous datetime-like dtype, this now fills new columns with integer dtypes instead of casting to datetimelike (:issue:`49842`) - Files are now closed when encountering an exception in :func:`read_json` (:issue:`49921`) - :meth:`DataFrame.values`, :meth:`DataFrame.to_numpy`, :meth:`DataFrame.xs`, :meth:`DataFrame.reindex`, :meth:`DataFrame.fillna`, and :meth:`DataFrame.replace` no longer silently consolidate the underlying arrays; do ``df = df.copy()`` to ensure consolidation (:issue:`49356`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index d43692742efd4..599b2cead2b9c 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -33,6 +33,7 @@ is_bool_dtype, is_integer, is_integer_dtype, + is_object_dtype, is_scalar, ) from pandas.core.dtypes.missing import isna @@ -762,25 +763,19 @@ def to_numpy( copy: bool = False, na_value: object = lib.no_default, ) -> np.ndarray: - # TODO: copy argument is ignored - - if na_value is lib.no_default: - na_value = self.dtype.na_value - pa_type = self._data.type - if pa.types.is_timestamp(pa_type) or pa.types.is_duration(pa_type): - result = np.array(self.tolist(), dtype=np.object_) - elif not self._hasna or ( - np.issubdtype(float, np.floating) and na_value is np.nan + if ( + is_object_dtype(dtype) + or pa.types.is_timestamp(pa_type) + or pa.types.is_duration(pa_type) ): - return np.array(self._data, dtype=dtype) + result = np.array(list(self), dtype=dtype) else: - result = np.array(self._data, dtype=np.object_) - - if self._hasna: + result = np.asarray(self._data, dtype=dtype) + if copy or na_value is not lib.no_default: + result = result.copy() + if self._hasna and na_value is not lib.no_default: result[self.isna()] = na_value - if dtype is not None: - return result.astype(dtype, copy=False) return result def unique(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index c79e2f752c5a8..52397122da5a5 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -150,6 +150,18 @@ def dtype(self) -> StringDtype: # type: ignore[override] """ return self._dtype + def to_numpy( + self, + dtype: npt.DTypeLike | None = None, + copy: bool = False, + na_value=lib.no_default, + ) -> np.ndarray: + # TODO: should na_value default to pyarrow's behavior of None (vs. pd.NA)? + if na_value is lib.no_default: + if not (dtype and np.issubdtype(dtype, np.floating)): + na_value = self._dtype.na_value + return super().to_numpy(dtype=dtype, copy=copy, na_value=na_value) + def insert(self, loc: int, item) -> ArrowStringArray: if not isinstance(item, str) and item is not libmissing.NA: raise TypeError("Scalar must be NA or str") diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index e6f1675bb8bc8..e9a3e5e096ee4 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1406,3 +1406,23 @@ def test_astype_from_non_pyarrow(data): assert not isinstance(pd_array.dtype, ArrowDtype) assert isinstance(result.dtype, ArrowDtype) tm.assert_extension_array_equal(result, data) + + +def test_to_numpy_with_defaults_matches_pyarrow(data, request): + # GH49973 + pa_type = data._data.type + if pa.types.is_duration(pa_type) or pa.types.is_timestamp(pa_type): + request.node.add_marker( + pytest.mark.xfail( + raises=AssertionError, + reason="numpy array are different", + ) + ) + result = data.to_numpy() + + # these should be equivalent + expected1 = np.array(data._data) + expected2 = data._data.to_numpy() + + tm.assert_numpy_array_equal(result, expected1) + tm.assert_numpy_array_equal(result, expected2) From 685d66ac2546521b68a2e23d4130b1b11b5e816a Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 7 Dec 2022 19:00:09 -0500 Subject: [PATCH 5/7] use pd.NA for missing --- doc/source/whatsnew/v2.0.0.rst | 1 - pandas/core/arrays/arrow/array.py | 9 +++++++-- pandas/core/arrays/string_arrow.py | 12 ------------ pandas/tests/extension/test_arrow.py | 23 ++++++++++------------- 4 files changed, 17 insertions(+), 28 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index efc2bd32ce7e3..e10775b0ae161 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -370,7 +370,6 @@ Other API changes - Changed behavior of :class:`Index` constructor with sequence containing at least one ``NaT`` and everything else either ``None`` or ``NaN`` to infer ``datetime64[ns]`` dtype instead of ``object``, matching :class:`Series` behavior (:issue:`49340`) - :func:`read_stata` with parameter ``index_col`` set to ``None`` (the default) will now set the index on the returned :class:`DataFrame` to a :class:`RangeIndex` instead of a :class:`Int64Index` (:issue:`49745`) - Changed behavior of :class:`Index` constructor with an object-dtype ``numpy.ndarray`` containing all-``bool`` values or all-complex values, this will now retain object dtype, consistent with the :class:`Series` behavior (:issue:`49594`) -- Changed the default behavior of :meth:`.arrays.ArrowExtensionArray.to_numpy` to pyarrow's behavior in terms of numpy dtype and missing value representation for all types except ``duration`` and ``timestamp`` (:issue:`49973`) - Changed behavior of :meth:`DataFrame.shift` with ``axis=1``, an integer ``fill_value``, and homogeneous datetime-like dtype, this now fills new columns with integer dtypes instead of casting to datetimelike (:issue:`49842`) - Files are now closed when encountering an exception in :func:`read_json` (:issue:`49921`) - :meth:`DataFrame.values`, :meth:`DataFrame.to_numpy`, :meth:`DataFrame.xs`, :meth:`DataFrame.reindex`, :meth:`DataFrame.fillna`, and :meth:`DataFrame.replace` no longer silently consolidate the underlying arrays; do ``df = df.copy()`` to ensure consolidation (:issue:`49356`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 599b2cead2b9c..d698c5eb11751 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -763,6 +763,11 @@ def to_numpy( copy: bool = False, na_value: object = lib.no_default, ) -> np.ndarray: + if dtype is None and self._hasna: + dtype = object + if na_value is lib.no_default: + na_value = self.dtype.na_value + pa_type = self._data.type if ( is_object_dtype(dtype) @@ -772,9 +777,9 @@ def to_numpy( result = np.array(list(self), dtype=dtype) else: result = np.asarray(self._data, dtype=dtype) - if copy or na_value is not lib.no_default: + if copy or self._hasna: result = result.copy() - if self._hasna and na_value is not lib.no_default: + if self._hasna: result[self.isna()] = na_value return result diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 52397122da5a5..c79e2f752c5a8 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -150,18 +150,6 @@ def dtype(self) -> StringDtype: # type: ignore[override] """ return self._dtype - def to_numpy( - self, - dtype: npt.DTypeLike | None = None, - copy: bool = False, - na_value=lib.no_default, - ) -> np.ndarray: - # TODO: should na_value default to pyarrow's behavior of None (vs. pd.NA)? - if na_value is lib.no_default: - if not (dtype and np.issubdtype(dtype, np.floating)): - na_value = self._dtype.na_value - return super().to_numpy(dtype=dtype, copy=copy, na_value=na_value) - def insert(self, loc: int, item) -> ArrowStringArray: if not isinstance(item, str) and item is not libmissing.NA: raise TypeError("Scalar must be NA or str") diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index e9a3e5e096ee4..82192b5f0a603 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1408,21 +1408,18 @@ def test_astype_from_non_pyarrow(data): tm.assert_extension_array_equal(result, data) -def test_to_numpy_with_defaults_matches_pyarrow(data, request): +def test_to_numpy_with_defaults(data, request): # GH49973 + result = data.to_numpy() + pa_type = data._data.type if pa.types.is_duration(pa_type) or pa.types.is_timestamp(pa_type): - request.node.add_marker( - pytest.mark.xfail( - raises=AssertionError, - reason="numpy array are different", - ) - ) - result = data.to_numpy() + expected = np.array(list(data)) + else: - # these should be equivalent - expected1 = np.array(data._data) - expected2 = data._data.to_numpy() + expected = np.array(data._data) + if data._hasna: + expected = expected.astype(object) + expected[pd.isna(data)] = pd.NA - tm.assert_numpy_array_equal(result, expected1) - tm.assert_numpy_array_equal(result, expected2) + tm.assert_numpy_array_equal(result, expected) From 2f8e15645b9155292d5affdc25b04a7e1406f27d Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 7 Dec 2022 19:07:46 -0500 Subject: [PATCH 6/7] fix spacing --- pandas/tests/extension/test_arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 82192b5f0a603..dfccd50cd136e 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1416,8 +1416,8 @@ def test_to_numpy_with_defaults(data, request): if pa.types.is_duration(pa_type) or pa.types.is_timestamp(pa_type): expected = np.array(list(data)) else: - expected = np.array(data._data) + if data._hasna: expected = expected.astype(object) expected[pd.isna(data)] = pd.NA From d6c3d57f3986b18d9aca54b1bde9ee3d2878e554 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Thu, 15 Dec 2022 19:30:39 -0500 Subject: [PATCH 7/7] remove unused parameter in test --- pandas/tests/extension/test_arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index dfccd50cd136e..a9577fc9ad8e6 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1408,7 +1408,7 @@ def test_astype_from_non_pyarrow(data): tm.assert_extension_array_equal(result, data) -def test_to_numpy_with_defaults(data, request): +def test_to_numpy_with_defaults(data): # GH49973 result = data.to_numpy()