From eeca9e3b1baadea5b93943c9f62e06cddd6c89b1 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 2 Dec 2023 00:31:27 +0100 Subject: [PATCH 1/8] Convert ArrowExtensionArray to proper NumPy dtype --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/core/arrays/_utils.py | 41 +++++++++++++++++++++++++++ pandas/core/arrays/arrow/array.py | 16 +++++++---- pandas/core/arrays/masked.py | 34 ++-------------------- pandas/tests/extension/test_arrow.py | 16 +++++++++-- scripts/validate_unwanted_patterns.py | 1 + 6 files changed, 69 insertions(+), 41 deletions(-) create mode 100644 pandas/core/arrays/_utils.py diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index ade87c4215a38..e542e2943d2b8 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -111,7 +111,7 @@ ExtensionArray.to_numpy converts to suitable NumPy dtype ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :meth:`ExtensionArray.to_numpy`` will now convert to a suitable NumPy dtype instead -of ``object`` dtype for nullable extension dtypes. +of ``object`` dtype for nullable and PyArrow backed extension dtypes. *Old behavior:* diff --git a/pandas/core/arrays/_utils.py b/pandas/core/arrays/_utils.py new file mode 100644 index 0000000000000..26ad8026c1ebf --- /dev/null +++ b/pandas/core/arrays/_utils.py @@ -0,0 +1,41 @@ +from __future__ import annotations + +import numpy as np + +from pandas._libs import lib +from pandas.errors import LossySetitemError + +from pandas.core.dtypes.cast import np_can_hold_element +from pandas.core.dtypes.common import is_numeric_dtype + + +def _to_numpy_dtype_inference(arr, dtype, na_value, hasna): + if dtype is None and is_numeric_dtype(arr.dtype): + dtype_given = False + if hasna: + if arr.dtype.kind == "b": + dtype = object + else: + if arr.dtype.kind in "iu": + dtype = np.dtype(np.float64) + else: + dtype = arr.dtype.numpy_dtype + if na_value is lib.no_default: + na_value = np.nan + else: + dtype = arr.dtype.numpy_dtype + elif dtype is not None: + dtype = np.dtype(dtype) + dtype_given = True + else: + dtype_given = True + + if na_value is lib.no_default: + na_value = arr.dtype.na_value + + if not dtype_given and hasna: + try: + np_can_hold_element(dtype, na_value) # type: ignore[arg-type] + except LossySetitemError: + dtype = object + return dtype, na_value diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index d162b66e5d369..5d5a5631db16c 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -38,6 +38,7 @@ is_bool_dtype, is_integer, is_list_like, + is_numeric_dtype, is_scalar, ) from pandas.core.dtypes.dtypes import DatetimeTZDtype @@ -48,8 +49,10 @@ missing, roperator, ) +from pandas.core.algorithms import map_array from pandas.core.arraylike import OpsMixin from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin +from pandas.core.arrays._utils import _to_numpy_dtype_inference from pandas.core.arrays.base import ( ExtensionArray, ExtensionArraySupportsAnyAll, @@ -1259,12 +1262,7 @@ def to_numpy( copy: bool = False, na_value: object = lib.no_default, ) -> np.ndarray: - if dtype is not None: - dtype = np.dtype(dtype) - - if na_value is lib.no_default: - na_value = self.dtype.na_value - + dtype, na_value = _to_numpy_dtype_inference(self, dtype, na_value, self._hasna) pa_type = self._pa_array.type if not self._hasna or isna(na_value) or pa.types.is_null(pa_type): data = self @@ -1307,6 +1305,12 @@ def to_numpy( result[~mask] = data[~mask]._pa_array.to_numpy() return result + def map(self, mapper, na_action=None): + if is_numeric_dtype(self.dtype): + return map_array(self.to_numpy(), mapper, na_action=None) + else: + return super().map(mapper, na_action) + @doc(ExtensionArray.duplicated) def duplicated( self, keep: Literal["first", "last", False] = "first" diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 201ce44ed0163..25fa091ed9cbd 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -38,15 +38,11 @@ IS64, is_platform_windows, ) -from pandas.errors import ( - AbstractMethodError, - LossySetitemError, -) +from pandas.errors import AbstractMethodError from pandas.util._decorators import doc from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.base import ExtensionDtype -from pandas.core.dtypes.cast import np_can_hold_element from pandas.core.dtypes.common import ( is_bool, is_integer_dtype, @@ -83,6 +79,7 @@ ) from pandas.core.array_algos.quantile import quantile_with_mask from pandas.core.arraylike import OpsMixin +from pandas.core.arrays._utils import _to_numpy_dtype_inference from pandas.core.arrays.base import ExtensionArray from pandas.core.construction import ( array as pd_array, @@ -479,32 +476,7 @@ def to_numpy( array([ True, False, False]) """ hasna = self._hasna - - if dtype is None: - dtype_given = False - if hasna: - if self.dtype.kind == "b": - dtype = object - else: - if self.dtype.kind in "iu": - dtype = np.dtype(np.float64) - else: - dtype = self.dtype.numpy_dtype - if na_value is lib.no_default: - na_value = np.nan - else: - dtype = self.dtype.numpy_dtype - else: - dtype = np.dtype(dtype) - dtype_given = True - if na_value is lib.no_default: - na_value = libmissing.NA - - if not dtype_given and hasna: - try: - np_can_hold_element(dtype, na_value) # type: ignore[arg-type] - except LossySetitemError: - dtype = object + dtype, na_value = _to_numpy_dtype_inference(self, dtype, na_value, hasna) if hasna: if ( diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 7131a50956a7d..67fefb766ef6c 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -266,6 +266,16 @@ def data_for_twos(data): class TestArrowArray(base.ExtensionTests): + @pytest.mark.parametrize("na_action", [None, "ignore"]) + def test_map(self, data_missing, na_action): + result = data_missing.map(lambda x: x, na_action=na_action) + if data_missing.dtype == "float32[pyarrow]": + # map roundtrips through objects, which converts to float64 + expected = data_missing.to_numpy(dtype="float64", na_value=np.nan) + else: + expected = data_missing.to_numpy() + tm.assert_numpy_array_equal(result, expected) + def test_astype_str(self, data, request): pa_dtype = data.dtype.pyarrow_dtype if pa.types.is_binary(pa_dtype): @@ -1489,7 +1499,7 @@ def test_to_numpy_with_defaults(data): else: expected = np.array(data._pa_array) - if data._hasna: + if data._hasna and not is_numeric_dtype(data.dtype): expected = expected.astype(object) expected[pd.isna(data)] = pd.NA @@ -1501,8 +1511,8 @@ def test_to_numpy_int_with_na(): data = [1, None] arr = pd.array(data, dtype="int64[pyarrow]") result = arr.to_numpy() - expected = np.array([1, pd.NA], dtype=object) - assert isinstance(result[0], int) + expected = np.array([1, np.nan]) + assert isinstance(result[0], float) tm.assert_numpy_array_equal(result, expected) diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index 89b67ddd9f5b6..a225834b6862a 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -58,6 +58,7 @@ "_iLocIndexer", # TODO(3.0): GH#55043 - remove upon removal of ArrayManager "_get_option", + "_to_numpy_dtype_inference", } From 455df4891aadf35fe57b390ca0a641c93392d3bc Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 2 Dec 2023 00:35:16 +0100 Subject: [PATCH 2/8] Add test --- pandas/tests/series/test_npfuncs.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/tests/series/test_npfuncs.py b/pandas/tests/series/test_npfuncs.py index 08950db25b282..11a51c4700d5c 100644 --- a/pandas/tests/series/test_npfuncs.py +++ b/pandas/tests/series/test_npfuncs.py @@ -5,6 +5,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import Series import pandas._testing as tm @@ -33,3 +35,12 @@ def test_numpy_argwhere(index): expected = np.array([[3], [4]], dtype=np.int64) tm.assert_numpy_array_equal(result, expected) + + +@td.skip_if_no("pyarrow") +def test_log_arrow_backed_missing_value(): + # GH#56285 + ser = Series([1, 2, None], dtype="float64[pyarrow]") + result = np.log(ser) + expected = np.log(Series([1, 2, None], dtype="float64")) + tm.assert_series_equal(result, expected) From 3bbfab54147e4968f8e788579c1ba1107401585f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 2 Dec 2023 01:06:03 +0100 Subject: [PATCH 3/8] Fix typing --- pandas/core/arrays/_utils.py | 2 +- pandas/core/arrays/arrow/array.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/_utils.py b/pandas/core/arrays/_utils.py index 26ad8026c1ebf..9988363646cf5 100644 --- a/pandas/core/arrays/_utils.py +++ b/pandas/core/arrays/_utils.py @@ -35,7 +35,7 @@ def _to_numpy_dtype_inference(arr, dtype, na_value, hasna): if not dtype_given and hasna: try: - np_can_hold_element(dtype, na_value) # type: ignore[arg-type] + np_can_hold_element(dtype, na_value) except LossySetitemError: dtype = object return dtype, na_value diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 5d5a5631db16c..8e5cc8496dfd6 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1272,7 +1272,7 @@ def to_numpy( if pa.types.is_timestamp(pa_type) or pa.types.is_duration(pa_type): result = data._maybe_convert_datelike_array() - if dtype is None or dtype.kind == "O": + if dtype is None or dtype.kind == "O": # type: ignore[union-attr] result = result.to_numpy(dtype=object, na_value=na_value) else: result = result.to_numpy(dtype=dtype) From 7b1122fb83c087bb8b5b88bc28a4a71a635aedec Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 8 Dec 2023 23:12:45 +0100 Subject: [PATCH 4/8] Add types --- pandas/core/arrays/_utils.py | 12 +++++++++++- pandas/core/arrays/arrow/array.py | 4 ++-- pandas/core/arrays/masked.py | 4 ++-- scripts/validate_unwanted_patterns.py | 1 - 4 files changed, 15 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/_utils.py b/pandas/core/arrays/_utils.py index 9988363646cf5..a58f9a94406f6 100644 --- a/pandas/core/arrays/_utils.py +++ b/pandas/core/arrays/_utils.py @@ -1,15 +1,25 @@ from __future__ import annotations +from typing import TYPE_CHECKING + import numpy as np from pandas._libs import lib + +if TYPE_CHECKING: + from pandas._typing import ( + ArrayLike, + Dtype, + ) from pandas.errors import LossySetitemError from pandas.core.dtypes.cast import np_can_hold_element from pandas.core.dtypes.common import is_numeric_dtype -def _to_numpy_dtype_inference(arr, dtype, na_value, hasna): +def to_numpy_dtype_inference( + arr: ArrayLike, dtype: Dtype | None, na_value, hasna: bool +) -> np.ndarray: if dtype is None and is_numeric_dtype(arr.dtype): dtype_given = False if hasna: diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 8a75cdf635f01..92657c8a3d5be 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -53,7 +53,7 @@ from pandas.core.algorithms import map_array from pandas.core.arraylike import OpsMixin from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin -from pandas.core.arrays._utils import _to_numpy_dtype_inference +from pandas.core.arrays._utils import to_numpy_dtype_inference from pandas.core.arrays.base import ( ExtensionArray, ExtensionArraySupportsAnyAll, @@ -1269,7 +1269,7 @@ def to_numpy( copy: bool = False, na_value: object = lib.no_default, ) -> np.ndarray: - dtype, na_value = _to_numpy_dtype_inference(self, dtype, na_value, self._hasna) + dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, self._hasna) pa_type = self._pa_array.type if not self._hasna or isna(na_value) or pa.types.is_null(pa_type): data = self diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 25fa091ed9cbd..6dc51ca213a66 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -79,7 +79,7 @@ ) from pandas.core.array_algos.quantile import quantile_with_mask from pandas.core.arraylike import OpsMixin -from pandas.core.arrays._utils import _to_numpy_dtype_inference +from pandas.core.arrays._utils import to_numpy_dtype_inference from pandas.core.arrays.base import ExtensionArray from pandas.core.construction import ( array as pd_array, @@ -476,7 +476,7 @@ def to_numpy( array([ True, False, False]) """ hasna = self._hasna - dtype, na_value = _to_numpy_dtype_inference(self, dtype, na_value, hasna) + dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, hasna) if hasna: if ( diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index a225834b6862a..89b67ddd9f5b6 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -58,7 +58,6 @@ "_iLocIndexer", # TODO(3.0): GH#55043 - remove upon removal of ArrayManager "_get_option", - "_to_numpy_dtype_inference", } From 513ad7f7eaaf95917a808963a4b8866b7dc5e104 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 8 Dec 2023 23:13:06 +0100 Subject: [PATCH 5/8] Add types --- pandas/core/arrays/_utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/_utils.py b/pandas/core/arrays/_utils.py index a58f9a94406f6..a066034ec61f6 100644 --- a/pandas/core/arrays/_utils.py +++ b/pandas/core/arrays/_utils.py @@ -5,16 +5,16 @@ import numpy as np from pandas._libs import lib +from pandas.errors import LossySetitemError + +from pandas.core.dtypes.cast import np_can_hold_element +from pandas.core.dtypes.common import is_numeric_dtype if TYPE_CHECKING: from pandas._typing import ( ArrayLike, Dtype, ) -from pandas.errors import LossySetitemError - -from pandas.core.dtypes.cast import np_can_hold_element -from pandas.core.dtypes.common import is_numeric_dtype def to_numpy_dtype_inference( From eaf42115d6a210b55534f19845e1c9a8f414ed75 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 10 Dec 2023 22:52:04 +0100 Subject: [PATCH 6/8] Fix typing --- pandas/core/arrays/_utils.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/pandas/core/arrays/_utils.py b/pandas/core/arrays/_utils.py index a066034ec61f6..ba4d403202778 100644 --- a/pandas/core/arrays/_utils.py +++ b/pandas/core/arrays/_utils.py @@ -1,6 +1,9 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import ( + TYPE_CHECKING, + Any, +) import numpy as np @@ -13,27 +16,27 @@ if TYPE_CHECKING: from pandas._typing import ( ArrayLike, - Dtype, + npt, ) def to_numpy_dtype_inference( - arr: ArrayLike, dtype: Dtype | None, na_value, hasna: bool -) -> np.ndarray: + arr: ArrayLike, dtype: npt.DTypeLike | None, na_value, hasna: bool +) -> tuple[npt.DTypeLike, Any]: if dtype is None and is_numeric_dtype(arr.dtype): dtype_given = False if hasna: if arr.dtype.kind == "b": - dtype = object + dtype = np.dtype(np.object_) else: if arr.dtype.kind in "iu": dtype = np.dtype(np.float64) else: - dtype = arr.dtype.numpy_dtype + dtype = arr.dtype.numpy_dtype # type: ignore[union-attr] if na_value is lib.no_default: na_value = np.nan else: - dtype = arr.dtype.numpy_dtype + dtype = arr.dtype.numpy_dtype # type: ignore[union-attr] elif dtype is not None: dtype = np.dtype(dtype) dtype_given = True @@ -43,9 +46,10 @@ def to_numpy_dtype_inference( if na_value is lib.no_default: na_value = arr.dtype.na_value + assert dtype is not None if not dtype_given and hasna: try: - np_can_hold_element(dtype, na_value) + np_can_hold_element(dtype, na_value) # type: ignore[arg-type] except LossySetitemError: - dtype = object + dtype = np.dtype(np.object_) return dtype, na_value From a9fb2acb7f02a9a28f84de3f96eaa298b610a6f5 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 10 Dec 2023 23:34:54 +0100 Subject: [PATCH 7/8] Fix typing --- pandas/core/arrays/_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/arrays/_utils.py b/pandas/core/arrays/_utils.py index ba4d403202778..c75ec7f843ed2 100644 --- a/pandas/core/arrays/_utils.py +++ b/pandas/core/arrays/_utils.py @@ -46,7 +46,6 @@ def to_numpy_dtype_inference( if na_value is lib.no_default: na_value = arr.dtype.na_value - assert dtype is not None if not dtype_given and hasna: try: np_can_hold_element(dtype, na_value) # type: ignore[arg-type] From 393cb3383c6ff154bfa6e6cfbd29a7b2c5a14583 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 21 Dec 2023 22:15:57 +0100 Subject: [PATCH 8/8] Fix typing --- pandas/core/arrays/arrow/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 38de874ffb112..e7ca7c12b71ee 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1315,7 +1315,7 @@ def to_numpy( if pa.types.is_timestamp(pa_type) or pa.types.is_duration(pa_type): result = data._maybe_convert_datelike_array() if (pa.types.is_timestamp(pa_type) and pa_type.tz is not None) or ( - dtype is not None and dtype.kind == "O" + dtype is not None and dtype.kind == "O" # type: ignore[union-attr] ): dtype = object else: