Skip to content

Commit 819bcce

Browse files
phoflcbpygit
authored andcommitted
Convert ArrowExtensionArray to proper NumPy dtype (pandas-dev#56290)
1 parent 0e6a4ef commit 819bcce

File tree

6 files changed

+89
-42
lines changed

6 files changed

+89
-42
lines changed

doc/source/whatsnew/v2.2.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ documentation.
194194
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
195195

196196
``to_numpy`` for NumPy nullable and Arrow types will now convert to a
197-
suitable NumPy dtype instead of ``object`` dtype for nullable extension dtypes.
197+
suitable NumPy dtype instead of ``object`` dtype for nullable and PyArrow backed extension dtypes.
198198

199199
*Old behavior:*
200200

pandas/core/arrays/_utils.py

+54
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
from __future__ import annotations
2+
3+
from typing import (
4+
TYPE_CHECKING,
5+
Any,
6+
)
7+
8+
import numpy as np
9+
10+
from pandas._libs import lib
11+
from pandas.errors import LossySetitemError
12+
13+
from pandas.core.dtypes.cast import np_can_hold_element
14+
from pandas.core.dtypes.common import is_numeric_dtype
15+
16+
if TYPE_CHECKING:
17+
from pandas._typing import (
18+
ArrayLike,
19+
npt,
20+
)
21+
22+
23+
def to_numpy_dtype_inference(
24+
arr: ArrayLike, dtype: npt.DTypeLike | None, na_value, hasna: bool
25+
) -> tuple[npt.DTypeLike, Any]:
26+
if dtype is None and is_numeric_dtype(arr.dtype):
27+
dtype_given = False
28+
if hasna:
29+
if arr.dtype.kind == "b":
30+
dtype = np.dtype(np.object_)
31+
else:
32+
if arr.dtype.kind in "iu":
33+
dtype = np.dtype(np.float64)
34+
else:
35+
dtype = arr.dtype.numpy_dtype # type: ignore[union-attr]
36+
if na_value is lib.no_default:
37+
na_value = np.nan
38+
else:
39+
dtype = arr.dtype.numpy_dtype # type: ignore[union-attr]
40+
elif dtype is not None:
41+
dtype = np.dtype(dtype)
42+
dtype_given = True
43+
else:
44+
dtype_given = True
45+
46+
if na_value is lib.no_default:
47+
na_value = arr.dtype.na_value
48+
49+
if not dtype_given and hasna:
50+
try:
51+
np_can_hold_element(dtype, na_value) # type: ignore[arg-type]
52+
except LossySetitemError:
53+
dtype = np.dtype(np.object_)
54+
return dtype, na_value

pandas/core/arrays/arrow/array.py

+10-6
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
is_bool_dtype,
4040
is_integer,
4141
is_list_like,
42+
is_numeric_dtype,
4243
is_scalar,
4344
)
4445
from pandas.core.dtypes.dtypes import DatetimeTZDtype
@@ -50,8 +51,10 @@
5051
ops,
5152
roperator,
5253
)
54+
from pandas.core.algorithms import map_array
5355
from pandas.core.arraylike import OpsMixin
5456
from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin
57+
from pandas.core.arrays._utils import to_numpy_dtype_inference
5558
from pandas.core.arrays.base import (
5659
ExtensionArray,
5760
ExtensionArraySupportsAnyAll,
@@ -1317,12 +1320,7 @@ def to_numpy(
13171320
copy: bool = False,
13181321
na_value: object = lib.no_default,
13191322
) -> np.ndarray:
1320-
if dtype is not None:
1321-
dtype = np.dtype(dtype)
1322-
1323-
if na_value is lib.no_default:
1324-
na_value = self.dtype.na_value
1325-
1323+
dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, self._hasna)
13261324
pa_type = self._pa_array.type
13271325
if not self._hasna or isna(na_value) or pa.types.is_null(pa_type):
13281326
data = self
@@ -1366,6 +1364,12 @@ def to_numpy(
13661364
result[~mask] = data[~mask]._pa_array.to_numpy()
13671365
return result
13681366

1367+
def map(self, mapper, na_action=None):
1368+
if is_numeric_dtype(self.dtype):
1369+
return map_array(self.to_numpy(), mapper, na_action=None)
1370+
else:
1371+
return super().map(mapper, na_action)
1372+
13691373
@doc(ExtensionArray.duplicated)
13701374
def duplicated(
13711375
self, keep: Literal["first", "last", False] = "first"

pandas/core/arrays/masked.py

+3-31
Original file line numberDiff line numberDiff line change
@@ -35,15 +35,11 @@
3535
IS64,
3636
is_platform_windows,
3737
)
38-
from pandas.errors import (
39-
AbstractMethodError,
40-
LossySetitemError,
41-
)
38+
from pandas.errors import AbstractMethodError
4239
from pandas.util._decorators import doc
4340
from pandas.util._validators import validate_fillna_kwargs
4441

4542
from pandas.core.dtypes.base import ExtensionDtype
46-
from pandas.core.dtypes.cast import np_can_hold_element
4743
from pandas.core.dtypes.common import (
4844
is_bool,
4945
is_integer_dtype,
@@ -80,6 +76,7 @@
8076
)
8177
from pandas.core.array_algos.quantile import quantile_with_mask
8278
from pandas.core.arraylike import OpsMixin
79+
from pandas.core.arrays._utils import to_numpy_dtype_inference
8380
from pandas.core.arrays.base import ExtensionArray
8481
from pandas.core.construction import (
8582
array as pd_array,
@@ -477,32 +474,7 @@ def to_numpy(
477474
array([ True, False, False])
478475
"""
479476
hasna = self._hasna
480-
481-
if dtype is None:
482-
dtype_given = False
483-
if hasna:
484-
if self.dtype.kind == "b":
485-
dtype = object
486-
else:
487-
if self.dtype.kind in "iu":
488-
dtype = np.dtype(np.float64)
489-
else:
490-
dtype = self.dtype.numpy_dtype
491-
if na_value is lib.no_default:
492-
na_value = np.nan
493-
else:
494-
dtype = self.dtype.numpy_dtype
495-
else:
496-
dtype = np.dtype(dtype)
497-
dtype_given = True
498-
if na_value is lib.no_default:
499-
na_value = libmissing.NA
500-
501-
if not dtype_given and hasna:
502-
try:
503-
np_can_hold_element(dtype, na_value) # type: ignore[arg-type]
504-
except LossySetitemError:
505-
dtype = object
477+
dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, hasna)
506478

507479
if hasna:
508480
if (

pandas/tests/extension/test_arrow.py

+10-4
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,13 @@ def test_map(self, data_missing, na_action):
278278
expected = data_missing.to_numpy(dtype=object)
279279
tm.assert_numpy_array_equal(result, expected)
280280
else:
281-
super().test_map(data_missing, na_action)
281+
result = data_missing.map(lambda x: x, na_action=na_action)
282+
if data_missing.dtype == "float32[pyarrow]":
283+
# map roundtrips through objects, which converts to float64
284+
expected = data_missing.to_numpy(dtype="float64", na_value=np.nan)
285+
else:
286+
expected = data_missing.to_numpy()
287+
tm.assert_numpy_array_equal(result, expected)
282288

283289
def test_astype_str(self, data, request):
284290
pa_dtype = data.dtype.pyarrow_dtype
@@ -1585,7 +1591,7 @@ def test_to_numpy_with_defaults(data):
15851591
else:
15861592
expected = np.array(data._pa_array)
15871593

1588-
if data._hasna:
1594+
if data._hasna and not is_numeric_dtype(data.dtype):
15891595
expected = expected.astype(object)
15901596
expected[pd.isna(data)] = pd.NA
15911597

@@ -1597,8 +1603,8 @@ def test_to_numpy_int_with_na():
15971603
data = [1, None]
15981604
arr = pd.array(data, dtype="int64[pyarrow]")
15991605
result = arr.to_numpy()
1600-
expected = np.array([1, pd.NA], dtype=object)
1601-
assert isinstance(result[0], int)
1606+
expected = np.array([1, np.nan])
1607+
assert isinstance(result[0], float)
16021608
tm.assert_numpy_array_equal(result, expected)
16031609

16041610

pandas/tests/series/test_npfuncs.py

+11
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
import numpy as np
66
import pytest
77

8+
import pandas.util._test_decorators as td
9+
810
from pandas import Series
911
import pandas._testing as tm
1012

@@ -33,3 +35,12 @@ def test_numpy_argwhere(index):
3335
expected = np.array([[3], [4]], dtype=np.int64)
3436

3537
tm.assert_numpy_array_equal(result, expected)
38+
39+
40+
@td.skip_if_no("pyarrow")
41+
def test_log_arrow_backed_missing_value():
42+
# GH#56285
43+
ser = Series([1, 2, None], dtype="float64[pyarrow]")
44+
result = np.log(ser)
45+
expected = np.log(Series([1, 2, None], dtype="float64"))
46+
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)