Skip to content

Commit 563fa95

Browse files
lukemanleymroeschke
authored andcommitted
BUG: ArrowExtensionArray.to_numpy avoid object dtype when na_value provided (pandas-dev#54843)
* ENH: ArrowExtensionArray.to_numpy to avoid object dtype when na_value provided * refactor * cleanup * mypy * fix
1 parent d71c2c2 commit 563fa95

File tree

3 files changed

+50
-25
lines changed

3 files changed

+50
-25
lines changed

pandas/core/arrays/arrow/array.py

+27-23
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,12 @@
2929
from pandas.util._decorators import doc
3030
from pandas.util._validators import validate_fillna_kwargs
3131

32+
from pandas.core.dtypes.cast import can_hold_element
3233
from pandas.core.dtypes.common import (
3334
is_array_like,
3435
is_bool_dtype,
3536
is_integer,
3637
is_list_like,
37-
is_object_dtype,
3838
is_scalar,
3939
)
4040
from pandas.core.dtypes.dtypes import DatetimeTZDtype
@@ -1240,46 +1240,50 @@ def to_numpy(
12401240
) -> np.ndarray:
12411241
if dtype is not None:
12421242
dtype = np.dtype(dtype)
1243-
elif self._hasna:
1244-
dtype = np.dtype(object)
12451243

12461244
if na_value is lib.no_default:
12471245
na_value = self.dtype.na_value
12481246

12491247
pa_type = self._pa_array.type
1248+
if not self._hasna or isna(na_value) or pa.types.is_null(pa_type):
1249+
data = self
1250+
else:
1251+
data = self.fillna(na_value)
1252+
copy = False
1253+
12501254
if pa.types.is_timestamp(pa_type) or pa.types.is_duration(pa_type):
1251-
result = self._maybe_convert_datelike_array()
1255+
result = data._maybe_convert_datelike_array()
12521256
if dtype is None or dtype.kind == "O":
12531257
result = result.to_numpy(dtype=object, na_value=na_value)
12541258
else:
12551259
result = result.to_numpy(dtype=dtype)
1256-
return result
12571260
elif pa.types.is_time(pa_type) or pa.types.is_date(pa_type):
12581261
# convert to list of python datetime.time objects before
12591262
# wrapping in ndarray
1260-
result = np.array(list(self), dtype=dtype)
1261-
elif is_object_dtype(dtype) and self._hasna:
1262-
result = np.empty(len(self), dtype=object)
1263-
mask = ~self.isna()
1264-
result[mask] = np.asarray(self[mask]._pa_array)
1265-
elif pa.types.is_null(self._pa_array.type):
1266-
fill_value = None if isna(na_value) else na_value
1267-
return np.full(len(self), fill_value=fill_value, dtype=dtype)
1268-
elif self._hasna:
1269-
data = self.fillna(na_value)
1263+
result = np.array(list(data), dtype=dtype)
1264+
if data._hasna:
1265+
result[data.isna()] = na_value
1266+
elif pa.types.is_null(pa_type):
1267+
if dtype is not None and isna(na_value):
1268+
na_value = None
1269+
result = np.full(len(data), fill_value=na_value, dtype=dtype)
1270+
elif not data._hasna or (pa.types.is_floating(pa_type) and na_value is np.nan):
12701271
result = data._pa_array.to_numpy()
1271-
if dtype is not None:
1272-
result = result.astype(dtype, copy=False)
1273-
return result
1274-
else:
1275-
result = self._pa_array.to_numpy()
12761272
if dtype is not None:
12771273
result = result.astype(dtype, copy=False)
12781274
if copy:
12791275
result = result.copy()
1280-
return result
1281-
if self._hasna:
1282-
result[self.isna()] = na_value
1276+
else:
1277+
if dtype is None:
1278+
empty = pa.array([], type=pa_type).to_numpy(zero_copy_only=False)
1279+
if can_hold_element(empty, na_value):
1280+
dtype = empty.dtype
1281+
else:
1282+
dtype = np.object_
1283+
result = np.empty(len(data), dtype=dtype)
1284+
mask = data.isna()
1285+
result[mask] = na_value
1286+
result[~mask] = data[~mask]._pa_array.to_numpy()
12831287
return result
12841288

12851289
def unique(self) -> Self:

pandas/tests/arrays/string_/test_string.py

+10-2
Original file line numberDiff line numberDiff line change
@@ -377,8 +377,16 @@ def test_astype_int(dtype):
377377
tm.assert_numpy_array_equal(result, expected)
378378

379379
arr = pd.array(["1", pd.NA, "3"], dtype=dtype)
380-
msg = r"int\(\) argument must be a string, a bytes-like object or a( real)? number"
381-
with pytest.raises(TypeError, match=msg):
380+
if dtype.storage == "pyarrow_numpy":
381+
err = ValueError
382+
msg = "cannot convert float NaN to integer"
383+
else:
384+
err = TypeError
385+
msg = (
386+
r"int\(\) argument must be a string, a bytes-like "
387+
r"object or a( real)? number"
388+
)
389+
with pytest.raises(err, match=msg):
382390
arr.astype("int64")
383391

384392

pandas/tests/extension/test_arrow.py

+13
Original file line numberDiff line numberDiff line change
@@ -1595,6 +1595,19 @@ def test_to_numpy_null_array_no_dtype():
15951595
tm.assert_numpy_array_equal(result, expected)
15961596

15971597

1598+
def test_to_numpy_without_dtype():
1599+
# GH 54808
1600+
arr = pd.array([True, pd.NA], dtype="boolean[pyarrow]")
1601+
result = arr.to_numpy(na_value=False)
1602+
expected = np.array([True, False], dtype=np.bool_)
1603+
tm.assert_numpy_array_equal(result, expected)
1604+
1605+
arr = pd.array([1.0, pd.NA], dtype="float32[pyarrow]")
1606+
result = arr.to_numpy(na_value=0.0)
1607+
expected = np.array([1.0, 0.0], dtype=np.float32)
1608+
tm.assert_numpy_array_equal(result, expected)
1609+
1610+
15981611
def test_setitem_null_slice(data):
15991612
# GH50248
16001613
orig = data.copy()

0 commit comments

Comments
 (0)