Skip to content

BUG: ArrowExtensionArray.to_numpy avoid object dtype when na_value provided #54843

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Sep 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 27 additions & 23 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,12 @@
from pandas.util._decorators import doc
from pandas.util._validators import validate_fillna_kwargs

from pandas.core.dtypes.cast import can_hold_element
from pandas.core.dtypes.common import (
is_array_like,
is_bool_dtype,
is_integer,
is_list_like,
is_object_dtype,
is_scalar,
)
from pandas.core.dtypes.dtypes import DatetimeTZDtype
Expand Down Expand Up @@ -1240,46 +1240,50 @@ def to_numpy(
) -> np.ndarray:
if dtype is not None:
dtype = np.dtype(dtype)
elif self._hasna:
dtype = np.dtype(object)

if na_value is lib.no_default:
na_value = self.dtype.na_value

pa_type = self._pa_array.type
if not self._hasna or isna(na_value) or pa.types.is_null(pa_type):
data = self
else:
data = self.fillna(na_value)
copy = False

if pa.types.is_timestamp(pa_type) or pa.types.is_duration(pa_type):
result = self._maybe_convert_datelike_array()
result = data._maybe_convert_datelike_array()
if dtype is None or dtype.kind == "O":
result = result.to_numpy(dtype=object, na_value=na_value)
else:
result = result.to_numpy(dtype=dtype)
return result
elif pa.types.is_time(pa_type) or pa.types.is_date(pa_type):
# convert to list of python datetime.time objects before
# wrapping in ndarray
result = np.array(list(self), dtype=dtype)
elif is_object_dtype(dtype) and self._hasna:
result = np.empty(len(self), dtype=object)
mask = ~self.isna()
result[mask] = np.asarray(self[mask]._pa_array)
elif pa.types.is_null(self._pa_array.type):
fill_value = None if isna(na_value) else na_value
return np.full(len(self), fill_value=fill_value, dtype=dtype)
elif self._hasna:
data = self.fillna(na_value)
result = np.array(list(data), dtype=dtype)
if data._hasna:
result[data.isna()] = na_value
elif pa.types.is_null(pa_type):
if dtype is not None and isna(na_value):
na_value = None
result = np.full(len(data), fill_value=na_value, dtype=dtype)
elif not data._hasna or (pa.types.is_floating(pa_type) and na_value is np.nan):
result = data._pa_array.to_numpy()
if dtype is not None:
result = result.astype(dtype, copy=False)
return result
else:
result = self._pa_array.to_numpy()
if dtype is not None:
result = result.astype(dtype, copy=False)
if copy:
result = result.copy()
return result
if self._hasna:
result[self.isna()] = na_value
else:
if dtype is None:
empty = pa.array([], type=pa_type).to_numpy(zero_copy_only=False)
if can_hold_element(empty, na_value):
dtype = empty.dtype
else:
dtype = np.object_
result = np.empty(len(data), dtype=dtype)
mask = data.isna()
result[mask] = na_value
result[~mask] = data[~mask]._pa_array.to_numpy()
return result

def unique(self) -> Self:
Expand Down
12 changes: 10 additions & 2 deletions pandas/tests/arrays/string_/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -377,8 +377,16 @@ def test_astype_int(dtype):
tm.assert_numpy_array_equal(result, expected)

arr = pd.array(["1", pd.NA, "3"], dtype=dtype)
msg = r"int\(\) argument must be a string, a bytes-like object or a( real)? number"
with pytest.raises(TypeError, match=msg):
if dtype.storage == "pyarrow_numpy":
err = ValueError
msg = "cannot convert float NaN to integer"
else:
err = TypeError
msg = (
r"int\(\) argument must be a string, a bytes-like "
r"object or a( real)? number"
)
with pytest.raises(err, match=msg):
arr.astype("int64")


Expand Down
13 changes: 13 additions & 0 deletions pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -1595,6 +1595,19 @@ def test_to_numpy_null_array_no_dtype():
tm.assert_numpy_array_equal(result, expected)


def test_to_numpy_without_dtype():
# GH 54808
arr = pd.array([True, pd.NA], dtype="boolean[pyarrow]")
result = arr.to_numpy(na_value=False)
expected = np.array([True, False], dtype=np.bool_)
tm.assert_numpy_array_equal(result, expected)

arr = pd.array([1.0, pd.NA], dtype="float32[pyarrow]")
result = arr.to_numpy(na_value=0.0)
expected = np.array([1.0, 0.0], dtype=np.float32)
tm.assert_numpy_array_equal(result, expected)


def test_setitem_null_slice(data):
# GH50248
orig = data.copy()
Expand Down