Skip to content

REF: simplify sanitize_array #49347

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 27, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 49 additions & 28 deletions pandas/core/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -545,8 +545,25 @@ def sanitize_array(
data = construct_1d_arraylike_from_scalar(data, len(index), dtype)
return data

elif isinstance(data, ABCExtensionArray):
# it is already ensured above this is not a PandasArray
# Until GH#49309 is fixed this check needs to come before the
# ExtensionDtype check
if dtype is not None:
subarr = data.astype(dtype, copy=copy)
elif copy:
subarr = data.copy()
else:
subarr = data

elif isinstance(dtype, ExtensionDtype):
# create an extension array from its dtype
_sanitize_non_ordered(data)
cls = dtype.construct_array_type()
subarr = cls._from_sequence(data, dtype=dtype, copy=copy)

# GH#846
if isinstance(data, np.ndarray):
elif isinstance(data, np.ndarray):
if isinstance(data, np.matrix):
data = data.A

Expand All @@ -556,7 +573,10 @@ def sanitize_array(
# GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int
# casting aligning with IntCastingNaNError below
with np.errstate(invalid="ignore"):
subarr = _try_cast(data, dtype, copy)
# GH#15832: Check if we are requesting a numeric dtype and
# that we can convert the data to the requested dtype.
subarr = maybe_cast_to_integer_array(data, dtype)

except IntCastingNaNError:
warnings.warn(
"In a future version, passing float-dtype values containing NaN "
Expand All @@ -582,28 +602,27 @@ def sanitize_array(
# we will try to copy by-definition here
subarr = _try_cast(data, dtype, copy)

elif isinstance(data, ABCExtensionArray):
# it is already ensured above this is not a PandasArray
subarr = data

if dtype is not None:
subarr = subarr.astype(dtype, copy=copy)
elif copy:
subarr = subarr.copy()
elif hasattr(data, "__array__"):
# e.g. dask array GH#38645
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mroeschke one thought is here we could check for e.g. pyarrow array and avert a cast to ndarray. most of the tests that get here do so with ChunkedArray and eventually cast back to ArrowArray

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So publicly this is the case when one calls e.g. pd.Series(pa.chunked_array([[1, 2]]))?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

right

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay cool. Will put up a PR in the next few days

data = np.array(data, copy=copy)
return sanitize_array(
data,
index=index,
dtype=dtype,
copy=False,
allow_2d=allow_2d,
)

else:
if isinstance(data, (set, frozenset)):
# Raise only for unordered sets, e.g., not for dict_keys
raise TypeError(f"'{type(data).__name__}' type is unordered")

_sanitize_non_ordered(data)
# materialize e.g. generators, convert e.g. tuples, abc.ValueView
if hasattr(data, "__array__"):
# e.g. dask array GH#38645
data = np.array(data, copy=copy)
else:
data = list(data)
data = list(data)

if dtype is not None or len(data) == 0:
if len(data) == 0 and dtype is None:
# We default to float64, matching numpy
subarr = np.array([], dtype=np.float64)

elif dtype is not None:
try:
subarr = _try_cast(data, dtype, copy)
except ValueError:
Expand Down Expand Up @@ -658,6 +677,14 @@ def range_to_ndarray(rng: range) -> np.ndarray:
return arr


def _sanitize_non_ordered(data) -> None:
"""
Raise only for unordered sets, e.g., not for dict_keys
"""
if isinstance(data, (set, frozenset)):
raise TypeError(f"'{type(data).__name__}' type is unordered")


def _sanitize_ndim(
result: ArrayLike,
data,
Expand Down Expand Up @@ -728,7 +755,7 @@ def _maybe_repeat(arr: ArrayLike, index: Index | None) -> ArrayLike:

def _try_cast(
arr: list | np.ndarray,
dtype: DtypeObj | None,
dtype: np.dtype | None,
copy: bool,
) -> ArrayLike:
"""
Expand All @@ -738,7 +765,7 @@ def _try_cast(
----------
arr : ndarray or list
Excludes: ExtensionArray, Series, Index.
dtype : np.dtype, ExtensionDtype or None
dtype : np.dtype or None
copy : bool
If False, don't copy the data if not needed.

Expand Down Expand Up @@ -771,12 +798,6 @@ def _try_cast(
return varr
return maybe_infer_to_datetimelike(varr)

elif isinstance(dtype, ExtensionDtype):
# create an extension array from its dtype
array_type = dtype.construct_array_type()._from_sequence
subarr = array_type(arr, dtype=dtype, copy=copy)
return subarr

elif is_object_dtype(dtype):
if not is_ndarray:
subarr = construct_1d_object_array_from_listlike(arr)
Expand Down
4 changes: 3 additions & 1 deletion pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
ArrayLike,
DtypeObj,
Manager,
npt,
)
from pandas.util._exceptions import find_stack_level

Expand Down Expand Up @@ -1032,7 +1033,7 @@ def _validate_or_indexify_columns(


def _convert_object_array(
content: list[np.ndarray], dtype: DtypeObj | None
content: list[npt.NDArray[np.object_]], dtype: DtypeObj | None
) -> list[ArrayLike]:
"""
Internal function to convert object array.
Expand All @@ -1059,6 +1060,7 @@ def convert(arr):
arr = cls._from_sequence(arr, dtype=dtype, copy=False)
else:
arr = maybe_cast_to_datetime(arr, dtype)

return arr

arrays = [convert(arr) for arr in content]
Expand Down