-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
REF: use sanitize_array in Index.__new__ #49718
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
9 commits
Select commit
Hold shift + click to select a range
253d2ef
REF: Index.__new__ use sanitize_array
jbrockmendel a99c52c
REF: _wrapped_sanitize
jbrockmendel d1129f6
re-use wrapped_sanitize
jbrockmendel d33da48
cln
jbrockmendel af07989
REF: share
jbrockmendel 3b4e72e
avoid extra copy
jbrockmendel f3610f3
Merge branch 'main' into ref-index_new
jbrockmendel 98c0020
troubleshoot CI
jbrockmendel bb4f1db
pylint fixup
jbrockmendel File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -81,7 +81,6 @@ | |
find_common_type, | ||
infer_dtype_from, | ||
maybe_cast_pointwise_result, | ||
maybe_infer_to_datetimelike, | ||
np_can_hold_element, | ||
) | ||
from pandas.core.dtypes.common import ( | ||
|
@@ -116,7 +115,6 @@ | |
DatetimeTZDtype, | ||
ExtensionDtype, | ||
IntervalDtype, | ||
PandasDtype, | ||
PeriodDtype, | ||
) | ||
from pandas.core.dtypes.generic import ( | ||
|
@@ -208,6 +206,22 @@ | |
_dtype_obj = np.dtype("object") | ||
|
||
|
||
def _wrapped_sanitize(cls, data, dtype: DtypeObj | None, copy: bool): | ||
""" | ||
Call sanitize_array with wrapping for differences between Index/Series. | ||
""" | ||
try: | ||
arr = sanitize_array(data, None, dtype=dtype, copy=copy, strict_ints=True) | ||
except ValueError as err: | ||
if "index must be specified when data is not list-like" in str(err): | ||
raise cls._raise_scalar_data_error(data) from err | ||
if "Data must be 1-dimensional" in str(err): | ||
raise ValueError("Index data must be 1-dimensional") from err | ||
raise | ||
arr = ensure_wrapped_if_datetimelike(arr) | ||
return arr | ||
|
||
|
||
def _maybe_return_indexers(meth: F) -> F: | ||
""" | ||
Decorator to simplify 'return_indexers' checks in Index.join. | ||
|
@@ -422,21 +436,13 @@ def __new__( | |
tupleize_cols: bool = True, | ||
) -> Index: | ||
|
||
from pandas.core.arrays import PandasArray | ||
from pandas.core.indexes.range import RangeIndex | ||
|
||
name = maybe_extract_name(name, data, cls) | ||
|
||
if dtype is not None: | ||
dtype = pandas_dtype(dtype) | ||
|
||
if type(data) is PandasArray: | ||
# ensure users don't accidentally put a PandasArray in an index, | ||
# but don't unpack StringArray | ||
data = data.to_numpy() | ||
if isinstance(dtype, PandasDtype): | ||
dtype = dtype.numpy_dtype | ||
|
||
data_dtype = getattr(data, "dtype", None) | ||
|
||
# range | ||
|
@@ -448,28 +454,10 @@ def __new__( | |
|
||
elif is_ea_or_datetimelike_dtype(dtype): | ||
# non-EA dtype indexes have special casting logic, so we punt here | ||
klass = cls._dtype_to_subclass(dtype) | ||
if klass is not Index: | ||
return klass(data, dtype=dtype, copy=copy, name=name) | ||
|
||
ea_cls = dtype.construct_array_type() | ||
data = ea_cls._from_sequence(data, dtype=dtype, copy=copy) | ||
return Index._simple_new(data, name=name) | ||
pass | ||
|
||
elif is_ea_or_datetimelike_dtype(data_dtype): | ||
data_dtype = cast(DtypeObj, data_dtype) | ||
klass = cls._dtype_to_subclass(data_dtype) | ||
if klass is not Index: | ||
result = klass(data, copy=copy, name=name) | ||
if dtype is not None: | ||
return result.astype(dtype, copy=False) | ||
return result | ||
elif dtype is not None: | ||
# GH#45206 | ||
data = data.astype(dtype, copy=False) | ||
|
||
data = extract_array(data, extract_numpy=True) | ||
return Index._simple_new(data, name=name) | ||
pass | ||
|
||
# index-like | ||
elif ( | ||
|
@@ -483,42 +471,25 @@ def __new__( | |
if isinstance(data, ABCMultiIndex): | ||
data = data._values | ||
|
||
if dtype is not None: | ||
# we need to avoid having numpy coerce | ||
if data.dtype.kind not in ["i", "u", "f", "b", "c", "m", "M"]: | ||
# GH#11836 we need to avoid having numpy coerce | ||
# things that look like ints/floats to ints unless | ||
# they are actually ints, e.g. '0' and 0.0 | ||
# should not be coerced | ||
# GH 11836 | ||
data = sanitize_array(data, None, dtype=dtype, copy=copy) | ||
|
||
dtype = data.dtype | ||
|
||
if data.dtype.kind in ["i", "u", "f"]: | ||
# maybe coerce to a sub-class | ||
arr = data | ||
elif data.dtype.kind in ["b", "c"]: | ||
# No special subclass, and Index._ensure_array won't do this | ||
# for us. | ||
arr = np.asarray(data) | ||
else: | ||
arr = com.asarray_tuplesafe(data, dtype=_dtype_obj) | ||
|
||
if dtype is None: | ||
arr = maybe_infer_to_datetimelike(arr) | ||
arr = ensure_wrapped_if_datetimelike(arr) | ||
dtype = arr.dtype | ||
|
||
klass = cls._dtype_to_subclass(arr.dtype) | ||
arr = klass._ensure_array(arr, dtype, copy) | ||
return klass._simple_new(arr, name) | ||
data = com.asarray_tuplesafe(data, dtype=_dtype_obj) | ||
|
||
elif is_scalar(data): | ||
raise cls._raise_scalar_data_error(data) | ||
elif hasattr(data, "__array__"): | ||
return Index(np.asarray(data), dtype=dtype, copy=copy, name=name) | ||
elif not is_list_like(data) and not isinstance(data, memoryview): | ||
# 2022-11-16 the memoryview check is only necessary on some CI | ||
# builds, not clear why | ||
raise cls._raise_scalar_data_error(data) | ||
|
||
else: | ||
|
||
if tupleize_cols and is_list_like(data): | ||
if tupleize_cols: | ||
# GH21470: convert iterable to list before determining if empty | ||
if is_iterator(data): | ||
data = list(data) | ||
|
@@ -531,12 +502,24 @@ def __new__( | |
return MultiIndex.from_tuples(data, names=name) | ||
# other iterable of some kind | ||
|
||
subarr = com.asarray_tuplesafe(data, dtype=_dtype_obj) | ||
if dtype is None: | ||
# with e.g. a list [1, 2, 3] casting to numeric is _not_ deprecated | ||
subarr = _maybe_cast_data_without_dtype(subarr) | ||
dtype = subarr.dtype | ||
return Index(subarr, dtype=dtype, copy=copy, name=name) | ||
if not isinstance(data, (list, tuple)): | ||
# we allow set/frozenset, which Series/sanitize_array does not, so | ||
# cast to list here | ||
data = list(data) | ||
if len(data) == 0: | ||
# unlike Series, we default to object dtype: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would we want to align this in the future? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i have no strong opinion on this |
||
data = np.array(data, dtype=object) | ||
|
||
if len(data) and isinstance(data[0], tuple): | ||
# Ensure we get 1-D array of tuples instead of 2D array. | ||
data = com.asarray_tuplesafe(data, dtype=_dtype_obj) | ||
|
||
arr = _wrapped_sanitize(cls, data, dtype, copy) | ||
klass = cls._dtype_to_subclass(arr.dtype) | ||
|
||
# _ensure_array _may_ be unnecessary once Int64Index etc are gone | ||
arr = klass._ensure_array(arr, arr.dtype, copy=False) | ||
return klass._simple_new(arr, name) | ||
|
||
@classmethod | ||
def _ensure_array(cls, data, dtype, copy: bool): | ||
|
@@ -7056,32 +7039,6 @@ def maybe_extract_name(name, obj, cls) -> Hashable: | |
return name | ||
|
||
|
||
def _maybe_cast_data_without_dtype(subarr: npt.NDArray[np.object_]) -> ArrayLike: | ||
""" | ||
If we have an arraylike input but no passed dtype, try to infer | ||
a supported dtype. | ||
|
||
Parameters | ||
---------- | ||
subarr : np.ndarray[object] | ||
|
||
Returns | ||
------- | ||
np.ndarray or ExtensionArray | ||
""" | ||
|
||
result = lib.maybe_convert_objects( | ||
subarr, | ||
convert_datetime=True, | ||
convert_timedelta=True, | ||
convert_period=True, | ||
convert_interval=True, | ||
dtype_if_all_nat=np.dtype("datetime64[ns]"), | ||
) | ||
result = ensure_wrapped_if_datetimelike(result) | ||
return result | ||
|
||
|
||
def get_unanimous_names(*indexes: Index) -> tuple[Hashable, ...]: | ||
""" | ||
Return common name if all indices agree, otherwise None (level-by-level). | ||
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So ideally we wouldn't need this in the future once there's more alignment between Series and Index?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the strict_ints keyword we could get rid of after #49609. the rest of this wrapping is mostly about giving Index-specific exception messages, so is pretty benign
(actually, at the time i refactored this function out i was calling it 3ish times within
Index.__new__
. now that its only called once, we could inline it)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Inlining can happen in a follow up