Skip to content

Commit 8020bf1

Browse files
authored
REF: use sanitize_array in Index.__new__ (#49718)
* REF: Index.__new__ use sanitize_array * REF: _wrapped_sanitize * re-use wrapped_sanitize * cln * REF: share * avoid extra copy * troubleshoot CI * pylint fixup
1 parent 68e2c2a commit 8020bf1

File tree

5 files changed

+55
-89
lines changed

5 files changed

+55
-89
lines changed

pandas/core/construction.py

+7
Original file line numberDiff line numberDiff line change
@@ -499,6 +499,7 @@ def sanitize_array(
499499
copy: bool = False,
500500
*,
501501
allow_2d: bool = False,
502+
strict_ints: bool = False,
502503
) -> ArrayLike:
503504
"""
504505
Sanitize input data to an ndarray or ExtensionArray, copy if specified,
@@ -512,6 +513,8 @@ def sanitize_array(
512513
copy : bool, default False
513514
allow_2d : bool, default False
514515
If False, raise if we have a 2D Arraylike.
516+
strict_ints : bool, default False
517+
If False, silently ignore failures to cast float data to int dtype.
515518
516519
Returns
517520
-------
@@ -581,6 +584,8 @@ def sanitize_array(
581584
# DataFrame would call np.array(data, dtype=dtype, copy=copy),
582585
# which would cast to the integer dtype even if the cast is lossy.
583586
# See GH#40110.
587+
if strict_ints:
588+
raise
584589

585590
# We ignore the dtype arg and return floating values,
586591
# e.g. test_constructor_floating_data_int_dtype
@@ -624,6 +629,8 @@ def sanitize_array(
624629
subarr = _try_cast(data, dtype, copy)
625630
except ValueError:
626631
if is_integer_dtype(dtype):
632+
if strict_ints:
633+
raise
627634
casted = np.array(data, copy=False)
628635
if casted.dtype.kind == "f":
629636
# GH#40110 match the behavior we have if we passed

pandas/core/indexes/base.py

+45-88
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,6 @@
8181
find_common_type,
8282
infer_dtype_from,
8383
maybe_cast_pointwise_result,
84-
maybe_infer_to_datetimelike,
8584
np_can_hold_element,
8685
)
8786
from pandas.core.dtypes.common import (
@@ -116,7 +115,6 @@
116115
DatetimeTZDtype,
117116
ExtensionDtype,
118117
IntervalDtype,
119-
PandasDtype,
120118
PeriodDtype,
121119
)
122120
from pandas.core.dtypes.generic import (
@@ -208,6 +206,22 @@
208206
_dtype_obj = np.dtype("object")
209207

210208

209+
def _wrapped_sanitize(cls, data, dtype: DtypeObj | None, copy: bool):
210+
"""
211+
Call sanitize_array with wrapping for differences between Index/Series.
212+
"""
213+
try:
214+
arr = sanitize_array(data, None, dtype=dtype, copy=copy, strict_ints=True)
215+
except ValueError as err:
216+
if "index must be specified when data is not list-like" in str(err):
217+
raise cls._raise_scalar_data_error(data) from err
218+
if "Data must be 1-dimensional" in str(err):
219+
raise ValueError("Index data must be 1-dimensional") from err
220+
raise
221+
arr = ensure_wrapped_if_datetimelike(arr)
222+
return arr
223+
224+
211225
def _maybe_return_indexers(meth: F) -> F:
212226
"""
213227
Decorator to simplify 'return_indexers' checks in Index.join.
@@ -422,21 +436,13 @@ def __new__(
422436
tupleize_cols: bool = True,
423437
) -> Index:
424438

425-
from pandas.core.arrays import PandasArray
426439
from pandas.core.indexes.range import RangeIndex
427440

428441
name = maybe_extract_name(name, data, cls)
429442

430443
if dtype is not None:
431444
dtype = pandas_dtype(dtype)
432445

433-
if type(data) is PandasArray:
434-
# ensure users don't accidentally put a PandasArray in an index,
435-
# but don't unpack StringArray
436-
data = data.to_numpy()
437-
if isinstance(dtype, PandasDtype):
438-
dtype = dtype.numpy_dtype
439-
440446
data_dtype = getattr(data, "dtype", None)
441447

442448
# range
@@ -448,28 +454,10 @@ def __new__(
448454

449455
elif is_ea_or_datetimelike_dtype(dtype):
450456
# non-EA dtype indexes have special casting logic, so we punt here
451-
klass = cls._dtype_to_subclass(dtype)
452-
if klass is not Index:
453-
return klass(data, dtype=dtype, copy=copy, name=name)
454-
455-
ea_cls = dtype.construct_array_type()
456-
data = ea_cls._from_sequence(data, dtype=dtype, copy=copy)
457-
return Index._simple_new(data, name=name)
457+
pass
458458

459459
elif is_ea_or_datetimelike_dtype(data_dtype):
460-
data_dtype = cast(DtypeObj, data_dtype)
461-
klass = cls._dtype_to_subclass(data_dtype)
462-
if klass is not Index:
463-
result = klass(data, copy=copy, name=name)
464-
if dtype is not None:
465-
return result.astype(dtype, copy=False)
466-
return result
467-
elif dtype is not None:
468-
# GH#45206
469-
data = data.astype(dtype, copy=False)
470-
471-
data = extract_array(data, extract_numpy=True)
472-
return Index._simple_new(data, name=name)
460+
pass
473461

474462
# index-like
475463
elif (
@@ -483,42 +471,25 @@ def __new__(
483471
if isinstance(data, ABCMultiIndex):
484472
data = data._values
485473

486-
if dtype is not None:
487-
# we need to avoid having numpy coerce
474+
if data.dtype.kind not in ["i", "u", "f", "b", "c", "m", "M"]:
475+
# GH#11836 we need to avoid having numpy coerce
488476
# things that look like ints/floats to ints unless
489477
# they are actually ints, e.g. '0' and 0.0
490478
# should not be coerced
491-
# GH 11836
492-
data = sanitize_array(data, None, dtype=dtype, copy=copy)
493-
494-
dtype = data.dtype
495-
496-
if data.dtype.kind in ["i", "u", "f"]:
497-
# maybe coerce to a sub-class
498-
arr = data
499-
elif data.dtype.kind in ["b", "c"]:
500-
# No special subclass, and Index._ensure_array won't do this
501-
# for us.
502-
arr = np.asarray(data)
503-
else:
504-
arr = com.asarray_tuplesafe(data, dtype=_dtype_obj)
505-
506-
if dtype is None:
507-
arr = maybe_infer_to_datetimelike(arr)
508-
arr = ensure_wrapped_if_datetimelike(arr)
509-
dtype = arr.dtype
510-
511-
klass = cls._dtype_to_subclass(arr.dtype)
512-
arr = klass._ensure_array(arr, dtype, copy)
513-
return klass._simple_new(arr, name)
479+
data = com.asarray_tuplesafe(data, dtype=_dtype_obj)
514480

515481
elif is_scalar(data):
516482
raise cls._raise_scalar_data_error(data)
517483
elif hasattr(data, "__array__"):
518484
return Index(np.asarray(data), dtype=dtype, copy=copy, name=name)
485+
elif not is_list_like(data) and not isinstance(data, memoryview):
486+
# 2022-11-16 the memoryview check is only necessary on some CI
487+
# builds, not clear why
488+
raise cls._raise_scalar_data_error(data)
489+
519490
else:
520491

521-
if tupleize_cols and is_list_like(data):
492+
if tupleize_cols:
522493
# GH21470: convert iterable to list before determining if empty
523494
if is_iterator(data):
524495
data = list(data)
@@ -531,12 +502,24 @@ def __new__(
531502
return MultiIndex.from_tuples(data, names=name)
532503
# other iterable of some kind
533504

534-
subarr = com.asarray_tuplesafe(data, dtype=_dtype_obj)
535-
if dtype is None:
536-
# with e.g. a list [1, 2, 3] casting to numeric is _not_ deprecated
537-
subarr = _maybe_cast_data_without_dtype(subarr)
538-
dtype = subarr.dtype
539-
return Index(subarr, dtype=dtype, copy=copy, name=name)
505+
if not isinstance(data, (list, tuple)):
506+
# we allow set/frozenset, which Series/sanitize_array does not, so
507+
# cast to list here
508+
data = list(data)
509+
if len(data) == 0:
510+
# unlike Series, we default to object dtype:
511+
data = np.array(data, dtype=object)
512+
513+
if len(data) and isinstance(data[0], tuple):
514+
# Ensure we get 1-D array of tuples instead of 2D array.
515+
data = com.asarray_tuplesafe(data, dtype=_dtype_obj)
516+
517+
arr = _wrapped_sanitize(cls, data, dtype, copy)
518+
klass = cls._dtype_to_subclass(arr.dtype)
519+
520+
# _ensure_array _may_ be unnecessary once Int64Index etc are gone
521+
arr = klass._ensure_array(arr, arr.dtype, copy=False)
522+
return klass._simple_new(arr, name)
540523

541524
@classmethod
542525
def _ensure_array(cls, data, dtype, copy: bool):
@@ -7048,32 +7031,6 @@ def maybe_extract_name(name, obj, cls) -> Hashable:
70487031
return name
70497032

70507033

7051-
def _maybe_cast_data_without_dtype(subarr: npt.NDArray[np.object_]) -> ArrayLike:
7052-
"""
7053-
If we have an arraylike input but no passed dtype, try to infer
7054-
a supported dtype.
7055-
7056-
Parameters
7057-
----------
7058-
subarr : np.ndarray[object]
7059-
7060-
Returns
7061-
-------
7062-
np.ndarray or ExtensionArray
7063-
"""
7064-
7065-
result = lib.maybe_convert_objects(
7066-
subarr,
7067-
convert_datetime=True,
7068-
convert_timedelta=True,
7069-
convert_period=True,
7070-
convert_interval=True,
7071-
dtype_if_all_nat=np.dtype("datetime64[ns]"),
7072-
)
7073-
result = ensure_wrapped_if_datetimelike(result)
7074-
return result
7075-
7076-
70777034
def get_unanimous_names(*indexes: Index) -> tuple[Hashable, ...]:
70787035
"""
70797036
Return common name if all indices agree, otherwise None (level-by-level).

pandas/tests/indexes/datetimes/test_constructors.py

+1
Original file line numberDiff line numberDiff line change
@@ -912,6 +912,7 @@ def test_constructor_no_precision_raises(self):
912912
with pytest.raises(ValueError, match=msg):
913913
DatetimeIndex(["2000"], dtype="datetime64")
914914

915+
msg = "The 'datetime64' dtype has no unit. Please pass in"
915916
with pytest.raises(ValueError, match=msg):
916917
Index(["2000"], dtype="datetime64")
917918

pandas/tests/indexes/interval/test_constructors.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -393,7 +393,7 @@ def test_constructor_errors(self, klass):
393393

394394
# scalar
395395
msg = (
396-
r"IntervalIndex\(...\) must be called with a collection of "
396+
r"(IntervalIndex|Index)\(...\) must be called with a collection of "
397397
"some kind, 5 was passed"
398398
)
399399
with pytest.raises(TypeError, match=msg):

pandas/tests/indexes/timedeltas/test_constructors.py

+1
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,7 @@ def test_constructor_no_precision_raises(self):
246246
with pytest.raises(ValueError, match=msg):
247247
TimedeltaIndex(["2000"], dtype="timedelta64")
248248

249+
msg = "The 'timedelta64' dtype has no unit. Please pass in"
249250
with pytest.raises(ValueError, match=msg):
250251
pd.Index(["2000"], dtype="timedelta64")
251252

0 commit comments

Comments
 (0)