From 253d2ef9fb2c8534427142c2687ff3b0c7872a5b Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 9 Nov 2022 08:00:46 -0800 Subject: [PATCH 1/8] REF: Index.__new__ use sanitize_array --- pandas/core/indexes/base.py | 81 +++++++++---------- .../indexes/datetimes/test_constructors.py | 1 + .../indexes/interval/test_constructors.py | 2 +- .../indexes/timedeltas/test_constructors.py | 1 + 4 files changed, 40 insertions(+), 45 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 068ff7a0bf1c9..c21b53f8f84e9 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -448,28 +448,23 @@ def __new__( elif is_ea_or_datetimelike_dtype(dtype): # non-EA dtype indexes have special casting logic, so we punt here - klass = cls._dtype_to_subclass(dtype) - if klass is not Index: - return klass(data, dtype=dtype, copy=copy, name=name) - - ea_cls = dtype.construct_array_type() - data = ea_cls._from_sequence(data, dtype=dtype, copy=copy) - return Index._simple_new(data, name=name) + try: + arr = sanitize_array(data, None, dtype=dtype, copy=copy) + except ValueError as err: + if "index must be specified when data is not list-like" in str(err): + raise cls._raise_scalar_data_error(data) from err + elif "Data must be 1-dimensional" in str(err): + raise ValueError("Index data must be 1-dimensional") from err + raise + arr = ensure_wrapped_if_datetimelike(arr) + klass = cls._dtype_to_subclass(arr.dtype) + return klass._simple_new(arr, name=name) elif is_ea_or_datetimelike_dtype(data_dtype): - data_dtype = cast(DtypeObj, data_dtype) - klass = cls._dtype_to_subclass(data_dtype) - if klass is not Index: - result = klass(data, copy=copy, name=name) - if dtype is not None: - return result.astype(dtype, copy=False) - return result - elif dtype is not None: - # GH#45206 - data = data.astype(dtype, copy=False) - - data = extract_array(data, extract_numpy=True) - return Index._simple_new(data, name=name) + arr = sanitize_array(data, None, dtype=dtype, copy=copy) + arr = ensure_wrapped_if_datetimelike(arr) + klass = cls._dtype_to_subclass(arr.dtype) + return klass._simple_new(arr, name=name) # index-like elif ( @@ -483,42 +478,40 @@ def __new__( if isinstance(data, ABCMultiIndex): data = data._values - if dtype is not None: - # we need to avoid having numpy coerce - # things that look like ints/floats to ints unless - # they are actually ints, e.g. '0' and 0.0 - # should not be coerced - # GH 11836 + # we need to avoid having numpy coerce + # things that look like ints/floats to ints unless + # they are actually ints, e.g. '0' and 0.0 + # should not be coerced + # GH 11836 + if data.dtype.kind not in ["i", "u", "f", "b", "c", "m", "M"]: + data = com.asarray_tuplesafe(data, dtype=_dtype_obj) + try: data = sanitize_array(data, None, dtype=dtype, copy=copy) + except ValueError as err: + if "index must be specified when data is not list-like" in str(err): + raise cls._raise_scalar_data_error(data) from err + elif "Data must be 1-dimensional" in str(err): + raise ValueError("Index data must be 1-dimensional") from err + raise - dtype = data.dtype - - if data.dtype.kind in ["i", "u", "f"]: - # maybe coerce to a sub-class - arr = data - elif data.dtype.kind in ["b", "c"]: - # No special subclass, and Index._ensure_array won't do this - # for us. - arr = np.asarray(data) - else: - arr = com.asarray_tuplesafe(data, dtype=_dtype_obj) - - if dtype is None: - arr = maybe_infer_to_datetimelike(arr) - arr = ensure_wrapped_if_datetimelike(arr) - dtype = arr.dtype + arr = ensure_wrapped_if_datetimelike(data) klass = cls._dtype_to_subclass(arr.dtype) - arr = klass._ensure_array(arr, dtype, copy) + + # _ensure_array _may_ be unnecessary once Int64Index etc are gone + arr = klass._ensure_array(arr, data.dtype, copy) return klass._simple_new(arr, name) elif is_scalar(data): raise cls._raise_scalar_data_error(data) elif hasattr(data, "__array__"): return Index(np.asarray(data), dtype=dtype, copy=copy, name=name) + elif not is_list_like(data): + raise cls._raise_scalar_data_error(data) + else: - if tupleize_cols and is_list_like(data): + if tupleize_cols: # GH21470: convert iterable to list before determining if empty if is_iterator(data): data = list(data) diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index 4aaa2b694102d..d67bc8a132c0c 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -912,6 +912,7 @@ def test_constructor_no_precision_raises(self): with pytest.raises(ValueError, match=msg): DatetimeIndex(["2000"], dtype="datetime64") + msg = "The 'datetime64' dtype has no unit. Please pass in" with pytest.raises(ValueError, match=msg): Index(["2000"], dtype="datetime64") diff --git a/pandas/tests/indexes/interval/test_constructors.py b/pandas/tests/indexes/interval/test_constructors.py index f8c6042c5007d..ce0de97befec3 100644 --- a/pandas/tests/indexes/interval/test_constructors.py +++ b/pandas/tests/indexes/interval/test_constructors.py @@ -393,7 +393,7 @@ def test_constructor_errors(self, klass): # scalar msg = ( - r"IntervalIndex\(...\) must be called with a collection of " + r"(IntervalIndex|Index)\(...\) must be called with a collection of " "some kind, 5 was passed" ) with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/indexes/timedeltas/test_constructors.py b/pandas/tests/indexes/timedeltas/test_constructors.py index 1447e9080313f..5c23d1dfd83c8 100644 --- a/pandas/tests/indexes/timedeltas/test_constructors.py +++ b/pandas/tests/indexes/timedeltas/test_constructors.py @@ -246,6 +246,7 @@ def test_constructor_no_precision_raises(self): with pytest.raises(ValueError, match=msg): TimedeltaIndex(["2000"], dtype="timedelta64") + msg = "The 'timedelta64' dtype has no unit. Please pass in" with pytest.raises(ValueError, match=msg): pd.Index(["2000"], dtype="timedelta64") From a99c52cffec2dea00c4b50534396b67a8d866c18 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 14 Nov 2022 10:06:47 -0800 Subject: [PATCH 2/8] REF: _wrapped_sanitize --- pandas/core/indexes/base.py | 43 +++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c21b53f8f84e9..b72bc50740c42 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -81,7 +81,6 @@ find_common_type, infer_dtype_from, maybe_cast_pointwise_result, - maybe_infer_to_datetimelike, np_can_hold_element, ) from pandas.core.dtypes.common import ( @@ -208,6 +207,22 @@ _dtype_obj = np.dtype("object") +def _wrapped_sanitize(cls, data, dtype: DtypeObj | None, copy: bool): + """ + Call sanitize_array with wrapping for differences between Index/Series. + """ + try: + arr = sanitize_array(data, None, dtype=dtype, copy=copy) + except ValueError as err: + if "index must be specified when data is not list-like" in str(err): + raise cls._raise_scalar_data_error(data) from err + elif "Data must be 1-dimensional" in str(err): + raise ValueError("Index data must be 1-dimensional") from err + raise + arr = ensure_wrapped_if_datetimelike(arr) + return arr + + def _maybe_return_indexers(meth: F) -> F: """ Decorator to simplify 'return_indexers' checks in Index.join. @@ -448,21 +463,12 @@ def __new__( elif is_ea_or_datetimelike_dtype(dtype): # non-EA dtype indexes have special casting logic, so we punt here - try: - arr = sanitize_array(data, None, dtype=dtype, copy=copy) - except ValueError as err: - if "index must be specified when data is not list-like" in str(err): - raise cls._raise_scalar_data_error(data) from err - elif "Data must be 1-dimensional" in str(err): - raise ValueError("Index data must be 1-dimensional") from err - raise - arr = ensure_wrapped_if_datetimelike(arr) + arr = _wrapped_sanitize(cls, data, dtype, copy) klass = cls._dtype_to_subclass(arr.dtype) return klass._simple_new(arr, name=name) elif is_ea_or_datetimelike_dtype(data_dtype): - arr = sanitize_array(data, None, dtype=dtype, copy=copy) - arr = ensure_wrapped_if_datetimelike(arr) + arr = _wrapped_sanitize(cls, data, dtype, copy) klass = cls._dtype_to_subclass(arr.dtype) return klass._simple_new(arr, name=name) @@ -485,21 +491,12 @@ def __new__( # GH 11836 if data.dtype.kind not in ["i", "u", "f", "b", "c", "m", "M"]: data = com.asarray_tuplesafe(data, dtype=_dtype_obj) - try: - data = sanitize_array(data, None, dtype=dtype, copy=copy) - except ValueError as err: - if "index must be specified when data is not list-like" in str(err): - raise cls._raise_scalar_data_error(data) from err - elif "Data must be 1-dimensional" in str(err): - raise ValueError("Index data must be 1-dimensional") from err - raise - - arr = ensure_wrapped_if_datetimelike(data) + arr = _wrapped_sanitize(cls, data, dtype, copy) klass = cls._dtype_to_subclass(arr.dtype) # _ensure_array _may_ be unnecessary once Int64Index etc are gone - arr = klass._ensure_array(arr, data.dtype, copy) + arr = klass._ensure_array(arr, arr.dtype, copy) return klass._simple_new(arr, name) elif is_scalar(data): From d1129f6127ebc5549f484ae203e6f46658c34229 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 14 Nov 2022 12:56:29 -0800 Subject: [PATCH 3/8] re-use wrapped_sanitize --- pandas/core/construction.py | 7 ++++++ pandas/core/indexes/base.py | 45 +++++++++++-------------------------- 2 files changed, 20 insertions(+), 32 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index d40c334ab1840..28af62bd9a0d7 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -499,6 +499,7 @@ def sanitize_array( copy: bool = False, *, allow_2d: bool = False, + strict_ints: bool = False, ) -> ArrayLike: """ Sanitize input data to an ndarray or ExtensionArray, copy if specified, @@ -512,6 +513,8 @@ def sanitize_array( copy : bool, default False allow_2d : bool, default False If False, raise if we have a 2D Arraylike. + strict_ints : bool, default False + If False, silently ignore failures to cast float data to int dtype. Returns ------- @@ -581,6 +584,8 @@ def sanitize_array( # DataFrame would call np.array(data, dtype=dtype, copy=copy), # which would cast to the integer dtype even if the cast is lossy. # See GH#40110. + if strict_ints: + raise # We ignore the dtype arg and return floating values, # e.g. test_constructor_floating_data_int_dtype @@ -624,6 +629,8 @@ def sanitize_array( subarr = _try_cast(data, dtype, copy) except ValueError: if is_integer_dtype(dtype): + if strict_ints: + raise casted = np.array(data, copy=False) if casted.dtype.kind == "f": # GH#40110 match the behavior we have if we passed diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index b72bc50740c42..01f27ed7e5150 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -212,7 +212,7 @@ def _wrapped_sanitize(cls, data, dtype: DtypeObj | None, copy: bool): Call sanitize_array with wrapping for differences between Index/Series. """ try: - arr = sanitize_array(data, None, dtype=dtype, copy=copy) + arr = sanitize_array(data, None, dtype=dtype, copy=copy, strict_ints=True) except ValueError as err: if "index must be specified when data is not list-like" in str(err): raise cls._raise_scalar_data_error(data) from err @@ -521,11 +521,18 @@ def __new__( return MultiIndex.from_tuples(data, names=name) # other iterable of some kind - subarr = com.asarray_tuplesafe(data, dtype=_dtype_obj) - if dtype is None: - # with e.g. a list [1, 2, 3] casting to numeric is _not_ deprecated - subarr = _maybe_cast_data_without_dtype(subarr) - dtype = subarr.dtype + # we allow set/frozenset, which Series/sanitize_array does not, so + # cast to list here + data = list(data) + if len(data) == 0: + # unlike Series, we default to object dtype: + data = np.array(data, dtype=object) + + if len(data) and isinstance(data[0], tuple): + # Ensure we get 1-D array of tuples instead of 2D array. + data = com.asarray_tuplesafe(data, dtype=_dtype_obj) + subarr = _wrapped_sanitize(cls, data, dtype, copy) + dtype = subarr.dtype return Index(subarr, dtype=dtype, copy=copy, name=name) @classmethod @@ -7046,32 +7053,6 @@ def maybe_extract_name(name, obj, cls) -> Hashable: return name -def _maybe_cast_data_without_dtype(subarr: npt.NDArray[np.object_]) -> ArrayLike: - """ - If we have an arraylike input but no passed dtype, try to infer - a supported dtype. - - Parameters - ---------- - subarr : np.ndarray[object] - - Returns - ------- - np.ndarray or ExtensionArray - """ - - result = lib.maybe_convert_objects( - subarr, - convert_datetime=True, - convert_timedelta=True, - convert_period=True, - convert_interval=True, - dtype_if_all_nat=np.dtype("datetime64[ns]"), - ) - result = ensure_wrapped_if_datetimelike(result) - return result - - def get_unanimous_names(*indexes: Index) -> tuple[Hashable, ...]: """ Return common name if all indices agree, otherwise None (level-by-level). From d33da480165fc0bcc311e0c9ca23ed9b1f51f304 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 14 Nov 2022 13:05:18 -0800 Subject: [PATCH 4/8] cln --- pandas/core/indexes/base.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 01f27ed7e5150..430fdba144136 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -484,12 +484,11 @@ def __new__( if isinstance(data, ABCMultiIndex): data = data._values - # we need to avoid having numpy coerce - # things that look like ints/floats to ints unless - # they are actually ints, e.g. '0' and 0.0 - # should not be coerced - # GH 11836 if data.dtype.kind not in ["i", "u", "f", "b", "c", "m", "M"]: + # GH#11836 we need to avoid having numpy coerce + # things that look like ints/floats to ints unless + # they are actually ints, e.g. '0' and 0.0 + # should not be coerced data = com.asarray_tuplesafe(data, dtype=_dtype_obj) arr = _wrapped_sanitize(cls, data, dtype, copy) @@ -521,9 +520,10 @@ def __new__( return MultiIndex.from_tuples(data, names=name) # other iterable of some kind - # we allow set/frozenset, which Series/sanitize_array does not, so - # cast to list here - data = list(data) + if not isinstance(data, (list, tuple)): + # we allow set/frozenset, which Series/sanitize_array does not, so + # cast to list here + data = list(data) if len(data) == 0: # unlike Series, we default to object dtype: data = np.array(data, dtype=object) From af07989fc6d280d88b83241b81be045211ca6e5f Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 14 Nov 2022 16:40:27 -0800 Subject: [PATCH 5/8] REF: share --- pandas/core/indexes/base.py | 34 +++++++++------------------------- 1 file changed, 9 insertions(+), 25 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 430fdba144136..73ce75309af0e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -115,7 +115,6 @@ DatetimeTZDtype, ExtensionDtype, IntervalDtype, - PandasDtype, PeriodDtype, ) from pandas.core.dtypes.generic import ( @@ -437,7 +436,6 @@ def __new__( tupleize_cols: bool = True, ) -> Index: - from pandas.core.arrays import PandasArray from pandas.core.indexes.range import RangeIndex name = maybe_extract_name(name, data, cls) @@ -445,13 +443,6 @@ def __new__( if dtype is not None: dtype = pandas_dtype(dtype) - if type(data) is PandasArray: - # ensure users don't accidentally put a PandasArray in an index, - # but don't unpack StringArray - data = data.to_numpy() - if isinstance(dtype, PandasDtype): - dtype = dtype.numpy_dtype - data_dtype = getattr(data, "dtype", None) # range @@ -463,14 +454,10 @@ def __new__( elif is_ea_or_datetimelike_dtype(dtype): # non-EA dtype indexes have special casting logic, so we punt here - arr = _wrapped_sanitize(cls, data, dtype, copy) - klass = cls._dtype_to_subclass(arr.dtype) - return klass._simple_new(arr, name=name) + pass elif is_ea_or_datetimelike_dtype(data_dtype): - arr = _wrapped_sanitize(cls, data, dtype, copy) - klass = cls._dtype_to_subclass(arr.dtype) - return klass._simple_new(arr, name=name) + pass # index-like elif ( @@ -490,13 +477,6 @@ def __new__( # they are actually ints, e.g. '0' and 0.0 # should not be coerced data = com.asarray_tuplesafe(data, dtype=_dtype_obj) - arr = _wrapped_sanitize(cls, data, dtype, copy) - - klass = cls._dtype_to_subclass(arr.dtype) - - # _ensure_array _may_ be unnecessary once Int64Index etc are gone - arr = klass._ensure_array(arr, arr.dtype, copy) - return klass._simple_new(arr, name) elif is_scalar(data): raise cls._raise_scalar_data_error(data) @@ -531,9 +511,13 @@ def __new__( if len(data) and isinstance(data[0], tuple): # Ensure we get 1-D array of tuples instead of 2D array. data = com.asarray_tuplesafe(data, dtype=_dtype_obj) - subarr = _wrapped_sanitize(cls, data, dtype, copy) - dtype = subarr.dtype - return Index(subarr, dtype=dtype, copy=copy, name=name) + + arr = _wrapped_sanitize(cls, data, dtype, copy) + klass = cls._dtype_to_subclass(arr.dtype) + + # _ensure_array _may_ be unnecessary once Int64Index etc are gone + arr = klass._ensure_array(arr, arr.dtype, copy) + return klass._simple_new(arr, name) @classmethod def _ensure_array(cls, data, dtype, copy: bool): From 3b4e72ec1b0eceb2c85f1d9947b4a789f65bc79a Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 15 Nov 2022 16:15:05 -0800 Subject: [PATCH 6/8] avoid extra copy --- pandas/core/indexes/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 73ce75309af0e..5a280c04fb2fe 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -516,7 +516,7 @@ def __new__( klass = cls._dtype_to_subclass(arr.dtype) # _ensure_array _may_ be unnecessary once Int64Index etc are gone - arr = klass._ensure_array(arr, arr.dtype, copy) + arr = klass._ensure_array(arr, arr.dtype, copy=False) return klass._simple_new(arr, name) @classmethod From 98c0020dd74e1723e85c78164cc5e1e551fb2d30 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 16 Nov 2022 08:58:01 -0800 Subject: [PATCH 7/8] troubleshoot CI --- pandas/core/indexes/base.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5a280c04fb2fe..c78bdb670dd45 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -482,7 +482,9 @@ def __new__( raise cls._raise_scalar_data_error(data) elif hasattr(data, "__array__"): return Index(np.asarray(data), dtype=dtype, copy=copy, name=name) - elif not is_list_like(data): + elif not is_list_like(data) and not isinstance(data, memoryview): + # 2022-11-16 the memoryview check is only necessary on some CI + # builds, not clear why raise cls._raise_scalar_data_error(data) else: From bb4f1db2f850e8cd70b84fd047a3e67cfdfb41f3 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 16 Nov 2022 10:16:05 -0800 Subject: [PATCH 8/8] pylint fixup --- pandas/core/indexes/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c78bdb670dd45..275d54a2ea800 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -215,7 +215,7 @@ def _wrapped_sanitize(cls, data, dtype: DtypeObj | None, copy: bool): except ValueError as err: if "index must be specified when data is not list-like" in str(err): raise cls._raise_scalar_data_error(data) from err - elif "Data must be 1-dimensional" in str(err): + if "Data must be 1-dimensional" in str(err): raise ValueError("Index data must be 1-dimensional") from err raise arr = ensure_wrapped_if_datetimelike(arr)