From fb63ade2877b92b5321f2c6248791cab0bfb8c34 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 1 Jan 2021 09:53:24 -0800 Subject: [PATCH 1/4] REF: simplify Index.__new__ --- pandas/core/dtypes/common.py | 13 ++++ pandas/core/indexes/base.py | 132 ++++++++++++----------------------- 2 files changed, 59 insertions(+), 86 deletions(-) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 5869b2cf22516..46aff11835cec 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1529,6 +1529,19 @@ def is_extension_array_dtype(arr_or_dtype) -> bool: return isinstance(dtype, ExtensionDtype) or registry.find(dtype) is not None +def is_ea_or_datetimelike_dtype(dtype: Optional[DtypeObj]) -> bool: + """ + Check for ExtensionDtype, datetime64 dtype, or timedelta64 dtype. + + Notes + ----- + Checks only for dtype objects, not dtype-castable strings or types. + """ + return isinstance(dtype, ExtensionDtype) or ( + isinstance(dtype, np.dtype) and dtype.kind in ["m", "M"] + ) + + def is_complex_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of a complex dtype. diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 17ed42a188b4e..78242b3ac908d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -44,8 +44,8 @@ ensure_platform_int, is_bool_dtype, is_categorical_dtype, - is_datetime64_any_dtype, is_dtype_equal, + is_ea_or_datetimelike_dtype, is_extension_array_dtype, is_float, is_float_dtype, @@ -56,10 +56,8 @@ is_iterator, is_list_like, is_object_dtype, - is_period_dtype, is_scalar, is_signed_integer_dtype, - is_timedelta64_dtype, is_unsigned_integer_dtype, needs_i8_conversion, pandas_dtype, @@ -69,6 +67,7 @@ from pandas.core.dtypes.dtypes import ( CategoricalDtype, DatetimeTZDtype, + ExtensionDtype, IntervalDtype, PeriodDtype, ) @@ -87,6 +86,7 @@ import pandas.core.algorithms as algos from pandas.core.arrays import Categorical, ExtensionArray from pandas.core.arrays.datetimes import tz_to_dtype, validate_tz_from_dtype +from pandas.core.arrays.sparse import SparseDtype from pandas.core.base import IndexOpsMixin, PandasObject import pandas.core.common as com from pandas.core.construction import extract_array @@ -290,40 +290,22 @@ def __new__( elif isinstance(data, range): return RangeIndex.from_range(data, dtype=dtype, name=name) - # categorical - elif is_categorical_dtype(data_dtype) or is_categorical_dtype(dtype): - # Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423 - from pandas.core.indexes.category import CategoricalIndex - - return _maybe_asobject(dtype, CategoricalIndex, data, copy, name, **kwargs) - - # interval - elif is_interval_dtype(data_dtype) or is_interval_dtype(dtype): - # Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423 - from pandas.core.indexes.interval import IntervalIndex - - return _maybe_asobject(dtype, IntervalIndex, data, copy, name, **kwargs) - - elif is_datetime64_any_dtype(data_dtype) or is_datetime64_any_dtype(dtype): - # Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423 - from pandas import DatetimeIndex - - return _maybe_asobject(dtype, DatetimeIndex, data, copy, name, **kwargs) - - elif is_timedelta64_dtype(data_dtype) or is_timedelta64_dtype(dtype): - # Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423 - from pandas import TimedeltaIndex - - return _maybe_asobject(dtype, TimedeltaIndex, data, copy, name, **kwargs) - - elif is_period_dtype(data_dtype) or is_period_dtype(dtype): - # Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423 - from pandas import PeriodIndex - - return _maybe_asobject(dtype, PeriodIndex, data, copy, name, **kwargs) + if is_ea_or_datetimelike_dtype(dtype): + # non-EA dtype indexes have special casting logic, so we punt here + klass = cls._dtype_to_subclass(dtype) + if klass is not Index: + return klass(data, dtype=dtype, copy=copy, name=name, **kwargs) + + if is_ea_or_datetimelike_dtype(data_dtype): + klass = cls._dtype_to_subclass(data_dtype) + if klass is not Index: + result = klass(data, copy=copy, name=name, **kwargs) + if dtype is not None: + return result.astype(dtype, copy=False) + return result # extension dtype - elif is_extension_array_dtype(data_dtype) or is_extension_array_dtype(dtype): + if is_extension_array_dtype(data_dtype) or is_extension_array_dtype(dtype): if not (dtype is None or is_object_dtype(dtype)): # coerce to the provided dtype ea_cls = dtype.construct_array_type() @@ -407,26 +389,38 @@ def _ensure_array(cls, data, dtype, copy: bool): def _dtype_to_subclass(cls, dtype: DtypeObj): # Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423 - if isinstance(dtype, DatetimeTZDtype) or dtype == np.dtype("M8[ns]"): + if isinstance(dtype, ExtensionDtype): + if isinstance(dtype, DatetimeTZDtype): + from pandas import DatetimeIndex + + return DatetimeIndex + elif isinstance(dtype, CategoricalDtype): + from pandas import CategoricalIndex + + return CategoricalIndex + elif isinstance(dtype, IntervalDtype): + from pandas import IntervalIndex + + return IntervalIndex + elif isinstance(dtype, PeriodDtype): + from pandas import PeriodIndex + + return PeriodIndex + + elif isinstance(dtype, SparseDtype): + return cls._dtype_to_subclass(dtype.subtype) + + return Index + + if dtype.kind == "M": from pandas import DatetimeIndex return DatetimeIndex - elif dtype == "m8[ns]": + + elif dtype.kind == "m": from pandas import TimedeltaIndex return TimedeltaIndex - elif isinstance(dtype, CategoricalDtype): - from pandas import CategoricalIndex - - return CategoricalIndex - elif isinstance(dtype, IntervalDtype): - from pandas import IntervalIndex - - return IntervalIndex - elif isinstance(dtype, PeriodDtype): - from pandas import PeriodIndex - - return PeriodIndex elif is_float_dtype(dtype): from pandas import Float64Index @@ -445,6 +439,9 @@ def _dtype_to_subclass(cls, dtype: DtypeObj): # NB: assuming away MultiIndex return Index + elif issubclass(dtype.type, (str, bool, np.bool_)): + return Index + raise NotImplementedError(dtype) """ @@ -6253,43 +6250,6 @@ def _try_convert_to_int_array( raise ValueError -def _maybe_asobject(dtype, klass, data, copy: bool, name: Label, **kwargs): - """ - If an object dtype was specified, create the non-object Index - and then convert it to object. - - Parameters - ---------- - dtype : np.dtype, ExtensionDtype, str - klass : Index subclass - data : list-like - copy : bool - name : hashable - **kwargs - - Returns - ------- - Index - - Notes - ----- - We assume that calling .astype(object) on this klass will make a copy. - """ - - # GH#23524 passing `dtype=object` to DatetimeIndex is invalid, - # will raise in the where `data` is already tz-aware. So - # we leave it out of this step and cast to object-dtype after - # the DatetimeIndex construction. - - if is_dtype_equal(_o_dtype, dtype): - # Note we can pass copy=False because the .astype below - # will always make a copy - index = klass(data, copy=False, name=name, **kwargs) - return index.astype(object) - - return klass(data, dtype=dtype, copy=copy, name=name, **kwargs) - - def get_unanimous_names(*indexes: Index) -> Tuple[Label, ...]: """ Return common name if all indices agree, otherwise None (level-by-level). From e3a55088c6b832dacca8632d6dac0b27a57a925f Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 1 Jan 2021 10:33:20 -0800 Subject: [PATCH 2/4] TST: dont silently ignore dtype in Index.__new__ --- pandas/core/indexes/base.py | 10 +++- pandas/tests/indexes/test_index_new.py | 78 ++++++++++++++++++++++++++ 2 files changed, 86 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 78242b3ac908d..b0c89000a53a9 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -286,9 +286,15 @@ def __new__( # range if isinstance(data, RangeIndex): - return RangeIndex(start=data, copy=copy, dtype=dtype, name=name) + result = RangeIndex(start=data, copy=copy, name=name) + if dtype is not None: + return result.astype(dtype, copy=False) + return result elif isinstance(data, range): - return RangeIndex.from_range(data, dtype=dtype, name=name) + result = RangeIndex.from_range(data, name=name) + if dtype is not None: + return result.astype(dtype, copy=False) + return result if is_ea_or_datetimelike_dtype(dtype): # non-EA dtype indexes have special casting logic, so we punt here diff --git a/pandas/tests/indexes/test_index_new.py b/pandas/tests/indexes/test_index_new.py index c8f580babc0b2..de0850d37034d 100644 --- a/pandas/tests/indexes/test_index_new.py +++ b/pandas/tests/indexes/test_index_new.py @@ -8,10 +8,12 @@ from pandas import ( NA, + Categorical, CategoricalIndex, DatetimeIndex, Index, Int64Index, + IntervalIndex, MultiIndex, NaT, PeriodIndex, @@ -19,7 +21,9 @@ TimedeltaIndex, Timestamp, UInt64Index, + date_range, period_range, + timedelta_range, ) import pandas._testing as tm @@ -122,6 +126,80 @@ def test_constructor_mixed_nat_objs_infers_object(self, swap_objs): tm.assert_index_equal(Index(np.array(data, dtype=object)), expected) +class TestDtypeEnforced: + # check we don't silently ignore the dtype keyword + + @pytest.mark.parametrize("dtype", [object, "float64", "uint64", "category"]) + def test_constructor_range_values_mismatched_dtype(self, dtype): + rng = Index(range(5)) + + result = Index(rng, dtype=dtype) + assert result.dtype == dtype + + result = Index(range(5), dtype=dtype) + assert result.dtype == dtype + + @pytest.mark.parametrize("dtype", [object, "float64", "uint64", "category"]) + def test_constructor_categorical_values_mismatched_non_ea_dtype(self, dtype): + cat = Categorical([1, 2, 3]) + + result = Index(cat, dtype=dtype) + assert result.dtype == dtype + + def test_constructor_categorical_values_mismatched_dtype(self): + dti = date_range("2016-01-01", periods=3) + cat = Categorical(dti) + result = Index(cat, dti.dtype) + tm.assert_index_equal(result, dti) + + dti2 = dti.tz_localize("Asia/Tokyo") + cat2 = Categorical(dti2) + result = Index(cat2, dti2.dtype) + tm.assert_index_equal(result, dti2) + + ii = IntervalIndex.from_breaks(range(5)) + cat3 = Categorical(ii) + result = Index(cat3, dtype=ii.dtype) + tm.assert_index_equal(result, ii) + + def test_constructor_ea_values_mismatched_categorical_dtype(self): + dti = date_range("2016-01-01", periods=3) + result = Index(dti, dtype="category") + expected = CategoricalIndex(dti) + tm.assert_index_equal(result, expected) + + dti2 = date_range("2016-01-01", periods=3, tz="US/Pacific") + result = Index(dti2, dtype="category") + expected = CategoricalIndex(dti2) + tm.assert_index_equal(result, expected) + + def test_constructor_period_values_mismatched_dtype(self): + pi = period_range("2016-01-01", periods=3, freq="D") + result = Index(pi, dtype="category") + expected = CategoricalIndex(pi) + tm.assert_index_equal(result, expected) + + def test_constructor_timedelta64_values_mismatched_dtype(self): + # check we don't silently ignore the dtype keyword + tdi = timedelta_range("4 Days", periods=5) + result = Index(tdi, dtype="category") + expected = CategoricalIndex(tdi) + tm.assert_index_equal(result, expected) + + def test_constructor_interval_values_mismatched_dtype(self): + dti = date_range("2016-01-01", periods=3) + ii = IntervalIndex.from_breaks(dti) + result = Index(ii, dtype="category") + expected = CategoricalIndex(ii) + tm.assert_index_equal(result, expected) + + def test_constructor_datetime64_values_mismatched_period_dtype(self): + dti = date_range("2016-01-01", periods=3) + result = Index(dti, dtype="Period[D]") + expected = dti.to_period("D") + tm.assert_index_equal(result, expected) + + class TestIndexConstructorUnwrapping: # Test passing different arraylike values to pd.Index From 070465809452dc91fdb51c558e70fa49d392bcca Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 1 Jan 2021 10:38:44 -0800 Subject: [PATCH 3/4] whatsnew --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 931ec895cc73f..9a5ef8421cf56 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -309,7 +309,7 @@ ExtensionArray Other ^^^^^ - +- Bug in :class:`Index` constructor sometimes silently ignorning a a specified ``dtype`` (:issue:`38879`) - - From 679ef2f06a333d98e02b3cf575247acbb3c056e3 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 1 Jan 2021 13:41:32 -0800 Subject: [PATCH 4/4] remove no-longer-correct test --- pandas/tests/indexes/ranges/test_constructors.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pandas/tests/indexes/ranges/test_constructors.py b/pandas/tests/indexes/ranges/test_constructors.py index 7dd893bd16720..f83c885a7850b 100644 --- a/pandas/tests/indexes/ranges/test_constructors.py +++ b/pandas/tests/indexes/ranges/test_constructors.py @@ -114,11 +114,6 @@ def test_constructor_range(self): expected = RangeIndex(1, 5, 2) tm.assert_index_equal(result, expected, exact=True) - with pytest.raises( - ValueError, - match="Incorrect `dtype` passed: expected signed integer, received float64", - ): - Index(range(1, 5, 2), dtype="float64") msg = r"^from_range\(\) got an unexpected keyword argument" with pytest.raises(TypeError, match=msg): RangeIndex.from_range(range(10), copy=True)