From 7924d6c7cfb669597bfc0729b714f8b427258423 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 31 May 2021 13:35:30 -0700 Subject: [PATCH 1/2] BUG: lib.infer_dtype with incompatible intervals --- pandas/_libs/lib.pyx | 54 +++++++++++++++++++++++++-- pandas/core/construction.py | 7 +--- pandas/core/indexes/base.py | 8 +--- pandas/tests/dtypes/test_inference.py | 41 +++++++++++++++++++- 4 files changed, 93 insertions(+), 17 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index d7e15bb2ad197..6f3f69e541f9f 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2035,10 +2035,58 @@ cdef class IntervalValidator(Validator): cpdef bint is_interval_array(ndarray values): + """ + Is this an ndarray of Interval (or np.nan) with a single dtype? + """ + cdef: - IntervalValidator validator = IntervalValidator(len(values), - skipna=True) - return validator.validate(values) + Py_ssize_t i, n = len(values) + str closed = None + bint numeric = False + bint dt64 = False + bint td64 = False + object val + + if len(values) == 0: + return False + + for val in values: + if is_interval(val): + if closed is None: + closed = val.closed + numeric = ( + util.is_float_object(val.left) + or util.is_integer_object(val.left) + ) + td64 = is_timedelta(val.left) + dt64 = PyDateTime_Check(val.left) + elif val.closed != closed: + # mismatched closedness + return False + elif numeric: + if not ( + util.is_float_object(val.left) + or util.is_integer_object(val.left) + ): + # i.e. datetime64 or timedelta64 + return False + elif td64: + if not is_timedelta(val.left): + return False + elif dt64: + if not PyDateTime_Check(val.left): + return False + else: + raise ValueError(val) + elif util.is_nan(val) or val is None: + pass + else: + return False + + if closed is None: + # we saw all-NaTs, no actual Intervals + return False + return True @cython.boundscheck(False) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index dd9a81a58d36e..e072a3d7c6595 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -322,12 +322,7 @@ def array( # We choose to return an ndarray, rather than raising. pass elif inferred_dtype == "interval": - try: - return IntervalArray(data, copy=copy) - except ValueError: - # We may have a mixture of `closed` here. - # We choose to return an ndarray, rather than raising. - pass + return IntervalArray(data, copy=copy) elif inferred_dtype.startswith("datetime"): # datetime, datetime64 diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2a50ebd959ace..199cdc7196bca 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6444,12 +6444,8 @@ def _maybe_cast_data_without_dtype(subarr: np.ndarray) -> ArrayLike: return data elif inferred == "interval": - try: - ia_data = IntervalArray._from_sequence(subarr, copy=False) - return ia_data - except (ValueError, TypeError): - # GH27172: mixed closed Intervals --> object dtype - pass + ia_data = IntervalArray._from_sequence(subarr, copy=False) + return ia_data elif inferred == "boolean": # don't support boolean explicitly ATM pass diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 09efa97871fae..073a1ff28815b 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1458,17 +1458,54 @@ def test_categorical(self): result = lib.infer_dtype(Series(arr), skipna=True) assert result == "categorical" - def test_interval(self): + @pytest.mark.parametrize("asobject", [True, False]) + def test_interval(self, asobject): idx = pd.IntervalIndex.from_breaks(range(5), closed="both") + if asobject: + idx = idx.astype(object) + inferred = lib.infer_dtype(idx, skipna=False) assert inferred == "interval" inferred = lib.infer_dtype(idx._data, skipna=False) assert inferred == "interval" - inferred = lib.infer_dtype(Series(idx), skipna=False) + inferred = lib.infer_dtype(Series(idx, dtype=idx.dtype), skipna=False) assert inferred == "interval" + @pytest.mark.parametrize("value", [Timestamp(0), Timedelta(0), 0, 0.0]) + def test_interval_mismatched_closed(self, value): + + first = Interval(value, value, closed="left") + second = Interval(value, value, closed="right") + + # if closed match, we should infer "interval" + arr = np.array([first, first], dtype=object) + assert lib.infer_dtype(arr, skipna=False) == "interval" + + # if closed dont match, we should _not_ get "interval" + arr2 = np.array([first, second], dtype=object) + assert lib.infer_dtype(arr2, skipna=False) == "mixed" + + def test_interval_mismatched_subtype(self): + first = Interval(0, 1, closed="left") + second = Interval(Timestamp(0), Timestamp(1), closed="left") + third = Interval(Timedelta(0), Timedelta(1), closed="left") + + arr = np.array([first, second]) + assert lib.infer_dtype(arr, skipna=False) == "mixed" + + arr = np.array([second, third]) + assert lib.infer_dtype(arr, skipna=False) == "mixed" + + arr = np.array([first, third]) + assert lib.infer_dtype(arr, skipna=False) == "mixed" + + # float vs int subdtype are compatible + flt_interval = Interval(1.5, 2.5, closed="left") + arr = np.array([first, flt_interval], dtype=object) + assert lib.infer_dtype(arr, skipna=False) == "interval" + @pytest.mark.parametrize("klass", [pd.array, Series]) @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("data", [["a", "b", "c"], ["a", "b", pd.NA]]) From 1d031f1c6aa1eac59fc55ef0d532571fc1702d90 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 31 May 2021 22:35:31 -0700 Subject: [PATCH 2/2] update per comments --- pandas/_libs/lib.pyx | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 6f3f69e541f9f..ba82a7840c4f9 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2029,11 +2029,6 @@ cdef bint is_period_array(ndarray[object] values): return True -cdef class IntervalValidator(Validator): - cdef inline bint is_value_typed(self, object value) except -1: - return is_interval(value) - - cpdef bint is_interval_array(ndarray values): """ Is this an ndarray of Interval (or np.nan) with a single dtype? @@ -2084,7 +2079,7 @@ cpdef bint is_interval_array(ndarray values): return False if closed is None: - # we saw all-NaTs, no actual Intervals + # we saw all-NAs, no actual Intervals return False return True