Skip to content

BUG: lib.infer_dtype with incompatible intervals #41749

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jun 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 51 additions & 8 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2029,16 +2029,59 @@ cdef bint is_period_array(ndarray[object] values):
return True


cdef class IntervalValidator(Validator):
cdef inline bint is_value_typed(self, object value) except -1:
return is_interval(value)


cpdef bint is_interval_array(ndarray values):
"""
Is this an ndarray of Interval (or np.nan) with a single dtype?
"""

cdef:
IntervalValidator validator = IntervalValidator(len(values),
skipna=True)
return validator.validate(values)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you remove the IntervalValidator code?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure, will add that to the upcoming misc branch

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

kk

Py_ssize_t i, n = len(values)
str closed = None
bint numeric = False
bint dt64 = False
bint td64 = False
object val

if len(values) == 0:
return False

for val in values:
if is_interval(val):
if closed is None:
closed = val.closed
numeric = (
util.is_float_object(val.left)
or util.is_integer_object(val.left)
)
td64 = is_timedelta(val.left)
dt64 = PyDateTime_Check(val.left)
elif val.closed != closed:
# mismatched closedness
return False
elif numeric:
if not (
util.is_float_object(val.left)
or util.is_integer_object(val.left)
):
# i.e. datetime64 or timedelta64
return False
elif td64:
if not is_timedelta(val.left):
return False
elif dt64:
if not PyDateTime_Check(val.left):
return False
else:
raise ValueError(val)
elif util.is_nan(val) or val is None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

couldn't this be a float nan or NA? (or None)? i am not sure we really specify

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this matches what we do in corresponding libinterval function

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

got it
ok i think we should open an issue about this (eg letting the missing value be NA) or NaT if it's a datetimelike in a new issue as this is a bit inconsistent with what we allow for missing values in other dtypes

but out of scope for this PR

pass
else:
return False

if closed is None:
# we saw all-NAs, no actual Intervals
return False
return True


@cython.boundscheck(False)
Expand Down
7 changes: 1 addition & 6 deletions pandas/core/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,12 +317,7 @@ def array(
return PeriodArray._from_sequence(data, copy=copy)

elif inferred_dtype == "interval":
try:
return IntervalArray(data, copy=copy)
except ValueError:
# We may have a mixture of `closed` here.
# We choose to return an ndarray, rather than raising.
pass
return IntervalArray(data, copy=copy)

elif inferred_dtype.startswith("datetime"):
# datetime, datetime64
Expand Down
8 changes: 2 additions & 6 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6441,12 +6441,8 @@ def _maybe_cast_data_without_dtype(subarr: np.ndarray) -> ArrayLike:
return data

elif inferred == "interval":
try:
ia_data = IntervalArray._from_sequence(subarr, copy=False)
return ia_data
except (ValueError, TypeError):
# GH27172: mixed closed Intervals --> object dtype
pass
ia_data = IntervalArray._from_sequence(subarr, copy=False)
return ia_data
elif inferred == "boolean":
# don't support boolean explicitly ATM
pass
Expand Down
41 changes: 39 additions & 2 deletions pandas/tests/dtypes/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -1458,17 +1458,54 @@ def test_categorical(self):
result = lib.infer_dtype(Series(arr), skipna=True)
assert result == "categorical"

def test_interval(self):
@pytest.mark.parametrize("asobject", [True, False])
def test_interval(self, asobject):
idx = pd.IntervalIndex.from_breaks(range(5), closed="both")
if asobject:
idx = idx.astype(object)

inferred = lib.infer_dtype(idx, skipna=False)
assert inferred == "interval"

inferred = lib.infer_dtype(idx._data, skipna=False)
assert inferred == "interval"

inferred = lib.infer_dtype(Series(idx), skipna=False)
inferred = lib.infer_dtype(Series(idx, dtype=idx.dtype), skipna=False)
assert inferred == "interval"

@pytest.mark.parametrize("value", [Timestamp(0), Timedelta(0), 0, 0.0])
def test_interval_mismatched_closed(self, value):

first = Interval(value, value, closed="left")
second = Interval(value, value, closed="right")

# if closed match, we should infer "interval"
arr = np.array([first, first], dtype=object)
assert lib.infer_dtype(arr, skipna=False) == "interval"

# if closed dont match, we should _not_ get "interval"
arr2 = np.array([first, second], dtype=object)
assert lib.infer_dtype(arr2, skipna=False) == "mixed"

def test_interval_mismatched_subtype(self):
first = Interval(0, 1, closed="left")
second = Interval(Timestamp(0), Timestamp(1), closed="left")
third = Interval(Timedelta(0), Timedelta(1), closed="left")

arr = np.array([first, second])
assert lib.infer_dtype(arr, skipna=False) == "mixed"

arr = np.array([second, third])
assert lib.infer_dtype(arr, skipna=False) == "mixed"

arr = np.array([first, third])
assert lib.infer_dtype(arr, skipna=False) == "mixed"

# float vs int subdtype are compatible
flt_interval = Interval(1.5, 2.5, closed="left")
arr = np.array([first, flt_interval], dtype=object)
assert lib.infer_dtype(arr, skipna=False) == "interval"

@pytest.mark.parametrize("klass", [pd.array, Series])
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize("data", [["a", "b", "c"], ["a", "b", pd.NA]])
Expand Down