Skip to content

API: Prohibit non-numeric dtypes in IntervalIndex #19022

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jan 5, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,7 @@ Other API Changes
- Subtracting ``NaT`` from a :class:`Series` with ``dtype='datetime64[ns]'`` returns a ``Series`` with ``dtype='timedelta64[ns]'`` instead of ``dtype='datetime64[ns]'``(:issue:`18808`)
- Operations between a :class:`Series` with dtype ``dtype='datetime64[ns]'`` and a :class:`PeriodIndex` will correctly raises ``TypeError`` (:issue:`18850`)
- Subtraction of :class:`Series` with timezone-aware ``dtype='datetime64[ns]'`` with mis-matched timezones will raise ``TypeError`` instead of ``ValueError`` (issue:`18817`)
- :class:`IntervalIndex` and ``IntervalDtype`` no longer support categorical, object, and string subtypes (:issue:`19016`)
- The default ``Timedelta`` constructor now accepts an ``ISO 8601 Duration`` string as an argument (:issue:`19040`)

.. _whatsnew_0230.deprecations:
Expand Down Expand Up @@ -279,11 +280,11 @@ Performance Improvements
Documentation Changes
~~~~~~~~~~~~~~~~~~~~~

- Changed spelling of "numpy" to "NumPy", and "python" to "Python". (:issue:`19017`)
- Changed spelling of "numpy" to "NumPy", and "python" to "Python". (:issue:`19017`)
- Consistency when introducing code samples, using either colon or period.
Rewrote some sentences for greater clarity, added more dynamic references
to functions, methods and classes.
(:issue:`18941`, :issue:`18948`, :issue:`18973`, :issue:`19017`)
(:issue:`18941`, :issue:`18948`, :issue:`18973`, :issue:`19017`)
-

.. _whatsnew_0230.bug_fixes:
Expand All @@ -310,7 +311,7 @@ Conversion
- Bug in :class:`DatetimeIndex` where adding or subtracting an array-like of ``DateOffset`` objects either raised (``np.array``, ``pd.Index``) or broadcast incorrectly (``pd.Series``) (:issue:`18849`)
- Bug in :class:`Series` floor-division where operating on a scalar ``timedelta`` raises an exception (:issue:`18846`)
- Bug in :class:`FY5253Quarter`, :class:`LastWeekOfMonth` where rollback and rollforward behavior was inconsistent with addition and subtraction behavior (:issue:`18854`)
- Bug in :class:`Index` constructor with ``dtype=CategoricalDtype(...)`` where ``categories`` and ``ordered`` are not maintained (issue:`19032`)
- Bug in :class:`Index` constructor with ``dtype=CategoricalDtype(...)`` where ``categories`` and ``ordered`` are not maintained (issue:`19032`)


Indexing
Expand Down
9 changes: 8 additions & 1 deletion pandas/core/dtypes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -641,6 +641,8 @@ def __new__(cls, subtype=None):
----------
subtype : the dtype of the Interval
"""
from pandas.core.dtypes.common import (
is_categorical_dtype, is_string_dtype, pandas_dtype)

if isinstance(subtype, IntervalDtype):
return subtype
Expand All @@ -659,7 +661,6 @@ def __new__(cls, subtype=None):
if m is not None:
subtype = m.group('subtype')

from pandas.core.dtypes.common import pandas_dtype
try:
subtype = pandas_dtype(subtype)
except TypeError:
Expand All @@ -670,6 +671,12 @@ def __new__(cls, subtype=None):
u.subtype = None
return u

if is_categorical_dtype(subtype) or is_string_dtype(subtype):
# GH 19016
msg = ('category, object, and string subtypes are not supported '
'for IntervalDtype')
raise TypeError(msg)

try:
return cls._cache[str(subtype)]
except KeyError:
Expand Down
41 changes: 36 additions & 5 deletions pandas/core/indexes/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
is_list_like,
is_datetime_or_timedelta_dtype,
is_datetime64tz_dtype,
is_categorical_dtype,
is_string_dtype,
is_integer_dtype,
is_float_dtype,
is_interval_dtype,
Expand Down Expand Up @@ -92,6 +94,30 @@ def _get_interval_closed_bounds(interval):
return left, right


def maybe_convert_platform_interval(values):
"""
Try to do platform conversion, with special casing for IntervalIndex.
Wrapper around maybe_convert_platform that alters the default return
dtype in certain cases to be compatible with IntervalIndex. For example,
empty lists return with integer dtype instead of object dtype, which is
prohibited for IntervalIndex.

Parameters
----------
values : array-like

Returns
-------
array
"""
if isinstance(values, (list, tuple)) and len(values) == 0:
# GH 19016
# empty lists/tuples get object dtype by default, but this is not
# prohibited for IntervalIndex, so coerce to integer instead
return np.array([], dtype=np.intp)
return maybe_convert_platform(values)


def _new_IntervalIndex(cls, d):
"""
This is called upon unpickling, rather than the default which doesn't have
Expand Down Expand Up @@ -206,7 +232,7 @@ def __new__(cls, data, closed=None,
if is_scalar(data):
cls._scalar_data_error(data)

data = maybe_convert_platform(data)
data = maybe_convert_platform_interval(data)
left, right, infer_closed = intervals_to_interval_bounds(data)

if _all_not_none(closed, infer_closed) and closed != infer_closed:
Expand Down Expand Up @@ -242,6 +268,11 @@ def _simple_new(cls, left, right, closed=None, name=None,
'[{rtype}] types')
raise ValueError(msg.format(ltype=type(left).__name__,
rtype=type(right).__name__))
elif is_categorical_dtype(left.dtype) or is_string_dtype(left.dtype):
# GH 19016
msg = ('category, object, and string subtypes are not supported '
'for IntervalIndex')
raise TypeError(msg)
elif isinstance(left, ABCPeriodIndex):
msg = 'Period dtypes are not supported, use a PeriodIndex instead'
raise ValueError(msg)
Expand Down Expand Up @@ -403,7 +434,7 @@ def from_breaks(cls, breaks, closed='right', name=None, copy=False):
IntervalIndex.from_tuples : Construct an IntervalIndex from a
list/array of tuples
"""
breaks = maybe_convert_platform(breaks)
breaks = maybe_convert_platform_interval(breaks)

return cls.from_arrays(breaks[:-1], breaks[1:], closed,
name=name, copy=copy)
Expand Down Expand Up @@ -444,8 +475,8 @@ def from_arrays(cls, left, right, closed='right', name=None, copy=False):
IntervalIndex.from_tuples : Construct an IntervalIndex from a
list/array of tuples
"""
left = maybe_convert_platform(left)
right = maybe_convert_platform(right)
left = maybe_convert_platform_interval(left)
right = maybe_convert_platform_interval(right)

return cls._simple_new(left, right, closed, name=name,
copy=copy, verify_integrity=True)
Expand Down Expand Up @@ -493,7 +524,7 @@ def from_intervals(cls, data, name=None, copy=False):
left, right, closed = data.left, data.right, data.closed
name = name or data.name
else:
data = maybe_convert_platform(data)
data = maybe_convert_platform_interval(data)
left, right, closed = intervals_to_interval_bounds(data)
return cls.from_arrays(left, right, closed, name=name, copy=False)

Expand Down
13 changes: 12 additions & 1 deletion pandas/tests/dtypes/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ def test_update_dtype(self, dtype, new_dtype):
assert result.ordered is expected_ordered

@pytest.mark.parametrize('bad_dtype', [
'foo', object, np.int64, PeriodDtype('Q'), IntervalDtype(object)])
'foo', object, np.int64, PeriodDtype('Q')])
def test_update_dtype_errors(self, bad_dtype):
dtype = CategoricalDtype(list('abc'), False)
msg = 'a CategoricalDtype must be passed to perform an update, '
Expand Down Expand Up @@ -460,6 +460,17 @@ def test_construction(self):
assert i.subtype == np.dtype('int64')
assert is_interval_dtype(i)

@pytest.mark.parametrize('subtype', [
CategoricalDtype(list('abc'), False),
CategoricalDtype(list('wxyz'), True),
object, str, '<U10', 'interval[category]', 'interval[object]'])
def test_construction_not_supported(self, subtype):
# GH 19016
msg = ('category, object, and string subtypes are not supported '
'for IntervalDtype')
with tm.assert_raises_regex(TypeError, msg):
IntervalDtype(subtype)

def test_construction_generic(self):
# generic
i = IntervalDtype('interval')
Expand Down
74 changes: 46 additions & 28 deletions pandas/tests/indexes/interval/test_interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import numpy as np
from pandas import (
Interval, IntervalIndex, Index, isna, notna, interval_range, Timestamp,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe your test for .length should change? IIRC you were catching he exception

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

.length had two tests: test_length and test_length_errors. I've removed test_length_errors in full, since we should be prohibiting dtypes that would cause exceptions. I've left test_length unchanged since it was only testing valid dtypes.

Timedelta, date_range, timedelta_range)
Timedelta, date_range, timedelta_range, Categorical)
from pandas.compat import lzip
from pandas.core.common import _asarray_tuplesafe
from pandas.tests.indexes.common import Base
Expand Down Expand Up @@ -42,7 +42,6 @@ def create_index_with_nan(self, closed='right'):

@pytest.mark.parametrize('data', [
Index([0, 1, 2, 3, 4]),
Index(list('abcde')),
date_range('2017-01-01', periods=5),
date_range('2017-01-01', periods=5, tz='US/Eastern'),
timedelta_range('1 day', periods=5)])
Expand Down Expand Up @@ -138,10 +137,10 @@ def test_constructors_nan(self, closed, data):
[],
np.array([], dtype='int64'),
np.array([], dtype='float64'),
np.array([], dtype=object)])
np.array([], dtype='datetime64[ns]')])
def test_constructors_empty(self, data, closed):
# GH 18421
expected_dtype = data.dtype if isinstance(data, np.ndarray) else object
expected_dtype = getattr(data, 'dtype', np.intp)
expected_values = np.array([], dtype=object)
expected_index = IntervalIndex(data, closed=closed)

Expand Down Expand Up @@ -223,6 +222,48 @@ def test_constructors_errors(self):
with tm.assert_raises_regex(ValueError, msg):
IntervalIndex.from_arrays(range(10, -1, -1), range(9, -2, -1))

# GH 19016: categorical data
data = Categorical(list('01234abcde'), ordered=True)
msg = ('category, object, and string subtypes are not supported '
'for IntervalIndex')

with tm.assert_raises_regex(TypeError, msg):
IntervalIndex.from_breaks(data)

with tm.assert_raises_regex(TypeError, msg):
IntervalIndex.from_arrays(data[:-1], data[1:])

@pytest.mark.parametrize('data', [
tuple('0123456789'),
list('abcdefghij'),
np.array(list('abcdefghij'), dtype=object),
np.array(list('abcdefghij'), dtype='<U1')])
def test_constructors_errors_string(self, data):
# GH 19016
left, right = data[:-1], data[1:]
tuples = lzip(left, right)
ivs = [Interval(l, r) for l, r in tuples] or data
msg = ('category, object, and string subtypes are not supported '
'for IntervalIndex')

with tm.assert_raises_regex(TypeError, msg):
IntervalIndex(ivs)

with tm.assert_raises_regex(TypeError, msg):
Index(ivs)

with tm.assert_raises_regex(TypeError, msg):
IntervalIndex.from_intervals(ivs)

with tm.assert_raises_regex(TypeError, msg):
IntervalIndex.from_breaks(data)

with tm.assert_raises_regex(TypeError, msg):
IntervalIndex.from_arrays(left, right)

with tm.assert_raises_regex(TypeError, msg):
IntervalIndex.from_tuples(tuples)

@pytest.mark.parametrize('tz_left, tz_right', [
(None, 'UTC'), ('UTC', None), ('UTC', 'US/Eastern')])
def test_constructors_errors_tz(self, tz_left, tz_right):
Expand Down Expand Up @@ -298,18 +339,6 @@ def test_length(self, closed, breaks):
expected = Index(iv.length if notna(iv) else iv for iv in index)
tm.assert_index_equal(result, expected)

@pytest.mark.parametrize('breaks', [
list('abcdefgh'),
lzip(range(10), range(1, 11)),
[['A', 'B'], ['a', 'b'], ['c', 'd'], ['e', 'f']],
[Interval(0, 1), Interval(1, 2), Interval(3, 4), Interval(4, 5)]])
def test_length_errors(self, closed, breaks):
# GH 18789
index = IntervalIndex.from_breaks(breaks)
msg = 'IntervalIndex contains Intervals without defined length'
with tm.assert_raises_regex(TypeError, msg):
index.length

def test_with_nans(self, closed):
index = self.create_index(closed=closed)
assert not index.hasnans
Expand Down Expand Up @@ -428,9 +457,7 @@ def test_delete(self, closed):
interval_range(0, periods=10, closed='neither'),
interval_range(1.7, periods=8, freq=2.5, closed='both'),
interval_range(Timestamp('20170101'), periods=12, closed='left'),
interval_range(Timedelta('1 day'), periods=6, closed='right'),
IntervalIndex.from_tuples([('a', 'd'), ('e', 'j'), ('w', 'z')]),
IntervalIndex.from_tuples([(1, 2), ('a', 'z'), (3.14, 6.28)])])
interval_range(Timedelta('1 day'), periods=6, closed='right')])
def test_insert(self, data):
item = data[0]
idx_item = IntervalIndex([item])
Expand Down Expand Up @@ -504,15 +531,6 @@ def test_unique(self, closed):
[(0, 1), (0, 1), (2, 3)], closed=closed)
assert not idx.is_unique

# unique mixed
idx = IntervalIndex.from_tuples([(0, 1), ('a', 'b')], closed=closed)
assert idx.is_unique

# duplicate mixed
idx = IntervalIndex.from_tuples(
[(0, 1), ('a', 'b'), (0, 1)], closed=closed)
assert not idx.is_unique

# empty
idx = IntervalIndex([], closed=closed)
assert idx.is_unique
Expand Down