Skip to content

ENH: Add IntervalDtype support to IntervalIndex.astype #19231

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 14, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@ Other Enhancements
- ``Resampler`` objects now have a functioning :attr:`~pandas.core.resample.Resampler.pipe` method.
Previously, calls to ``pipe`` were diverted to the ``mean`` method (:issue:`17905`).
- :func:`~pandas.api.types.is_scalar` now returns ``True`` for ``DateOffset`` objects (:issue:`18943`).
- ``IntervalIndex.astype`` now supports conversions between subtypes when passed an ``IntervalDtype`` (:issue:`19197`)

.. _whatsnew_0230.api_breaking:

Expand Down
3 changes: 2 additions & 1 deletion pandas/core/dtypes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -710,7 +710,8 @@ def __eq__(self, other):
# None should match any subtype
return True
else:
return self.subtype == other.subtype
from pandas.core.dtypes.common import is_dtype_equal
return is_dtype_equal(self.subtype, other.subtype)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change was necessary since some subtypes don't nicely compare:

In [2]: dtype1 = IntervalDtype('float64')

In [3]: dtype2 = IntervalDtype('datetime64[ns, US/Eastern]')

In [4]: dtype1 == dtype2
---------------------------------------------------------------------------
TypeError: data type not understood

Though interestingly enough the reverse comparison was fine:

In [5]: dtype2 == dtype1
Out[5]: False

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes this is because numpy types don't handle comparisons vs pandas dtypes.


@classmethod
def is_dtype(cls, dtype):
Expand Down
15 changes: 12 additions & 3 deletions pandas/core/indexes/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
is_scalar,
is_float,
is_number,
is_integer)
is_integer,
pandas_dtype)
from pandas.core.indexes.base import (
Index, _ensure_index,
default_pprint, _index_shared_docs)
Expand Down Expand Up @@ -699,8 +700,16 @@ def copy(self, deep=False, name=None):

@Appender(_index_shared_docs['astype'])
def astype(self, dtype, copy=True):
if is_interval_dtype(dtype):
return self.copy() if copy else self
dtype = pandas_dtype(dtype)
if is_interval_dtype(dtype) and dtype != self.dtype:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you could do: not is_dtype(equal(dtype, self.dtype)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this needs to be a bit more specific than not is_dtype_equal; what this is really looking for is an IntervalDtype with a different subtype than the existing dtype. Just having not is_dtype_equal would allow non-interval dtypes to pass through as well.

I could make the "different subtype" part more explicit with and not is_dtype_equal(dtype.subtype, self.dtype.subtype). Though this would be a partial dupe of the IntervalDtype.__eq__ logic. Could also just add a comment noting the "different subtype" part, if that would be preferable.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I c, this is ok then. these are guaranteed to be II types and thus they are comparable

try:
new_left = self.left.astype(dtype.subtype)
new_right = self.right.astype(dtype.subtype)
except TypeError:
msg = ('Cannot convert {dtype} to {new_dtype}; subtypes are '
'incompatible')
raise TypeError(msg.format(dtype=self.dtype, new_dtype=dtype))
return self._shallow_copy(new_left, new_right)
return super(IntervalIndex, self).astype(dtype, copy=copy)

@cache_readonly
Expand Down
6 changes: 6 additions & 0 deletions pandas/tests/dtypes/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -534,6 +534,12 @@ def test_equality(self):
assert not is_dtype_equal(IntervalDtype('int64'),
IntervalDtype('float64'))

# invalid subtype comparisons do not raise when directly compared
dtype1 = IntervalDtype('float64')
dtype2 = IntervalDtype('datetime64[ns, US/Eastern]')
assert dtype1 != dtype2
assert dtype2 != dtype1

@pytest.mark.parametrize('subtype', [
None, 'interval', 'Interval', 'int64', 'uint64', 'float64',
'complex128', 'datetime64', 'timedelta64', PeriodDtype('Q')])
Expand Down
209 changes: 209 additions & 0 deletions pandas/tests/indexes/interval/test_astype.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
from __future__ import division

import pytest
import numpy as np
from pandas import (
Index,
IntervalIndex,
interval_range,
CategoricalIndex,
Timestamp,
Timedelta,
NaT)
from pandas.core.dtypes.dtypes import CategoricalDtype, IntervalDtype
import pandas.util.testing as tm


class Base(object):
"""Tests common to IntervalIndex with any subtype"""

def test_astype_idempotent(self, index):
result = index.astype('interval')
tm.assert_index_equal(result, index)

result = index.astype(index.dtype)
tm.assert_index_equal(result, index)

def test_astype_object(self, index):
result = index.astype(object)
expected = Index(index.values, dtype='object')
tm.assert_index_equal(result, expected)
assert not result.equals(index)

def test_astype_category(self, index):
result = index.astype('category')
expected = CategoricalIndex(index.values)
tm.assert_index_equal(result, expected)

result = index.astype(CategoricalDtype())
tm.assert_index_equal(result, expected)

# non-default params
categories = index.dropna().unique().values[:-1]
dtype = CategoricalDtype(categories=categories, ordered=True)
result = index.astype(dtype)
expected = CategoricalIndex(
index.values, categories=categories, ordered=True)
tm.assert_index_equal(result, expected)

@pytest.mark.parametrize('dtype', [
'int64', 'uint64', 'float64', 'complex128', 'period[M]',
'timedelta64', 'timedelta64[ns]', 'datetime64', 'datetime64[ns]',
'datetime64[ns, US/Eastern]'])
def test_astype_cannot_cast(self, index, dtype):
msg = 'Cannot cast IntervalIndex to dtype'
with tm.assert_raises_regex(TypeError, msg):
index.astype(dtype)

def test_astype_invalid_dtype(self, index):
msg = 'data type "fake_dtype" not understood'
with tm.assert_raises_regex(TypeError, msg):
index.astype('fake_dtype')


class TestIntSubtype(Base):
"""Tests specific to IntervalIndex with integer-like subtype"""

indexes = [
IntervalIndex.from_breaks(np.arange(-10, 11, dtype='int64')),
IntervalIndex.from_breaks(
np.arange(100, dtype='uint64'), closed='left'),
]

@pytest.fixture(params=indexes)
def index(self, request):
return request.param

@pytest.mark.parametrize('subtype', [
'float64', 'datetime64[ns]', 'timedelta64[ns]'])
def test_subtype_conversion(self, index, subtype):
dtype = IntervalDtype(subtype)
result = index.astype(dtype)
expected = IntervalIndex.from_arrays(index.left.astype(subtype),
index.right.astype(subtype),
closed=index.closed)
tm.assert_index_equal(result, expected)

@pytest.mark.parametrize('subtype_start, subtype_end', [
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add some tests where we fail on int/unit sub-types that we don't support e.g. (int8,16,32)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we have these for construction as well (tests I mean)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There aren't any construction related tests for unsupported int/uint. This also doesn't fail with how I've implemented IntervalIndex.astype in this PR, since it relies on the behavior of the underlying endpoint indexes. I can't find a way in which those raise when passed an unsupported int/uint, as they always appear to upcast to int64:

In [2]: pd.Index([1, 2, 3], dtype='int16')
Out[2]: Int64Index([1, 2, 3], dtype='int64')

In [3]: pd.Int64Index([1, 2, 3]).astype('int8')
Out[3]: Int64Index([1, 2, 3], dtype='int64')

In [4]: pd.Index(np.arange(5, dtype='int16'))
Out[4]: Int64Index([0, 1, 2, 3, 4], dtype='int64')

In [5]: pd.Int64Index([1, 2, 3]).astype('uint8')
Out[5]: UInt64Index([1, 2, 3], dtype='uint64')

Seems like this would need to fail at the level shown above for it to fail on IntervalIndex? If so, probably best done in a separate PR, since my initial guess is that it'd require a bit of work/changes. I could write xfailing tests here though. Or am I misinterpreting what you mean?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

right these are all valid (as these are how indexes are constructed), maybe just add some tests to confirm for II

('int64', 'uint64'), ('uint64', 'int64')])
def test_subtype_integer(self, subtype_start, subtype_end):
index = IntervalIndex.from_breaks(np.arange(100, dtype=subtype_start))
dtype = IntervalDtype(subtype_end)
result = index.astype(dtype)
expected = IntervalIndex.from_arrays(index.left.astype(subtype_end),
index.right.astype(subtype_end),
closed=index.closed)
tm.assert_index_equal(result, expected)

@pytest.mark.xfail(reason='GH 15832')
def test_subtype_integer_errors(self):
# int64 -> uint64 fails with negative values
index = interval_range(-10, 10)
dtype = IntervalDtype('uint64')
with pytest.raises(ValueError):
index.astype(dtype)


class TestFloatSubtype(Base):
"""Tests specific to IntervalIndex with float subtype"""

indexes = [
interval_range(-10.0, 10.0, closed='neither'),
IntervalIndex.from_arrays([-1.5, np.nan, 0., 0., 1.5],
[-0.5, np.nan, 1., 1., 3.],
closed='both'),
]

@pytest.fixture(params=indexes)
def index(self, request):
return request.param

@pytest.mark.parametrize('subtype', ['int64', 'uint64'])
def test_subtype_integer(self, subtype):
index = interval_range(0.0, 10.0)
dtype = IntervalDtype(subtype)
result = index.astype(dtype)
expected = IntervalIndex.from_arrays(index.left.astype(subtype),
index.right.astype(subtype),
closed=index.closed)
tm.assert_index_equal(result, expected)

# raises with NA
msg = 'Cannot convert NA to integer'
with tm.assert_raises_regex(ValueError, msg):
index.insert(0, np.nan).astype(dtype)

@pytest.mark.xfail(reason='GH 15832')
def test_subtype_integer_errors(self):
# float64 -> uint64 fails with negative values
index = interval_range(-10.0, 10.0)
dtype = IntervalDtype('uint64')
with pytest.raises(ValueError):
index.astype(dtype)

# float64 -> integer-like fails with non-integer valued floats
index = interval_range(0.0, 10.0, freq=0.25)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

numpy actually allows things like this (IOW truncation of floats). Are you failing this directly?

I agree that it should fail though, its just a can of worms.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is test is being xfailed with @pytest.mark.xfail(reason='GH 15832'). I think fixing #15832 should resolve this, since the changes should flow through to IntervalIndex as well, since it's relying on Float64Index to hold the endpoints.

The current behavior within this PR is truncation of floats, identical to the existing Float64Index behavior:

In [2]: ii = pd.interval_range(0.25, 1.5, freq=0.25)

In [3]: ii
Out[3]:
IntervalIndex([(0.25, 0.5], (0.5, 0.75], (0.75, 1.0], (1.0, 1.25], (1.25, 1.5]]
              closed='right',
              dtype='interval[float64]')

In [4]: ii.astype('interval[int64]')
Out[4]:
IntervalIndex([(0, 0], (0, 0], (0, 1], (1, 1], (1, 1]]
              closed='right',
              dtype='interval[int64]')

In [5]: pd.Float64Index([1.1, 2.2, 3.3]).astype('int64')
Out[5]: Int64Index([1, 2, 3], dtype='int64')

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok that's fine

dtype = IntervalDtype('int64')
with pytest.raises(ValueError):
index.astype(dtype)

dtype = IntervalDtype('uint64')
with pytest.raises(ValueError):
index.astype(dtype)

@pytest.mark.parametrize('subtype', ['datetime64[ns]', 'timedelta64[ns]'])
def test_subtype_datetimelike(self, index, subtype):
dtype = IntervalDtype(subtype)
msg = 'Cannot convert .* to .*; subtypes are incompatible'
with tm.assert_raises_regex(TypeError, msg):
index.astype(dtype)


class TestDatetimelikeSubtype(Base):
"""Tests specific to IntervalIndex with datetime-like subtype"""

indexes = [
interval_range(Timestamp('2018-01-01'), periods=10, closed='neither'),
interval_range(Timestamp('2018-01-01'), periods=10).insert(2, NaT),
interval_range(Timestamp('2018-01-01', tz='US/Eastern'), periods=10),
interval_range(Timedelta('0 days'), periods=10, closed='both'),
interval_range(Timedelta('0 days'), periods=10).insert(2, NaT),
]

@pytest.fixture(params=indexes)
def index(self, request):
return request.param

@pytest.mark.parametrize('subtype', ['int64', 'uint64'])
def test_subtype_integer(self, index, subtype):
dtype = IntervalDtype(subtype)
result = index.astype(dtype)
expected = IntervalIndex.from_arrays(index.left.astype(subtype),
index.right.astype(subtype),
closed=index.closed)
tm.assert_index_equal(result, expected)

def test_subtype_float(self, index):
dtype = IntervalDtype('float64')
msg = 'Cannot convert .* to .*; subtypes are incompatible'
with tm.assert_raises_regex(TypeError, msg):
index.astype(dtype)

def test_subtype_datetimelike(self):
# datetime -> timedelta raises
dtype = IntervalDtype('timedelta64[ns]')
msg = 'Cannot convert .* to .*; subtypes are incompatible'

index = interval_range(Timestamp('2018-01-01'), periods=10)
with tm.assert_raises_regex(TypeError, msg):
index.astype(dtype)

index = interval_range(Timestamp('2018-01-01', tz='CET'), periods=10)
with tm.assert_raises_regex(TypeError, msg):
index.astype(dtype)

# timedelta -> datetime raises
dtype = IntervalDtype('datetime64[ns]')
index = interval_range(Timedelta('0 days'), periods=10)
with tm.assert_raises_regex(TypeError, msg):
index.astype(dtype)
20 changes: 0 additions & 20 deletions pandas/tests/indexes/interval/test_interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,26 +415,6 @@ def test_equals(self, closed):
np.arange(5), closed=other_closed)
assert not expected.equals(expected_other_closed)

def test_astype(self, closed):
idx = self.create_index(closed=closed)
result = idx.astype(object)
tm.assert_index_equal(result, Index(idx.values, dtype='object'))
assert not idx.equals(result)
assert idx.equals(IntervalIndex.from_intervals(result))

result = idx.astype('interval')
tm.assert_index_equal(result, idx)
assert result.equals(idx)

@pytest.mark.parametrize('dtype', [
np.int64, np.float64, 'period[M]', 'timedelta64', 'datetime64[ns]',
'datetime64[ns, US/Eastern]'])
def test_astype_errors(self, closed, dtype):
idx = self.create_index(closed=closed)
msg = 'Cannot cast IntervalIndex to dtype'
with tm.assert_raises_regex(TypeError, msg):
idx.astype(dtype)

@pytest.mark.parametrize('klass', [list, tuple, np.array, pd.Series])
def test_where(self, closed, klass):
idx = self.create_index(closed=closed)
Expand Down