-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ENH: Add IntervalDtype support to IntervalIndex.astype #19231
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,7 +20,8 @@ | |
is_scalar, | ||
is_float, | ||
is_number, | ||
is_integer) | ||
is_integer, | ||
pandas_dtype) | ||
from pandas.core.indexes.base import ( | ||
Index, _ensure_index, | ||
default_pprint, _index_shared_docs) | ||
|
@@ -699,8 +700,16 @@ def copy(self, deep=False, name=None): | |
|
||
@Appender(_index_shared_docs['astype']) | ||
def astype(self, dtype, copy=True): | ||
if is_interval_dtype(dtype): | ||
return self.copy() if copy else self | ||
dtype = pandas_dtype(dtype) | ||
if is_interval_dtype(dtype) and dtype != self.dtype: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you could do: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this needs to be a bit more specific than I could make the "different subtype" part more explicit with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I c, this is ok then. these are guaranteed to be II types and thus they are comparable |
||
try: | ||
new_left = self.left.astype(dtype.subtype) | ||
new_right = self.right.astype(dtype.subtype) | ||
except TypeError: | ||
msg = ('Cannot convert {dtype} to {new_dtype}; subtypes are ' | ||
'incompatible') | ||
raise TypeError(msg.format(dtype=self.dtype, new_dtype=dtype)) | ||
return self._shallow_copy(new_left, new_right) | ||
return super(IntervalIndex, self).astype(dtype, copy=copy) | ||
|
||
@cache_readonly | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,209 @@ | ||
from __future__ import division | ||
|
||
import pytest | ||
import numpy as np | ||
from pandas import ( | ||
Index, | ||
IntervalIndex, | ||
interval_range, | ||
CategoricalIndex, | ||
Timestamp, | ||
Timedelta, | ||
NaT) | ||
from pandas.core.dtypes.dtypes import CategoricalDtype, IntervalDtype | ||
import pandas.util.testing as tm | ||
|
||
|
||
class Base(object): | ||
"""Tests common to IntervalIndex with any subtype""" | ||
|
||
def test_astype_idempotent(self, index): | ||
result = index.astype('interval') | ||
tm.assert_index_equal(result, index) | ||
|
||
result = index.astype(index.dtype) | ||
tm.assert_index_equal(result, index) | ||
|
||
def test_astype_object(self, index): | ||
result = index.astype(object) | ||
expected = Index(index.values, dtype='object') | ||
tm.assert_index_equal(result, expected) | ||
assert not result.equals(index) | ||
|
||
def test_astype_category(self, index): | ||
result = index.astype('category') | ||
expected = CategoricalIndex(index.values) | ||
tm.assert_index_equal(result, expected) | ||
|
||
result = index.astype(CategoricalDtype()) | ||
tm.assert_index_equal(result, expected) | ||
|
||
# non-default params | ||
categories = index.dropna().unique().values[:-1] | ||
dtype = CategoricalDtype(categories=categories, ordered=True) | ||
result = index.astype(dtype) | ||
expected = CategoricalIndex( | ||
index.values, categories=categories, ordered=True) | ||
tm.assert_index_equal(result, expected) | ||
|
||
@pytest.mark.parametrize('dtype', [ | ||
'int64', 'uint64', 'float64', 'complex128', 'period[M]', | ||
'timedelta64', 'timedelta64[ns]', 'datetime64', 'datetime64[ns]', | ||
'datetime64[ns, US/Eastern]']) | ||
def test_astype_cannot_cast(self, index, dtype): | ||
msg = 'Cannot cast IntervalIndex to dtype' | ||
with tm.assert_raises_regex(TypeError, msg): | ||
index.astype(dtype) | ||
|
||
def test_astype_invalid_dtype(self, index): | ||
msg = 'data type "fake_dtype" not understood' | ||
with tm.assert_raises_regex(TypeError, msg): | ||
index.astype('fake_dtype') | ||
|
||
|
||
class TestIntSubtype(Base): | ||
"""Tests specific to IntervalIndex with integer-like subtype""" | ||
|
||
indexes = [ | ||
IntervalIndex.from_breaks(np.arange(-10, 11, dtype='int64')), | ||
IntervalIndex.from_breaks( | ||
np.arange(100, dtype='uint64'), closed='left'), | ||
] | ||
|
||
@pytest.fixture(params=indexes) | ||
def index(self, request): | ||
return request.param | ||
|
||
@pytest.mark.parametrize('subtype', [ | ||
'float64', 'datetime64[ns]', 'timedelta64[ns]']) | ||
def test_subtype_conversion(self, index, subtype): | ||
dtype = IntervalDtype(subtype) | ||
result = index.astype(dtype) | ||
expected = IntervalIndex.from_arrays(index.left.astype(subtype), | ||
index.right.astype(subtype), | ||
closed=index.closed) | ||
tm.assert_index_equal(result, expected) | ||
|
||
@pytest.mark.parametrize('subtype_start, subtype_end', [ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add some tests where we fail on int/unit sub-types that we don't support e.g. (int8,16,32) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we have these for construction as well (tests I mean)? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There aren't any construction related tests for unsupported int/uint. This also doesn't fail with how I've implemented In [2]: pd.Index([1, 2, 3], dtype='int16')
Out[2]: Int64Index([1, 2, 3], dtype='int64')
In [3]: pd.Int64Index([1, 2, 3]).astype('int8')
Out[3]: Int64Index([1, 2, 3], dtype='int64')
In [4]: pd.Index(np.arange(5, dtype='int16'))
Out[4]: Int64Index([0, 1, 2, 3, 4], dtype='int64')
In [5]: pd.Int64Index([1, 2, 3]).astype('uint8')
Out[5]: UInt64Index([1, 2, 3], dtype='uint64') Seems like this would need to fail at the level shown above for it to fail on There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. right these are all valid (as these are how indexes are constructed), maybe just add some tests to confirm for II |
||
('int64', 'uint64'), ('uint64', 'int64')]) | ||
def test_subtype_integer(self, subtype_start, subtype_end): | ||
index = IntervalIndex.from_breaks(np.arange(100, dtype=subtype_start)) | ||
dtype = IntervalDtype(subtype_end) | ||
result = index.astype(dtype) | ||
expected = IntervalIndex.from_arrays(index.left.astype(subtype_end), | ||
index.right.astype(subtype_end), | ||
closed=index.closed) | ||
tm.assert_index_equal(result, expected) | ||
|
||
@pytest.mark.xfail(reason='GH 15832') | ||
def test_subtype_integer_errors(self): | ||
# int64 -> uint64 fails with negative values | ||
index = interval_range(-10, 10) | ||
dtype = IntervalDtype('uint64') | ||
with pytest.raises(ValueError): | ||
index.astype(dtype) | ||
|
||
|
||
class TestFloatSubtype(Base): | ||
"""Tests specific to IntervalIndex with float subtype""" | ||
|
||
indexes = [ | ||
interval_range(-10.0, 10.0, closed='neither'), | ||
IntervalIndex.from_arrays([-1.5, np.nan, 0., 0., 1.5], | ||
[-0.5, np.nan, 1., 1., 3.], | ||
closed='both'), | ||
] | ||
|
||
@pytest.fixture(params=indexes) | ||
def index(self, request): | ||
return request.param | ||
|
||
@pytest.mark.parametrize('subtype', ['int64', 'uint64']) | ||
def test_subtype_integer(self, subtype): | ||
index = interval_range(0.0, 10.0) | ||
dtype = IntervalDtype(subtype) | ||
result = index.astype(dtype) | ||
expected = IntervalIndex.from_arrays(index.left.astype(subtype), | ||
index.right.astype(subtype), | ||
closed=index.closed) | ||
tm.assert_index_equal(result, expected) | ||
|
||
# raises with NA | ||
msg = 'Cannot convert NA to integer' | ||
with tm.assert_raises_regex(ValueError, msg): | ||
index.insert(0, np.nan).astype(dtype) | ||
|
||
@pytest.mark.xfail(reason='GH 15832') | ||
def test_subtype_integer_errors(self): | ||
# float64 -> uint64 fails with negative values | ||
index = interval_range(-10.0, 10.0) | ||
dtype = IntervalDtype('uint64') | ||
with pytest.raises(ValueError): | ||
index.astype(dtype) | ||
|
||
# float64 -> integer-like fails with non-integer valued floats | ||
index = interval_range(0.0, 10.0, freq=0.25) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. numpy actually allows things like this (IOW truncation of floats). Are you failing this directly? I agree that it should fail though, its just a can of worms. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is test is being xfailed with The current behavior within this PR is truncation of floats, identical to the existing In [2]: ii = pd.interval_range(0.25, 1.5, freq=0.25)
In [3]: ii
Out[3]:
IntervalIndex([(0.25, 0.5], (0.5, 0.75], (0.75, 1.0], (1.0, 1.25], (1.25, 1.5]]
closed='right',
dtype='interval[float64]')
In [4]: ii.astype('interval[int64]')
Out[4]:
IntervalIndex([(0, 0], (0, 0], (0, 1], (1, 1], (1, 1]]
closed='right',
dtype='interval[int64]')
In [5]: pd.Float64Index([1.1, 2.2, 3.3]).astype('int64')
Out[5]: Int64Index([1, 2, 3], dtype='int64') There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok that's fine |
||
dtype = IntervalDtype('int64') | ||
with pytest.raises(ValueError): | ||
index.astype(dtype) | ||
|
||
dtype = IntervalDtype('uint64') | ||
with pytest.raises(ValueError): | ||
index.astype(dtype) | ||
|
||
@pytest.mark.parametrize('subtype', ['datetime64[ns]', 'timedelta64[ns]']) | ||
def test_subtype_datetimelike(self, index, subtype): | ||
dtype = IntervalDtype(subtype) | ||
msg = 'Cannot convert .* to .*; subtypes are incompatible' | ||
with tm.assert_raises_regex(TypeError, msg): | ||
index.astype(dtype) | ||
|
||
|
||
class TestDatetimelikeSubtype(Base): | ||
"""Tests specific to IntervalIndex with datetime-like subtype""" | ||
|
||
indexes = [ | ||
interval_range(Timestamp('2018-01-01'), periods=10, closed='neither'), | ||
interval_range(Timestamp('2018-01-01'), periods=10).insert(2, NaT), | ||
interval_range(Timestamp('2018-01-01', tz='US/Eastern'), periods=10), | ||
interval_range(Timedelta('0 days'), periods=10, closed='both'), | ||
interval_range(Timedelta('0 days'), periods=10).insert(2, NaT), | ||
] | ||
|
||
@pytest.fixture(params=indexes) | ||
def index(self, request): | ||
return request.param | ||
|
||
@pytest.mark.parametrize('subtype', ['int64', 'uint64']) | ||
def test_subtype_integer(self, index, subtype): | ||
dtype = IntervalDtype(subtype) | ||
result = index.astype(dtype) | ||
expected = IntervalIndex.from_arrays(index.left.astype(subtype), | ||
index.right.astype(subtype), | ||
closed=index.closed) | ||
tm.assert_index_equal(result, expected) | ||
|
||
def test_subtype_float(self, index): | ||
dtype = IntervalDtype('float64') | ||
msg = 'Cannot convert .* to .*; subtypes are incompatible' | ||
with tm.assert_raises_regex(TypeError, msg): | ||
index.astype(dtype) | ||
|
||
def test_subtype_datetimelike(self): | ||
# datetime -> timedelta raises | ||
dtype = IntervalDtype('timedelta64[ns]') | ||
msg = 'Cannot convert .* to .*; subtypes are incompatible' | ||
|
||
index = interval_range(Timestamp('2018-01-01'), periods=10) | ||
with tm.assert_raises_regex(TypeError, msg): | ||
index.astype(dtype) | ||
|
||
index = interval_range(Timestamp('2018-01-01', tz='CET'), periods=10) | ||
with tm.assert_raises_regex(TypeError, msg): | ||
index.astype(dtype) | ||
|
||
# timedelta -> datetime raises | ||
dtype = IntervalDtype('datetime64[ns]') | ||
index = interval_range(Timedelta('0 days'), periods=10) | ||
with tm.assert_raises_regex(TypeError, msg): | ||
index.astype(dtype) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This change was necessary since some subtypes don't nicely compare:
Though interestingly enough the reverse comparison was fine:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes this is because numpy types don't handle comparisons vs pandas dtypes.