Skip to content

Commit 8226cc3

Browse files
committed
IntervalDtype
1 parent ec0b97b commit 8226cc3

File tree

12 files changed

+329
-55
lines changed

12 files changed

+329
-55
lines changed

doc/source/whatsnew/v0.20.0.txt

+34
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ Highlights include:
1111

1212
- Building pandas for development now requires ``cython >= 0.23`` (:issue:`14831`)
1313
- The ``.ix`` indexer has been deprecated, see :ref:`here <whatsnew_0200.api_breaking.deprecate_ix>`
14+
- Addition of an ``IntervalIndex`` and ``Interval`` scalar type, see :ref:`here <whatsnew_0200.enhancements.intervalindex>`
1415

1516
Check the :ref:`API Changes <whatsnew_0200.api_breaking>` and :ref:`deprecations <whatsnew_0200.deprecations>` before updating.
1617

@@ -96,6 +97,9 @@ support for bz2 compression in the python 2 c-engine improved (:issue:`14874`).
9697

9798
.. _whatsnew_0200.enhancements.uint64_support:
9899

100+
Enanced UInt64 Support
101+
^^^^^^^^^^^^^^^^^^^^^^
102+
99103
Pandas has significantly improved support for operations involving unsigned,
100104
or purely non-negative, integers. Previously, handling these integers would
101105
result in improper rounding or data-type casting, leading to incorrect results.
@@ -114,6 +118,36 @@ Notably, a new numerical index, ``UInt64Index``, has been created (:issue:`14937
114118
- Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`)
115119
- Bug in ``pd.value_counts()`` in which unsigned 64-bit integers were being erroneously truncated in the output (:issue:`14934`)
116120

121+
.. _whatsnew_0200.enhancements.intervalindex:
122+
123+
IntervalIndex
124+
^^^^^^^^^^^^^
125+
126+
pandas has gain an ``IntervalIndex`` with its own dtype, ``interval`` as well as the ``Interval`` scalar type. These allow first-class support for interval
127+
notation, specifically as return type for ``pd.cut`` and ``pd.qcut``. (:issue:`7640`, :issue:`8625`)
128+
129+
**Previous behavior**:
130+
131+
.. code-block:: ipython
132+
133+
In [2]: pd.cut(range(3), 2)
134+
Out[2]:
135+
[(-0.002, 1], (-0.002, 1], (1, 2]]
136+
Categories (2, object): [(-0.002, 1] < (1, 2]]
137+
138+
# the returned categories are strings, representing Intervals
139+
In [3]: pd.cut(range(3), 2).categories
140+
Out[3]: Index(['(-0.002, 1]', '(1, 2]'], dtype='object')
141+
142+
**New behavior**:
143+
144+
.. ipython:: python
145+
146+
c = pd.cut(range(3), 2)
147+
c
148+
c.categories
149+
pd.api.types.is_interval_dtype(c.categories)
150+
117151
.. _whatsnew_0200.enhancements.other:
118152

119153
Other enhancements

pandas/api/tests/test_api.py

+1
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ class TestTypes(Base, tm.TestCase):
156156
'is_string_dtype', 'is_signed_integer_dtype',
157157
'is_timedelta64_dtype', 'is_timedelta64_ns_dtype',
158158
'is_unsigned_integer_dtype', 'is_period',
159+
'is_interval', 'is_interval_dtype',
159160
'is_period_dtype', 'is_re', 'is_re_compilable',
160161
'is_dict_like', 'is_iterator',
161162
'is_list_like', 'is_hashable',

pandas/core/algorithms.py

+3-5
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,14 @@
88

99
from pandas import compat, lib, tslib, _np_version_under1p8
1010
from pandas.types.cast import _maybe_promote
11-
from pandas.types.generic import ABCSeries, ABCIndex, ABCIntervalIndex
11+
from pandas.types.generic import ABCSeries, ABCIndex
1212
from pandas.types.common import (is_unsigned_integer_dtype,
1313
is_signed_integer_dtype,
1414
is_integer_dtype,
1515
is_int64_dtype,
1616
is_categorical_dtype,
1717
is_extension_type,
18+
is_interval_dtype,
1819
is_datetimetz,
1920
is_period_dtype,
2021
is_period_arraylike,
@@ -1253,10 +1254,7 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,
12531254
allow_fill=allow_fill)
12541255
elif is_datetimetz(arr):
12551256
return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
1256-
elif isinstance(arr, ABCIntervalIndex):
1257-
# TODO: we need to be sure we are taking on an actual IntervalIndex
1258-
# this is 'hacky' until we have a first class dtype
1259-
# ideally will use is_interval_dtype here
1257+
elif is_interval_dtype(arr):
12601258
return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
12611259

12621260
if indexer is None:

pandas/indexes/base.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,12 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
165165
from .category import CategoricalIndex
166166
return CategoricalIndex(data, copy=copy, name=name, **kwargs)
167167

168+
# interval
169+
if is_interval_dtype(data):
170+
from .interval import IntervalIndex
171+
return IntervalIndex.from_intervals(data, name=name,
172+
copy=copy)
173+
168174
# index-like
169175
elif isinstance(data, (np.ndarray, Index, ABCSeries)):
170176

@@ -3516,10 +3522,8 @@ def _evaluate_compare(self, other):
35163522
if needs_i8_conversion(self) and needs_i8_conversion(other):
35173523
return self._evaluate_compare(other, op)
35183524

3519-
# TODO: super hack
35203525
if (is_object_dtype(self) and
3521-
self.nlevels == 1 and not
3522-
is_interval_dtype(self)):
3526+
self.nlevels == 1):
35233527

35243528
# don't pass MultiIndex
35253529
with np.errstate(all='ignore'):

pandas/indexes/interval.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import numpy as np
44

55
from pandas.types.missing import notnull, isnull
6+
from pandas.types.dtypes import IntervalDtype
67
from pandas.types.common import (_ensure_platform_int,
78
is_datetime_or_timedelta_dtype,
89
is_integer_dtype,
@@ -93,7 +94,7 @@ class IntervalIndex(IntervalMixin, Index):
9394
_engine = None # disable it
9495

9596
def __new__(cls, left, right=None, closed='right', mask=None,
96-
name=None, copy=False, fastpath=False):
97+
name=None, copy=False, dtype=None, fastpath=False):
9798

9899
if right is None:
99100

@@ -215,6 +216,7 @@ def from_breaks(cls, breaks, closed='right', name=None):
215216
right=[1, 2, 3],
216217
closed='right')
217218
"""
219+
breaks = np.asarray(breaks)
218220
mask = isnull(breaks[:-1])
219221
return cls(breaks[:-1], breaks[1:], closed, mask=mask, name=name)
220222

@@ -357,7 +359,7 @@ def astype(self, dtype, copy=True):
357359

358360
@cache_readonly
359361
def dtype(self):
360-
return np.dtype('O')
362+
return IntervalDtype.construct_from_string(str(self.left.dtype))
361363

362364
@property
363365
def inferred_type(self):
@@ -645,6 +647,7 @@ def _format_attrs(self):
645647
('closed', repr(self.closed))]
646648
if self.name is not None:
647649
attrs.append(('name', default_pprint(self.name)))
650+
attrs.append(('dtype', "'%s'" % self.dtype))
648651
return attrs
649652

650653
def _format_space(self):

pandas/tests/indexes/test_interval.py

+10-6
Original file line numberDiff line numberDiff line change
@@ -155,9 +155,10 @@ def test_equals(self):
155155
idx = self.index
156156
self.assertTrue(idx.equals(idx))
157157
self.assertTrue(idx.equals(idx.copy()))
158-
self.assertTrue(idx.equals(idx.astype(object)))
159-
self.assertTrue(idx.equals(np.array(idx)))
160-
self.assertTrue(idx.equals(list(idx)))
158+
159+
self.assertFalse(idx.equals(idx.astype(object)))
160+
self.assertFalse(idx.equals(np.array(idx)))
161+
self.assertFalse(idx.equals(list(idx)))
161162

162163
self.assertFalse(idx.equals([1, 2]))
163164
self.assertFalse(idx.equals(np.array([1, 2])))
@@ -175,7 +176,8 @@ def test_astype(self):
175176

176177
result = idx.astype(object)
177178
tm.assert_index_equal(result, Index(idx.values, dtype='object'))
178-
self.assertTrue(idx.equals(result))
179+
self.assertFalse(idx.equals(result))
180+
self.assertTrue(idx.equals(IntervalIndex.from_intervals(result)))
179181

180182
result = idx.astype('interval')
181183
tm.assert_index_equal(result, idx)
@@ -229,8 +231,10 @@ def test_monotonic_and_unique(self):
229231
self.assertTrue(idx.is_monotonic)
230232

231233
def test_repr(self):
232-
expected = ("IntervalIndex(left=[0, 1],\n right=[1, 2],"
233-
"\n closed='right')")
234+
expected = ("IntervalIndex(left=[0, 1],"
235+
"\n right=[1, 2],"
236+
"\n closed='right',"
237+
"\n dtype='interval[int64]')")
234238
IntervalIndex((0, 1), (1, 2), closed='right')
235239
self.assertEqual(repr(self.index), expected)
236240

pandas/tests/types/test_dtypes.py

+96-6
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
import pandas as pd
77
from pandas import Series, Categorical, IntervalIndex, date_range
88

9-
from pandas.types.dtypes import DatetimeTZDtype, PeriodDtype, CategoricalDtype
9+
from pandas.types.dtypes import (DatetimeTZDtype, PeriodDtype,
10+
IntervalDtype, CategoricalDtype)
1011
from pandas.types.common import (is_categorical_dtype, is_categorical,
1112
is_datetime64tz_dtype, is_datetimetz,
1213
is_period_dtype, is_period,
@@ -16,7 +17,7 @@
1617
_coerce_to_dtype)
1718
import pandas.util.testing as tm
1819

19-
_multiprocess_can_split_ = True
20+
_multiprocess_can_split_ = False
2021

2122

2223
class Base(object):
@@ -355,16 +356,104 @@ def test_not_string(self):
355356
self.assertFalse(is_string_dtype(PeriodDtype('D')))
356357

357358

358-
# TODO: inherit from Base
359-
class TestIntervalDtype(tm.TestCase):
359+
class TestIntervalDtype(Base, tm.TestCase):
360360

361361
# TODO: placeholder
362362
def setUp(self):
363-
pass
363+
self.dtype = IntervalDtype('int64')
364+
365+
def test_construction(self):
366+
with tm.assertRaises(ValueError):
367+
IntervalDtype('xx')
368+
369+
for s in ['interval[int64]', 'Interval[int64]', 'int64']:
370+
i = IntervalDtype(s)
371+
self.assertEqual(i.subtype, np.dtype('int64'))
372+
self.assertTrue(is_interval_dtype(i))
373+
374+
def test_construction_generic(self):
375+
# generic
376+
i = IntervalDtype('interval')
377+
self.assertIs(i.subtype, None)
378+
self.assertTrue(is_interval_dtype(i))
379+
self.assertTrue(str(i) == 'interval')
380+
381+
i = IntervalDtype()
382+
self.assertIs(i.subtype, None)
383+
self.assertTrue(is_interval_dtype(i))
384+
self.assertTrue(str(i) == 'interval')
385+
386+
def test_subclass(self):
387+
a = IntervalDtype('interval[int64]')
388+
b = IntervalDtype('interval[int64]')
389+
390+
self.assertTrue(issubclass(type(a), type(a)))
391+
self.assertTrue(issubclass(type(a), type(b)))
364392

365393
def test_is_dtype(self):
394+
self.assertTrue(IntervalDtype.is_dtype(self.dtype))
395+
self.assertTrue(IntervalDtype.is_dtype('interval'))
396+
self.assertTrue(IntervalDtype.is_dtype(IntervalDtype('float64')))
397+
self.assertTrue(IntervalDtype.is_dtype(IntervalDtype('int64')))
398+
self.assertTrue(IntervalDtype.is_dtype(IntervalDtype(np.int64)))
399+
400+
self.assertFalse(IntervalDtype.is_dtype('D'))
401+
self.assertFalse(IntervalDtype.is_dtype('3D'))
402+
self.assertFalse(IntervalDtype.is_dtype('U'))
403+
self.assertFalse(IntervalDtype.is_dtype('S'))
404+
self.assertFalse(IntervalDtype.is_dtype('foo'))
405+
self.assertFalse(IntervalDtype.is_dtype(np.object_))
406+
self.assertFalse(IntervalDtype.is_dtype(np.int64))
407+
self.assertFalse(IntervalDtype.is_dtype(np.float64))
408+
409+
def test_identity(self):
410+
self.assertEqual(IntervalDtype('interval[int64]'),
411+
IntervalDtype('interval[int64]'))
412+
413+
def test_coerce_to_dtype(self):
414+
self.assertEqual(_coerce_to_dtype('interval[int64]'),
415+
IntervalDtype('interval[int64]'))
416+
417+
def test_construction_from_string(self):
418+
result = IntervalDtype('interval[int64]')
419+
self.assertTrue(is_dtype_equal(self.dtype, result))
420+
result = IntervalDtype.construct_from_string('interval[int64]')
421+
self.assertTrue(is_dtype_equal(self.dtype, result))
422+
with tm.assertRaises(TypeError):
423+
IntervalDtype.construct_from_string('foo')
424+
with tm.assertRaises(TypeError):
425+
IntervalDtype.construct_from_string('interval[foo]')
426+
with tm.assertRaises(TypeError):
427+
IntervalDtype.construct_from_string('foo[int64]')
366428

367-
self.assertTrue(is_interval_dtype('interval'))
429+
def test_equality(self):
430+
self.assertTrue(is_dtype_equal(self.dtype, 'interval[int64]'))
431+
self.assertTrue(is_dtype_equal(self.dtype, IntervalDtype('int64')))
432+
self.assertTrue(is_dtype_equal(self.dtype, IntervalDtype('int64')))
433+
self.assertTrue(is_dtype_equal(IntervalDtype('int64'),
434+
IntervalDtype('int64')))
435+
436+
self.assertFalse(is_dtype_equal(self.dtype, 'int64'))
437+
self.assertFalse(is_dtype_equal(IntervalDtype('int64'),
438+
IntervalDtype('float64')))
439+
440+
def test_basic(self):
441+
self.assertTrue(is_interval_dtype(self.dtype))
442+
443+
ii = IntervalIndex.from_breaks(range(3))
444+
445+
self.assertTrue(is_interval_dtype(ii.dtype))
446+
self.assertTrue(is_interval_dtype(ii))
447+
448+
s = Series(ii, name='A')
449+
450+
# dtypes
451+
# series results in object dtype currently,
452+
self.assertFalse(is_interval_dtype(s.dtype))
453+
self.assertFalse(is_interval_dtype(s))
454+
455+
def test_basic_dtype(self):
456+
self.assertTrue(is_interval_dtype('interval[int64]'))
368457
self.assertTrue(is_interval_dtype(IntervalIndex.from_tuples([(0, 1)])))
369458
self.assertTrue(is_interval_dtype
370459
(IntervalIndex.from_breaks(np.arange(4))))
@@ -377,6 +466,7 @@ def test_is_dtype(self):
377466
self.assertFalse(is_interval_dtype(np.int64))
378467
self.assertFalse(is_interval_dtype(np.float64))
379468

469+
380470
if __name__ == '__main__':
381471
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
382472
exit=False)

pandas/tools/tests/test_tile.py

+15-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from pandas import (Series, Index, isnull,
88
to_datetime, DatetimeIndex, Timestamp,
99
Interval, IntervalIndex, Categorical,
10-
cut, qcut)
10+
cut, qcut, date_range)
1111
import pandas.util.testing as tm
1212

1313
from pandas.core.algorithms import quantile
@@ -379,6 +379,20 @@ def test_datetime_bin(self):
379379
result = cut(data, bins=bin_pydatetime)
380380
tm.assert_series_equal(Series(result), expected)
381381

382+
def test_datetime_nan(self):
383+
384+
def f():
385+
cut(date_range('20130101', periods=3), bins=[0, 2, 4])
386+
self.assertRaises(ValueError, f)
387+
388+
result = cut(date_range('20130102', periods=5),
389+
bins=date_range('20130101', periods=2))
390+
mask = result.categories.isnull()
391+
self.assert_numpy_array_equal(mask, np.array([False]))
392+
mask = result.isnull()
393+
self.assert_numpy_array_equal(
394+
mask, np.array([False, True, True, True, True]))
395+
382396

383397
def curpath():
384398
pth, _ = os.path.split(os.path.abspath(__file__))

0 commit comments

Comments
 (0)