Skip to content

Commit 9865000

Browse files
committed
CLN/COMPAT: IntervalIndex
1 parent 31b4c69 commit 9865000

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+1834
-2434
lines changed

doc/source/whatsnew/v0.20.0.txt

+31
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ Highlights include:
1111

1212
- Building pandas for development now requires ``cython >= 0.23`` (:issue:`14831`)
1313
- The ``.ix`` indexer has been deprecated, see :ref:`here <whatsnew_0200.api_breaking.deprecate_ix>`
14+
- Addition of an ``IntervalIndex`` and ``Interval`` scalar type, see :ref:`here <whatsnew_0200.enhancements.intervalindex>`
1415

1516
Check the :ref:`API Changes <whatsnew_0200.api_breaking>` and :ref:`deprecations <whatsnew_0200.deprecations>` before updating.
1617

@@ -117,6 +118,36 @@ Notably, a new numerical index, ``UInt64Index``, has been created (:issue:`14937
117118
- Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`)
118119
- Bug in ``pd.value_counts()`` in which unsigned 64-bit integers were being erroneously truncated in the output (:issue:`14934`)
119120

121+
.. _whatsnew_0200.enhancements.intervalindex:
122+
123+
IntervalIndex
124+
^^^^^^^^^^^^^
125+
126+
pandas has gain an ``IntervalIndex`` with its own dtype, ``interval`` as well as the ``Interval`` scalar type. These allow first-class support for interval
127+
notation, specifically as return type for ``pd.cut`` and ``pd.qcut``. (:issue:`7640`, :issue:`8625`)
128+
129+
**Previous behavior**:
130+
131+
.. code-block:: ipython
132+
133+
In [2]: pd.cut(range(3), 2)
134+
Out[2]:
135+
[(-0.002, 1], (-0.002, 1], (1, 2]]
136+
Categories (2, object): [(-0.002, 1] < (1, 2]]
137+
138+
# the returned categories are strings, representing Intervals
139+
In [3]: pd.cut(range(3), 2).categories
140+
Out[3]: Index(['(-0.002, 1]', '(1, 2]'], dtype='object')
141+
142+
**New behavior**:
143+
144+
.. ipython:: python
145+
146+
c = pd.cut(range(3), 2)
147+
c
148+
c.categories
149+
pd.api.types.is_interval_dtype(c.categories)
150+
120151
.. _whatsnew_0200.enhancements.other:
121152

122153
Other enhancements

pandas/api/tests/test_api.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ class TestPDApi(Base, tm.TestCase):
5656
'Period', 'PeriodIndex', 'RangeIndex', 'UInt64Index',
5757
'Series', 'SparseArray', 'SparseDataFrame',
5858
'SparseSeries', 'TimeGrouper', 'Timedelta',
59-
'TimedeltaIndex', 'Timestamp']
59+
'TimedeltaIndex', 'Timestamp', 'Interval', 'IntervalIndex']
6060

6161
# these are already deprecated; awaiting removal
6262
deprecated_classes = ['TimeSeries', 'WidePanel',
@@ -74,7 +74,7 @@ class TestPDApi(Base, tm.TestCase):
7474

7575
# top-level functions
7676
funcs = ['bdate_range', 'concat', 'crosstab', 'cut',
77-
'date_range', 'eval',
77+
'date_range', 'interval_range', 'eval',
7878
'factorize', 'get_dummies', 'get_store',
7979
'infer_freq', 'isnull', 'lreshape',
8080
'match', 'melt', 'notnull', 'offsets',
@@ -156,6 +156,7 @@ class TestTypes(Base, tm.TestCase):
156156
'is_string_dtype', 'is_signed_integer_dtype',
157157
'is_timedelta64_dtype', 'is_timedelta64_ns_dtype',
158158
'is_unsigned_integer_dtype', 'is_period',
159+
'is_interval', 'is_interval_dtype',
159160
'is_period_dtype', 'is_re', 'is_re_compilable',
160161
'is_dict_like', 'is_iterator',
161162
'is_list_like', 'is_hashable',

pandas/core/algorithms.py

+31-19
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
is_int64_dtype,
1616
is_categorical_dtype,
1717
is_extension_type,
18+
is_interval_dtype,
1819
is_datetimetz,
1920
is_period_dtype,
2021
is_period_arraylike,
@@ -401,31 +402,40 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
401402
if bins is not None:
402403
try:
403404
from pandas.tools.tile import cut
404-
values = Series(values).values
405-
cat, bins = cut(values, bins, retbins=True)
405+
values = Series(values)
406+
ii = cut(values, bins, include_lowest=True)
406407
except TypeError:
407408
raise TypeError("bins argument only works with numeric data.")
408409

409-
if is_extension_type(values) and not is_datetimetz(values):
410-
# handle Categorical and sparse,
411-
# datetime tz can be handeled in ndarray path
412-
result = Series(values).values.value_counts(dropna=dropna)
413-
result.name = name
414-
counts = result.values
410+
# count, remove nulls (from the index), and but the bins
411+
result = ii.value_counts(dropna=dropna)
412+
result = result[result.index.notnull()]
413+
result.index = result.index.astype('interval')
414+
result = result.sort_index()
415+
416+
# if we are dropna and we have NO values
417+
if dropna and (result.values == 0).all():
418+
result = result.iloc[0:0]
419+
420+
# normalizing is by len of all (regarless of dropna)
421+
counts = np.array([len(ii)])
422+
415423
else:
416-
# ndarray path. pass original to handle DatetimeTzBlock
417-
keys, counts = _value_counts_arraylike(values, dropna=dropna)
418424

419-
from pandas import Index, Series
420-
if not isinstance(keys, Index):
421-
keys = Index(keys)
422-
result = Series(counts, index=keys, name=name)
425+
if is_extension_type(values) and not is_datetimetz(values):
426+
# handle Categorical and sparse,
427+
# datetime tz can be handeled in ndarray path
428+
result = Series(values).values.value_counts(dropna=dropna)
429+
result.name = name
430+
counts = result.values
431+
else:
432+
# ndarray path. pass original to handle DatetimeTzBlock
433+
keys, counts = _value_counts_arraylike(values, dropna=dropna)
423434

424-
if bins is not None:
425-
# TODO: This next line should be more efficient
426-
result = result.reindex(np.arange(len(cat.categories)),
427-
fill_value=0)
428-
result.index = bins[:-1]
435+
from pandas import Index, Series
436+
if not isinstance(keys, Index):
437+
keys = Index(keys)
438+
result = Series(counts, index=keys, name=name)
429439

430440
if sort:
431441
result = result.sort_values(ascending=ascending)
@@ -1244,6 +1254,8 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,
12441254
allow_fill=allow_fill)
12451255
elif is_datetimetz(arr):
12461256
return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
1257+
elif is_interval_dtype(arr):
1258+
return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
12471259

12481260
if indexer is None:
12491261
indexer = np.arange(arr.shape[axis], dtype=np.int64)

pandas/core/api.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@
1111
from pandas.formats.format import set_eng_float_format
1212
from pandas.core.index import (Index, CategoricalIndex, Int64Index,
1313
UInt64Index, RangeIndex, Float64Index,
14-
MultiIndex)
15-
from pandas.core.interval import Interval, IntervalIndex
14+
MultiIndex, IntervalIndex)
15+
from pandas.indexes.interval import Interval, interval_range
1616

1717
from pandas.core.series import Series, TimeSeries
1818
from pandas.core.frame import DataFrame

pandas/core/groupby.py

+21-19
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from pandas.types.common import (is_numeric_dtype,
1818
is_timedelta64_dtype, is_datetime64_dtype,
1919
is_categorical_dtype,
20+
is_interval_dtype,
2021
is_datetimelike,
2122
is_datetime64_any_dtype,
2223
is_bool, is_integer_dtype,
@@ -39,10 +40,11 @@
3940

4041
from pandas.core.base import (PandasObject, SelectionMixin, GroupByError,
4142
DataError, SpecificationError)
43+
from pandas.core.index import (Index, MultiIndex,
44+
CategoricalIndex, _ensure_index)
4245
from pandas.core.categorical import Categorical
4346
from pandas.core.frame import DataFrame
4447
from pandas.core.generic import NDFrame
45-
from pandas.core.interval import IntervalIndex
4648
from pandas.core.internals import BlockManager, make_block
4749
from pandas.core.series import Series
4850
from pandas.core.panel import Panel
@@ -2579,7 +2581,7 @@ def _convert_grouper(axis, grouper):
25792581
return grouper.reindex(axis)._values
25802582
elif isinstance(grouper, (list, Series, Index, np.ndarray)):
25812583
if len(grouper) != len(axis):
2582-
raise AssertionError('Grouper and axis must be same length')
2584+
raise ValueError('Grouper and axis must be same length')
25832585
return grouper
25842586
else:
25852587
return grouper
@@ -3063,36 +3065,37 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
30633065

30643066
if bins is None:
30653067
lab, lev = algos.factorize(val, sort=True)
3068+
llab = lambda lab, inc: lab[inc]
30663069
else:
3067-
raise NotImplementedError('this is broken')
3068-
lab, bins = cut(val, bins, retbins=True)
3069-
# bins[:-1] for backward compat;
3070-
# o.w. cat.categories could be better
3071-
# cat = Categorical(cat)
3072-
# lab, lev, dropna = cat.codes, bins[:-1], False
3073-
3074-
if (lab.dtype == object
3075-
and lib.is_interval_array_fixed_closed(lab[notnull(lab)])):
3076-
lab_index = Index(lab)
3077-
assert isinstance(lab, IntervalIndex)
3078-
sorter = np.lexsort((lab_index.left, lab_index.right, ids))
3070+
3071+
# lab is a Categorical with categories an IntervalIndex
3072+
lab = cut(Series(val), bins, include_lowest=True)
3073+
lev = lab.cat.categories
3074+
lab = lev.take(lab.cat.codes)
3075+
llab = lambda lab, inc: lab[inc]._multiindex.labels[-1]
3076+
3077+
if is_interval_dtype(lab):
3078+
# TODO: should we do this inside II?
3079+
sorter = np.lexsort((lab.left, lab.right, ids))
30793080
else:
30803081
sorter = np.lexsort((lab, ids))
3082+
30813083
ids, lab = ids[sorter], lab[sorter]
30823084

30833085
# group boundaries are where group ids change
30843086
idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]
30853087

30863088
# new values are where sorted labels change
3087-
inc = np.r_[True, lab[1:] != lab[:-1]]
3089+
lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1))
3090+
inc = np.r_[True, lchanges]
30883091
inc[idx] = True # group boundaries are also new values
30893092
out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts
30903093

30913094
# num. of times each group should be repeated
30923095
rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx))
30933096

30943097
# multi-index components
3095-
labels = list(map(rep, self.grouper.recons_labels)) + [lab[inc]]
3098+
labels = list(map(rep, self.grouper.recons_labels)) + [llab(lab, inc)]
30963099
levels = [ping.group_index for ping in self.grouper.groupings] + [lev]
30973100
names = self.grouper.names + [self.name]
30983101

@@ -3118,13 +3121,12 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
31183121
acc = rep(d)
31193122
out /= acc
31203123

3121-
if sort: # and bins is None:
3124+
if sort and bins is None:
31223125
cat = ids[inc][mask] if dropna else ids[inc]
31233126
sorter = np.lexsort((out if ascending else -out, cat))
31243127
out, labels[-1] = out[sorter], labels[-1][sorter]
31253128

3126-
# if bins is None:
3127-
if True:
3129+
if bins is None:
31283130
mi = MultiIndex(levels=levels, labels=labels, names=names,
31293131
verify_integrity=False)
31303132

pandas/hashtable.pyx

+2
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ PyDateTime_IMPORT
3939
cdef extern from "Python.h":
4040
int PySlice_Check(object)
4141

42+
cdef size_t _INIT_VEC_CAP = 128
43+
4244
include "hashtable_class_helper.pxi"
4345
include "hashtable_func_helper.pxi"
4446

pandas/indexes/api.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
InvalidIndexError)
44
from pandas.indexes.category import CategoricalIndex # noqa
55
from pandas.indexes.multi import MultiIndex # noqa
6+
from pandas.indexes.interval import IntervalIndex # noqa
67
from pandas.indexes.numeric import (NumericIndex, Float64Index, # noqa
78
Int64Index, UInt64Index)
89
from pandas.indexes.range import RangeIndex # noqa
@@ -13,7 +14,7 @@
1314
# TODO: there are many places that rely on these private methods existing in
1415
# pandas.core.index
1516
__all__ = ['Index', 'MultiIndex', 'NumericIndex', 'Float64Index', 'Int64Index',
16-
'CategoricalIndex', 'RangeIndex', 'UInt64Index',
17+
'CategoricalIndex', 'IntervalIndex', 'RangeIndex', 'UInt64Index',
1718
'InvalidIndexError',
1819
'_new_Index',
1920
'_ensure_index', '_get_na_value', '_get_combined_index',

pandas/indexes/base.py

+28-2
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
is_dtype_equal,
2727
is_object_dtype,
2828
is_categorical_dtype,
29+
is_interval_dtype,
2930
is_bool_dtype,
3031
is_signed_integer_dtype,
3132
is_unsigned_integer_dtype,
@@ -164,6 +165,12 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
164165
from .category import CategoricalIndex
165166
return CategoricalIndex(data, copy=copy, name=name, **kwargs)
166167

168+
# interval
169+
if is_interval_dtype(data):
170+
from .interval import IntervalIndex
171+
return IntervalIndex.from_intervals(data, name=name,
172+
copy=copy)
173+
167174
# index-like
168175
elif isinstance(data, (np.ndarray, Index, ABCSeries)):
169176

@@ -268,6 +275,10 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
268275
elif inferred in ['floating', 'mixed-integer-float']:
269276
from .numeric import Float64Index
270277
return Float64Index(subarr, copy=copy, name=name)
278+
elif inferred == 'interval':
279+
from .interval import IntervalIndex
280+
return IntervalIndex.from_intervals(subarr, name=name,
281+
copy=copy)
271282
elif inferred == 'boolean':
272283
# don't support boolean explicity ATM
273284
pass
@@ -1180,6 +1191,9 @@ def is_object(self):
11801191
def is_categorical(self):
11811192
return self.inferred_type in ['categorical']
11821193

1194+
def is_interval(self):
1195+
return self.inferred_type in ['interval']
1196+
11831197
def is_mixed(self):
11841198
return self.inferred_type in ['mixed']
11851199

@@ -3232,6 +3246,13 @@ def _searchsorted_monotonic(self, label, side='left'):
32323246

32333247
raise ValueError('index must be monotonic increasing or decreasing')
32343248

3249+
def _get_loc_only_exact_matches(self, key):
3250+
"""
3251+
This is overriden on subclasses (namely, IntervalIndex) to control
3252+
get_slice_bound.
3253+
"""
3254+
return self.get_loc(key)
3255+
32353256
def get_slice_bound(self, label, side, kind):
32363257
"""
32373258
Calculate slice bound that corresponds to given label.
@@ -3261,7 +3282,7 @@ def get_slice_bound(self, label, side, kind):
32613282

32623283
# we need to look up the label
32633284
try:
3264-
slc = self.get_loc(label)
3285+
slc = self._get_loc_only_exact_matches(label)
32653286
except KeyError as err:
32663287
try:
32673288
return self._searchsorted_monotonic(label, side)
@@ -3501,7 +3522,9 @@ def _evaluate_compare(self, other):
35013522
if needs_i8_conversion(self) and needs_i8_conversion(other):
35023523
return self._evaluate_compare(other, op)
35033524

3504-
if is_object_dtype(self) and self.nlevels == 1:
3525+
if (is_object_dtype(self) and
3526+
self.nlevels == 1):
3527+
35053528
# don't pass MultiIndex
35063529
with np.errstate(all='ignore'):
35073530
result = _comp_method_OBJECT_ARRAY(
@@ -3813,6 +3836,9 @@ def _ensure_index(index_like, copy=False):
38133836

38143837

38153838
def _get_na_value(dtype):
3839+
if is_datetime64_any_dtype(dtype) or is_timedelta64_dtype(dtype):
3840+
return tslib.NaT
3841+
38163842
return {np.datetime64: tslib.NaT,
38173843
np.timedelta64: tslib.NaT}.get(dtype, np.nan)
38183844

pandas/indexes/category.py

+10
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from pandas.types.common import (is_categorical_dtype,
88
_ensure_platform_int,
99
is_list_like,
10+
is_interval_dtype,
1011
is_scalar)
1112
from pandas.types.missing import array_equivalent
1213

@@ -266,6 +267,13 @@ def __array__(self, dtype=None):
266267
""" the array interface, return my values """
267268
return np.array(self._data, dtype=dtype)
268269

270+
@Appender(_index_shared_docs['astype'])
271+
def astype(self, dtype, copy=True):
272+
if is_interval_dtype(dtype):
273+
from pandas import IntervalIndex
274+
return IntervalIndex.from_intervals(np.array(self))
275+
return super(CategoricalIndex, self).astype(dtype=dtype, copy=copy)
276+
269277
@cache_readonly
270278
def _isnan(self):
271279
""" return if each value is nan"""
@@ -508,6 +516,8 @@ def take(self, indices, axis=0, allow_fill=True,
508516
na_value=-1)
509517
return self._create_from_codes(taken)
510518

519+
take_nd = take
520+
511521
def map(self, mapper):
512522
"""Apply mapper function to its categories (not codes).
513523

0 commit comments

Comments
 (0)