Skip to content

Commit 2d768f4

Browse files
committed
CLN/COMPAT: IntervalIndex
1 parent 0193f57 commit 2d768f4

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+1763
-2280
lines changed

doc/source/whatsnew/v0.20.0.txt

+31
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ Highlights include:
1111

1212
- Building pandas for development now requires ``cython >= 0.23`` (:issue:`14831`)
1313
- The ``.ix`` indexer has been deprecated, see :ref:`here <whatsnew_0200.api_breaking.deprecate_ix>`
14+
- Addition of an ``IntervalIndex`` and ``Interval`` scalar type, see :ref:`here <whatsnew_0200.enhancements.intervalindex>`
1415
- Switched the test framework to `pytest`_ (:issue:`13097`)
1516

1617
.. _pytest: http://doc.pytest.org/en/latest/
@@ -120,6 +121,36 @@ Notably, a new numerical index, ``UInt64Index``, has been created (:issue:`14937
120121
- Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`)
121122
- Bug in ``pd.value_counts()`` in which unsigned 64-bit integers were being erroneously truncated in the output (:issue:`14934`)
122123

124+
.. _whatsnew_0200.enhancements.intervalindex:
125+
126+
IntervalIndex
127+
^^^^^^^^^^^^^
128+
129+
pandas has gain an ``IntervalIndex`` with its own dtype, ``interval`` as well as the ``Interval`` scalar type. These allow first-class support for interval
130+
notation, specifically as return type for ``pd.cut`` and ``pd.qcut``. (:issue:`7640`, :issue:`8625`)
131+
132+
**Previous behavior**:
133+
134+
.. code-block:: ipython
135+
136+
In [2]: pd.cut(range(3), 2)
137+
Out[2]:
138+
[(-0.002, 1], (-0.002, 1], (1, 2]]
139+
Categories (2, object): [(-0.002, 1] < (1, 2]]
140+
141+
# the returned categories are strings, representing Intervals
142+
In [3]: pd.cut(range(3), 2).categories
143+
Out[3]: Index(['(-0.002, 1]', '(1, 2]'], dtype='object')
144+
145+
**New behavior**:
146+
147+
.. ipython:: python
148+
149+
c = pd.cut(range(3), 2)
150+
c
151+
c.categories
152+
pd.api.types.is_interval_dtype(c.categories)
153+
123154
.. _whatsnew_0200.enhancements.other:
124155

125156
Other enhancements

pandas/core/algorithms.py

+31-19
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
is_int64_dtype,
1616
is_categorical_dtype,
1717
is_extension_type,
18+
is_interval_dtype,
1819
is_datetimetz,
1920
is_period_dtype,
2021
is_period_arraylike,
@@ -401,31 +402,40 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
401402
if bins is not None:
402403
try:
403404
from pandas.tools.tile import cut
404-
values = Series(values).values
405-
cat, bins = cut(values, bins, retbins=True)
405+
values = Series(values)
406+
ii = cut(values, bins, include_lowest=True)
406407
except TypeError:
407408
raise TypeError("bins argument only works with numeric data.")
408409

409-
if is_extension_type(values) and not is_datetimetz(values):
410-
# handle Categorical and sparse,
411-
# datetime tz can be handeled in ndarray path
412-
result = Series(values).values.value_counts(dropna=dropna)
413-
result.name = name
414-
counts = result.values
410+
# count, remove nulls (from the index), and but the bins
411+
result = ii.value_counts(dropna=dropna)
412+
result = result[result.index.notnull()]
413+
result.index = result.index.astype('interval')
414+
result = result.sort_index()
415+
416+
# if we are dropna and we have NO values
417+
if dropna and (result.values == 0).all():
418+
result = result.iloc[0:0]
419+
420+
# normalizing is by len of all (regarless of dropna)
421+
counts = np.array([len(ii)])
422+
415423
else:
416-
# ndarray path. pass original to handle DatetimeTzBlock
417-
keys, counts = _value_counts_arraylike(values, dropna=dropna)
418424

419-
from pandas import Index, Series
420-
if not isinstance(keys, Index):
421-
keys = Index(keys)
422-
result = Series(counts, index=keys, name=name)
425+
if is_extension_type(values) and not is_datetimetz(values):
426+
# handle Categorical and sparse,
427+
# datetime tz can be handeled in ndarray path
428+
result = Series(values).values.value_counts(dropna=dropna)
429+
result.name = name
430+
counts = result.values
431+
else:
432+
# ndarray path. pass original to handle DatetimeTzBlock
433+
keys, counts = _value_counts_arraylike(values, dropna=dropna)
423434

424-
if bins is not None:
425-
# TODO: This next line should be more efficient
426-
result = result.reindex(np.arange(len(cat.categories)),
427-
fill_value=0)
428-
result.index = bins[:-1]
435+
from pandas import Index, Series
436+
if not isinstance(keys, Index):
437+
keys = Index(keys)
438+
result = Series(counts, index=keys, name=name)
429439

430440
if sort:
431441
result = result.sort_values(ascending=ascending)
@@ -1246,6 +1256,8 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,
12461256
allow_fill=allow_fill)
12471257
elif is_datetimetz(arr):
12481258
return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
1259+
elif is_interval_dtype(arr):
1260+
return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
12491261

12501262
if indexer is None:
12511263
indexer = np.arange(arr.shape[axis], dtype=np.int64)

pandas/core/api.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@
1111
from pandas.formats.format import set_eng_float_format
1212
from pandas.core.index import (Index, CategoricalIndex, Int64Index,
1313
UInt64Index, RangeIndex, Float64Index,
14-
MultiIndex)
15-
from pandas.core.interval import Interval, IntervalIndex
14+
MultiIndex, IntervalIndex)
15+
from pandas.indexes.interval import Interval, interval_range
1616

1717
from pandas.core.series import Series, TimeSeries
1818
from pandas.core.frame import DataFrame

pandas/core/frame.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
_possibly_downcast_to_dtype,
3131
_invalidate_string_dtypes,
3232
_coerce_to_dtypes,
33+
_coerce_extension_to_embed,
3334
_maybe_upcast_putmask,
3435
_find_common_type)
3536
from pandas.types.common import (is_categorical_dtype,
@@ -2648,7 +2649,7 @@ def reindexer(value):
26482649

26492650
# return internal types directly
26502651
if is_extension_type(value):
2651-
return value
2652+
return _coerce_extension_to_embed(value)
26522653

26532654
# broadcast across multiple columns if necessary
26542655
if broadcast and key in self.columns and value.ndim == 1:

pandas/core/groupby.py

+21-19
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from pandas.types.common import (is_numeric_dtype,
1818
is_timedelta64_dtype, is_datetime64_dtype,
1919
is_categorical_dtype,
20+
is_interval_dtype,
2021
is_datetimelike,
2122
is_datetime64_any_dtype,
2223
is_bool, is_integer_dtype,
@@ -39,10 +40,11 @@
3940

4041
from pandas.core.base import (PandasObject, SelectionMixin, GroupByError,
4142
DataError, SpecificationError)
43+
from pandas.core.index import (Index, MultiIndex,
44+
CategoricalIndex, _ensure_index)
4245
from pandas.core.categorical import Categorical
4346
from pandas.core.frame import DataFrame
4447
from pandas.core.generic import NDFrame
45-
from pandas.core.interval import IntervalIndex
4648
from pandas.core.internals import BlockManager, make_block
4749
from pandas.core.series import Series
4850
from pandas.core.panel import Panel
@@ -2592,7 +2594,7 @@ def _convert_grouper(axis, grouper):
25922594
return grouper.reindex(axis)._values
25932595
elif isinstance(grouper, (list, Series, Index, np.ndarray)):
25942596
if len(grouper) != len(axis):
2595-
raise AssertionError('Grouper and axis must be same length')
2597+
raise ValueError('Grouper and axis must be same length')
25962598
return grouper
25972599
else:
25982600
return grouper
@@ -3084,36 +3086,37 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
30843086

30853087
if bins is None:
30863088
lab, lev = algos.factorize(val, sort=True)
3089+
llab = lambda lab, inc: lab[inc]
30873090
else:
3088-
raise NotImplementedError('this is broken')
3089-
lab, bins = cut(val, bins, retbins=True)
3090-
# bins[:-1] for backward compat;
3091-
# o.w. cat.categories could be better
3092-
# cat = Categorical(cat)
3093-
# lab, lev, dropna = cat.codes, bins[:-1], False
3094-
3095-
if (lab.dtype == object
3096-
and lib.is_interval_array_fixed_closed(lab[notnull(lab)])):
3097-
lab_index = Index(lab)
3098-
assert isinstance(lab, IntervalIndex)
3099-
sorter = np.lexsort((lab_index.left, lab_index.right, ids))
3091+
3092+
# lab is a Categorical with categories an IntervalIndex
3093+
lab = cut(Series(val), bins, include_lowest=True)
3094+
lev = lab.cat.categories
3095+
lab = lev.take(lab.cat.codes)
3096+
llab = lambda lab, inc: lab[inc]._multiindex.labels[-1]
3097+
3098+
if is_interval_dtype(lab):
3099+
# TODO: should we do this inside II?
3100+
sorter = np.lexsort((lab.left, lab.right, ids))
31003101
else:
31013102
sorter = np.lexsort((lab, ids))
3103+
31023104
ids, lab = ids[sorter], lab[sorter]
31033105

31043106
# group boundaries are where group ids change
31053107
idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]
31063108

31073109
# new values are where sorted labels change
3108-
inc = np.r_[True, lab[1:] != lab[:-1]]
3110+
lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1))
3111+
inc = np.r_[True, lchanges]
31093112
inc[idx] = True # group boundaries are also new values
31103113
out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts
31113114

31123115
# num. of times each group should be repeated
31133116
rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx))
31143117

31153118
# multi-index components
3116-
labels = list(map(rep, self.grouper.recons_labels)) + [lab[inc]]
3119+
labels = list(map(rep, self.grouper.recons_labels)) + [llab(lab, inc)]
31173120
levels = [ping.group_index for ping in self.grouper.groupings] + [lev]
31183121
names = self.grouper.names + [self.name]
31193122

@@ -3139,13 +3142,12 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
31393142
acc = rep(d)
31403143
out /= acc
31413144

3142-
if sort: # and bins is None:
3145+
if sort and bins is None:
31433146
cat = ids[inc][mask] if dropna else ids[inc]
31443147
sorter = np.lexsort((out if ascending else -out, cat))
31453148
out, labels[-1] = out[sorter], labels[-1][sorter]
31463149

3147-
# if bins is None:
3148-
if True:
3150+
if bins is None:
31493151
mi = MultiIndex(levels=levels, labels=labels, names=names,
31503152
verify_integrity=False)
31513153

pandas/hashtable.pyx

+2
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ PyDateTime_IMPORT
3939
cdef extern from "Python.h":
4040
int PySlice_Check(object)
4141

42+
cdef size_t _INIT_VEC_CAP = 128
43+
4244
include "hashtable_class_helper.pxi"
4345
include "hashtable_func_helper.pxi"
4446

pandas/indexes/api.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
InvalidIndexError)
44
from pandas.indexes.category import CategoricalIndex # noqa
55
from pandas.indexes.multi import MultiIndex # noqa
6+
from pandas.indexes.interval import IntervalIndex # noqa
67
from pandas.indexes.numeric import (NumericIndex, Float64Index, # noqa
78
Int64Index, UInt64Index)
89
from pandas.indexes.range import RangeIndex # noqa
@@ -13,7 +14,7 @@
1314
# TODO: there are many places that rely on these private methods existing in
1415
# pandas.core.index
1516
__all__ = ['Index', 'MultiIndex', 'NumericIndex', 'Float64Index', 'Int64Index',
16-
'CategoricalIndex', 'RangeIndex', 'UInt64Index',
17+
'CategoricalIndex', 'IntervalIndex', 'RangeIndex', 'UInt64Index',
1718
'InvalidIndexError',
1819
'_new_Index',
1920
'_ensure_index', '_get_na_value', '_get_combined_index',

pandas/indexes/base.py

+28-2
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
is_dtype_equal,
2727
is_object_dtype,
2828
is_categorical_dtype,
29+
is_interval_dtype,
2930
is_bool_dtype,
3031
is_signed_integer_dtype,
3132
is_unsigned_integer_dtype,
@@ -165,6 +166,12 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
165166
from .category import CategoricalIndex
166167
return CategoricalIndex(data, copy=copy, name=name, **kwargs)
167168

169+
# interval
170+
if is_interval_dtype(data):
171+
from .interval import IntervalIndex
172+
return IntervalIndex.from_intervals(data, name=name,
173+
copy=copy)
174+
168175
# index-like
169176
elif isinstance(data, (np.ndarray, Index, ABCSeries)):
170177

@@ -269,6 +276,10 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
269276
elif inferred in ['floating', 'mixed-integer-float']:
270277
from .numeric import Float64Index
271278
return Float64Index(subarr, copy=copy, name=name)
279+
elif inferred == 'interval':
280+
from .interval import IntervalIndex
281+
return IntervalIndex.from_intervals(subarr, name=name,
282+
copy=copy)
272283
elif inferred == 'boolean':
273284
# don't support boolean explicity ATM
274285
pass
@@ -1181,6 +1192,9 @@ def is_object(self):
11811192
def is_categorical(self):
11821193
return self.inferred_type in ['categorical']
11831194

1195+
def is_interval(self):
1196+
return self.inferred_type in ['interval']
1197+
11841198
def is_mixed(self):
11851199
return self.inferred_type in ['mixed']
11861200

@@ -3258,6 +3272,13 @@ def _searchsorted_monotonic(self, label, side='left'):
32583272

32593273
raise ValueError('index must be monotonic increasing or decreasing')
32603274

3275+
def _get_loc_only_exact_matches(self, key):
3276+
"""
3277+
This is overriden on subclasses (namely, IntervalIndex) to control
3278+
get_slice_bound.
3279+
"""
3280+
return self.get_loc(key)
3281+
32613282
def get_slice_bound(self, label, side, kind):
32623283
"""
32633284
Calculate slice bound that corresponds to given label.
@@ -3287,7 +3308,7 @@ def get_slice_bound(self, label, side, kind):
32873308

32883309
# we need to look up the label
32893310
try:
3290-
slc = self.get_loc(label)
3311+
slc = self._get_loc_only_exact_matches(label)
32913312
except KeyError as err:
32923313
try:
32933314
return self._searchsorted_monotonic(label, side)
@@ -3527,7 +3548,9 @@ def _evaluate_compare(self, other):
35273548
if needs_i8_conversion(self) and needs_i8_conversion(other):
35283549
return self._evaluate_compare(other, op)
35293550

3530-
if is_object_dtype(self) and self.nlevels == 1:
3551+
if (is_object_dtype(self) and
3552+
self.nlevels == 1):
3553+
35313554
# don't pass MultiIndex
35323555
with np.errstate(all='ignore'):
35333556
result = _comp_method_OBJECT_ARRAY(
@@ -3839,6 +3862,9 @@ def _ensure_index(index_like, copy=False):
38393862

38403863

38413864
def _get_na_value(dtype):
3865+
if is_datetime64_any_dtype(dtype) or is_timedelta64_dtype(dtype):
3866+
return tslib.NaT
3867+
38423868
return {np.datetime64: tslib.NaT,
38433869
np.timedelta64: tslib.NaT}.get(dtype, np.nan)
38443870

pandas/indexes/category.py

+10
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from pandas.types.common import (is_categorical_dtype,
88
_ensure_platform_int,
99
is_list_like,
10+
is_interval_dtype,
1011
is_scalar)
1112
from pandas.types.missing import array_equivalent
1213

@@ -268,6 +269,13 @@ def __array__(self, dtype=None):
268269
""" the array interface, return my values """
269270
return np.array(self._data, dtype=dtype)
270271

272+
@Appender(_index_shared_docs['astype'])
273+
def astype(self, dtype, copy=True):
274+
if is_interval_dtype(dtype):
275+
from pandas import IntervalIndex
276+
return IntervalIndex.from_intervals(np.array(self))
277+
return super(CategoricalIndex, self).astype(dtype=dtype, copy=copy)
278+
271279
@cache_readonly
272280
def _isnan(self):
273281
""" return if each value is nan"""
@@ -482,6 +490,8 @@ def take(self, indices, axis=0, allow_fill=True,
482490
na_value=-1)
483491
return self._create_from_codes(taken)
484492

493+
take_nd = take
494+
485495
def map(self, mapper):
486496
"""Apply mapper function to its categories (not codes).
487497

0 commit comments

Comments
 (0)