Skip to content

Commit 067375c

Browse files
committed
CLN/COMPAT: IntervalIndex
1 parent b67b098 commit 067375c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+1763
-2283
lines changed

doc/source/whatsnew/v0.20.0.txt

+31
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ Highlights include:
1111

1212
- Building pandas for development now requires ``cython >= 0.23`` (:issue:`14831`)
1313
- The ``.ix`` indexer has been deprecated, see :ref:`here <whatsnew_0200.api_breaking.deprecate_ix>`
14+
- Addition of an ``IntervalIndex`` and ``Interval`` scalar type, see :ref:`here <whatsnew_0200.enhancements.intervalindex>`
1415
- Switched the test framework to `pytest`_ (:issue:`13097`)
1516

1617
.. _pytest: http://doc.pytest.org/en/latest/
@@ -120,6 +121,36 @@ Notably, a new numerical index, ``UInt64Index``, has been created (:issue:`14937
120121
- Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`)
121122
- Bug in ``pd.value_counts()`` in which unsigned 64-bit integers were being erroneously truncated in the output (:issue:`14934`)
122123

124+
.. _whatsnew_0200.enhancements.intervalindex:
125+
126+
IntervalIndex
127+
^^^^^^^^^^^^^
128+
129+
pandas has gain an ``IntervalIndex`` with its own dtype, ``interval`` as well as the ``Interval`` scalar type. These allow first-class support for interval
130+
notation, specifically as return type for ``pd.cut`` and ``pd.qcut``. (:issue:`7640`, :issue:`8625`)
131+
132+
**Previous behavior**:
133+
134+
.. code-block:: ipython
135+
136+
In [2]: pd.cut(range(3), 2)
137+
Out[2]:
138+
[(-0.002, 1], (-0.002, 1], (1, 2]]
139+
Categories (2, object): [(-0.002, 1] < (1, 2]]
140+
141+
# the returned categories are strings, representing Intervals
142+
In [3]: pd.cut(range(3), 2).categories
143+
Out[3]: Index(['(-0.002, 1]', '(1, 2]'], dtype='object')
144+
145+
**New behavior**:
146+
147+
.. ipython:: python
148+
149+
c = pd.cut(range(3), 2)
150+
c
151+
c.categories
152+
pd.api.types.is_interval_dtype(c.categories)
153+
123154
.. _whatsnew_0200.enhancements.other:
124155

125156
Other enhancements

pandas/core/algorithms.py

+31-19
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
is_int64_dtype,
1616
is_categorical_dtype,
1717
is_extension_type,
18+
is_interval_dtype,
1819
is_datetimetz,
1920
is_period_dtype,
2021
is_period_arraylike,
@@ -401,31 +402,40 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
401402
if bins is not None:
402403
try:
403404
from pandas.tools.tile import cut
404-
values = Series(values).values
405-
cat, bins = cut(values, bins, retbins=True)
405+
values = Series(values)
406+
ii = cut(values, bins, include_lowest=True)
406407
except TypeError:
407408
raise TypeError("bins argument only works with numeric data.")
408409

409-
if is_extension_type(values) and not is_datetimetz(values):
410-
# handle Categorical and sparse,
411-
# datetime tz can be handeled in ndarray path
412-
result = Series(values).values.value_counts(dropna=dropna)
413-
result.name = name
414-
counts = result.values
410+
# count, remove nulls (from the index), and but the bins
411+
result = ii.value_counts(dropna=dropna)
412+
result = result[result.index.notnull()]
413+
result.index = result.index.astype('interval')
414+
result = result.sort_index()
415+
416+
# if we are dropna and we have NO values
417+
if dropna and (result.values == 0).all():
418+
result = result.iloc[0:0]
419+
420+
# normalizing is by len of all (regarless of dropna)
421+
counts = np.array([len(ii)])
422+
415423
else:
416-
# ndarray path. pass original to handle DatetimeTzBlock
417-
keys, counts = _value_counts_arraylike(values, dropna=dropna)
418424

419-
from pandas import Index, Series
420-
if not isinstance(keys, Index):
421-
keys = Index(keys)
422-
result = Series(counts, index=keys, name=name)
425+
if is_extension_type(values) and not is_datetimetz(values):
426+
# handle Categorical and sparse,
427+
# datetime tz can be handeled in ndarray path
428+
result = Series(values).values.value_counts(dropna=dropna)
429+
result.name = name
430+
counts = result.values
431+
else:
432+
# ndarray path. pass original to handle DatetimeTzBlock
433+
keys, counts = _value_counts_arraylike(values, dropna=dropna)
423434

424-
if bins is not None:
425-
# TODO: This next line should be more efficient
426-
result = result.reindex(np.arange(len(cat.categories)),
427-
fill_value=0)
428-
result.index = bins[:-1]
435+
from pandas import Index, Series
436+
if not isinstance(keys, Index):
437+
keys = Index(keys)
438+
result = Series(counts, index=keys, name=name)
429439

430440
if sort:
431441
result = result.sort_values(ascending=ascending)
@@ -1244,6 +1254,8 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,
12441254
allow_fill=allow_fill)
12451255
elif is_datetimetz(arr):
12461256
return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
1257+
elif is_interval_dtype(arr):
1258+
return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
12471259

12481260
if indexer is None:
12491261
indexer = np.arange(arr.shape[axis], dtype=np.int64)

pandas/core/api.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@
1111
from pandas.formats.format import set_eng_float_format
1212
from pandas.core.index import (Index, CategoricalIndex, Int64Index,
1313
UInt64Index, RangeIndex, Float64Index,
14-
MultiIndex)
15-
from pandas.core.interval import Interval, IntervalIndex
14+
MultiIndex, IntervalIndex)
15+
from pandas.indexes.interval import Interval, interval_range
1616

1717
from pandas.core.series import Series, TimeSeries
1818
from pandas.core.frame import DataFrame

pandas/core/frame.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
_possibly_downcast_to_dtype,
3131
_invalidate_string_dtypes,
3232
_coerce_to_dtypes,
33+
_coerce_extension_to_embed,
3334
_maybe_upcast_putmask,
3435
_find_common_type)
3536
from pandas.types.common import (is_categorical_dtype,
@@ -2648,7 +2649,7 @@ def reindexer(value):
26482649

26492650
# return internal types directly
26502651
if is_extension_type(value):
2651-
return value
2652+
return _coerce_extension_to_embed(value)
26522653

26532654
# broadcast across multiple columns if necessary
26542655
if broadcast and key in self.columns and value.ndim == 1:

pandas/core/groupby.py

+21-19
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from pandas.types.common import (is_numeric_dtype,
1818
is_timedelta64_dtype, is_datetime64_dtype,
1919
is_categorical_dtype,
20+
is_interval_dtype,
2021
is_datetimelike,
2122
is_datetime64_any_dtype,
2223
is_bool, is_integer_dtype,
@@ -39,10 +40,11 @@
3940

4041
from pandas.core.base import (PandasObject, SelectionMixin, GroupByError,
4142
DataError, SpecificationError)
43+
from pandas.core.index import (Index, MultiIndex,
44+
CategoricalIndex, _ensure_index)
4245
from pandas.core.categorical import Categorical
4346
from pandas.core.frame import DataFrame
4447
from pandas.core.generic import NDFrame
45-
from pandas.core.interval import IntervalIndex
4648
from pandas.core.internals import BlockManager, make_block
4749
from pandas.core.series import Series
4850
from pandas.core.panel import Panel
@@ -2592,7 +2594,7 @@ def _convert_grouper(axis, grouper):
25922594
return grouper.reindex(axis)._values
25932595
elif isinstance(grouper, (list, Series, Index, np.ndarray)):
25942596
if len(grouper) != len(axis):
2595-
raise AssertionError('Grouper and axis must be same length')
2597+
raise ValueError('Grouper and axis must be same length')
25962598
return grouper
25972599
else:
25982600
return grouper
@@ -3084,36 +3086,37 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
30843086

30853087
if bins is None:
30863088
lab, lev = algos.factorize(val, sort=True)
3089+
llab = lambda lab, inc: lab[inc]
30873090
else:
3088-
raise NotImplementedError('this is broken')
3089-
lab, bins = cut(val, bins, retbins=True)
3090-
# bins[:-1] for backward compat;
3091-
# o.w. cat.categories could be better
3092-
# cat = Categorical(cat)
3093-
# lab, lev, dropna = cat.codes, bins[:-1], False
3094-
3095-
if (lab.dtype == object
3096-
and lib.is_interval_array_fixed_closed(lab[notnull(lab)])):
3097-
lab_index = Index(lab)
3098-
assert isinstance(lab, IntervalIndex)
3099-
sorter = np.lexsort((lab_index.left, lab_index.right, ids))
3091+
3092+
# lab is a Categorical with categories an IntervalIndex
3093+
lab = cut(Series(val), bins, include_lowest=True)
3094+
lev = lab.cat.categories
3095+
lab = lev.take(lab.cat.codes)
3096+
llab = lambda lab, inc: lab[inc]._multiindex.labels[-1]
3097+
3098+
if is_interval_dtype(lab):
3099+
# TODO: should we do this inside II?
3100+
sorter = np.lexsort((lab.left, lab.right, ids))
31003101
else:
31013102
sorter = np.lexsort((lab, ids))
3103+
31023104
ids, lab = ids[sorter], lab[sorter]
31033105

31043106
# group boundaries are where group ids change
31053107
idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]
31063108

31073109
# new values are where sorted labels change
3108-
inc = np.r_[True, lab[1:] != lab[:-1]]
3110+
lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1))
3111+
inc = np.r_[True, lchanges]
31093112
inc[idx] = True # group boundaries are also new values
31103113
out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts
31113114

31123115
# num. of times each group should be repeated
31133116
rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx))
31143117

31153118
# multi-index components
3116-
labels = list(map(rep, self.grouper.recons_labels)) + [lab[inc]]
3119+
labels = list(map(rep, self.grouper.recons_labels)) + [llab(lab, inc)]
31173120
levels = [ping.group_index for ping in self.grouper.groupings] + [lev]
31183121
names = self.grouper.names + [self.name]
31193122

@@ -3139,13 +3142,12 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
31393142
acc = rep(d)
31403143
out /= acc
31413144

3142-
if sort: # and bins is None:
3145+
if sort and bins is None:
31433146
cat = ids[inc][mask] if dropna else ids[inc]
31443147
sorter = np.lexsort((out if ascending else -out, cat))
31453148
out, labels[-1] = out[sorter], labels[-1][sorter]
31463149

3147-
# if bins is None:
3148-
if True:
3150+
if bins is None:
31493151
mi = MultiIndex(levels=levels, labels=labels, names=names,
31503152
verify_integrity=False)
31513153

pandas/hashtable.pyx

+2
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ PyDateTime_IMPORT
3939
cdef extern from "Python.h":
4040
int PySlice_Check(object)
4141

42+
cdef size_t _INIT_VEC_CAP = 128
43+
4244
include "hashtable_class_helper.pxi"
4345
include "hashtable_func_helper.pxi"
4446

pandas/indexes/api.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
InvalidIndexError)
44
from pandas.indexes.category import CategoricalIndex # noqa
55
from pandas.indexes.multi import MultiIndex # noqa
6+
from pandas.indexes.interval import IntervalIndex # noqa
67
from pandas.indexes.numeric import (NumericIndex, Float64Index, # noqa
78
Int64Index, UInt64Index)
89
from pandas.indexes.range import RangeIndex # noqa
@@ -13,7 +14,7 @@
1314
# TODO: there are many places that rely on these private methods existing in
1415
# pandas.core.index
1516
__all__ = ['Index', 'MultiIndex', 'NumericIndex', 'Float64Index', 'Int64Index',
16-
'CategoricalIndex', 'RangeIndex', 'UInt64Index',
17+
'CategoricalIndex', 'IntervalIndex', 'RangeIndex', 'UInt64Index',
1718
'InvalidIndexError',
1819
'_new_Index',
1920
'_ensure_index', '_get_na_value', '_get_combined_index',

pandas/indexes/base.py

+28-2
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
is_dtype_equal,
2727
is_object_dtype,
2828
is_categorical_dtype,
29+
is_interval_dtype,
2930
is_bool_dtype,
3031
is_signed_integer_dtype,
3132
is_unsigned_integer_dtype,
@@ -164,6 +165,12 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
164165
from .category import CategoricalIndex
165166
return CategoricalIndex(data, copy=copy, name=name, **kwargs)
166167

168+
# interval
169+
if is_interval_dtype(data):
170+
from .interval import IntervalIndex
171+
return IntervalIndex.from_intervals(data, name=name,
172+
copy=copy)
173+
167174
# index-like
168175
elif isinstance(data, (np.ndarray, Index, ABCSeries)):
169176

@@ -268,6 +275,10 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
268275
elif inferred in ['floating', 'mixed-integer-float']:
269276
from .numeric import Float64Index
270277
return Float64Index(subarr, copy=copy, name=name)
278+
elif inferred == 'interval':
279+
from .interval import IntervalIndex
280+
return IntervalIndex.from_intervals(subarr, name=name,
281+
copy=copy)
271282
elif inferred == 'boolean':
272283
# don't support boolean explicity ATM
273284
pass
@@ -1180,6 +1191,9 @@ def is_object(self):
11801191
def is_categorical(self):
11811192
return self.inferred_type in ['categorical']
11821193

1194+
def is_interval(self):
1195+
return self.inferred_type in ['interval']
1196+
11831197
def is_mixed(self):
11841198
return self.inferred_type in ['mixed']
11851199

@@ -3235,6 +3249,13 @@ def _searchsorted_monotonic(self, label, side='left'):
32353249

32363250
raise ValueError('index must be monotonic increasing or decreasing')
32373251

3252+
def _get_loc_only_exact_matches(self, key):
3253+
"""
3254+
This is overriden on subclasses (namely, IntervalIndex) to control
3255+
get_slice_bound.
3256+
"""
3257+
return self.get_loc(key)
3258+
32383259
def get_slice_bound(self, label, side, kind):
32393260
"""
32403261
Calculate slice bound that corresponds to given label.
@@ -3264,7 +3285,7 @@ def get_slice_bound(self, label, side, kind):
32643285

32653286
# we need to look up the label
32663287
try:
3267-
slc = self.get_loc(label)
3288+
slc = self._get_loc_only_exact_matches(label)
32683289
except KeyError as err:
32693290
try:
32703291
return self._searchsorted_monotonic(label, side)
@@ -3504,7 +3525,9 @@ def _evaluate_compare(self, other):
35043525
if needs_i8_conversion(self) and needs_i8_conversion(other):
35053526
return self._evaluate_compare(other, op)
35063527

3507-
if is_object_dtype(self) and self.nlevels == 1:
3528+
if (is_object_dtype(self) and
3529+
self.nlevels == 1):
3530+
35083531
# don't pass MultiIndex
35093532
with np.errstate(all='ignore'):
35103533
result = _comp_method_OBJECT_ARRAY(
@@ -3816,6 +3839,9 @@ def _ensure_index(index_like, copy=False):
38163839

38173840

38183841
def _get_na_value(dtype):
3842+
if is_datetime64_any_dtype(dtype) or is_timedelta64_dtype(dtype):
3843+
return tslib.NaT
3844+
38193845
return {np.datetime64: tslib.NaT,
38203846
np.timedelta64: tslib.NaT}.get(dtype, np.nan)
38213847

pandas/indexes/category.py

+10
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from pandas.types.common import (is_categorical_dtype,
88
_ensure_platform_int,
99
is_list_like,
10+
is_interval_dtype,
1011
is_scalar)
1112
from pandas.types.missing import array_equivalent
1213

@@ -266,6 +267,13 @@ def __array__(self, dtype=None):
266267
""" the array interface, return my values """
267268
return np.array(self._data, dtype=dtype)
268269

270+
@Appender(_index_shared_docs['astype'])
271+
def astype(self, dtype, copy=True):
272+
if is_interval_dtype(dtype):
273+
from pandas import IntervalIndex
274+
return IntervalIndex.from_intervals(np.array(self))
275+
return super(CategoricalIndex, self).astype(dtype=dtype, copy=copy)
276+
269277
@cache_readonly
270278
def _isnan(self):
271279
""" return if each value is nan"""
@@ -508,6 +516,8 @@ def take(self, indices, axis=0, allow_fill=True,
508516
na_value=-1)
509517
return self._create_from_codes(taken)
510518

519+
take_nd = take
520+
511521
def map(self, mapper):
512522
"""Apply mapper function to its categories (not codes).
513523

0 commit comments

Comments
 (0)