Skip to content

Commit 37e0b98

Browse files
committed
CLN/COMPAT: IntervalIndex
1 parent ebab713 commit 37e0b98

35 files changed

+1312
-2381
lines changed

pandas/api/tests/test_api.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ class TestPDApi(Base, tm.TestCase):
5656
'Period', 'PeriodIndex', 'RangeIndex', 'UInt64Index',
5757
'Series', 'SparseArray', 'SparseDataFrame',
5858
'SparseSeries', 'TimeGrouper', 'Timedelta',
59-
'TimedeltaIndex', 'Timestamp']
59+
'TimedeltaIndex', 'Timestamp', 'Interval', 'IntervalIndex']
6060

6161
# these are already deprecated; awaiting removal
6262
deprecated_classes = ['TimeSeries', 'WidePanel',

pandas/core/algorithms.py

+41-20
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
from pandas import compat, lib, tslib, _np_version_under1p8
1010
from pandas.types.cast import _maybe_promote
11-
from pandas.types.generic import ABCSeries, ABCIndex
11+
from pandas.types.generic import ABCSeries, ABCIndex, ABCIntervalIndex
1212
from pandas.types.common import (is_unsigned_integer_dtype,
1313
is_signed_integer_dtype,
1414
is_integer_dtype,
@@ -401,31 +401,47 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
401401
if bins is not None:
402402
try:
403403
from pandas.tools.tile import cut
404-
values = Series(values).values
405-
cat, bins = cut(values, bins, retbins=True)
404+
values = Series(values)
405+
ii, _, lev = cut(values, bins, retbins=True, include_lowest=True)
406406
except TypeError:
407407
raise TypeError("bins argument only works with numeric data.")
408408

409-
if is_extension_type(values) and not is_datetimetz(values):
410-
# handle Categorical and sparse,
411-
# datetime tz can be handeled in ndarray path
412-
result = Series(values).values.value_counts(dropna=dropna)
413-
result.name = name
414-
counts = result.values
409+
# if normalizing, we need the total (include NA's)
410+
counts = np.array([len(ii)])
411+
412+
# remove NaN ii entries
413+
if dropna:
414+
mask = ii.notnull()
415+
values = values[mask]
416+
ii = ii[mask]
417+
418+
result = values.groupby(ii).count()
419+
420+
# reindex & fill in 0's for non-represented levels
421+
# but don't if we have completely dropped everything
422+
# as its now a missing level
423+
# this matches our groupby.value_counts behavior
424+
if dropna and not len(values) and not len(result):
425+
result.index = lev[0:0]
426+
else:
427+
result = result.reindex(lev).fillna(0).astype('i8')
428+
415429
else:
416-
# ndarray path. pass original to handle DatetimeTzBlock
417-
keys, counts = _value_counts_arraylike(values, dropna=dropna)
418430

419-
from pandas import Index, Series
420-
if not isinstance(keys, Index):
421-
keys = Index(keys)
422-
result = Series(counts, index=keys, name=name)
431+
if is_extension_type(values) and not is_datetimetz(values):
432+
# handle Categorical and sparse,
433+
# datetime tz can be handeled in ndarray path
434+
result = Series(values).values.value_counts(dropna=dropna)
435+
result.name = name
436+
counts = result.values
437+
else:
438+
# ndarray path. pass original to handle DatetimeTzBlock
439+
keys, counts = _value_counts_arraylike(values, dropna=dropna)
423440

424-
if bins is not None:
425-
# TODO: This next line should be more efficient
426-
result = result.reindex(np.arange(len(cat.categories)),
427-
fill_value=0)
428-
result.index = bins[:-1]
441+
from pandas import Index, Series
442+
if not isinstance(keys, Index):
443+
keys = Index(keys)
444+
result = Series(counts, index=keys, name=name)
429445

430446
if sort:
431447
result = result.sort_values(ascending=ascending)
@@ -1244,6 +1260,11 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,
12441260
allow_fill=allow_fill)
12451261
elif is_datetimetz(arr):
12461262
return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
1263+
elif isinstance(arr, ABCIntervalIndex):
1264+
# TODO: we need to be sure we are taking on an actual IntervalIndex
1265+
# this is 'hacky' until we have a first class dtype
1266+
# ideally will use is_interval_dtype here
1267+
return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
12471268

12481269
if indexer is None:
12491270
indexer = np.arange(arr.shape[axis], dtype=np.int64)

pandas/core/api.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@
1111
from pandas.formats.format import set_eng_float_format
1212
from pandas.core.index import (Index, CategoricalIndex, Int64Index,
1313
UInt64Index, RangeIndex, Float64Index,
14-
MultiIndex)
15-
from pandas.core.interval import Interval, IntervalIndex
14+
MultiIndex, IntervalIndex)
15+
from pandas.indexes.interval import Interval
1616

1717
from pandas.core.series import Series, TimeSeries
1818
from pandas.core.frame import DataFrame

pandas/core/groupby.py

+25-19
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from pandas.types.common import (is_numeric_dtype,
1818
is_timedelta64_dtype, is_datetime64_dtype,
1919
is_categorical_dtype,
20+
is_interval_dtype,
2021
is_datetimelike,
2122
is_datetime64_any_dtype,
2223
is_bool, is_integer_dtype,
@@ -39,10 +40,11 @@
3940

4041
from pandas.core.base import (PandasObject, SelectionMixin, GroupByError,
4142
DataError, SpecificationError)
43+
from pandas.core.index import (Index, MultiIndex,
44+
CategoricalIndex, _ensure_index)
4245
from pandas.core.categorical import Categorical
4346
from pandas.core.frame import DataFrame
4447
from pandas.core.generic import NDFrame
45-
from pandas.core.interval import IntervalIndex
4648
from pandas.core.internals import BlockManager, make_block
4749
from pandas.core.series import Series
4850
from pandas.core.panel import Panel
@@ -2579,7 +2581,7 @@ def _convert_grouper(axis, grouper):
25792581
return grouper.reindex(axis)._values
25802582
elif isinstance(grouper, (list, Series, Index, np.ndarray)):
25812583
if len(grouper) != len(axis):
2582-
raise AssertionError('Grouper and axis must be same length')
2584+
raise ValueError('Grouper and axis must be same length')
25832585
return grouper
25842586
else:
25852587
return grouper
@@ -3063,36 +3065,41 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
30633065

30643066
if bins is None:
30653067
lab, lev = algos.factorize(val, sort=True)
3068+
llab = lambda lab, inc: lab[inc]
30663069
else:
3067-
raise NotImplementedError('this is broken')
3068-
lab, bins = cut(val, bins, retbins=True)
3069-
# bins[:-1] for backward compat;
3070-
# o.w. cat.categories could be better
3071-
# cat = Categorical(cat)
3072-
# lab, lev, dropna = cat.codes, bins[:-1], False
3073-
3074-
if (lab.dtype == object
3075-
and lib.is_interval_array_fixed_closed(lab[notnull(lab)])):
3076-
lab_index = Index(lab)
3077-
assert isinstance(lab, IntervalIndex)
3078-
sorter = np.lexsort((lab_index.left, lab_index.right, ids))
3070+
3071+
# lab is an IntervalIndex
3072+
# we get our last level of labels from the
3073+
# II indexer
3074+
# TODO: make this a method on II
3075+
lab, _, lev = cut(val, bins, retbins=True, include_lowest=True)
3076+
3077+
# we compute the levels here rather than use the bins
3078+
# because we may have adjusted them with include_lowest
3079+
llab = lambda lab, inc: lab[inc]._multiindex.labels[-1]
3080+
3081+
if is_interval_dtype(lab):
3082+
# TODO: should we do this inside II?
3083+
sorter = np.lexsort((lab.left, lab.right, ids))
30793084
else:
30803085
sorter = np.lexsort((lab, ids))
3086+
30813087
ids, lab = ids[sorter], lab[sorter]
30823088

30833089
# group boundaries are where group ids change
30843090
idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]
30853091

30863092
# new values are where sorted labels change
3087-
inc = np.r_[True, lab[1:] != lab[:-1]]
3093+
lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1))
3094+
inc = np.r_[True, lchanges]
30883095
inc[idx] = True # group boundaries are also new values
30893096
out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts
30903097

30913098
# num. of times each group should be repeated
30923099
rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx))
30933100

30943101
# multi-index components
3095-
labels = list(map(rep, self.grouper.recons_labels)) + [lab[inc]]
3102+
labels = list(map(rep, self.grouper.recons_labels)) + [llab(lab, inc)]
30963103
levels = [ping.group_index for ping in self.grouper.groupings] + [lev]
30973104
names = self.grouper.names + [self.name]
30983105

@@ -3118,13 +3125,12 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
31183125
acc = rep(d)
31193126
out /= acc
31203127

3121-
if sort: # and bins is None:
3128+
if sort and bins is None:
31223129
cat = ids[inc][mask] if dropna else ids[inc]
31233130
sorter = np.lexsort((out if ascending else -out, cat))
31243131
out, labels[-1] = out[sorter], labels[-1][sorter]
31253132

3126-
# if bins is None:
3127-
if True:
3133+
if bins is None:
31283134
mi = MultiIndex(levels=levels, labels=labels, names=names,
31293135
verify_integrity=False)
31303136

pandas/hashtable.pyx

+2
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ PyDateTime_IMPORT
3939
cdef extern from "Python.h":
4040
int PySlice_Check(object)
4141

42+
cdef size_t _INIT_VEC_CAP = 128
43+
4244
include "hashtable_class_helper.pxi"
4345
include "hashtable_func_helper.pxi"
4446

pandas/indexes/api.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
InvalidIndexError)
44
from pandas.indexes.category import CategoricalIndex # noqa
55
from pandas.indexes.multi import MultiIndex # noqa
6+
from pandas.indexes.interval import IntervalIndex # noqa
67
from pandas.indexes.numeric import (NumericIndex, Float64Index, # noqa
78
Int64Index, UInt64Index)
89
from pandas.indexes.range import RangeIndex # noqa
@@ -13,7 +14,7 @@
1314
# TODO: there are many places that rely on these private methods existing in
1415
# pandas.core.index
1516
__all__ = ['Index', 'MultiIndex', 'NumericIndex', 'Float64Index', 'Int64Index',
16-
'CategoricalIndex', 'RangeIndex', 'UInt64Index',
17+
'CategoricalIndex', 'IntervalIndex', 'RangeIndex', 'UInt64Index',
1718
'InvalidIndexError',
1819
'_new_Index',
1920
'_ensure_index', '_get_na_value', '_get_combined_index',

pandas/indexes/base.py

+24-2
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
is_dtype_equal,
2727
is_object_dtype,
2828
is_categorical_dtype,
29+
is_interval_dtype,
2930
is_bool_dtype,
3031
is_signed_integer_dtype,
3132
is_unsigned_integer_dtype,
@@ -268,6 +269,10 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
268269
elif inferred in ['floating', 'mixed-integer-float']:
269270
from .numeric import Float64Index
270271
return Float64Index(subarr, copy=copy, name=name)
272+
elif inferred == 'interval':
273+
from .interval import IntervalIndex
274+
return IntervalIndex.from_intervals(subarr, name=name,
275+
copy=copy)
271276
elif inferred == 'boolean':
272277
# don't support boolean explicity ATM
273278
pass
@@ -1180,6 +1185,9 @@ def is_object(self):
11801185
def is_categorical(self):
11811186
return self.inferred_type in ['categorical']
11821187

1188+
def is_interval(self):
1189+
return self.inferred_type in ['interval']
1190+
11831191
def is_mixed(self):
11841192
return self.inferred_type in ['mixed']
11851193

@@ -3232,6 +3240,13 @@ def _searchsorted_monotonic(self, label, side='left'):
32323240

32333241
raise ValueError('index must be monotonic increasing or decreasing')
32343242

3243+
def _get_loc_only_exact_matches(self, key):
3244+
"""
3245+
This is overriden on subclasses (namely, IntervalIndex) to control
3246+
get_slice_bound.
3247+
"""
3248+
return self.get_loc(key)
3249+
32353250
def get_slice_bound(self, label, side, kind):
32363251
"""
32373252
Calculate slice bound that corresponds to given label.
@@ -3261,7 +3276,7 @@ def get_slice_bound(self, label, side, kind):
32613276

32623277
# we need to look up the label
32633278
try:
3264-
slc = self.get_loc(label)
3279+
slc = self._get_loc_only_exact_matches(label)
32653280
except KeyError as err:
32663281
try:
32673282
return self._searchsorted_monotonic(label, side)
@@ -3501,7 +3516,11 @@ def _evaluate_compare(self, other):
35013516
if needs_i8_conversion(self) and needs_i8_conversion(other):
35023517
return self._evaluate_compare(other, op)
35033518

3504-
if is_object_dtype(self) and self.nlevels == 1:
3519+
# TODO: super hack
3520+
if (is_object_dtype(self) and
3521+
self.nlevels == 1 and not
3522+
is_interval_dtype(self)):
3523+
35053524
# don't pass MultiIndex
35063525
with np.errstate(all='ignore'):
35073526
result = _comp_method_OBJECT_ARRAY(
@@ -3813,6 +3832,9 @@ def _ensure_index(index_like, copy=False):
38133832

38143833

38153834
def _get_na_value(dtype):
3835+
if is_datetime64_any_dtype(dtype) or is_timedelta64_dtype(dtype):
3836+
return tslib.NaT
3837+
38163838
return {np.datetime64: tslib.NaT,
38173839
np.timedelta64: tslib.NaT}.get(dtype, np.nan)
38183840

0 commit comments

Comments
 (0)