From 74162aab065f480a4b779351475cf7a60812ec9f Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 2 Nov 2014 03:01:15 -0500 Subject: [PATCH 01/12] API/ENH: IntervalIndex closes #7640 closes #8625 --- pandas/_libs/hashtable.pyx | 3 - pandas/_libs/lib.pyx | 6 +- pandas/_libs/src/inference.pyx | 21 + pandas/core/algorithms.py | 1 - pandas/core/api.py | 1 + pandas/core/groupby.py | 24 +- pandas/core/interval.py | 521 ++++++++++ pandas/src/generate_intervaltree.py | 395 +++++++ pandas/src/interval.pyx | 171 +++ pandas/src/intervaltree.pyx | 1444 ++++++++++++++++++++++++++ pandas/tests/groupby/test_groupby.py | 43 + pandas/tests/test_algos.py | 16 +- pandas/tests/test_base.py | 19 +- pandas/tests/test_categorical.py | 20 +- pandas/tests/test_interval.py | 591 +++++++++++ pandas/tests/tools/test_tile.py | 148 +-- pandas/tools/tile.py | 102 +- pandas/util/testing.py | 11 +- 18 files changed, 3377 insertions(+), 160 deletions(-) create mode 100644 pandas/core/interval.py create mode 100644 pandas/src/generate_intervaltree.py create mode 100644 pandas/src/interval.pyx create mode 100644 pandas/src/intervaltree.pyx create mode 100644 pandas/tests/test_interval.py diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index eee287b2c157b..a4e5bee9a8746 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -39,9 +39,6 @@ PyDateTime_IMPORT cdef extern from "Python.h": int PySlice_Check(object) -cdef size_t _INIT_VEC_CAP = 128 - - include "hashtable_class_helper.pxi" include "hashtable_func_helper.pxi" diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index f902422b0916d..f90fd1e5bb44b 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -245,6 +245,7 @@ cpdef bint isscalar(object val): - instances of datetime.timedelta - Period - instances of decimal.Decimal + - Interval """ @@ -258,7 +259,8 @@ cpdef bint isscalar(object val): or PyDelta_Check(val) or PyTime_Check(val) or util.is_period_object(val) - or is_decimal(val)) + or is_decimal(val), + or is_interval(val)) def item_from_zerodim(object val): @@ -1896,4 +1898,6 @@ cdef class BlockPlacement: include "reduce.pyx" include "properties.pyx" +include "interval.pyx" +include "intervaltree.pyx" include "inference.pyx" diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index 33c05f302dd94..0c85f488dd311 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -429,6 +429,10 @@ def infer_dtype(object value): if is_period_array(values): return 'period' + elif is_interval(val): + if is_interval_array_fixed_closed(values): + return 'interval' + for i in range(n): val = util.get_value_1d(values, i) if (util.is_integer_object(val) and @@ -879,6 +883,23 @@ cpdef bint is_period_array(ndarray[object] values): return False return null_count != n +cdef inline bint is_interval(object o): + return isinstance(o, Interval) + +def is_interval_array_fixed_closed(ndarray[object] values): + cdef Py_ssize_t i, n = len(values) + cdef str closed + if n == 0: + return False + for i in range(n): + if not is_interval(values[i]): + return False + if i == 0: + closed = values[0].closed + elif closed != values[i].closed: + return False + return True + cdef extern from "parse_helper.h": inline int floatify(object, double *result, int *maybe_int) except -1 diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 7fab9295bb94e..d72ee71570adb 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -609,7 +609,6 @@ def value_counts(values, sort=True, ascending=False, normalize=False, cat, bins = cut(values, bins, retbins=True) except TypeError: raise TypeError("bins argument only works with numeric data.") - values = cat.codes if is_categorical_dtype(values) or is_sparse(values): diff --git a/pandas/core/api.py b/pandas/core/api.py index 65253dedb8b53..dbb5e22358c18 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -12,6 +12,7 @@ from pandas.core.index import (Index, CategoricalIndex, Int64Index, UInt64Index, RangeIndex, Float64Index, MultiIndex) +from pandas.core.interval import Interval, IntervalIndex from pandas.core.series import Series from pandas.core.frame import DataFrame diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 5591ce4b0d4aa..69c90d8cc9efd 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -43,8 +43,7 @@ from pandas.core.categorical import Categorical from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame -from pandas.core.index import (Index, MultiIndex, CategoricalIndex, - _ensure_index) +from pandas.core.interval import IntervalIndex from pandas.core.internals import BlockManager, make_block from pandas.core.series import Series from pandas.core.panel import Panel @@ -3146,12 +3145,20 @@ def value_counts(self, normalize=False, sort=True, ascending=False, if bins is None: lab, lev = algorithms.factorize(val, sort=True) else: - cat, bins = cut(val, bins, retbins=True) + raise NotImplementedError('this is broken') + lab, bins = cut(val, bins, retbins=True) # bins[:-1] for backward compat; # o.w. cat.categories could be better - lab, lev, dropna = cat.codes, bins[:-1], False - - sorter = np.lexsort((lab, ids)) + # cat = Categorical(cat) + # lab, lev, dropna = cat.codes, bins[:-1], False + + if (lab.dtype == object + and lib.is_interval_array_fixed_closed(lab[notnull(lab)])): + lab_index = Index(lab) + assert isinstance(lab, IntervalIndex) + sorter = np.lexsort((lab_index.left, lab_index.right, ids)) + else: + sorter = np.lexsort((lab, ids)) ids, lab = ids[sorter], lab[sorter] # group boundaries are where group ids change @@ -3192,12 +3199,13 @@ def value_counts(self, normalize=False, sort=True, ascending=False, acc = rep(d) out /= acc - if sort and bins is None: + if sort: # and bins is None: cat = ids[inc][mask] if dropna else ids[inc] sorter = np.lexsort((out if ascending else -out, cat)) out, labels[-1] = out[sorter], labels[-1][sorter] - if bins is None: + # if bins is None: + if True: mi = MultiIndex(levels=levels, labels=labels, names=names, verify_integrity=False) diff --git a/pandas/core/interval.py b/pandas/core/interval.py new file mode 100644 index 0000000000000..68e07f21367a0 --- /dev/null +++ b/pandas/core/interval.py @@ -0,0 +1,521 @@ +import operator + +import numpy as np +import pandas as pd + +from pandas.core.base import PandasObject, IndexOpsMixin +from pandas.core.common import (_values_from_object, _ensure_platform_int, + notnull, is_datetime_or_timedelta_dtype, + is_integer_dtype, is_float_dtype) +from pandas.core.index import (Index, _ensure_index, default_pprint, + InvalidIndexError, MultiIndex) +from pandas.lib import (Interval, IntervalMixin, IntervalTree, + interval_bounds_to_intervals, + intervals_to_interval_bounds) +from pandas.util.decorators import cache_readonly +import pandas.core.common as com + + +_VALID_CLOSED = set(['left', 'right', 'both', 'neither']) + + +def _get_next_label(label): + dtype = getattr(label, 'dtype', type(label)) + if isinstance(label, (pd.Timestamp, pd.Timedelta)): + dtype = 'datetime64' + if is_datetime_or_timedelta_dtype(dtype): + return label + np.timedelta64(1, 'ns') + elif is_integer_dtype(dtype): + return label + 1 + elif is_float_dtype(dtype): + return np.nextafter(label, np.infty) + else: + raise TypeError('cannot determine next label for type %r' + % type(label)) + + +def _get_prev_label(label): + dtype = getattr(label, 'dtype', type(label)) + if isinstance(label, (pd.Timestamp, pd.Timedelta)): + dtype = 'datetime64' + if is_datetime_or_timedelta_dtype(dtype): + return label - np.timedelta64(1, 'ns') + elif is_integer_dtype(dtype): + return label - 1 + elif is_float_dtype(dtype): + return np.nextafter(label, -np.infty) + else: + raise TypeError('cannot determine next label for type %r' + % type(label)) + + +def _get_interval_closed_bounds(interval): + """ + Given an Interval or IntervalIndex, return the corresponding interval with + closed bounds. + """ + left, right = interval.left, interval.right + if interval.open_left: + left = _get_next_label(left) + if interval.open_right: + right = _get_prev_label(right) + return left, right + + +class IntervalIndex(IntervalMixin, Index): + """ + Immutable Index implementing an ordered, sliceable set. IntervalIndex + represents an Index of intervals that are all closed on the same side. + + .. versionadded:: 0.18 + + Properties + ---------- + left, right : array-like (1-dimensional) + Left and right bounds for each interval. + closed : {'left', 'right', 'both', 'neither'}, optional + Whether the intervals are closed on the left-side, right-side, both or + neither. Defaults to 'right'. + name : object, optional + Name to be stored in the index. + """ + _typ = 'intervalindex' + _comparables = ['name'] + _attributes = ['name', 'closed'] + _allow_index_ops = True + _engine = None # disable it + + def __new__(cls, left, right, closed='right', name=None, fastpath=False): + # TODO: validation + result = IntervalMixin.__new__(cls) + result._left = _ensure_index(left) + result._right = _ensure_index(right) + result._closed = closed + result.name = name + if not fastpath: + result._validate() + result._reset_identity() + return result + + def _validate(self): + """Verify that the IntervalIndex is valid. + """ + # TODO: exclude periods? + if self.closed not in _VALID_CLOSED: + raise ValueError("invalid options for 'closed': %s" % self.closed) + if len(self.left) != len(self.right): + raise ValueError('left and right must have the same length') + left_valid = notnull(self.left) + right_valid = notnull(self.right) + if not (left_valid == right_valid).all(): + raise ValueError('missing values must be missing in the same ' + 'location both left and right sides') + if not (self.left[left_valid] <= self.right[left_valid]).all(): + raise ValueError('left side of interval must be <= right side') + + def _simple_new(cls, values, name=None, **kwargs): + # ensure we don't end up here (this is a superclass method) + raise NotImplementedError + + def _cleanup(self): + pass + + @property + def _engine(self): + raise NotImplementedError + + @cache_readonly + def _tree(self): + return IntervalTree(self.left, self.right, closed=self.closed) + + @property + def _constructor(self): + return type(self).from_intervals + + @classmethod + def from_breaks(cls, breaks, closed='right', name=None): + """ + Construct an IntervalIndex from an array of splits + + Parameters + ---------- + breaks : array-like (1-dimensional) + Left and right bounds for each interval. + closed : {'left', 'right', 'both', 'neither'}, optional + Whether the intervals are closed on the left-side, right-side, both + or neither. Defaults to 'right'. + name : object, optional + Name to be stored in the index. + + Examples + -------- + + >>> IntervalIndex.from_breaks([0, 1, 2, 3]) + IntervalIndex(left=[0, 1, 2], + right=[1, 2, 3], + closed='right') + """ + return cls(breaks[:-1], breaks[1:], closed, name) + + @classmethod + def from_intervals(cls, data, name=None): + """ + Construct an IntervalIndex from a 1d array of Interval objects + + Parameters + ---------- + data : array-like (1-dimensional) + Array of Interval objects. All intervals must be closed on the same + sides. + name : object, optional + Name to be stored in the index. + + Examples + -------- + + >>> IntervalIndex.from_intervals([Interval(0, 1), Interval(1, 2)]) + IntervalIndex(left=[0, 1], + right=[1, 2], + closed='right') + + The generic Index constructor work identically when it infers an array + of all intervals: + + >>> Index([Interval(0, 1), Interval(1, 2)]) + IntervalIndex(left=[0, 1], + right=[1, 2], + closed='right') + """ + data = np.asarray(data) + left, right, closed = intervals_to_interval_bounds(data) + return cls(left, right, closed, name) + + @classmethod + def from_tuples(cls, data, closed='right', name=None): + left = [] + right = [] + for l, r in data: + left.append(l) + right.append(r) + return cls(np.array(left), np.array(right), closed, name) + + def to_tuples(self): + return Index(com._asarray_tuplesafe(zip(self.left, self.right))) + + @cache_readonly + def _multiindex(self): + return MultiIndex.from_arrays([self.left, self.right], + names=['left', 'right']) + + @property + def left(self): + return self._left + + @property + def right(self): + return self._right + + @property + def closed(self): + return self._closed + + def __len__(self): + return len(self.left) + + @cache_readonly + def values(self): + """Returns the IntervalIndex's data as a numpy array of Interval + objects (with dtype='object') + """ + left = np.asarray(self.left) + right = np.asarray(self.right) + return interval_bounds_to_intervals(left, right, self.closed) + + def __array__(self, result=None): + """ the array interface, return my values """ + return self.values + + def __array_wrap__(self, result, context=None): + # we don't want the superclass implementation + return result + + def _array_values(self): + return self.values + + def __reduce__(self): + return self.__class__, (self.left, self.right, self.closed, self.name) + + def _shallow_copy(self, values=None, name=None): + name = name if name is not None else self.name + if values is not None: + return type(self).from_intervals(values, name=name) + else: + return self.copy(name=name) + + def copy(self, deep=False, name=None): + left = self.left.copy(deep=True) if deep else self.left + right = self.right.copy(deep=True) if deep else self.right + name = name if name is not None else self.name + return type(self)(left, right, closed=self.closed, name=name, + fastpath=True) + + @cache_readonly + def dtype(self): + return np.dtype('O') + + @cache_readonly + def mid(self): + """Returns the mid-point of each interval in the index as an array + """ + try: + return Index(0.5 * (self.left.values + self.right.values)) + except TypeError: + # datetime safe version + delta = self.right.values - self.left.values + return Index(self.left.values + 0.5 * delta) + + @cache_readonly + def is_monotonic_increasing(self): + return self._multiindex.is_monotonic_increasing + + @cache_readonly + def is_monotonic_decreasing(self): + return self._multiindex.is_monotonic_decreasing + + @cache_readonly + def is_unique(self): + return self._multiindex.is_unique + + @cache_readonly + def is_non_overlapping_monotonic(self): + # must be increasing (e.g., [0, 1), [1, 2), [2, 3), ... ) + # or decreasing (e.g., [-1, 0), [-2, -1), [-3, -2), ...) + # we already require left <= right + return ((self.right[:-1] <= self.left[1:]).all() or + (self.left[:-1] >= self.right[1:]).all()) + + def _convert_scalar_indexer(self, key, kind=None): + return key + + def _maybe_cast_slice_bound(self, label, side, kind): + return getattr(self, side)._maybe_cast_slice_bound(label, side, kind) + + def _convert_list_indexer(self, keyarr, kind=None): + """ + we are passed a list indexer. + Return our indexer or raise if all of the values are not included in the categories + """ + locs = self.get_indexer(keyarr) + # TODO: handle keyarr if it includes intervals + if (locs == -1).any(): + raise KeyError("a list-indexer must only include existing intervals") + + return locs + + def _check_method(self, method): + if method is not None: + raise NotImplementedError( + 'method %r not yet implemented for IntervalIndex' % method) + + def _searchsorted_monotonic(self, label, side, exclude_label=False): + if not self.is_non_overlapping_monotonic: + raise KeyError('can only get slices from an IntervalIndex if ' + 'bounds are non-overlapping and all monotonic ' + 'increasing or decreasing') + + if isinstance(label, IntervalMixin): + raise NotImplementedError + + if ((side == 'left' and self.left.is_monotonic_increasing) or + (side == 'right' and self.left.is_monotonic_decreasing)): + sub_idx = self.right + if self.open_right or exclude_label: + label = _get_next_label(label) + else: + sub_idx = self.left + if self.open_left or exclude_label: + label = _get_prev_label(label) + + return sub_idx._searchsorted_monotonic(label, side) + + def _get_loc_only_exact_matches(self, key): + return self._multiindex._tuple_index.get_loc(key) + + def _find_non_overlapping_monotonic_bounds(self, key): + if isinstance(key, IntervalMixin): + start = self._searchsorted_monotonic( + key.left, 'left', exclude_label=key.open_left) + stop = self._searchsorted_monotonic( + key.right, 'right', exclude_label=key.open_right) + else: + # scalar + start = self._searchsorted_monotonic(key, 'left') + stop = self._searchsorted_monotonic(key, 'right') + return start, stop + + def get_loc(self, key, method=None): + self._check_method(method) + + original_key = key + + if self.is_non_overlapping_monotonic: + if isinstance(key, Interval): + left = self._maybe_cast_slice_bound(key.left, 'left', None) + right = self._maybe_cast_slice_bound(key.right, 'right', None) + key = Interval(left, right, key.closed) + else: + key = self._maybe_cast_slice_bound(key, 'left', None) + + start, stop = self._find_non_overlapping_monotonic_bounds(key) + + if start + 1 == stop: + return start + elif start < stop: + return slice(start, stop) + else: + raise KeyError(original_key) + + else: + # use the interval tree + if isinstance(key, Interval): + left, right = _get_interval_closed_bounds(key) + return self._tree.get_loc_interval(left, right) + else: + return self._tree.get_loc(key) + + def get_value(self, series, key): + # this method seems necessary for Series.__getitem__ but I have no idea + # what it should actually do here... + loc = self.get_loc(key) # nb. this can't handle slice objects + return series.iloc[loc] + + def get_indexer(self, target, method=None, limit=None, tolerance=None): + self._check_method(method) + target = _ensure_index(target) + + if self.is_non_overlapping_monotonic: + start, stop = self._find_non_overlapping_monotonic_bounds(target) + + start_plus_one = start + 1 + if (start_plus_one < stop).any(): + raise ValueError('indexer corresponds to non-unique elements') + return np.where(start_plus_one == stop, start, -1) + + else: + if isinstance(target, IntervalIndex): + raise NotImplementedError( + 'have not yet implemented get_indexer ' + 'for IntervalIndex indexers') + else: + return self._tree.get_indexer(target) + + def delete(self, loc): + new_left = self.left.delete(loc) + new_right = self.right.delete(loc) + return type(self)(new_left, new_right, self.closed, self.name, + fastpath=True) + + def insert(self, loc, item): + if not isinstance(item, Interval): + raise ValueError('can only insert Interval objects into an ' + 'IntervalIndex') + if not item.closed == self.closed: + raise ValueError('inserted item must be closed on the same side ' + 'as the index') + new_left = self.left.insert(loc, item.left) + new_right = self.right.insert(loc, item.right) + return type(self)(new_left, new_right, self.closed, self.name, + fastpath=True) + + def _as_like_interval_index(self, other, error_msg): + self._assert_can_do_setop(other) + other = _ensure_index(other) + if (not isinstance(other, IntervalIndex) or + self.closed != other.closed): + raise ValueError(error_msg) + return other + + def append(self, other): + msg = ('can only append two IntervalIndex objects that are closed on ' + 'the same side') + other = self._as_like_interval_index(other, msg) + new_left = self.left.append(other.left) + new_right = self.right.append(other.right) + if other.name is not None and other.name != self.name: + name = None + else: + name = self.name + return type(self)(new_left, new_right, self.closed, name, + fastpath=True) + + def take(self, indexer, axis=0): + indexer = com._ensure_platform_int(indexer) + new_left = self.left.take(indexer) + new_right = self.right.take(indexer) + return type(self)(new_left, new_right, self.closed, self.name, + fastpath=True) + + def __contains__(self, key): + try: + self.get_loc(key) + return True + except KeyError: + return False + + def __getitem__(self, value): + left = self.left[value] + right = self.right[value] + if not isinstance(left, Index): + return Interval(left, right, self.closed) + else: + return type(self)(left, right, self.closed, self.name) + + # __repr__ associated methods are based on MultiIndex + + def _format_attrs(self): + attrs = [('left', default_pprint(self.left)), + ('right', default_pprint(self.right)), + ('closed', repr(self.closed))] + if self.name is not None: + attrs.append(('name', default_pprint(self.name))) + return attrs + + def _format_space(self): + return "\n%s" % (' ' * (len(self.__class__.__name__) + 1)) + + def _format_data(self): + return None + + def argsort(self, *args, **kwargs): + return np.lexsort((self.right, self.left)) + + def equals(self, other): + if self.is_(other): + return True + try: + return (self.left.equals(other.left) + and self.right.equals(other.right) + and self.closed == other.closed) + except AttributeError: + return False + + def _setop(op_name): + def func(self, other): + msg = ('can only do set operations between two IntervalIndex ' + 'objects that are closed on the same side') + other = self._as_like_interval_index(other, msg) + result = getattr(self._multiindex, op_name)(other._multiindex) + result_name = self.name if self.name == other.name else None + return type(self).from_tuples(result.values, closed=self.closed, + name=result_name) + return func + + union = _setop('union') + intersection = _setop('intersection') + difference = _setop('difference') + sym_diff = _setop('sym_diff') + + # TODO: arithmetic operations + + +IntervalIndex._add_logical_methods_disabled() diff --git a/pandas/src/generate_intervaltree.py b/pandas/src/generate_intervaltree.py new file mode 100644 index 0000000000000..275a0d40e2433 --- /dev/null +++ b/pandas/src/generate_intervaltree.py @@ -0,0 +1,395 @@ +""" +This file generates `intervaltree.pyx` which is then included in `../lib.pyx` +during building. To regenerate `intervaltree.pyx`, just run: + + `python generate_intervaltree.py`. +""" +from __future__ import print_function +import os +from pandas.compat import StringIO +import numpy as np + + +warning_to_new_contributors = """ +# DO NOT EDIT THIS FILE: This file was autogenerated from +# generate_intervaltree.py, so please edit that file and then run +# `python2 generate_intervaltree.py` to re-generate this file. +""" + +header = r''' +from numpy cimport int64_t, float64_t +from numpy cimport ndarray, PyArray_ArgSort, NPY_QUICKSORT, PyArray_Take +import numpy as np + +cimport cython +cimport numpy as cnp +cnp.import_array() + +from hashtable cimport Int64Vector, Int64VectorData + + +ctypedef fused scalar64_t: + float64_t + int64_t + + +NODE_CLASSES = {} + + +cdef class IntervalTree(IntervalMixin): + """A centered interval tree + + Based off the algorithm described on Wikipedia: + http://en.wikipedia.org/wiki/Interval_tree + """ + cdef: + readonly object left, right, root + readonly str closed + object _left_sorter, _right_sorter + + def __init__(self, left, right, closed='right', leaf_size=100): + """ + Parameters + ---------- + left, right : np.ndarray[ndim=1] + Left and right bounds for each interval. Assumed to contain no + NaNs. + closed : {'left', 'right', 'both', 'neither'}, optional + Whether the intervals are closed on the left-side, right-side, both + or neither. Defaults to 'right'. + leaf_size : int, optional + Parameter that controls when the tree switches from creating nodes + to brute-force search. Tune this parameter to optimize query + performance. + """ + if closed not in ['left', 'right', 'both', 'neither']: + raise ValueError("invalid option for 'closed': %s" % closed) + + left = np.asarray(left) + right = np.asarray(right) + dtype = np.result_type(left, right) + self.left = np.asarray(left, dtype=dtype) + self.right = np.asarray(right, dtype=dtype) + + indices = np.arange(len(left), dtype='int64') + + self.closed = closed + + node_cls = NODE_CLASSES[str(dtype), closed] + self.root = node_cls(self.left, self.right, indices, leaf_size) + + @property + def left_sorter(self): + """How to sort the left labels; this is used for binary search + """ + if self._left_sorter is None: + self._left_sorter = np.argsort(self.left) + return self._left_sorter + + @property + def right_sorter(self): + """How to sort the right labels + """ + if self._right_sorter is None: + self._right_sorter = np.argsort(self.right) + return self._right_sorter + + def get_loc(self, scalar64_t key): + """Return all positions corresponding to intervals that overlap with + the given scalar key + """ + result = Int64Vector() + self.root.query(result, key) + if not result.data.n: + raise KeyError(key) + return result.to_array() + + def _get_partial_overlap(self, key_left, key_right, side): + """Return all positions corresponding to intervals with the given side + falling between the left and right bounds of an interval query + """ + if side == 'left': + values = self.left + sorter = self.left_sorter + else: + values = self.right + sorter = self.right_sorter + key = [key_left, key_right] + i, j = values.searchsorted(key, sorter=sorter) + return sorter[i:j] + + def get_loc_interval(self, key_left, key_right): + """Lookup the intervals enclosed in the given interval bounds + + The given interval is presumed to have closed bounds. + """ + import pandas as pd + left_overlap = self._get_partial_overlap(key_left, key_right, 'left') + right_overlap = self._get_partial_overlap(key_left, key_right, 'right') + enclosing = self.get_loc(0.5 * (key_left + key_right)) + combined = np.concatenate([left_overlap, right_overlap, enclosing]) + uniques = pd.unique(combined) + return uniques + + def get_indexer(self, scalar64_t[:] target): + """Return the positions corresponding to unique intervals that overlap + with the given array of scalar targets. + """ + # TODO: write get_indexer_intervals + cdef: + int64_t old_len, i + Int64Vector result + + result = Int64Vector() + old_len = 0 + for i in range(len(target)): + self.root.query(result, target[i]) + if result.data.n == old_len: + result.append(-1) + elif result.data.n > old_len + 1: + raise KeyError( + 'indexer does not intersect a unique set of intervals') + old_len = result.data.n + return result.to_array() + + def get_indexer_non_unique(self, scalar64_t[:] target): + """Return the positions corresponding to intervals that overlap with + the given array of scalar targets. Non-unique positions are repeated. + """ + cdef: + int64_t old_len, i + Int64Vector result, missing + + result = Int64Vector() + missing = Int64Vector() + old_len = 0 + for i in range(len(target)): + self.root.query(result, target[i]) + if result.data.n == old_len: + result.append(-1) + missing.append(i) + old_len = result.data.n + return result.to_array(), missing.to_array() + + def __repr__(self): + return ('' + % self.root.n_elements) + + +cdef take(ndarray source, ndarray indices): + """Take the given positions from a 1D ndarray + """ + return PyArray_Take(source, indices, 0) + + +cdef sort_values_and_indices(all_values, all_indices, subset): + indices = take(all_indices, subset) + values = take(all_values, subset) + sorter = PyArray_ArgSort(values, 0, NPY_QUICKSORT) + sorted_values = take(values, sorter) + sorted_indices = take(indices, sorter) + return sorted_values, sorted_indices +''' + +# we need specialized nodes and leaves to optimize for different dtype and +# closed values +# unfortunately, fused dtypes can't parameterize attributes on extension types, +# so we're stuck using template generation. + +node_template = r''' +cdef class {dtype_title}Closed{closed_title}IntervalNode: + """Non-terminal node for an IntervalTree + + Categorizes intervals by those that fall to the left, those that fall to + the right, and those that overlap with the pivot. + """ + cdef: + {dtype_title}Closed{closed_title}IntervalNode left_node, right_node + {dtype}_t[:] center_left_values, center_right_values, left, right + int64_t[:] center_left_indices, center_right_indices, indices + {dtype}_t min_left, max_right + readonly {dtype}_t pivot + readonly int64_t n_elements, n_center, leaf_size + readonly bint is_leaf_node + + def __init__(self, + ndarray[{dtype}_t, ndim=1] left, + ndarray[{dtype}_t, ndim=1] right, + ndarray[int64_t, ndim=1] indices, + int64_t leaf_size): + + self.n_elements = len(left) + self.leaf_size = leaf_size + if left.size > 0: + self.min_left = left.min() + self.max_right = right.max() + else: + self.min_left = 0 + self.max_right = 0 + + if self.n_elements <= leaf_size: + # make this a terminal (leaf) node + self.is_leaf_node = True + self.left = left + self.right = right + self.indices = indices + self.n_center + else: + # calculate a pivot so we can create child nodes + self.is_leaf_node = False + self.pivot = np.median(left + right) / 2 + left_set, right_set, center_set = self.classify_intervals(left, right) + + self.left_node = self.new_child_node(left, right, indices, left_set) + self.right_node = self.new_child_node(left, right, indices, right_set) + + self.center_left_values, self.center_left_indices = \ + sort_values_and_indices(left, indices, center_set) + self.center_right_values, self.center_right_indices = \ + sort_values_and_indices(right, indices, center_set) + self.n_center = len(self.center_left_indices) + + @cython.wraparound(False) + @cython.boundscheck(False) + cdef classify_intervals(self, {dtype}_t[:] left, {dtype}_t[:] right): + """Classify the given intervals based upon whether they fall to the + left, right, or overlap with this node's pivot. + """ + cdef: + Int64Vector left_ind, right_ind, overlapping_ind + Py_ssize_t i + + left_ind = Int64Vector() + right_ind = Int64Vector() + overlapping_ind = Int64Vector() + + for i in range(self.n_elements): + if right[i] {cmp_right_converse} self.pivot: + left_ind.append(i) + elif self.pivot {cmp_left_converse} left[i]: + right_ind.append(i) + else: + overlapping_ind.append(i) + + return (left_ind.to_array(), + right_ind.to_array(), + overlapping_ind.to_array()) + + cdef new_child_node(self, + ndarray[{dtype}_t, ndim=1] left, + ndarray[{dtype}_t, ndim=1] right, + ndarray[int64_t, ndim=1] indices, + ndarray[int64_t, ndim=1] subset): + """Create a new child node. + """ + left = take(left, subset) + right = take(right, subset) + indices = take(indices, subset) + return {dtype_title}Closed{closed_title}IntervalNode( + left, right, indices, self.leaf_size) + + @cython.wraparound(False) + @cython.boundscheck(False) + @cython.initializedcheck(False) + cpdef query(self, Int64Vector result, scalar64_t point): + """Recursively query this node and its sub-nodes for intervals that + overlap with the query point. + """ + cdef: + int64_t[:] indices + {dtype}_t[:] values + Py_ssize_t i + + if self.is_leaf_node: + # Once we get down to a certain size, it doesn't make sense to + # continue the binary tree structure. Instead, we use linear + # search. + for i in range(self.n_elements): + if self.left[i] {cmp_left} point {cmp_right} self.right[i]: + result.append(self.indices[i]) + else: + # There are child nodes. Based on comparing our query to the pivot, + # look at the center values, then go to the relevant child. + if point < self.pivot: + values = self.center_left_values + indices = self.center_left_indices + for i in range(self.n_center): + if not values[i] {cmp_left} point: + break + result.append(indices[i]) + if point {cmp_right} self.left_node.max_right: + self.left_node.query(result, point) + elif point > self.pivot: + values = self.center_right_values + indices = self.center_right_indices + for i in range(self.n_center - 1, -1, -1): + if not point {cmp_right} values[i]: + break + result.append(indices[i]) + if self.right_node.min_left {cmp_left} point: + self.right_node.query(result, point) + else: + result.extend(self.center_left_indices) + + def __repr__(self): + if self.is_leaf_node: + return ('<{dtype_title}Closed{closed_title}IntervalNode: ' + '%s elements (terminal)>' % self.n_elements) + else: + n_left = self.left_node.n_elements + n_right = self.right_node.n_elements + n_center = self.n_elements - n_left - n_right + return ('<{dtype_title}Closed{closed_title}IntervalNode: pivot %s, ' + '%s elements (%s left, %s right, %s overlapping)>' % + (self.pivot, self.n_elements, n_left, n_right, n_center)) + + def counts(self): + if self.is_leaf_node: + return self.n_elements + else: + m = len(self.center_left_values) + l = self.left_node.counts() + r = self.right_node.counts() + return (m, (l, r)) + +NODE_CLASSES['{dtype}', '{closed}'] = {dtype_title}Closed{closed_title}IntervalNode +''' + + +def generate_node_template(): + output = StringIO() + for dtype in ['float64', 'int64']: + for closed, cmp_left, cmp_right in [ + ('left', '<=', '<'), + ('right', '<', '<='), + ('both', '<=', '<='), + ('neither', '<', '<')]: + cmp_left_converse = '<' if cmp_left == '<=' else '<=' + cmp_right_converse = '<' if cmp_right == '<=' else '<=' + classes = node_template.format(dtype=dtype, + dtype_title=dtype.title(), + closed=closed, + closed_title=closed.title(), + cmp_left=cmp_left, + cmp_right=cmp_right, + cmp_left_converse=cmp_left_converse, + cmp_right_converse=cmp_right_converse) + output.write(classes) + output.write("\n") + return output.getvalue() + + +def generate_cython_file(): + # Put `intervaltree.pyx` in the same directory as this file + directory = os.path.dirname(os.path.realpath(__file__)) + filename = 'intervaltree.pyx' + path = os.path.join(directory, filename) + + with open(path, 'w') as f: + print(warning_to_new_contributors, file=f) + print(header, file=f) + print(generate_node_template(), file=f) + + +if __name__ == '__main__': + generate_cython_file() diff --git a/pandas/src/interval.pyx b/pandas/src/interval.pyx new file mode 100644 index 0000000000000..495730e0fd6a1 --- /dev/null +++ b/pandas/src/interval.pyx @@ -0,0 +1,171 @@ +cimport numpy as np +import numpy as np +import pandas as pd + +cimport cython +import cython + +from cpython.object cimport (Py_EQ, Py_NE, Py_GT, Py_LT, Py_GE, Py_LE, + PyObject_RichCompare) + +import numbers +_VALID_CLOSED = frozenset(['left', 'right', 'both', 'neither']) + +cdef class IntervalMixin: + property closed_left: + def __get__(self): + return self.closed == 'left' or self.closed == 'both' + + property closed_right: + def __get__(self): + return self.closed == 'right' or self.closed == 'both' + + property open_left: + def __get__(self): + return not self.closed_left + + property open_right: + def __get__(self): + return not self.closed_right + + property mid: + def __get__(self): + try: + return 0.5 * (self.left + self.right) + except TypeError: + # datetime safe version + return self.left + 0.5 * (self.right - self.left) + + +cdef _interval_like(other): + return (hasattr(other, 'left') + and hasattr(other, 'right') + and hasattr(other, 'closed')) + + +cdef class Interval(IntervalMixin): + cdef readonly object left, right + cdef readonly str closed + + def __init__(self, left, right, str closed='right'): + # note: it is faster to just do these checks than to use a special + # constructor (__cinit__/__new__) to avoid them + if closed not in _VALID_CLOSED: + raise ValueError("invalid option for 'closed': %s" % closed) + if not left <= right: + raise ValueError('left side of interval must be <= right side') + self.left = left + self.right = right + self.closed = closed + + def __hash__(self): + return hash((self.left, self.right, self.closed)) + + def __contains__(self, key): + if _interval_like(key): + raise TypeError('__contains__ not defined for two intervals') + return ((self.left < key if self.open_left else self.left <= key) and + (key < self.right if self.open_right else key <= self.right)) + + def __richcmp__(self, other, int op): + if hasattr(other, 'ndim'): + # let numpy (or IntervalIndex) handle vectorization + return NotImplemented + + if _interval_like(other): + self_tuple = (self.left, self.right, self.closed) + other_tuple = (other.left, other.right, other.closed) + return PyObject_RichCompare(self_tuple, other_tuple, op) + + # nb. could just return NotImplemented now, but handling this + # explicitly allows us to opt into the Python 3 behavior, even on + # Python 2. + if op == Py_EQ or op == Py_NE: + return NotImplemented + else: + op_str = {Py_LT: '<', Py_LE: '<=', Py_GT: '>', Py_GE: '>='}[op] + raise TypeError('unorderable types: %s() %s %s()' % + (type(self).__name__, op_str, type(other).__name__)) + + def __reduce__(self): + args = (self.left, self.right, self.closed) + return (type(self), args) + + def __repr__(self): + return ('%s(%r, %r, closed=%r)' % + (type(self).__name__, self.left, self.right, self.closed)) + + def __str__(self): + start_symbol = '[' if self.closed_left else '(' + end_symbol = ']' if self.closed_right else ')' + return '%s%s, %s%s' % (start_symbol, self.left, self.right, end_symbol) + + def __add__(self, y): + if isinstance(y, numbers.Number): + return Interval(self.left + y, self.right + y) + elif isinstance(y, Interval) and isinstance(self, numbers.Number): + return Interval(y.left + self, y.right + self) + else: + raise NotImplemented + + def __sub__(self, y): + if isinstance(y, numbers.Number): + return Interval(self.left - y, self.right - y) + else: + raise NotImplemented + + def __mul__(self, y): + if isinstance(y, numbers.Number): + return Interval(self.left * y, self.right * y) + elif isinstance(y, Interval) and isinstance(self, numbers.Number): + return Interval(y.left * self, y.right * self) + else: + return NotImplemented + + def __div__(self, y): + if isinstance(y, numbers.Number): + return Interval(self.left / y, self.right / y) + else: + return NotImplemented + + def __truediv__(self, y): + if isinstance(y, numbers.Number): + return Interval(self.left / y, self.right / y) + else: + return NotImplemented + + def __floordiv__(self, y): + if isinstance(y, numbers.Number): + return Interval(self.left // y, self.right // y) + else: + return NotImplemented + + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef interval_bounds_to_intervals(np.ndarray left, np.ndarray right, + str closed): + result = np.empty(len(left), dtype=object) + nulls = pd.isnull(left) | pd.isnull(right) + result[nulls] = np.nan + for i in np.flatnonzero(~nulls): + result[i] = Interval(left[i], right[i], closed) + return result + + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef intervals_to_interval_bounds(np.ndarray intervals): + left = np.empty(len(intervals), dtype=object) + right = np.empty(len(intervals), dtype=object) + cdef str closed = None + for i in range(len(intervals)): + interval = intervals[i] + left[i] = interval.left + right[i] = interval.right + if closed is None: + closed = interval.closed + elif closed != interval.closed: + raise ValueError('intervals must all be closed on the same side') + return left, right, closed + diff --git a/pandas/src/intervaltree.pyx b/pandas/src/intervaltree.pyx new file mode 100644 index 0000000000000..55782c930d4f8 --- /dev/null +++ b/pandas/src/intervaltree.pyx @@ -0,0 +1,1444 @@ + +# DO NOT EDIT THIS FILE: This file was autogenerated from +# generate_intervaltree.py, so please edit that file and then run +# `python2 generate_intervaltree.py` to re-generate this file. + + +from numpy cimport int64_t, float64_t +from numpy cimport ndarray, PyArray_ArgSort, NPY_QUICKSORT, PyArray_Take +import numpy as np + +cimport cython +cimport numpy as cnp +cnp.import_array() + +from hashtable cimport Int64Vector, Int64VectorData + + +ctypedef fused scalar64_t: + float64_t + int64_t + + +NODE_CLASSES = {} + + +cdef class IntervalTree(IntervalMixin): + """A centered interval tree + + Based off the algorithm described on Wikipedia: + http://en.wikipedia.org/wiki/Interval_tree + """ + cdef: + readonly object left, right, root + readonly str closed + object _left_sorter, _right_sorter + + def __init__(self, left, right, closed='right', leaf_size=100): + """ + Parameters + ---------- + left, right : np.ndarray[ndim=1] + Left and right bounds for each interval. Assumed to contain no + NaNs. + closed : {'left', 'right', 'both', 'neither'}, optional + Whether the intervals are closed on the left-side, right-side, both + or neither. Defaults to 'right'. + leaf_size : int, optional + Parameter that controls when the tree switches from creating nodes + to brute-force search. Tune this parameter to optimize query + performance. + """ + if closed not in ['left', 'right', 'both', 'neither']: + raise ValueError("invalid option for 'closed': %s" % closed) + + left = np.asarray(left) + right = np.asarray(right) + dtype = np.result_type(left, right) + self.left = np.asarray(left, dtype=dtype) + self.right = np.asarray(right, dtype=dtype) + + indices = np.arange(len(left), dtype='int64') + + self.closed = closed + + node_cls = NODE_CLASSES[str(dtype), closed] + self.root = node_cls(self.left, self.right, indices, leaf_size) + + @property + def left_sorter(self): + """How to sort the left labels; this is used for binary search + """ + if self._left_sorter is None: + self._left_sorter = np.argsort(self.left) + return self._left_sorter + + @property + def right_sorter(self): + """How to sort the right labels + """ + if self._right_sorter is None: + self._right_sorter = np.argsort(self.right) + return self._right_sorter + + def get_loc(self, scalar64_t key): + """Return all positions corresponding to intervals that overlap with + the given scalar key + """ + result = Int64Vector() + self.root.query(result, key) + if not result.data.n: + raise KeyError(key) + return result.to_array() + + def _get_partial_overlap(self, key_left, key_right, side): + """Return all positions corresponding to intervals with the given side + falling between the left and right bounds of an interval query + """ + if side == 'left': + values = self.left + sorter = self.left_sorter + else: + values = self.right + sorter = self.right_sorter + key = [key_left, key_right] + i, j = values.searchsorted(key, sorter=sorter) + return sorter[i:j] + + def get_loc_interval(self, key_left, key_right): + """Lookup the intervals enclosed in the given interval bounds + + The given interval is presumed to have closed bounds. + """ + import pandas as pd + left_overlap = self._get_partial_overlap(key_left, key_right, 'left') + right_overlap = self._get_partial_overlap(key_left, key_right, 'right') + enclosing = self.get_loc(0.5 * (key_left + key_right)) + combined = np.concatenate([left_overlap, right_overlap, enclosing]) + uniques = pd.unique(combined) + return uniques + + def get_indexer(self, scalar64_t[:] target): + """Return the positions corresponding to unique intervals that overlap + with the given array of scalar targets. + """ + # TODO: write get_indexer_intervals + cdef: + int64_t old_len, i + Int64Vector result + + result = Int64Vector() + old_len = 0 + for i in range(len(target)): + self.root.query(result, target[i]) + if result.data.n == old_len: + result.append(-1) + elif result.data.n > old_len + 1: + raise KeyError( + 'indexer does not intersect a unique set of intervals') + old_len = result.data.n + return result.to_array() + + def get_indexer_non_unique(self, scalar64_t[:] target): + """Return the positions corresponding to intervals that overlap with + the given array of scalar targets. Non-unique positions are repeated. + """ + cdef: + int64_t old_len, i + Int64Vector result, missing + + result = Int64Vector() + missing = Int64Vector() + old_len = 0 + for i in range(len(target)): + self.root.query(result, target[i]) + if result.data.n == old_len: + result.append(-1) + missing.append(i) + old_len = result.data.n + return result.to_array(), missing.to_array() + + def __repr__(self): + return ('' + % self.root.n_elements) + + +cdef take(ndarray source, ndarray indices): + """Take the given positions from a 1D ndarray + """ + return PyArray_Take(source, indices, 0) + + +cdef sort_values_and_indices(all_values, all_indices, subset): + indices = take(all_indices, subset) + values = take(all_values, subset) + sorter = PyArray_ArgSort(values, 0, NPY_QUICKSORT) + sorted_values = take(values, sorter) + sorted_indices = take(indices, sorter) + return sorted_values, sorted_indices + + +cdef class Float64ClosedLeftIntervalNode: + """Non-terminal node for an IntervalTree + + Categorizes intervals by those that fall to the left, those that fall to + the right, and those that overlap with the pivot. + """ + cdef: + Float64ClosedLeftIntervalNode left_node, right_node + float64_t[:] center_left_values, center_right_values, left, right + int64_t[:] center_left_indices, center_right_indices, indices + float64_t min_left, max_right + readonly float64_t pivot + readonly int64_t n_elements, n_center, leaf_size + readonly bint is_leaf_node + + def __init__(self, + ndarray[float64_t, ndim=1] left, + ndarray[float64_t, ndim=1] right, + ndarray[int64_t, ndim=1] indices, + int64_t leaf_size): + + self.n_elements = len(left) + self.leaf_size = leaf_size + if left.size > 0: + self.min_left = left.min() + self.max_right = right.max() + else: + self.min_left = 0 + self.max_right = 0 + + if self.n_elements <= leaf_size: + # make this a terminal (leaf) node + self.is_leaf_node = True + self.left = left + self.right = right + self.indices = indices + self.n_center + else: + # calculate a pivot so we can create child nodes + self.is_leaf_node = False + self.pivot = np.median(left + right) / 2 + left_set, right_set, center_set = self.classify_intervals(left, right) + + self.left_node = self.new_child_node(left, right, indices, left_set) + self.right_node = self.new_child_node(left, right, indices, right_set) + + self.center_left_values, self.center_left_indices = \ + sort_values_and_indices(left, indices, center_set) + self.center_right_values, self.center_right_indices = \ + sort_values_and_indices(right, indices, center_set) + self.n_center = len(self.center_left_indices) + + @cython.wraparound(False) + @cython.boundscheck(False) + cdef classify_intervals(self, float64_t[:] left, float64_t[:] right): + """Classify the given intervals based upon whether they fall to the + left, right, or overlap with this node's pivot. + """ + cdef: + Int64Vector left_ind, right_ind, overlapping_ind + Py_ssize_t i + + left_ind = Int64Vector() + right_ind = Int64Vector() + overlapping_ind = Int64Vector() + + for i in range(self.n_elements): + if right[i] <= self.pivot: + left_ind.append(i) + elif self.pivot < left[i]: + right_ind.append(i) + else: + overlapping_ind.append(i) + + return (left_ind.to_array(), + right_ind.to_array(), + overlapping_ind.to_array()) + + cdef new_child_node(self, + ndarray[float64_t, ndim=1] left, + ndarray[float64_t, ndim=1] right, + ndarray[int64_t, ndim=1] indices, + ndarray[int64_t, ndim=1] subset): + """Create a new child node. + """ + left = take(left, subset) + right = take(right, subset) + indices = take(indices, subset) + return Float64ClosedLeftIntervalNode( + left, right, indices, self.leaf_size) + + @cython.wraparound(False) + @cython.boundscheck(False) + @cython.initializedcheck(False) + cpdef query(self, Int64Vector result, scalar64_t point): + """Recursively query this node and its sub-nodes for intervals that + overlap with the query point. + """ + cdef: + int64_t[:] indices + float64_t[:] values + Py_ssize_t i + + if self.is_leaf_node: + # Once we get down to a certain size, it doesn't make sense to + # continue the binary tree structure. Instead, we use linear + # search. + for i in range(self.n_elements): + if self.left[i] <= point < self.right[i]: + result.append(self.indices[i]) + else: + # There are child nodes. Based on comparing our query to the pivot, + # look at the center values, then go to the relevant child. + if point < self.pivot: + values = self.center_left_values + indices = self.center_left_indices + for i in range(self.n_center): + if not values[i] <= point: + break + result.append(indices[i]) + if point < self.left_node.max_right: + self.left_node.query(result, point) + elif point > self.pivot: + values = self.center_right_values + indices = self.center_right_indices + for i in range(self.n_center - 1, -1, -1): + if not point < values[i]: + break + result.append(indices[i]) + if self.right_node.min_left <= point: + self.right_node.query(result, point) + else: + result.extend(self.center_left_indices) + + def __repr__(self): + if self.is_leaf_node: + return ('' % self.n_elements) + else: + n_left = self.left_node.n_elements + n_right = self.right_node.n_elements + n_center = self.n_elements - n_left - n_right + return ('' % + (self.pivot, self.n_elements, n_left, n_right, n_center)) + + def counts(self): + if self.is_leaf_node: + return self.n_elements + else: + m = len(self.center_left_values) + l = self.left_node.counts() + r = self.right_node.counts() + return (m, (l, r)) + +NODE_CLASSES['float64', 'left'] = Float64ClosedLeftIntervalNode + + +cdef class Float64ClosedRightIntervalNode: + """Non-terminal node for an IntervalTree + + Categorizes intervals by those that fall to the left, those that fall to + the right, and those that overlap with the pivot. + """ + cdef: + Float64ClosedRightIntervalNode left_node, right_node + float64_t[:] center_left_values, center_right_values, left, right + int64_t[:] center_left_indices, center_right_indices, indices + float64_t min_left, max_right + readonly float64_t pivot + readonly int64_t n_elements, n_center, leaf_size + readonly bint is_leaf_node + + def __init__(self, + ndarray[float64_t, ndim=1] left, + ndarray[float64_t, ndim=1] right, + ndarray[int64_t, ndim=1] indices, + int64_t leaf_size): + + self.n_elements = len(left) + self.leaf_size = leaf_size + if left.size > 0: + self.min_left = left.min() + self.max_right = right.max() + else: + self.min_left = 0 + self.max_right = 0 + + if self.n_elements <= leaf_size: + # make this a terminal (leaf) node + self.is_leaf_node = True + self.left = left + self.right = right + self.indices = indices + self.n_center + else: + # calculate a pivot so we can create child nodes + self.is_leaf_node = False + self.pivot = np.median(left + right) / 2 + left_set, right_set, center_set = self.classify_intervals(left, right) + + self.left_node = self.new_child_node(left, right, indices, left_set) + self.right_node = self.new_child_node(left, right, indices, right_set) + + self.center_left_values, self.center_left_indices = \ + sort_values_and_indices(left, indices, center_set) + self.center_right_values, self.center_right_indices = \ + sort_values_and_indices(right, indices, center_set) + self.n_center = len(self.center_left_indices) + + @cython.wraparound(False) + @cython.boundscheck(False) + cdef classify_intervals(self, float64_t[:] left, float64_t[:] right): + """Classify the given intervals based upon whether they fall to the + left, right, or overlap with this node's pivot. + """ + cdef: + Int64Vector left_ind, right_ind, overlapping_ind + Py_ssize_t i + + left_ind = Int64Vector() + right_ind = Int64Vector() + overlapping_ind = Int64Vector() + + for i in range(self.n_elements): + if right[i] < self.pivot: + left_ind.append(i) + elif self.pivot <= left[i]: + right_ind.append(i) + else: + overlapping_ind.append(i) + + return (left_ind.to_array(), + right_ind.to_array(), + overlapping_ind.to_array()) + + cdef new_child_node(self, + ndarray[float64_t, ndim=1] left, + ndarray[float64_t, ndim=1] right, + ndarray[int64_t, ndim=1] indices, + ndarray[int64_t, ndim=1] subset): + """Create a new child node. + """ + left = take(left, subset) + right = take(right, subset) + indices = take(indices, subset) + return Float64ClosedRightIntervalNode( + left, right, indices, self.leaf_size) + + @cython.wraparound(False) + @cython.boundscheck(False) + @cython.initializedcheck(False) + cpdef query(self, Int64Vector result, scalar64_t point): + """Recursively query this node and its sub-nodes for intervals that + overlap with the query point. + """ + cdef: + int64_t[:] indices + float64_t[:] values + Py_ssize_t i + + if self.is_leaf_node: + # Once we get down to a certain size, it doesn't make sense to + # continue the binary tree structure. Instead, we use linear + # search. + for i in range(self.n_elements): + if self.left[i] < point <= self.right[i]: + result.append(self.indices[i]) + else: + # There are child nodes. Based on comparing our query to the pivot, + # look at the center values, then go to the relevant child. + if point < self.pivot: + values = self.center_left_values + indices = self.center_left_indices + for i in range(self.n_center): + if not values[i] < point: + break + result.append(indices[i]) + if point <= self.left_node.max_right: + self.left_node.query(result, point) + elif point > self.pivot: + values = self.center_right_values + indices = self.center_right_indices + for i in range(self.n_center - 1, -1, -1): + if not point <= values[i]: + break + result.append(indices[i]) + if self.right_node.min_left < point: + self.right_node.query(result, point) + else: + result.extend(self.center_left_indices) + + def __repr__(self): + if self.is_leaf_node: + return ('' % self.n_elements) + else: + n_left = self.left_node.n_elements + n_right = self.right_node.n_elements + n_center = self.n_elements - n_left - n_right + return ('' % + (self.pivot, self.n_elements, n_left, n_right, n_center)) + + def counts(self): + if self.is_leaf_node: + return self.n_elements + else: + m = len(self.center_left_values) + l = self.left_node.counts() + r = self.right_node.counts() + return (m, (l, r)) + +NODE_CLASSES['float64', 'right'] = Float64ClosedRightIntervalNode + + +cdef class Float64ClosedBothIntervalNode: + """Non-terminal node for an IntervalTree + + Categorizes intervals by those that fall to the left, those that fall to + the right, and those that overlap with the pivot. + """ + cdef: + Float64ClosedBothIntervalNode left_node, right_node + float64_t[:] center_left_values, center_right_values, left, right + int64_t[:] center_left_indices, center_right_indices, indices + float64_t min_left, max_right + readonly float64_t pivot + readonly int64_t n_elements, n_center, leaf_size + readonly bint is_leaf_node + + def __init__(self, + ndarray[float64_t, ndim=1] left, + ndarray[float64_t, ndim=1] right, + ndarray[int64_t, ndim=1] indices, + int64_t leaf_size): + + self.n_elements = len(left) + self.leaf_size = leaf_size + if left.size > 0: + self.min_left = left.min() + self.max_right = right.max() + else: + self.min_left = 0 + self.max_right = 0 + + if self.n_elements <= leaf_size: + # make this a terminal (leaf) node + self.is_leaf_node = True + self.left = left + self.right = right + self.indices = indices + self.n_center + else: + # calculate a pivot so we can create child nodes + self.is_leaf_node = False + self.pivot = np.median(left + right) / 2 + left_set, right_set, center_set = self.classify_intervals(left, right) + + self.left_node = self.new_child_node(left, right, indices, left_set) + self.right_node = self.new_child_node(left, right, indices, right_set) + + self.center_left_values, self.center_left_indices = \ + sort_values_and_indices(left, indices, center_set) + self.center_right_values, self.center_right_indices = \ + sort_values_and_indices(right, indices, center_set) + self.n_center = len(self.center_left_indices) + + @cython.wraparound(False) + @cython.boundscheck(False) + cdef classify_intervals(self, float64_t[:] left, float64_t[:] right): + """Classify the given intervals based upon whether they fall to the + left, right, or overlap with this node's pivot. + """ + cdef: + Int64Vector left_ind, right_ind, overlapping_ind + Py_ssize_t i + + left_ind = Int64Vector() + right_ind = Int64Vector() + overlapping_ind = Int64Vector() + + for i in range(self.n_elements): + if right[i] < self.pivot: + left_ind.append(i) + elif self.pivot < left[i]: + right_ind.append(i) + else: + overlapping_ind.append(i) + + return (left_ind.to_array(), + right_ind.to_array(), + overlapping_ind.to_array()) + + cdef new_child_node(self, + ndarray[float64_t, ndim=1] left, + ndarray[float64_t, ndim=1] right, + ndarray[int64_t, ndim=1] indices, + ndarray[int64_t, ndim=1] subset): + """Create a new child node. + """ + left = take(left, subset) + right = take(right, subset) + indices = take(indices, subset) + return Float64ClosedBothIntervalNode( + left, right, indices, self.leaf_size) + + @cython.wraparound(False) + @cython.boundscheck(False) + @cython.initializedcheck(False) + cpdef query(self, Int64Vector result, scalar64_t point): + """Recursively query this node and its sub-nodes for intervals that + overlap with the query point. + """ + cdef: + int64_t[:] indices + float64_t[:] values + Py_ssize_t i + + if self.is_leaf_node: + # Once we get down to a certain size, it doesn't make sense to + # continue the binary tree structure. Instead, we use linear + # search. + for i in range(self.n_elements): + if self.left[i] <= point <= self.right[i]: + result.append(self.indices[i]) + else: + # There are child nodes. Based on comparing our query to the pivot, + # look at the center values, then go to the relevant child. + if point < self.pivot: + values = self.center_left_values + indices = self.center_left_indices + for i in range(self.n_center): + if not values[i] <= point: + break + result.append(indices[i]) + if point <= self.left_node.max_right: + self.left_node.query(result, point) + elif point > self.pivot: + values = self.center_right_values + indices = self.center_right_indices + for i in range(self.n_center - 1, -1, -1): + if not point <= values[i]: + break + result.append(indices[i]) + if self.right_node.min_left <= point: + self.right_node.query(result, point) + else: + result.extend(self.center_left_indices) + + def __repr__(self): + if self.is_leaf_node: + return ('' % self.n_elements) + else: + n_left = self.left_node.n_elements + n_right = self.right_node.n_elements + n_center = self.n_elements - n_left - n_right + return ('' % + (self.pivot, self.n_elements, n_left, n_right, n_center)) + + def counts(self): + if self.is_leaf_node: + return self.n_elements + else: + m = len(self.center_left_values) + l = self.left_node.counts() + r = self.right_node.counts() + return (m, (l, r)) + +NODE_CLASSES['float64', 'both'] = Float64ClosedBothIntervalNode + + +cdef class Float64ClosedNeitherIntervalNode: + """Non-terminal node for an IntervalTree + + Categorizes intervals by those that fall to the left, those that fall to + the right, and those that overlap with the pivot. + """ + cdef: + Float64ClosedNeitherIntervalNode left_node, right_node + float64_t[:] center_left_values, center_right_values, left, right + int64_t[:] center_left_indices, center_right_indices, indices + float64_t min_left, max_right + readonly float64_t pivot + readonly int64_t n_elements, n_center, leaf_size + readonly bint is_leaf_node + + def __init__(self, + ndarray[float64_t, ndim=1] left, + ndarray[float64_t, ndim=1] right, + ndarray[int64_t, ndim=1] indices, + int64_t leaf_size): + + self.n_elements = len(left) + self.leaf_size = leaf_size + if left.size > 0: + self.min_left = left.min() + self.max_right = right.max() + else: + self.min_left = 0 + self.max_right = 0 + + if self.n_elements <= leaf_size: + # make this a terminal (leaf) node + self.is_leaf_node = True + self.left = left + self.right = right + self.indices = indices + self.n_center + else: + # calculate a pivot so we can create child nodes + self.is_leaf_node = False + self.pivot = np.median(left + right) / 2 + left_set, right_set, center_set = self.classify_intervals(left, right) + + self.left_node = self.new_child_node(left, right, indices, left_set) + self.right_node = self.new_child_node(left, right, indices, right_set) + + self.center_left_values, self.center_left_indices = \ + sort_values_and_indices(left, indices, center_set) + self.center_right_values, self.center_right_indices = \ + sort_values_and_indices(right, indices, center_set) + self.n_center = len(self.center_left_indices) + + @cython.wraparound(False) + @cython.boundscheck(False) + cdef classify_intervals(self, float64_t[:] left, float64_t[:] right): + """Classify the given intervals based upon whether they fall to the + left, right, or overlap with this node's pivot. + """ + cdef: + Int64Vector left_ind, right_ind, overlapping_ind + Py_ssize_t i + + left_ind = Int64Vector() + right_ind = Int64Vector() + overlapping_ind = Int64Vector() + + for i in range(self.n_elements): + if right[i] <= self.pivot: + left_ind.append(i) + elif self.pivot <= left[i]: + right_ind.append(i) + else: + overlapping_ind.append(i) + + return (left_ind.to_array(), + right_ind.to_array(), + overlapping_ind.to_array()) + + cdef new_child_node(self, + ndarray[float64_t, ndim=1] left, + ndarray[float64_t, ndim=1] right, + ndarray[int64_t, ndim=1] indices, + ndarray[int64_t, ndim=1] subset): + """Create a new child node. + """ + left = take(left, subset) + right = take(right, subset) + indices = take(indices, subset) + return Float64ClosedNeitherIntervalNode( + left, right, indices, self.leaf_size) + + @cython.wraparound(False) + @cython.boundscheck(False) + @cython.initializedcheck(False) + cpdef query(self, Int64Vector result, scalar64_t point): + """Recursively query this node and its sub-nodes for intervals that + overlap with the query point. + """ + cdef: + int64_t[:] indices + float64_t[:] values + Py_ssize_t i + + if self.is_leaf_node: + # Once we get down to a certain size, it doesn't make sense to + # continue the binary tree structure. Instead, we use linear + # search. + for i in range(self.n_elements): + if self.left[i] < point < self.right[i]: + result.append(self.indices[i]) + else: + # There are child nodes. Based on comparing our query to the pivot, + # look at the center values, then go to the relevant child. + if point < self.pivot: + values = self.center_left_values + indices = self.center_left_indices + for i in range(self.n_center): + if not values[i] < point: + break + result.append(indices[i]) + if point < self.left_node.max_right: + self.left_node.query(result, point) + elif point > self.pivot: + values = self.center_right_values + indices = self.center_right_indices + for i in range(self.n_center - 1, -1, -1): + if not point < values[i]: + break + result.append(indices[i]) + if self.right_node.min_left < point: + self.right_node.query(result, point) + else: + result.extend(self.center_left_indices) + + def __repr__(self): + if self.is_leaf_node: + return ('' % self.n_elements) + else: + n_left = self.left_node.n_elements + n_right = self.right_node.n_elements + n_center = self.n_elements - n_left - n_right + return ('' % + (self.pivot, self.n_elements, n_left, n_right, n_center)) + + def counts(self): + if self.is_leaf_node: + return self.n_elements + else: + m = len(self.center_left_values) + l = self.left_node.counts() + r = self.right_node.counts() + return (m, (l, r)) + +NODE_CLASSES['float64', 'neither'] = Float64ClosedNeitherIntervalNode + + +cdef class Int64ClosedLeftIntervalNode: + """Non-terminal node for an IntervalTree + + Categorizes intervals by those that fall to the left, those that fall to + the right, and those that overlap with the pivot. + """ + cdef: + Int64ClosedLeftIntervalNode left_node, right_node + int64_t[:] center_left_values, center_right_values, left, right + int64_t[:] center_left_indices, center_right_indices, indices + int64_t min_left, max_right + readonly int64_t pivot + readonly int64_t n_elements, n_center, leaf_size + readonly bint is_leaf_node + + def __init__(self, + ndarray[int64_t, ndim=1] left, + ndarray[int64_t, ndim=1] right, + ndarray[int64_t, ndim=1] indices, + int64_t leaf_size): + + self.n_elements = len(left) + self.leaf_size = leaf_size + if left.size > 0: + self.min_left = left.min() + self.max_right = right.max() + else: + self.min_left = 0 + self.max_right = 0 + + if self.n_elements <= leaf_size: + # make this a terminal (leaf) node + self.is_leaf_node = True + self.left = left + self.right = right + self.indices = indices + self.n_center + else: + # calculate a pivot so we can create child nodes + self.is_leaf_node = False + self.pivot = np.median(left + right) / 2 + left_set, right_set, center_set = self.classify_intervals(left, right) + + self.left_node = self.new_child_node(left, right, indices, left_set) + self.right_node = self.new_child_node(left, right, indices, right_set) + + self.center_left_values, self.center_left_indices = \ + sort_values_and_indices(left, indices, center_set) + self.center_right_values, self.center_right_indices = \ + sort_values_and_indices(right, indices, center_set) + self.n_center = len(self.center_left_indices) + + @cython.wraparound(False) + @cython.boundscheck(False) + cdef classify_intervals(self, int64_t[:] left, int64_t[:] right): + """Classify the given intervals based upon whether they fall to the + left, right, or overlap with this node's pivot. + """ + cdef: + Int64Vector left_ind, right_ind, overlapping_ind + Py_ssize_t i + + left_ind = Int64Vector() + right_ind = Int64Vector() + overlapping_ind = Int64Vector() + + for i in range(self.n_elements): + if right[i] <= self.pivot: + left_ind.append(i) + elif self.pivot < left[i]: + right_ind.append(i) + else: + overlapping_ind.append(i) + + return (left_ind.to_array(), + right_ind.to_array(), + overlapping_ind.to_array()) + + cdef new_child_node(self, + ndarray[int64_t, ndim=1] left, + ndarray[int64_t, ndim=1] right, + ndarray[int64_t, ndim=1] indices, + ndarray[int64_t, ndim=1] subset): + """Create a new child node. + """ + left = take(left, subset) + right = take(right, subset) + indices = take(indices, subset) + return Int64ClosedLeftIntervalNode( + left, right, indices, self.leaf_size) + + @cython.wraparound(False) + @cython.boundscheck(False) + @cython.initializedcheck(False) + cpdef query(self, Int64Vector result, scalar64_t point): + """Recursively query this node and its sub-nodes for intervals that + overlap with the query point. + """ + cdef: + int64_t[:] indices + int64_t[:] values + Py_ssize_t i + + if self.is_leaf_node: + # Once we get down to a certain size, it doesn't make sense to + # continue the binary tree structure. Instead, we use linear + # search. + for i in range(self.n_elements): + if self.left[i] <= point < self.right[i]: + result.append(self.indices[i]) + else: + # There are child nodes. Based on comparing our query to the pivot, + # look at the center values, then go to the relevant child. + if point < self.pivot: + values = self.center_left_values + indices = self.center_left_indices + for i in range(self.n_center): + if not values[i] <= point: + break + result.append(indices[i]) + if point < self.left_node.max_right: + self.left_node.query(result, point) + elif point > self.pivot: + values = self.center_right_values + indices = self.center_right_indices + for i in range(self.n_center - 1, -1, -1): + if not point < values[i]: + break + result.append(indices[i]) + if self.right_node.min_left <= point: + self.right_node.query(result, point) + else: + result.extend(self.center_left_indices) + + def __repr__(self): + if self.is_leaf_node: + return ('' % self.n_elements) + else: + n_left = self.left_node.n_elements + n_right = self.right_node.n_elements + n_center = self.n_elements - n_left - n_right + return ('' % + (self.pivot, self.n_elements, n_left, n_right, n_center)) + + def counts(self): + if self.is_leaf_node: + return self.n_elements + else: + m = len(self.center_left_values) + l = self.left_node.counts() + r = self.right_node.counts() + return (m, (l, r)) + +NODE_CLASSES['int64', 'left'] = Int64ClosedLeftIntervalNode + + +cdef class Int64ClosedRightIntervalNode: + """Non-terminal node for an IntervalTree + + Categorizes intervals by those that fall to the left, those that fall to + the right, and those that overlap with the pivot. + """ + cdef: + Int64ClosedRightIntervalNode left_node, right_node + int64_t[:] center_left_values, center_right_values, left, right + int64_t[:] center_left_indices, center_right_indices, indices + int64_t min_left, max_right + readonly int64_t pivot + readonly int64_t n_elements, n_center, leaf_size + readonly bint is_leaf_node + + def __init__(self, + ndarray[int64_t, ndim=1] left, + ndarray[int64_t, ndim=1] right, + ndarray[int64_t, ndim=1] indices, + int64_t leaf_size): + + self.n_elements = len(left) + self.leaf_size = leaf_size + if left.size > 0: + self.min_left = left.min() + self.max_right = right.max() + else: + self.min_left = 0 + self.max_right = 0 + + if self.n_elements <= leaf_size: + # make this a terminal (leaf) node + self.is_leaf_node = True + self.left = left + self.right = right + self.indices = indices + self.n_center + else: + # calculate a pivot so we can create child nodes + self.is_leaf_node = False + self.pivot = np.median(left + right) / 2 + left_set, right_set, center_set = self.classify_intervals(left, right) + + self.left_node = self.new_child_node(left, right, indices, left_set) + self.right_node = self.new_child_node(left, right, indices, right_set) + + self.center_left_values, self.center_left_indices = \ + sort_values_and_indices(left, indices, center_set) + self.center_right_values, self.center_right_indices = \ + sort_values_and_indices(right, indices, center_set) + self.n_center = len(self.center_left_indices) + + @cython.wraparound(False) + @cython.boundscheck(False) + cdef classify_intervals(self, int64_t[:] left, int64_t[:] right): + """Classify the given intervals based upon whether they fall to the + left, right, or overlap with this node's pivot. + """ + cdef: + Int64Vector left_ind, right_ind, overlapping_ind + Py_ssize_t i + + left_ind = Int64Vector() + right_ind = Int64Vector() + overlapping_ind = Int64Vector() + + for i in range(self.n_elements): + if right[i] < self.pivot: + left_ind.append(i) + elif self.pivot <= left[i]: + right_ind.append(i) + else: + overlapping_ind.append(i) + + return (left_ind.to_array(), + right_ind.to_array(), + overlapping_ind.to_array()) + + cdef new_child_node(self, + ndarray[int64_t, ndim=1] left, + ndarray[int64_t, ndim=1] right, + ndarray[int64_t, ndim=1] indices, + ndarray[int64_t, ndim=1] subset): + """Create a new child node. + """ + left = take(left, subset) + right = take(right, subset) + indices = take(indices, subset) + return Int64ClosedRightIntervalNode( + left, right, indices, self.leaf_size) + + @cython.wraparound(False) + @cython.boundscheck(False) + @cython.initializedcheck(False) + cpdef query(self, Int64Vector result, scalar64_t point): + """Recursively query this node and its sub-nodes for intervals that + overlap with the query point. + """ + cdef: + int64_t[:] indices + int64_t[:] values + Py_ssize_t i + + if self.is_leaf_node: + # Once we get down to a certain size, it doesn't make sense to + # continue the binary tree structure. Instead, we use linear + # search. + for i in range(self.n_elements): + if self.left[i] < point <= self.right[i]: + result.append(self.indices[i]) + else: + # There are child nodes. Based on comparing our query to the pivot, + # look at the center values, then go to the relevant child. + if point < self.pivot: + values = self.center_left_values + indices = self.center_left_indices + for i in range(self.n_center): + if not values[i] < point: + break + result.append(indices[i]) + if point <= self.left_node.max_right: + self.left_node.query(result, point) + elif point > self.pivot: + values = self.center_right_values + indices = self.center_right_indices + for i in range(self.n_center - 1, -1, -1): + if not point <= values[i]: + break + result.append(indices[i]) + if self.right_node.min_left < point: + self.right_node.query(result, point) + else: + result.extend(self.center_left_indices) + + def __repr__(self): + if self.is_leaf_node: + return ('' % self.n_elements) + else: + n_left = self.left_node.n_elements + n_right = self.right_node.n_elements + n_center = self.n_elements - n_left - n_right + return ('' % + (self.pivot, self.n_elements, n_left, n_right, n_center)) + + def counts(self): + if self.is_leaf_node: + return self.n_elements + else: + m = len(self.center_left_values) + l = self.left_node.counts() + r = self.right_node.counts() + return (m, (l, r)) + +NODE_CLASSES['int64', 'right'] = Int64ClosedRightIntervalNode + + +cdef class Int64ClosedBothIntervalNode: + """Non-terminal node for an IntervalTree + + Categorizes intervals by those that fall to the left, those that fall to + the right, and those that overlap with the pivot. + """ + cdef: + Int64ClosedBothIntervalNode left_node, right_node + int64_t[:] center_left_values, center_right_values, left, right + int64_t[:] center_left_indices, center_right_indices, indices + int64_t min_left, max_right + readonly int64_t pivot + readonly int64_t n_elements, n_center, leaf_size + readonly bint is_leaf_node + + def __init__(self, + ndarray[int64_t, ndim=1] left, + ndarray[int64_t, ndim=1] right, + ndarray[int64_t, ndim=1] indices, + int64_t leaf_size): + + self.n_elements = len(left) + self.leaf_size = leaf_size + if left.size > 0: + self.min_left = left.min() + self.max_right = right.max() + else: + self.min_left = 0 + self.max_right = 0 + + if self.n_elements <= leaf_size: + # make this a terminal (leaf) node + self.is_leaf_node = True + self.left = left + self.right = right + self.indices = indices + self.n_center + else: + # calculate a pivot so we can create child nodes + self.is_leaf_node = False + self.pivot = np.median(left + right) / 2 + left_set, right_set, center_set = self.classify_intervals(left, right) + + self.left_node = self.new_child_node(left, right, indices, left_set) + self.right_node = self.new_child_node(left, right, indices, right_set) + + self.center_left_values, self.center_left_indices = \ + sort_values_and_indices(left, indices, center_set) + self.center_right_values, self.center_right_indices = \ + sort_values_and_indices(right, indices, center_set) + self.n_center = len(self.center_left_indices) + + @cython.wraparound(False) + @cython.boundscheck(False) + cdef classify_intervals(self, int64_t[:] left, int64_t[:] right): + """Classify the given intervals based upon whether they fall to the + left, right, or overlap with this node's pivot. + """ + cdef: + Int64Vector left_ind, right_ind, overlapping_ind + Py_ssize_t i + + left_ind = Int64Vector() + right_ind = Int64Vector() + overlapping_ind = Int64Vector() + + for i in range(self.n_elements): + if right[i] < self.pivot: + left_ind.append(i) + elif self.pivot < left[i]: + right_ind.append(i) + else: + overlapping_ind.append(i) + + return (left_ind.to_array(), + right_ind.to_array(), + overlapping_ind.to_array()) + + cdef new_child_node(self, + ndarray[int64_t, ndim=1] left, + ndarray[int64_t, ndim=1] right, + ndarray[int64_t, ndim=1] indices, + ndarray[int64_t, ndim=1] subset): + """Create a new child node. + """ + left = take(left, subset) + right = take(right, subset) + indices = take(indices, subset) + return Int64ClosedBothIntervalNode( + left, right, indices, self.leaf_size) + + @cython.wraparound(False) + @cython.boundscheck(False) + @cython.initializedcheck(False) + cpdef query(self, Int64Vector result, scalar64_t point): + """Recursively query this node and its sub-nodes for intervals that + overlap with the query point. + """ + cdef: + int64_t[:] indices + int64_t[:] values + Py_ssize_t i + + if self.is_leaf_node: + # Once we get down to a certain size, it doesn't make sense to + # continue the binary tree structure. Instead, we use linear + # search. + for i in range(self.n_elements): + if self.left[i] <= point <= self.right[i]: + result.append(self.indices[i]) + else: + # There are child nodes. Based on comparing our query to the pivot, + # look at the center values, then go to the relevant child. + if point < self.pivot: + values = self.center_left_values + indices = self.center_left_indices + for i in range(self.n_center): + if not values[i] <= point: + break + result.append(indices[i]) + if point <= self.left_node.max_right: + self.left_node.query(result, point) + elif point > self.pivot: + values = self.center_right_values + indices = self.center_right_indices + for i in range(self.n_center - 1, -1, -1): + if not point <= values[i]: + break + result.append(indices[i]) + if self.right_node.min_left <= point: + self.right_node.query(result, point) + else: + result.extend(self.center_left_indices) + + def __repr__(self): + if self.is_leaf_node: + return ('' % self.n_elements) + else: + n_left = self.left_node.n_elements + n_right = self.right_node.n_elements + n_center = self.n_elements - n_left - n_right + return ('' % + (self.pivot, self.n_elements, n_left, n_right, n_center)) + + def counts(self): + if self.is_leaf_node: + return self.n_elements + else: + m = len(self.center_left_values) + l = self.left_node.counts() + r = self.right_node.counts() + return (m, (l, r)) + +NODE_CLASSES['int64', 'both'] = Int64ClosedBothIntervalNode + + +cdef class Int64ClosedNeitherIntervalNode: + """Non-terminal node for an IntervalTree + + Categorizes intervals by those that fall to the left, those that fall to + the right, and those that overlap with the pivot. + """ + cdef: + Int64ClosedNeitherIntervalNode left_node, right_node + int64_t[:] center_left_values, center_right_values, left, right + int64_t[:] center_left_indices, center_right_indices, indices + int64_t min_left, max_right + readonly int64_t pivot + readonly int64_t n_elements, n_center, leaf_size + readonly bint is_leaf_node + + def __init__(self, + ndarray[int64_t, ndim=1] left, + ndarray[int64_t, ndim=1] right, + ndarray[int64_t, ndim=1] indices, + int64_t leaf_size): + + self.n_elements = len(left) + self.leaf_size = leaf_size + if left.size > 0: + self.min_left = left.min() + self.max_right = right.max() + else: + self.min_left = 0 + self.max_right = 0 + + if self.n_elements <= leaf_size: + # make this a terminal (leaf) node + self.is_leaf_node = True + self.left = left + self.right = right + self.indices = indices + self.n_center + else: + # calculate a pivot so we can create child nodes + self.is_leaf_node = False + self.pivot = np.median(left + right) / 2 + left_set, right_set, center_set = self.classify_intervals(left, right) + + self.left_node = self.new_child_node(left, right, indices, left_set) + self.right_node = self.new_child_node(left, right, indices, right_set) + + self.center_left_values, self.center_left_indices = \ + sort_values_and_indices(left, indices, center_set) + self.center_right_values, self.center_right_indices = \ + sort_values_and_indices(right, indices, center_set) + self.n_center = len(self.center_left_indices) + + @cython.wraparound(False) + @cython.boundscheck(False) + cdef classify_intervals(self, int64_t[:] left, int64_t[:] right): + """Classify the given intervals based upon whether they fall to the + left, right, or overlap with this node's pivot. + """ + cdef: + Int64Vector left_ind, right_ind, overlapping_ind + Py_ssize_t i + + left_ind = Int64Vector() + right_ind = Int64Vector() + overlapping_ind = Int64Vector() + + for i in range(self.n_elements): + if right[i] <= self.pivot: + left_ind.append(i) + elif self.pivot <= left[i]: + right_ind.append(i) + else: + overlapping_ind.append(i) + + return (left_ind.to_array(), + right_ind.to_array(), + overlapping_ind.to_array()) + + cdef new_child_node(self, + ndarray[int64_t, ndim=1] left, + ndarray[int64_t, ndim=1] right, + ndarray[int64_t, ndim=1] indices, + ndarray[int64_t, ndim=1] subset): + """Create a new child node. + """ + left = take(left, subset) + right = take(right, subset) + indices = take(indices, subset) + return Int64ClosedNeitherIntervalNode( + left, right, indices, self.leaf_size) + + @cython.wraparound(False) + @cython.boundscheck(False) + @cython.initializedcheck(False) + cpdef query(self, Int64Vector result, scalar64_t point): + """Recursively query this node and its sub-nodes for intervals that + overlap with the query point. + """ + cdef: + int64_t[:] indices + int64_t[:] values + Py_ssize_t i + + if self.is_leaf_node: + # Once we get down to a certain size, it doesn't make sense to + # continue the binary tree structure. Instead, we use linear + # search. + for i in range(self.n_elements): + if self.left[i] < point < self.right[i]: + result.append(self.indices[i]) + else: + # There are child nodes. Based on comparing our query to the pivot, + # look at the center values, then go to the relevant child. + if point < self.pivot: + values = self.center_left_values + indices = self.center_left_indices + for i in range(self.n_center): + if not values[i] < point: + break + result.append(indices[i]) + if point < self.left_node.max_right: + self.left_node.query(result, point) + elif point > self.pivot: + values = self.center_right_values + indices = self.center_right_indices + for i in range(self.n_center - 1, -1, -1): + if not point < values[i]: + break + result.append(indices[i]) + if self.right_node.min_left < point: + self.right_node.query(result, point) + else: + result.extend(self.center_left_indices) + + def __repr__(self): + if self.is_leaf_node: + return ('' % self.n_elements) + else: + n_left = self.left_node.n_elements + n_right = self.right_node.n_elements + n_center = self.n_elements - n_left - n_right + return ('' % + (self.pivot, self.n_elements, n_left, n_right, n_center)) + + def counts(self): + if self.is_leaf_node: + return self.n_elements + else: + m = len(self.center_left_values) + l = self.left_node.counts() + r = self.right_node.counts() + return (m, (l, r)) + +NODE_CLASSES['int64', 'neither'] = Int64ClosedNeitherIntervalNode + + diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 8f3d8e2307f45..800e2e8aa1cc1 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3866,6 +3866,49 @@ def test_transform_doesnt_clobber_ints(self): expected = gb2.transform('mean') tm.assert_frame_equal(result, expected) + def test_groupby_categorical_two_columns(self): + + # https://github.com/pydata/pandas/issues/8138 + d = {'cat': pd.Categorical(["a","b","a","b"], categories=["a", "b", "c"], ordered=True), + 'ints': [1, 1, 2, 2],'val': [10, 20, 30, 40]} + test = pd.DataFrame(d) + + # Grouping on a single column + groups_single_key = test.groupby("cat") + res = groups_single_key.agg('mean') + exp = DataFrame({"ints":[1.5,1.5,np.nan], "val":[20,30,np.nan]}, + index=pd.CategoricalIndex(["a", "b", "c"], name="cat")) + tm.assert_frame_equal(res, exp) + + # Grouping on two columns + groups_double_key = test.groupby(["cat","ints"]) + res = groups_double_key.agg('mean') + exp = DataFrame({"val":[10,30,20,40,np.nan,np.nan], + "cat": ["a","a","b","b","c","c"], + "ints": [1,2,1,2,1,2]}).set_index(["cat","ints"]) + tm.assert_frame_equal(res, exp) + + # GH 10132 + for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]: + c, i = key + result = groups_double_key.get_group(key) + expected = test[(test.cat == c) & (test.ints == i)] + assert_frame_equal(result, expected) + + d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]} + test = pd.DataFrame(d) + values = pd.cut(test['C1'], [1, 2, 3, 6], labels=pd.Categorical(['a', 'b', 'c'])) + values.name = "cat" + groups_double_key = test.groupby([values,'C2']) + + res = groups_double_key.agg('mean') + nan = np.nan + idx = MultiIndex.from_product([['a', 'b', 'c'], [1, 2, 3, 4]], + names=["cat", "C2"]) + exp = DataFrame({"C1":[nan,nan,nan,nan, 3, 3,nan,nan, nan,nan, 4, 5], + "C3":[nan,nan,nan,nan, 10,100,nan,nan, nan,nan,200,34]}, index=idx) + tm.assert_frame_equal(res, exp) + def test_groupby_apply_all_none(self): # Tests to make sure no errors if apply function returns all None # values. Issue 9684. diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index d9f81968c684d..135521f287f7c 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -588,24 +588,24 @@ def test_value_counts(self): arr = np.random.randn(4) factor = cut(arr, 4) - tm.assertIsInstance(factor, Categorical) + # tm.assertIsInstance(factor, n) result = algos.value_counts(factor) - cats = ['(-1.194, -0.535]', '(-0.535, 0.121]', '(0.121, 0.777]', - '(0.777, 1.433]'] - expected_index = CategoricalIndex(cats, cats, ordered=True) - expected = Series([1, 1, 1, 1], index=expected_index) + breaks = [-1.192, -0.535, 0.121, 0.777, 1.433] + expected_index = pd.IntervalIndex.from_breaks(breaks) + expected = Series([1, 1, 1, 1], + index=expected_index) tm.assert_series_equal(result.sort_index(), expected.sort_index()) def test_value_counts_bins(self): s = [1, 2, 3, 4] result = algos.value_counts(s, bins=1) self.assertEqual(result.tolist(), [4]) - self.assertEqual(result.index[0], 0.997) + self.assertEqual(result.index[0], pd.Interval(0.999, 4.0)) result = algos.value_counts(s, bins=2, sort=False) self.assertEqual(result.tolist(), [2, 2]) - self.assertEqual(result.index[0], 0.997) - self.assertEqual(result.index[1], 2.5) + self.assertEqual(result.index.min(), pd.Interval(0.999, 2.5)) + self.assertEqual(result.index.max(), pd.Interval(2.5, 4.0)) def test_value_counts_dtypes(self): result = algos.value_counts([1, 1.]) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 032e3a186b84a..1fe449fa26aef 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -13,7 +13,7 @@ needs_i8_conversion) import pandas.util.testing as tm from pandas import (Series, Index, DatetimeIndex, TimedeltaIndex, PeriodIndex, - Timedelta) + Timedelta, IntervalIndex, Interval) from pandas.compat import StringIO from pandas.compat.numpy import np_array_datetime64_compat from pandas.core.base import PandasDelegate, NoNewAttributesMixin @@ -31,6 +31,7 @@ def test_string_methods_dont_fail(self): unicode(self.container) # noqa def test_tricky_container(self): + import nose if not hasattr(self, 'unicode_container'): pytest.skip('Need unicode_container to test with this') repr(self.unicode_container) @@ -575,10 +576,10 @@ def test_value_counts_bins(self): s1 = Series([1, 1, 2, 3]) res1 = s1.value_counts(bins=1) - exp1 = Series({0.998: 4}) + exp1 = Series({Interval(0.999, 3.0): 4}) tm.assert_series_equal(res1, exp1) res1n = s1.value_counts(bins=1, normalize=True) - exp1n = Series({0.998: 1.0}) + exp1n = Series({Interval(0.999, 3.0): 1.0}) tm.assert_series_equal(res1n, exp1n) if isinstance(s1, Index): @@ -590,17 +591,11 @@ def test_value_counts_bins(self): self.assertEqual(s1.nunique(), 3) res4 = s1.value_counts(bins=4) - exp4 = Series({0.998: 2, - 1.5: 1, - 2.0: 0, - 2.5: 1}, index=[0.998, 2.5, 1.5, 2.0]) + intervals = IntervalIndex.from_breaks([0.999, 1.5, 2.0, 2.5, 3.0]) + exp4 = Series([2, 1, 1], index=intervals.take([0, 3, 1])) tm.assert_series_equal(res4, exp4) res4n = s1.value_counts(bins=4, normalize=True) - exp4n = Series( - {0.998: 0.5, - 1.5: 0.25, - 2.0: 0.0, - 2.5: 0.25}, index=[0.998, 2.5, 1.5, 2.0]) + exp4n = Series([0.5, 0.25, 0.25], index=intervals.take([0, 3, 1])) tm.assert_series_equal(res4n, exp4n) # handle NA's properly diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index adacbb95f5162..fe37fa000e687 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -21,7 +21,8 @@ Timestamp, CategoricalIndex, isnull, date_range, DatetimeIndex, period_range, PeriodIndex, - timedelta_range, TimedeltaIndex, NaT) + timedelta_range, TimedeltaIndex, NaT, + Interval) from pandas.compat import range, lrange, u, PY3 from pandas.core.config import option_context @@ -1597,11 +1598,11 @@ def setUp(self): self.factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) df = DataFrame({'value': np.random.randint(0, 10000, 100)}) - labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] + labels = [ "{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500) ] + cat_labels = Categorical(labels, labels) df = df.sort_values(by=['value'], ascending=True) - df['value_group'] = pd.cut(df.value, range(0, 10500, 500), right=False, - labels=labels) + df['value_group'] = pd.cut(df.value, range(0, 10500, 500), right=False, labels=cat_labels) self.cat = df def test_dtypes(self): @@ -2008,9 +2009,8 @@ def test_series_functions_no_warnings(self): def test_assignment_to_dataframe(self): # assignment - df = DataFrame({'value': np.array(np.random.randint(0, 10000, 100), - dtype='int32')}) - labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] + df = DataFrame({'value': np.array(np.random.randint(0, 10000, 100),dtype='int32')}) + labels = Categorical(["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]) df = df.sort_values(by=['value'], ascending=True) s = pd.cut(df.value, range(0, 10500, 500), right=False, labels=labels) @@ -3007,7 +3007,7 @@ def f(x): # GH 9603 df = pd.DataFrame({'a': [1, 0, 0, 0]}) - c = pd.cut(df.a, [0, 1, 2, 3, 4]) + c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=pd.Categorical(list('abcd'))) result = df.groupby(c).apply(len) exp_index = pd.CategoricalIndex(c.values.categories, @@ -3124,7 +3124,7 @@ def test_slicing(self): df = DataFrame({'value': (np.arange(100) + 1).astype('int64')}) df['D'] = pd.cut(df.value, bins=[0, 25, 50, 75, 100]) - expected = Series([11, '(0, 25]'], index=['value', 'D'], name=10) + expected = Series([11, Interval(0, 25)], index=['value','D'], name=10) result = df.iloc[10] tm.assert_series_equal(result, expected) @@ -3134,7 +3134,7 @@ def test_slicing(self): result = df.iloc[10:20] tm.assert_frame_equal(result, expected) - expected = Series([9, '(0, 25]'], index=['value', 'D'], name=8) + expected = Series([9, Interval(0, 25)],index=['value', 'D'], name=8) result = df.loc[8] tm.assert_series_equal(result, expected) diff --git a/pandas/tests/test_interval.py b/pandas/tests/test_interval.py new file mode 100644 index 0000000000000..1b52e2629b38c --- /dev/null +++ b/pandas/tests/test_interval.py @@ -0,0 +1,591 @@ +from __future__ import division +import numpy as np + +from pandas.core.interval import Interval, IntervalIndex +from pandas.core.index import Index +from pandas.lib import IntervalTree + +import pandas.util.testing as tm +import pandas as pd + + +class TestInterval(tm.TestCase): + def setUp(self): + self.interval = Interval(0, 1) + + def test_properties(self): + self.assertEqual(self.interval.closed, 'right') + self.assertEqual(self.interval.left, 0) + self.assertEqual(self.interval.right, 1) + self.assertEqual(self.interval.mid, 0.5) + + def test_repr(self): + self.assertEqual(repr(self.interval), + "Interval(0, 1, closed='right')") + self.assertEqual(str(self.interval), "(0, 1]") + + interval_left = Interval(0, 1, closed='left') + self.assertEqual(repr(interval_left), + "Interval(0, 1, closed='left')") + self.assertEqual(str(interval_left), "[0, 1)") + + def test_contains(self): + self.assertIn(0.5, self.interval) + self.assertIn(1, self.interval) + self.assertNotIn(0, self.interval) + self.assertRaises(TypeError, lambda: self.interval in self.interval) + + interval = Interval(0, 1, closed='both') + self.assertIn(0, interval) + self.assertIn(1, interval) + + interval = Interval(0, 1, closed='neither') + self.assertNotIn(0, interval) + self.assertIn(0.5, interval) + self.assertNotIn(1, interval) + + def test_equal(self): + self.assertEqual(Interval(0, 1), Interval(0, 1, closed='right')) + self.assertNotEqual(Interval(0, 1), Interval(0, 1, closed='left')) + self.assertNotEqual(Interval(0, 1), 0) + + def test_comparison(self): + with self.assertRaisesRegexp(TypeError, 'unorderable types'): + Interval(0, 1) < 2 + + self.assertTrue(Interval(0, 1) < Interval(1, 2)) + self.assertTrue(Interval(0, 1) < Interval(0, 2)) + self.assertTrue(Interval(0, 1) < Interval(0.5, 1.5)) + self.assertTrue(Interval(0, 1) <= Interval(0, 1)) + self.assertTrue(Interval(0, 1) > Interval(-1, 2)) + self.assertTrue(Interval(0, 1) >= Interval(0, 1)) + + def test_hash(self): + # should not raise + hash(self.interval) + + def test_math_add(self): + expected = Interval(1, 2) + actual = self.interval + 1 + self.assertEqual(expected, actual) + + expected = Interval(1, 2) + actual = 1 + self.interval + self.assertEqual(expected, actual) + + actual = self.interval + actual += 1 + self.assertEqual(expected, actual) + + with self.assertRaises(TypeError): + self.interval + Interval(1, 2) + + def test_math_sub(self): + expected = Interval(-1, 0) + actual = self.interval - 1 + self.assertEqual(expected, actual) + + actual = self.interval + actual -= 1 + self.assertEqual(expected, actual) + + with self.assertRaises(TypeError): + self.interval - Interval(1, 2) + + def test_math_mult(self): + expected = Interval(0, 2) + actual = self.interval * 2 + self.assertEqual(expected, actual) + + expected = Interval(0, 2) + actual = 2 * self.interval + self.assertEqual(expected, actual) + + actual = self.interval + actual *= 2 + self.assertEqual(expected, actual) + + with self.assertRaises(TypeError): + self.interval * Interval(1, 2) + + def test_math_div(self): + expected = Interval(0, 0.5) + actual = self.interval / 2.0 + self.assertEqual(expected, actual) + + actual = self.interval + actual /= 2.0 + self.assertEqual(expected, actual) + + with self.assertRaises(TypeError): + self.interval / Interval(1, 2) + + +class TestIntervalTree(tm.TestCase): + def setUp(self): + self.tree = IntervalTree(np.arange(5), np.arange(5) + 2) + + def test_get_loc(self): + self.assert_numpy_array_equal(self.tree.get_loc(1), [0]) + self.assert_numpy_array_equal(np.sort(self.tree.get_loc(2)), [0, 1]) + with self.assertRaises(KeyError): + self.tree.get_loc(-1) + + def test_get_indexer(self): + self.assert_numpy_array_equal( + self.tree.get_indexer(np.array([1.0, 5.5, 6.5])), [0, 4, -1]) + with self.assertRaises(KeyError): + self.tree.get_indexer(np.array([3.0])) + + def test_get_indexer_non_unique(self): + indexer, missing = self.tree.get_indexer_non_unique( + np.array([1.0, 2.0, 6.5])) + self.assert_numpy_array_equal(indexer[:1], [0]) + self.assert_numpy_array_equal(np.sort(indexer[1:3]), [0, 1]) + self.assert_numpy_array_equal(np.sort(indexer[3:]), [-1]) + self.assert_numpy_array_equal(missing, [2]) + + def test_duplicates(self): + tree = IntervalTree([0, 0, 0], [1, 1, 1]) + self.assert_numpy_array_equal(np.sort(tree.get_loc(0.5)), [0, 1, 2]) + + with self.assertRaises(KeyError): + tree.get_indexer(np.array([0.5])) + + indexer, missing = tree.get_indexer_non_unique(np.array([0.5])) + self.assert_numpy_array_equal(np.sort(indexer), [0, 1, 2]) + self.assert_numpy_array_equal(missing, []) + + def test_get_loc_closed(self): + for closed in ['left', 'right', 'both', 'neither']: + tree = IntervalTree([0], [1], closed=closed) + for p, errors in [(0, tree.open_left), + (1, tree.open_right)]: + if errors: + with self.assertRaises(KeyError): + tree.get_loc(p) + else: + self.assert_numpy_array_equal(tree.get_loc(p), + np.array([0])) + + def test_get_indexer_closed(self): + x = np.arange(1000) + found = x + not_found = -np.ones(1000) + for leaf_size in [1, 10, 100, 10000]: + for closed in ['left', 'right', 'both', 'neither']: + tree = IntervalTree(x, x + 0.5, closed=closed, + leaf_size=leaf_size) + self.assert_numpy_array_equal(found, tree.get_indexer(x + 0.25)) + + expected = found if tree.closed_left else not_found + self.assert_numpy_array_equal(expected, tree.get_indexer(x + 0.0)) + + expected = found if tree.closed_right else not_found + self.assert_numpy_array_equal(expected, tree.get_indexer(x + 0.5)) + + +class TestIntervalIndex(tm.TestCase): + def setUp(self): + self.index = IntervalIndex([0, 1], [1, 2]) + + def test_constructors(self): + expected = self.index + actual = IntervalIndex.from_breaks(np.arange(3), closed='right') + self.assertTrue(expected.equals(actual)) + + alternate = IntervalIndex.from_breaks(np.arange(3), closed='left') + self.assertFalse(expected.equals(alternate)) + + actual = IntervalIndex.from_intervals([Interval(0, 1), Interval(1, 2)]) + self.assertTrue(expected.equals(actual)) + + self.assertRaises(ValueError, IntervalIndex, [0], [1], closed='invalid') + + # TODO: fix all these commented out tests (here and below) + + intervals = [Interval(0, 1), Interval(1, 2, closed='left')] + with self.assertRaises(ValueError): + IntervalIndex.from_intervals(intervals) + + with self.assertRaises(ValueError): + IntervalIndex([0, 10], [3, 5]) + + actual = Index([Interval(0, 1), Interval(1, 2)]) + self.assertIsInstance(actual, IntervalIndex) + self.assertTrue(expected.equals(actual)) + + actual = Index(expected) + self.assertIsInstance(actual, IntervalIndex) + self.assertTrue(expected.equals(actual)) + + # no point in nesting periods in an IntervalIndex + # self.assertRaises(ValueError, IntervalIndex.from_breaks, + # pd.period_range('2000-01-01', periods=3)) + + def test_properties(self): + self.assertEqual(len(self.index), 2) + self.assertEqual(self.index.size, 2) + + self.assert_numpy_array_equal(self.index.left, [0, 1]) + self.assertIsInstance(self.index.left, Index) + + self.assert_numpy_array_equal(self.index.right, [1, 2]) + self.assertIsInstance(self.index.right, Index) + + self.assert_numpy_array_equal(self.index.mid, [0.5, 1.5]) + self.assertIsInstance(self.index.mid, Index) + + self.assertEqual(self.index.closed, 'right') + + expected = np.array([Interval(0, 1), Interval(1, 2)], dtype=object) + self.assert_numpy_array_equal(np.asarray(self.index), expected) + self.assert_numpy_array_equal(self.index.values, expected) + + def test_copy(self): + actual = self.index.copy() + self.assertTrue(actual.equals(self.index)) + + actual = self.index.copy(deep=True) + self.assertTrue(actual.equals(self.index)) + self.assertIsNot(actual.left, self.index.left) + + def test_delete(self): + expected = IntervalIndex.from_breaks([1, 2]) + actual = self.index.delete(0) + self.assertTrue(expected.equals(actual)) + + def test_insert(self): + expected = IntervalIndex.from_breaks(range(4)) + actual = self.index.insert(2, Interval(2, 3)) + self.assertTrue(expected.equals(actual)) + + self.assertRaises(ValueError, self.index.insert, 0, 1) + self.assertRaises(ValueError, self.index.insert, 0, + Interval(2, 3, closed='left')) + + def test_take(self): + actual = self.index.take([0, 1]) + self.assertTrue(self.index.equals(actual)) + + expected = IntervalIndex([0, 0, 1], [1, 1, 2]) + actual = self.index.take([0, 0, 1]) + self.assertTrue(expected.equals(actual)) + + def test_monotonic_and_unique(self): + self.assertTrue(self.index.is_monotonic) + self.assertTrue(self.index.is_unique) + + idx = IntervalIndex.from_tuples([(0, 1), (0.5, 1.5)]) + self.assertTrue(idx.is_monotonic) + self.assertTrue(idx.is_unique) + + idx = IntervalIndex.from_tuples([(0, 1), (2, 3), (1, 2)]) + self.assertFalse(idx.is_monotonic) + self.assertTrue(idx.is_unique) + + idx = IntervalIndex.from_tuples([(0, 2), (0, 2)]) + self.assertFalse(idx.is_unique) + self.assertTrue(idx.is_monotonic) + + def test_repr(self): + expected = ("IntervalIndex(left=[0, 1],\n right=[1, 2]," + "\n closed='right')") + IntervalIndex((0, 1), (1, 2), closed='right') + self.assertEqual(repr(self.index), expected) + + def test_get_loc_value(self): + self.assertRaises(KeyError, self.index.get_loc, 0) + self.assertEqual(self.index.get_loc(0.5), 0) + self.assertEqual(self.index.get_loc(1), 0) + self.assertEqual(self.index.get_loc(1.5), 1) + self.assertEqual(self.index.get_loc(2), 1) + self.assertRaises(KeyError, self.index.get_loc, -1) + self.assertRaises(KeyError, self.index.get_loc, 3) + + idx = IntervalIndex.from_tuples([(0, 2), (1, 3)]) + self.assertEqual(idx.get_loc(0.5), 0) + self.assertEqual(idx.get_loc(1), 0) + self.assert_numpy_array_equal(idx.get_loc(1.5), [0, 1]) + self.assert_numpy_array_equal(np.sort(idx.get_loc(2)), [0, 1]) + self.assertEqual(idx.get_loc(3), 1) + self.assertRaises(KeyError, idx.get_loc, 3.5) + + idx = IntervalIndex([0, 2], [1, 3]) + self.assertRaises(KeyError, idx.get_loc, 1.5) + + def slice_locs_cases(self, breaks): + # TODO: same tests for more index types + index = IntervalIndex.from_breaks([0, 1, 2], closed='right') + self.assertEqual(index.slice_locs(), (0, 2)) + self.assertEqual(index.slice_locs(0, 1), (0, 1)) + self.assertEqual(index.slice_locs(1, 1), (0, 1)) + self.assertEqual(index.slice_locs(0, 2), (0, 2)) + self.assertEqual(index.slice_locs(0.5, 1.5), (0, 2)) + self.assertEqual(index.slice_locs(0, 0.5), (0, 1)) + self.assertEqual(index.slice_locs(start=1), (0, 2)) + self.assertEqual(index.slice_locs(start=1.2), (1, 2)) + self.assertEqual(index.slice_locs(end=1), (0, 1)) + self.assertEqual(index.slice_locs(end=1.1), (0, 2)) + self.assertEqual(index.slice_locs(end=1.0), (0, 1)) + self.assertEqual(*index.slice_locs(-1, -1)) + + index = IntervalIndex.from_breaks([0, 1, 2], closed='neither') + self.assertEqual(index.slice_locs(0, 1), (0, 1)) + self.assertEqual(index.slice_locs(0, 2), (0, 2)) + self.assertEqual(index.slice_locs(0.5, 1.5), (0, 2)) + self.assertEqual(index.slice_locs(1, 1), (1, 1)) + self.assertEqual(index.slice_locs(1, 2), (1, 2)) + + index = IntervalIndex.from_breaks([0, 1, 2], closed='both') + self.assertEqual(index.slice_locs(1, 1), (0, 2)) + self.assertEqual(index.slice_locs(1, 2), (0, 2)) + + def test_slice_locs_int64(self): + self.slice_locs_cases([0, 1, 2]) + + def test_slice_locs_float64(self): + self.slice_locs_cases([0.0, 1.0, 2.0]) + + def slice_locs_decreasing_cases(self, tuples): + index = IntervalIndex.from_tuples(tuples) + self.assertEqual(index.slice_locs(1.5, 0.5), (1, 3)) + self.assertEqual(index.slice_locs(2, 0), (1, 3)) + self.assertEqual(index.slice_locs(2, 1), (1, 3)) + self.assertEqual(index.slice_locs(3, 1.1), (0, 3)) + self.assertEqual(index.slice_locs(3, 3), (0, 2)) + self.assertEqual(index.slice_locs(3.5, 3.3), (0, 1)) + self.assertEqual(index.slice_locs(1, -3), (2, 3)) + self.assertEqual(*index.slice_locs(-1, -1)) + + def test_slice_locs_decreasing_int64(self): + self.slice_locs_cases([(2, 4), (1, 3), (0, 2)]) + + def test_slice_locs_decreasing_float64(self): + self.slice_locs_cases([(2., 4.), (1., 3.), (0., 2.)]) + + def test_slice_locs_fails(self): + index = IntervalIndex.from_tuples([(1, 2), (0, 1), (2, 3)]) + with self.assertRaises(KeyError): + index.slice_locs(1, 2) + + def test_get_loc_interval(self): + self.assertEqual(self.index.get_loc(Interval(0, 1)), 0) + self.assertEqual(self.index.get_loc(Interval(0, 0.5)), 0) + self.assertEqual(self.index.get_loc(Interval(0, 1, 'left')), 0) + self.assertRaises(KeyError, self.index.get_loc, Interval(2, 3)) + self.assertRaises(KeyError, self.index.get_loc, Interval(-1, 0, 'left')) + + def test_get_indexer(self): + actual = self.index.get_indexer([-1, 0, 0.5, 1, 1.5, 2, 3]) + expected = [-1, -1, 0, 0, 1, 1, -1] + self.assert_numpy_array_equal(actual, expected) + + actual = self.index.get_indexer(self.index) + expected = [0, 1] + self.assert_numpy_array_equal(actual, expected) + + index = IntervalIndex.from_breaks([0, 1, 2], closed='left') + actual = index.get_indexer([-1, 0, 0.5, 1, 1.5, 2, 3]) + expected = [-1, 0, 0, 1, 1, -1, -1] + self.assert_numpy_array_equal(actual, expected) + + actual = self.index.get_indexer(index[:1]) + expected = [0] + self.assert_numpy_array_equal(actual, expected) + + self.assertRaises(ValueError, self.index.get_indexer, index) + + def test_get_indexer_subintervals(self): + # return indexers for wholly contained subintervals + target = IntervalIndex.from_breaks(np.linspace(0, 2, 5)) + actual = self.index.get_indexer(target) + expected = [0, 0, 1, 1] + self.assert_numpy_array_equal(actual, expected) + + target = IntervalIndex.from_breaks([0, 0.67, 1.33, 2]) + self.assertRaises(ValueError, self.index.get_indexer, target) + + actual = self.index.get_indexer(target[[0, -1]]) + expected = [0, 1] + self.assert_numpy_array_equal(actual, expected) + + target = IntervalIndex.from_breaks([0, 0.33, 0.67, 1], closed='left') + actual = self.index.get_indexer(target) + expected = [0, 0, 0] + self.assert_numpy_array_equal(actual, expected) + + def test_contains(self): + self.assertNotIn(0, self.index) + self.assertIn(0.5, self.index) + self.assertIn(2, self.index) + + self.assertIn(Interval(0, 1), self.index) + self.assertIn(Interval(0, 2), self.index) + self.assertIn(Interval(0, 0.5), self.index) + self.assertNotIn(Interval(3, 5), self.index) + self.assertNotIn(Interval(-1, 0, closed='left'), self.index) + + def test_non_contiguous(self): + index = IntervalIndex.from_tuples([(0, 1), (2, 3)]) + target = [0.5, 1.5, 2.5] + actual = index.get_indexer(target) + expected = [0, -1, 1] + self.assert_numpy_array_equal(actual, expected) + + self.assertNotIn(1.5, index) + + def test_union(self): + other = IntervalIndex([2], [3]) + expected = IntervalIndex(range(3), range(1, 4)) + actual = self.index.union(other) + self.assertTrue(expected.equals(actual)) + + actual = other.union(self.index) + self.assertTrue(expected.equals(actual)) + + self.assert_numpy_array_equal(self.index.union(self.index), self.index) + self.assert_numpy_array_equal(self.index.union(self.index[:1]), + self.index) + + def test_intersection(self): + other = IntervalIndex.from_breaks([1, 2, 3]) + expected = IntervalIndex.from_breaks([1, 2]) + actual = self.index.intersection(other) + self.assertTrue(expected.equals(actual)) + + self.assert_numpy_array_equal(self.index.intersection(self.index), + self.index) + + def test_difference(self): + self.assert_numpy_array_equal(self.index.difference(self.index[:1]), + self.index[1:]) + + def test_sym_diff(self): + self.assert_numpy_array_equal(self.index[:1].sym_diff(self.index[1:]), + self.index) + + def test_set_operation_errors(self): + self.assertRaises(ValueError, self.index.union, self.index.left) + + other = IntervalIndex.from_breaks([0, 1, 2], closed='neither') + self.assertRaises(ValueError, self.index.union, other) + + def test_isin(self): + actual = self.index.isin(self.index) + self.assert_numpy_array_equal([True, True], actual) + + actual = self.index.isin(self.index[:1]) + self.assert_numpy_array_equal([True, False], actual) + + def test_comparison(self): + actual = Interval(0, 1) < self.index + expected = [False, True] + self.assert_numpy_array_equal(actual, expected) + + actual = Interval(0.5, 1.5) < self.index + expected = [False, True] + self.assert_numpy_array_equal(actual, expected) + actual = self.index > Interval(0.5, 1.5) + self.assert_numpy_array_equal(actual, expected) + + actual = self.index == self.index + expected = [True, True] + self.assert_numpy_array_equal(actual, expected) + actual = self.index <= self.index + self.assert_numpy_array_equal(actual, expected) + actual = self.index >= self.index + self.assert_numpy_array_equal(actual, expected) + + actual = self.index < self.index + expected = [False, False] + self.assert_numpy_array_equal(actual, expected) + actual = self.index > self.index + self.assert_numpy_array_equal(actual, expected) + + actual = self.index == IntervalIndex.from_breaks([0, 1, 2], 'left') + self.assert_numpy_array_equal(actual, expected) + + actual = self.index == self.index.values + self.assert_numpy_array_equal(actual, [True, True]) + actual = self.index.values == self.index + self.assert_numpy_array_equal(actual, [True, True]) + actual = self.index <= self.index.values + self.assert_numpy_array_equal(actual, [True, True]) + actual = self.index != self.index.values + self.assert_numpy_array_equal(actual, [False, False]) + actual = self.index > self.index.values + self.assert_numpy_array_equal(actual, [False, False]) + actual = self.index.values > self.index + self.assert_numpy_array_equal(actual, [False, False]) + + # invalid comparisons + actual = self.index == 0 + self.assert_numpy_array_equal(actual, [False, False]) + actual = self.index == self.index.left + self.assert_numpy_array_equal(actual, [False, False]) + + with self.assertRaisesRegexp(TypeError, 'unorderable types'): + self.index > 0 + with self.assertRaisesRegexp(TypeError, 'unorderable types'): + self.index <= 0 + with self.assertRaises(TypeError): + self.index > np.arange(2) + with self.assertRaises(ValueError): + self.index > np.arange(3) + + def test_missing_values(self): + idx = pd.Index([np.nan, pd.Interval(0, 1), pd.Interval(1, 2)]) + idx2 = pd.IntervalIndex([np.nan, 0, 1], [np.nan, 1, 2]) + assert idx.equals(idx2) + + with tm.assertRaisesRegexp(ValueError, 'both left and right sides'): + pd.IntervalIndex([np.nan, 0, 1], [0, 1, 2]) + + self.assert_numpy_array_equal(pd.isnull(idx), [True, False, False]) + + def test_order(self): + expected = IntervalIndex.from_breaks([1, 2, 3, 4]) + actual = IntervalIndex.from_tuples([(3, 4), (1, 2), (2, 3)]).order() + self.assert_numpy_array_equal(expected, actual) + + def test_datetime(self): + dates = pd.date_range('2000', periods=3) + idx = IntervalIndex.from_breaks(dates) + + self.assert_numpy_array_equal(idx.left, dates[:2]) + self.assert_numpy_array_equal(idx.right, dates[-2:]) + + expected = pd.date_range('2000-01-01T12:00', periods=2) + self.assert_numpy_array_equal(idx.mid, expected) + + self.assertIn('2000-01-01T12', idx) + + target = pd.date_range('1999-12-31T12:00', periods=7, freq='12H') + actual = idx.get_indexer(target) + expected = [-1, -1, 0, 0, 1, 1, -1] + self.assert_numpy_array_equal(actual, expected) + + # def test_math(self): + # # add, subtract, multiply, divide with scalars should be OK + # actual = 2 * self.index + 1 + # expected = IntervalIndex.from_breaks((2 * np.arange(3) + 1)) + # self.assertTrue(expected.equals(actual)) + + # actual = self.index / 2.0 - 1 + # expected = IntervalIndex.from_breaks((np.arange(3) / 2.0 - 1)) + # self.assertTrue(expected.equals(actual)) + + # with self.assertRaises(TypeError): + # # doesn't make sense to add two IntervalIndex objects + # self.index + self.index + + # def test_datetime_math(self): + + # expected = IntervalIndex(pd.date_range('2000-01-02', periods=3)) + # actual = idx + pd.to_timedelta(1, unit='D') + # self.assertTrue(expected.equals(actual)) + + # TODO: other set operations (left join, right join, intersection), + # set operations with conflicting IntervalIndex objects or other dtypes, + # groupby, cut, reset_index... diff --git a/pandas/tests/tools/test_tile.py b/pandas/tests/tools/test_tile.py index cc80c1ff5db29..2d657c14b73a6 100644 --- a/pandas/tests/tools/test_tile.py +++ b/pandas/tests/tools/test_tile.py @@ -3,12 +3,14 @@ import numpy as np from pandas.compat import zip -from pandas import Series, Index, Categorical +from pandas import DataFrame, Series, Index, unique, isnull, Categorical import pandas.util.testing as tm from pandas.util.testing import assertRaisesRegexp import pandas.core.common as com from pandas.core.algorithms import quantile +from pandas.core.categorical import Categorical +from pandas.core.interval import Interval, IntervalIndex from pandas.tools.tile import cut, qcut import pandas.tools.tile as tmod from pandas import to_datetime, DatetimeIndex, Timestamp @@ -27,34 +29,30 @@ def test_bins(self): data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]) result, bins = cut(data, 3, retbins=True) - exp_codes = np.array([0, 0, 0, 1, 2, 0], dtype=np.int8) - tm.assert_numpy_array_equal(result.codes, exp_codes) - exp = np.array([0.1905, 3.36666667, 6.53333333, 9.7]) - tm.assert_almost_equal(bins, exp) + intervals = IntervalIndex.from_breaks(bins.round(3)) + tm.assert_numpy_array_equal(result, intervals.take([0, 0, 0, 1, 2, 0])) + tm.assert_almost_equal(bins, [0.1905, 3.36666667, 6.53333333, 9.7]) def test_right(self): data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575]) result, bins = cut(data, 4, right=True, retbins=True) - exp_codes = np.array([0, 0, 0, 2, 3, 0, 0], dtype=np.int8) - tm.assert_numpy_array_equal(result.codes, exp_codes) - exp = np.array([0.1905, 2.575, 4.95, 7.325, 9.7]) - tm.assert_numpy_array_equal(bins, exp) + intervals = IntervalIndex.from_breaks(bins.round(3)) + tm.assert_numpy_array_equal(result, intervals.take([0, 0, 0, 2, 3, 0, 0])) + tm.assert_almost_equal(bins, [0.1905, 2.575, 4.95, 7.325, 9.7]) def test_noright(self): data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575]) result, bins = cut(data, 4, right=False, retbins=True) - exp_codes = np.array([0, 0, 0, 2, 3, 0, 1], dtype=np.int8) - tm.assert_numpy_array_equal(result.codes, exp_codes) - exp = np.array([0.2, 2.575, 4.95, 7.325, 9.7095]) - tm.assert_almost_equal(bins, exp) + intervals = IntervalIndex.from_breaks(bins.round(3), closed='left') + tm.assert_numpy_array_equal(result, intervals.take([0, 0, 0, 2, 3, 0, 1])) + tm.assert_almost_equal(bins, [0.2, 2.575, 4.95, 7.325, 9.7095]) def test_arraylike(self): data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1] result, bins = cut(data, 3, retbins=True) - exp_codes = np.array([0, 0, 0, 1, 2, 0], dtype=np.int8) - tm.assert_numpy_array_equal(result.codes, exp_codes) - exp = np.array([0.1905, 3.36666667, 6.53333333, 9.7]) - tm.assert_almost_equal(bins, exp) + intervals = IntervalIndex.from_breaks(bins.round(3)) + tm.assert_numpy_array_equal(result, intervals.take([0, 0, 0, 1, 2, 0])) + tm.assert_almost_equal(bins, [0.1905, 3.36666667, 6.53333333, 9.7]) def test_bins_not_monotonic(self): data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1] @@ -82,14 +80,13 @@ def test_labels(self): arr = np.tile(np.arange(0, 1.01, 0.1), 4) result, bins = cut(arr, 4, retbins=True) - ex_levels = Index(['(-0.001, 0.25]', '(0.25, 0.5]', '(0.5, 0.75]', - '(0.75, 1]']) - self.assert_index_equal(result.categories, ex_levels) + ex_levels = IntervalIndex.from_breaks([-1e-3, 0.25, 0.5, 0.75, 1]) + self.assert_numpy_array_equal(unique(result), ex_levels) result, bins = cut(arr, 4, retbins=True, right=False) - ex_levels = Index(['[0, 0.25)', '[0.25, 0.5)', '[0.5, 0.75)', - '[0.75, 1.001)']) - self.assert_index_equal(result.categories, ex_levels) + ex_levels = IntervalIndex.from_breaks([0, 0.25, 0.5, 0.75, 1 + 1e-3], + closed='left') + self.assert_numpy_array_equal(unique(result), ex_levels) def test_cut_pass_series_name_to_factor(self): s = Series(np.random.randn(100), name='foo') @@ -101,9 +98,8 @@ def test_label_precision(self): arr = np.arange(0, 0.73, 0.01) result = cut(arr, 4, precision=2) - ex_levels = Index(['(-0.00072, 0.18]', '(0.18, 0.36]', - '(0.36, 0.54]', '(0.54, 0.72]']) - self.assert_index_equal(result.categories, ex_levels) + ex_levels = IntervalIndex.from_breaks([-0.00072, 0.18, 0.36, 0.54, 0.72]) + self.assert_numpy_array_equal(unique(result), ex_levels) def test_na_handling(self): arr = np.arange(0, 0.75, 0.01) @@ -125,17 +121,16 @@ def test_inf_handling(self): data = np.arange(6) data_ser = Series(data, dtype='int64') - result = cut(data, [-np.inf, 2, 4, np.inf]) - result_ser = cut(data_ser, [-np.inf, 2, 4, np.inf]) + bins = [-np.inf, 2, 4, np.inf] + result = cut(data, bins) + result_ser = cut(data_ser, bins) - ex_categories = Index(['(-inf, 2]', '(2, 4]', '(4, inf]']) - - tm.assert_index_equal(result.categories, ex_categories) - tm.assert_index_equal(result_ser.cat.categories, ex_categories) - self.assertEqual(result[5], '(4, inf]') - self.assertEqual(result[0], '(-inf, 2]') - self.assertEqual(result_ser[5], '(4, inf]') - self.assertEqual(result_ser[0], '(-inf, 2]') + ex_uniques = IntervalIndex.from_breaks(bins).values + tm.assert_numpy_array_equal(unique(result), ex_uniques) + self.assertEqual(result[5], Interval(4, np.inf)) + self.assertEqual(result[0], Interval(-np.inf, 2)) + self.assertEqual(result_ser[5], Interval(4, np.inf)) + self.assertEqual(result_ser[0], Interval(-np.inf, 2)) def test_qcut(self): arr = np.random.randn(1000) @@ -158,7 +153,7 @@ def test_qcut_specify_quantiles(self): factor = qcut(arr, [0, .25, .5, .75, 1.]) expected = qcut(arr, 4) - tm.assert_categorical_equal(factor, expected) + self.assert_numpy_array_equal(factor, expected) def test_qcut_all_bins_same(self): assertRaisesRegexp(ValueError, "edges.*unique", qcut, @@ -169,7 +164,7 @@ def test_cut_out_of_bounds(self): result = cut(arr, [-1, 0, 1]) - mask = result.codes == -1 + mask = isnull(result) ex_mask = (arr < -1) | (arr > 1) self.assert_numpy_array_equal(mask, ex_mask) @@ -179,19 +174,21 @@ def test_cut_pass_labels(self): labels = ['Small', 'Medium', 'Large'] result = cut(arr, bins, labels=labels) + exp = ['Medium'] + 4 * ['Small'] + ['Medium', 'Large'] + self.assert_numpy_array_equal(result, exp) - exp = cut(arr, bins) - exp.categories = labels - - tm.assert_categorical_equal(result, exp) + result = cut(arr, bins, labels=Categorical.from_codes([0, 1, 2], labels)) + exp = Categorical.from_codes([1] + 4 * [0] + [1, 2], labels) + self.assertTrue(result.equals(exp)) def test_qcut_include_lowest(self): values = np.arange(10) cats = qcut(values, 4) - ex_levels = ['[0, 2.25]', '(2.25, 4.5]', '(4.5, 6.75]', '(6.75, 9]'] - self.assertTrue((cats.categories == ex_levels).all()) + ex_levels = [Interval(0, 2.25, closed='both'), Interval(2.25, 4.5), + Interval(4.5, 6.75), Interval(6.75, 9)] + self.assert_numpy_array_equal(unique(cats), ex_levels) def test_qcut_nas(self): arr = np.random.randn(100) @@ -200,9 +197,15 @@ def test_qcut_nas(self): result = qcut(arr, 4) self.assertTrue(com.isnull(result[:20]).all()) - def test_label_formatting(self): - self.assertEqual(tmod._trim_zeros('1.000'), '1') + def test_qcut_index(self): + # the result is closed on a different side for the first interval, but + # we should still be able to make an index + result = qcut([0, 2], 2) + index = Index(result) + expected = Index([Interval(0, 1, closed='both'), Interval(1, 2)]) + self.assert_numpy_array_equal(index, expected) + def test_round_frac(self): # it works result = cut(np.arange(11.), 2) @@ -210,10 +213,15 @@ def test_label_formatting(self): # #1979, negative numbers - result = tmod._format_label(-117.9998, precision=3) - self.assertEqual(result, '-118') - result = tmod._format_label(117.9998, precision=3) - self.assertEqual(result, '118') + result = tmod._round_frac(-117.9998, precision=3) + self.assertEqual(result, -118) + result = tmod._round_frac(117.9998, precision=3) + self.assertEqual(result, 118) + + result = tmod._round_frac(117.9998, precision=2) + self.assertEqual(result, 118) + result = tmod._round_frac(0.000123456, precision=2) + self.assertEqual(result, 0.00012) def test_qcut_binning_issues(self): # #1978, 1979 @@ -224,9 +232,9 @@ def test_qcut_binning_issues(self): starts = [] ends = [] - for lev in result.categories: - s, e = lev[1:-1].split(',') - + for lev in np.unique(result): + s = lev.left + e = lev.right self.assertTrue(s != e) starts.append(float(s)) @@ -238,22 +246,20 @@ def test_qcut_binning_issues(self): self.assertTrue(ep < en) self.assertTrue(ep <= sn) - def test_cut_return_categorical(self): - s = Series([0, 1, 2, 3, 4, 5, 6, 7, 8]) - res = cut(s, 3) - exp = Series(Categorical.from_codes([0, 0, 0, 1, 1, 1, 2, 2, 2], - ["(-0.008, 2.667]", - "(2.667, 5.333]", "(5.333, 8]"], - ordered=True)) + def test_cut_return_intervals(self): + s = Series([0,1,2,3,4,5,6,7,8]) + res = cut(s,3) + exp_bins = np.linspace(0, 8, num=4).round(3) + exp_bins[0] -= 0.008 + exp = Series(IntervalIndex.from_breaks(exp_bins).take([0,0,0,1,1,1,2,2,2])) tm.assert_series_equal(res, exp) - def test_qcut_return_categorical(self): - s = Series([0, 1, 2, 3, 4, 5, 6, 7, 8]) - res = qcut(s, [0, 0.333, 0.666, 1]) - exp = Series(Categorical.from_codes([0, 0, 0, 1, 1, 1, 2, 2, 2], - ["[0, 2.664]", - "(2.664, 5.328]", "(5.328, 8]"], - ordered=True)) + def test_qcut_return_intervals(self): + s = Series([0,1,2,3,4,5,6,7,8]) + res = qcut(s,[0,0.333,0.666,1]) + exp_levels = np.array([Interval(0, 2.664, closed='both'), + Interval(2.664, 5.328), Interval(5.328, 8)]) + exp = Series(exp_levels.take([0,0,0,1,1,1,2,2,2])) tm.assert_series_equal(res, exp) def test_series_retbins(self): @@ -421,6 +427,14 @@ def test_datetime_bin(self): result = cut(data, bins=bin_pydatetime) tm.assert_series_equal(Series(result), expected) + result, bins = cut(s, 2, retbins=True, labels=[0, 1]) + tm.assert_numpy_array_equal(result, [0, 0, 1, 1]) + tm.assert_almost_equal(bins, [-0.003, 1.5, 3]) + + result, bins = qcut(s, 2, retbins=True, labels=[0, 1]) + tm.assert_numpy_array_equal(result, [0, 0, 1, 1]) + tm.assert_almost_equal(bins, [0, 1.5, 3]) + def curpath(): pth, _ = os.path.split(os.path.abspath(__file__)) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index 4a3d452228e01..f1ca7ff4b19ba 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -8,6 +8,8 @@ from pandas.core.api import Series from pandas.core.categorical import Categorical +from pandas.core.index import _ensure_index +from pandas.core.interval import IntervalIndex, Interval import pandas.core.algorithms as algos import pandas.core.nanops as nanops from pandas.compat import zip @@ -17,6 +19,8 @@ import numpy as np +import warnings + def cut(x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False): @@ -45,9 +49,9 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, retbins : bool, optional Whether to return the bins or not. Can be useful if bins is given as a scalar. - precision : int + precision : int, optional The precision at which to store and display the bins labels - include_lowest : bool + include_lowest : bool, optional Whether the first interval should be left-inclusive or not. Returns @@ -93,14 +97,17 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, if is_scalar(bins) and bins < 1: raise ValueError("`bins` should be a positive integer.") - sz = x.size + # TODO: IntervalIndex + try: # for array-like + sz = x.size + except AttributeError: + x = np.asarray(x) + sz = x.size if sz == 0: raise ValueError('Cannot cut empty array') - # handle empty arrays. Can't determine range, so use 0-1. - # rng = (0, 1) - else: - rng = (nanops.nanmin(x), nanops.nanmax(x)) + + rng = (nanops.nanmin(x), nanops.nanmax(x)) mn, mx = [mi + 0.0 for mi in rng] if mn == mx: # adjust end points before binning @@ -149,7 +156,7 @@ def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): retbins : bool, optional Whether to return the bins or not. Can be useful if bins is given as a scalar. - precision : int + precision : int, optional The precision at which to store and display the bins labels duplicates : {default 'raise', 'drop'}, optional If bin edges are not unique, raise ValueError or drop non-uniques. @@ -225,6 +232,8 @@ def _bins_to_cuts(x, bins, right=True, labels=None, if labels is not False: if labels is None: + + # TODO: IntervalIndex increases = 0 while True: try: @@ -239,23 +248,34 @@ def _bins_to_cuts(x, bins, right=True, labels=None, else: break + # + #closed = 'right' if right else 'left' + #precision = _infer_precision(precision, bins) + #breaks = [_round_frac(b, precision) for b in bins] + #labels = IntervalIndex.from_breaks(breaks, closed=closed).values + + #if right and include_lowest: + # labels[0] = Interval(labels[0].left, labels[0].right, + # closed='both') + else: if len(labels) != len(bins) - 1: raise ValueError('Bin labels must be one fewer than ' 'the number of bin edges') - levels = labels - levels = np.asarray(levels, dtype=object) + if not com.is_categorical(labels): + labels = np.asarray(labels) + np.putmask(ids, na_mask, 0) - fac = Categorical(ids - 1, levels, ordered=True, fastpath=True) + result = com.take_nd(labels, ids - 1) + else: - fac = ids - 1 + result = ids - 1 if has_nas: - fac = fac.astype(np.float64) - np.putmask(fac, na_mask, np.nan) - - return fac, bins + result = result.astype(np.float64) + np.putmask(result, na_mask, np.nan) + return result, bins def _format_levels(bins, prec, right=True, include_lowest=False, dtype=None): @@ -265,15 +285,11 @@ def _format_levels(bins, prec, right=True, for a, b in zip(bins, bins[1:]): fa, fb = fmt(a), fmt(b) - if a != b and fa == fb: - raise ValueError('precision too low') - - formatted = '(%s, %s]' % (fa, fb) - - levels.append(formatted) - - if include_lowest: - levels[0] = '[' + levels[0][1:] +def _round_frac(x, precision): + """Round the fractional part of the given number + """ + if not np.isfinite(x) or x == 0: + return x else: levels = ['[%s, %s)' % (fmt(a), fmt(b)) for a, b in zip(bins, bins[1:])] @@ -291,30 +307,11 @@ def _format_label(x, precision=3, dtype=None): return str(x) elif is_float(x): frac, whole = np.modf(x) - sgn = '-' if x < 0 else '' - whole = abs(whole) - if frac != 0.0: - val = fmt_str % frac - - # rounded up or down - if '.' not in val: - if x < 0: - return '%d' % (-whole - 1) - else: - return '%d' % (whole + 1) - - if 'e' in val: - return _trim_zeros(fmt_str % x) - else: - val = _trim_zeros(val) - if '.' in val: - return sgn + '.'.join(('%d' % whole, val.split('.')[1])) - else: # pragma: no cover - return sgn + '.'.join(('%d' % whole, val)) + if whole == 0: + digits = -int(np.floor(np.log10(abs(frac)))) - 1 + precision else: - return sgn + '%0.f' % whole - else: - return str(x) + digits = precision + return np.around(x, digits) def _trim_zeros(x): @@ -388,3 +385,12 @@ def _postprocess_for_cut(fac, bins, retbins, x_is_series, series_index, name): return fac return fac, bins + +def _infer_precision(base_precision, bins): + """Infer an appropriate precision for _round_frac + """ + for precision in range(base_precision, 20): + levels = [_round_frac(b, precision) for b in bins] + if algos.unique(levels).size == bins.size: + return precision + return base_precision # default diff --git a/pandas/util/testing.py b/pandas/util/testing.py index d5986a7f390e5..28214b1462cb7 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -43,9 +43,11 @@ from pandas.computation import expressions as expr -from pandas import (bdate_range, CategoricalIndex, Categorical, DatetimeIndex, - TimedeltaIndex, PeriodIndex, RangeIndex, Index, MultiIndex, +from pandas import (bdate_range, CategoricalIndex, Categorical, IntervalIndex, + DatetimeIndex, TimedeltaIndex, PeriodIndex, RangeIndex, + Index, MultiIndex, Series, DataFrame, Panel, Panel4D) + from pandas.util.decorators import deprecate from pandas.util import libtesting from pandas.io.common import urlopen @@ -1687,6 +1689,11 @@ def makeCategoricalIndex(k=10, n=3, name=None): return CategoricalIndex(np.random.choice(x, k), name=name) +def makeIntervalIndex(k=10, name=None): + """ make a length k IntervalIndex """ + x = np.linspace(0, 100, num=(k + 1)) + return IntervalIndex.from_breaks(x, name=name) + def makeBoolIndex(k=10, name=None): if k == 1: return Index([True], name=name) From 340c98bdd4cca28d664a906c0390141e86fd310d Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 2 Feb 2017 09:24:09 -0500 Subject: [PATCH 02/12] CLN/COMPAT: IntervalIndex --- asv_bench/benchmarks/indexing.py | 20 + doc/source/api.rst | 21 + doc/source/whatsnew/v0.20.0.txt | 31 + pandas/_libs/hashtable.pyx | 2 + pandas/{src => _libs}/interval.pyx | 104 +- .../intervaltree.pxi.in} | 207 +-- pandas/_libs/lib.pyx | 6 +- pandas/_libs/src/inference.pyx | 28 +- pandas/_libs/tslib.pyx | 12 + pandas/core/algorithms.py | 22 +- pandas/core/api.py | 4 +- pandas/core/groupby.py | 40 +- pandas/core/indexing.py | 8 +- pandas/core/interval.py | 521 ------ pandas/formats/format.py | 15 + pandas/indexes/api.py | 3 +- pandas/indexes/base.py | 73 +- pandas/indexes/category.py | 70 +- pandas/indexes/interval.py | 983 +++++++++++ pandas/indexes/multi.py | 4 +- pandas/src/intervaltree.pyx | 1444 ----------------- pandas/tests/api/test_api.py | 4 +- pandas/tests/frame/test_alter_axes.py | 66 +- pandas/tests/groupby/test_categorical.py | 5 +- pandas/tests/groupby/test_groupby.py | 49 +- pandas/tests/indexes/common.py | 25 +- pandas/tests/indexes/test_base.py | 4 +- pandas/tests/indexes/test_category.py | 18 +- pandas/tests/indexes/test_interval.py | 799 +++++++++ pandas/tests/indexing/test_interval.py | 141 ++ pandas/tests/scalar/test_interval.py | 129 ++ pandas/tests/series/test_constructors.py | 14 +- pandas/tests/series/test_missing.py | 11 +- pandas/tests/test_algos.py | 25 +- pandas/tests/test_base.py | 21 +- pandas/tests/test_categorical.py | 12 +- pandas/tests/test_interval.py | 591 ------- pandas/tests/tools/test_tile.py | 224 +-- pandas/tests/types/test_dtypes.py | 118 +- pandas/tests/types/test_missing.py | 8 + pandas/tools/tile.py | 183 ++- pandas/tseries/base.py | 10 +- pandas/tseries/interval.py | 35 - pandas/tseries/period.py | 3 + pandas/types/api.py | 4 + pandas/types/common.py | 23 + pandas/types/dtypes.py | 109 ++ pandas/types/generic.py | 4 +- pandas/types/inference.py | 2 + pandas/types/missing.py | 5 +- pandas/util/testing.py | 11 + setup.py | 5 + 52 files changed, 3211 insertions(+), 3065 deletions(-) rename pandas/{src => _libs}/interval.pyx (68%) rename pandas/{src/generate_intervaltree.py => _libs/intervaltree.pxi.in} (68%) delete mode 100644 pandas/core/interval.py create mode 100644 pandas/indexes/interval.py delete mode 100644 pandas/src/intervaltree.pyx create mode 100644 pandas/tests/indexes/test_interval.py create mode 100644 pandas/tests/indexing/test_interval.py create mode 100644 pandas/tests/scalar/test_interval.py delete mode 100644 pandas/tests/test_interval.py delete mode 100644 pandas/tseries/interval.py diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index d938cc6a6dc4d..a32c9f25a0f09 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -226,6 +226,26 @@ def time_is_monotonic(self): self.miint.is_monotonic +class IntervalIndexing(object): + goal_time = 0.2 + + def setup(self): + self.monotonic = Series(np.arange(1000000), + index=IntervalIndex.from_breaks(np.arange(1000001))) + + def time_getitem_scalar(self): + self.monotonic[80000] + + def time_loc_scalar(self): + self.monotonic.loc[80000] + + def time_getitem_list(self): + self.monotonic[80000:] + + def time_loc_list(self): + self.monotonic.loc[80000:] + + class PanelIndexing(object): goal_time = 0.2 diff --git a/doc/source/api.rst b/doc/source/api.rst index bf9d521e2a12a..6ba8c2b8ead67 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1405,6 +1405,27 @@ Categorical Components CategoricalIndex.as_ordered CategoricalIndex.as_unordered +.. _api.intervalindex: + +IntervalIndex +------------- + +.. autosummary:: + :toctree: generated/ + + IntervalIndex + +IntervalIndex Components +~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autosummary:: + :toctree: generated/ + + IntervalIndex.from_arrays + IntervalIndex.from_tuples + IntervalIndex.from_breaks + IntervalIndex.from_intervals + .. _api.multiindex: MultiIndex diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index a105a6801fb61..6daeb29a6e67e 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -13,6 +13,7 @@ Highlights include: - ``Panel`` has been deprecated, see :ref:`here ` - Improved user API when accessing levels in ``.groupby()``, see :ref:`here ` - Improved support for UInt64 dtypes, see :ref:`here ` +- Addition of an ``IntervalIndex`` and ``Interval`` scalar type, see :ref:`here ` - A new orient for JSON serialization, ``orient='table'``, that uses the Table Schema spec, see :ref:`here ` - Window Binary Corr/Cov operations return a MultiIndexed ``DataFrame`` rather than a ``Panel``, as ``Panel`` is now deprecated, see :ref:`here ` - Support for S3 handling now uses ``s3fs``, see :ref:`here ` @@ -314,6 +315,36 @@ To convert a ``SparseDataFrame`` back to sparse SciPy matrix in COO format, you sdf.to_coo() +.. _whatsnew_0200.enhancements.intervalindex: + +IntervalIndex +^^^^^^^^^^^^^ + +pandas has gain an ``IntervalIndex`` with its own dtype, ``interval`` as well as the ``Interval`` scalar type. These allow first-class support for interval +notation, specifically as return type for ``pd.cut`` and ``pd.qcut``. (:issue:`7640`, :issue:`8625`) + +**Previous behavior**: + +.. code-block:: ipython + + In [2]: pd.cut(range(3), 2) + Out[2]: + [(-0.002, 1], (-0.002, 1], (1, 2]] + Categories (2, object): [(-0.002, 1] < (1, 2]] + + # the returned categories are strings, representing Intervals + In [3]: pd.cut(range(3), 2).categories + Out[3]: Index(['(-0.002, 1]', '(1, 2]'], dtype='object') + +**New behavior**: + +.. ipython:: python + + c = pd.cut(range(3), 2) + c + c.categories + pd.api.types.is_interval_dtype(c.categories) + .. _whatsnew_0200.enhancements.other: Other Enhancements diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index a4e5bee9a8746..c8aedcef77502 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -39,6 +39,8 @@ PyDateTime_IMPORT cdef extern from "Python.h": int PySlice_Check(object) +cdef size_t _INIT_VEC_CAP = 128 + include "hashtable_class_helper.pxi" include "hashtable_func_helper.pxi" diff --git a/pandas/src/interval.pyx b/pandas/_libs/interval.pyx similarity index 68% rename from pandas/src/interval.pyx rename to pandas/_libs/interval.pyx index 495730e0fd6a1..60a34aff16e9d 100644 --- a/pandas/src/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -2,8 +2,11 @@ cimport numpy as np import numpy as np import pandas as pd +cimport util cimport cython import cython +from numpy cimport * +from tslib import Timestamp from cpython.object cimport (Py_EQ, Py_NE, Py_GT, Py_LT, Py_GE, Py_LE, PyObject_RichCompare) @@ -44,6 +47,20 @@ cdef _interval_like(other): cdef class Interval(IntervalMixin): + """ + Immutable object implementing an Interval, a bounded slice-like interval. + + .. versionadded:: 0.20.0 + + Properties + ---------- + left, right : values + Left and right bounds for each interval. + closed : {'left', 'right', 'both', 'neither'} + Whether the interval is closed on the left-side, right-side, both or + neither. Defaults to 'right'. + """ + cdef readonly object left, right cdef readonly str closed @@ -84,88 +101,115 @@ cdef class Interval(IntervalMixin): return NotImplemented else: op_str = {Py_LT: '<', Py_LE: '<=', Py_GT: '>', Py_GE: '>='}[op] - raise TypeError('unorderable types: %s() %s %s()' % - (type(self).__name__, op_str, type(other).__name__)) + raise TypeError( + 'unorderable types: %s() %s %s()' % + (type(self).__name__, op_str, type(other).__name__)) def __reduce__(self): args = (self.left, self.right, self.closed) return (type(self), args) + def _repr_base(self): + left = self.left + right = self.right + + # TODO: need more general formatting methodology here + if isinstance(left, Timestamp) and isinstance(right, Timestamp): + left = left._short_repr + right = right._short_repr + + return left, right + def __repr__(self): + + left, right = self._repr_base() return ('%s(%r, %r, closed=%r)' % - (type(self).__name__, self.left, self.right, self.closed)) + (type(self).__name__, left, right, self.closed)) def __str__(self): + + left, right = self._repr_base() start_symbol = '[' if self.closed_left else '(' end_symbol = ']' if self.closed_right else ')' - return '%s%s, %s%s' % (start_symbol, self.left, self.right, end_symbol) + return '%s%s, %s%s' % (start_symbol, left, right, end_symbol) def __add__(self, y): if isinstance(y, numbers.Number): return Interval(self.left + y, self.right + y) elif isinstance(y, Interval) and isinstance(self, numbers.Number): return Interval(y.left + self, y.right + self) - else: - raise NotImplemented + return NotImplemented def __sub__(self, y): if isinstance(y, numbers.Number): return Interval(self.left - y, self.right - y) - else: - raise NotImplemented + return NotImplemented def __mul__(self, y): if isinstance(y, numbers.Number): return Interval(self.left * y, self.right * y) elif isinstance(y, Interval) and isinstance(self, numbers.Number): return Interval(y.left * self, y.right * self) - else: - return NotImplemented + return NotImplemented def __div__(self, y): if isinstance(y, numbers.Number): return Interval(self.left / y, self.right / y) - else: - return NotImplemented + return NotImplemented def __truediv__(self, y): if isinstance(y, numbers.Number): return Interval(self.left / y, self.right / y) - else: - return NotImplemented + return NotImplemented def __floordiv__(self, y): if isinstance(y, numbers.Number): return Interval(self.left // y, self.right // y) - else: - return NotImplemented + return NotImplemented @cython.wraparound(False) @cython.boundscheck(False) -cpdef interval_bounds_to_intervals(np.ndarray left, np.ndarray right, - str closed): - result = np.empty(len(left), dtype=object) - nulls = pd.isnull(left) | pd.isnull(right) - result[nulls] = np.nan - for i in np.flatnonzero(~nulls): - result[i] = Interval(left[i], right[i], closed) - return result +cpdef intervals_to_interval_bounds(ndarray intervals): + """ + Parameters + ---------- + intervals: ndarray object array of Intervals / nulls + Returns + ------- + tuples (left: ndarray object array, + right: ndarray object array, + closed: str) + + """ + + cdef: + object closed = None, interval + int64_t n = len(intervals) + ndarray left, right + + left = np.empty(n, dtype=object) + right = np.empty(n, dtype=object) -@cython.wraparound(False) -@cython.boundscheck(False) -cpdef intervals_to_interval_bounds(np.ndarray intervals): - left = np.empty(len(intervals), dtype=object) - right = np.empty(len(intervals), dtype=object) - cdef str closed = None for i in range(len(intervals)): interval = intervals[i] + if util._checknull(interval): + left[i] = np.nan + right[i] = np.nan + continue + + if not isinstance(interval, Interval): + raise TypeError("type {} with value {} is not an interval".format( + type(interval), interval)) + left[i] = interval.left right[i] = interval.right if closed is None: closed = interval.closed elif closed != interval.closed: raise ValueError('intervals must all be closed on the same side') + return left, right, closed +include "intervaltree.pxi" diff --git a/pandas/src/generate_intervaltree.py b/pandas/_libs/intervaltree.pxi.in similarity index 68% rename from pandas/src/generate_intervaltree.py rename to pandas/_libs/intervaltree.pxi.in index 275a0d40e2433..4fa0d6d156fa2 100644 --- a/pandas/src/generate_intervaltree.py +++ b/pandas/_libs/intervaltree.pxi.in @@ -1,22 +1,9 @@ """ -This file generates `intervaltree.pyx` which is then included in `../lib.pyx` -during building. To regenerate `intervaltree.pyx`, just run: +Template for intervaltree - `python generate_intervaltree.py`. +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ -from __future__ import print_function -import os -from pandas.compat import StringIO -import numpy as np - -warning_to_new_contributors = """ -# DO NOT EDIT THIS FILE: This file was autogenerated from -# generate_intervaltree.py, so please edit that file and then run -# `python2 generate_intervaltree.py` to re-generate this file. -""" - -header = r''' from numpy cimport int64_t, float64_t from numpy cimport ndarray, PyArray_ArgSort, NPY_QUICKSORT, PyArray_Take import numpy as np @@ -28,22 +15,27 @@ from hashtable cimport Int64Vector, Int64VectorData -ctypedef fused scalar64_t: +ctypedef fused scalar_t: float64_t + float32_t int64_t + int32_t -NODE_CLASSES = {} - +#---------------------------------------------------------------------- +# IntervalTree +#---------------------------------------------------------------------- cdef class IntervalTree(IntervalMixin): """A centered interval tree Based off the algorithm described on Wikipedia: http://en.wikipedia.org/wiki/Interval_tree + + we are emulating the IndexEngine interface """ cdef: - readonly object left, right, root + readonly object left, right, root, dtype readonly str closed object _left_sorter, _right_sorter @@ -67,15 +59,15 @@ def __init__(self, left, right, closed='right', leaf_size=100): left = np.asarray(left) right = np.asarray(right) - dtype = np.result_type(left, right) - self.left = np.asarray(left, dtype=dtype) - self.right = np.asarray(right, dtype=dtype) + self.dtype = np.result_type(left, right) + self.left = np.asarray(left, dtype=self.dtype) + self.right = np.asarray(right, dtype=self.dtype) indices = np.arange(len(left), dtype='int64') self.closed = closed - node_cls = NODE_CLASSES[str(dtype), closed] + node_cls = NODE_CLASSES[str(self.dtype), closed] self.root = node_cls(self.left, self.right, indices, leaf_size) @property @@ -94,7 +86,7 @@ def right_sorter(self): self._right_sorter = np.argsort(self.right) return self._right_sorter - def get_loc(self, scalar64_t key): + def get_loc(self, scalar_t key): """Return all positions corresponding to intervals that overlap with the given scalar key """ @@ -131,13 +123,15 @@ def get_loc_interval(self, key_left, key_right): uniques = pd.unique(combined) return uniques - def get_indexer(self, scalar64_t[:] target): + def get_indexer(self, scalar_t[:] target): """Return the positions corresponding to unique intervals that overlap with the given array of scalar targets. """ + # TODO: write get_indexer_intervals cdef: - int64_t old_len, i + size_t old_len + Py_ssize_t i Int64Vector result result = Int64Vector() @@ -152,12 +146,13 @@ def get_indexer(self, scalar64_t[:] target): old_len = result.data.n return result.to_array() - def get_indexer_non_unique(self, scalar64_t[:] target): + def get_indexer_non_unique(self, scalar_t[:] target): """Return the positions corresponding to intervals that overlap with the given array of scalar targets. Non-unique positions are repeated. """ cdef: - int64_t old_len, i + size_t old_len + Py_ssize_t i Int64Vector result, missing result = Int64Vector() @@ -172,8 +167,14 @@ def get_indexer_non_unique(self, scalar64_t[:] target): return result.to_array(), missing.to_array() def __repr__(self): - return ('' - % self.root.n_elements) + return (''.format( + dtype=self.dtype, closed=self.closed, + n_elements=self.root.n_elements)) + + # compat with IndexEngine interface + def clear_mapping(self): + pass cdef take(ndarray source, ndarray indices): @@ -189,37 +190,66 @@ def __repr__(self): sorted_values = take(values, sorter) sorted_indices = take(indices, sorter) return sorted_values, sorted_indices -''' + +#---------------------------------------------------------------------- +# Nodes +#---------------------------------------------------------------------- # we need specialized nodes and leaves to optimize for different dtype and # closed values -# unfortunately, fused dtypes can't parameterize attributes on extension types, -# so we're stuck using template generation. -node_template = r''' -cdef class {dtype_title}Closed{closed_title}IntervalNode: +{{py: + +nodes = [] +for dtype in ['float32', 'float64', 'int32', 'int64']: + for closed, cmp_left, cmp_right in [ + ('left', '<=', '<'), + ('right', '<', '<='), + ('both', '<=', '<='), + ('neither', '<', '<')]: + cmp_left_converse = '<' if cmp_left == '<=' else '<=' + cmp_right_converse = '<' if cmp_right == '<=' else '<=' + nodes.append((dtype, dtype.title(), + closed, closed.title(), + cmp_left, + cmp_right, + cmp_left_converse, + cmp_right_converse)) + +}} + +NODE_CLASSES = {} + +{{for dtype, dtype_title, closed, closed_title, cmp_left, cmp_right, + cmp_left_converse, cmp_right_converse in nodes}} + +cdef class {{dtype_title}}Closed{{closed_title}}IntervalNode: """Non-terminal node for an IntervalTree Categorizes intervals by those that fall to the left, those that fall to the right, and those that overlap with the pivot. """ cdef: - {dtype_title}Closed{closed_title}IntervalNode left_node, right_node - {dtype}_t[:] center_left_values, center_right_values, left, right + {{dtype_title}}Closed{{closed_title}}IntervalNode left_node, right_node + {{dtype}}_t[:] center_left_values, center_right_values, left, right int64_t[:] center_left_indices, center_right_indices, indices - {dtype}_t min_left, max_right - readonly {dtype}_t pivot + {{dtype}}_t min_left, max_right + readonly {{dtype}}_t pivot readonly int64_t n_elements, n_center, leaf_size readonly bint is_leaf_node def __init__(self, - ndarray[{dtype}_t, ndim=1] left, - ndarray[{dtype}_t, ndim=1] right, + ndarray[{{dtype}}_t, ndim=1] left, + ndarray[{{dtype}}_t, ndim=1] right, ndarray[int64_t, ndim=1] indices, int64_t leaf_size): self.n_elements = len(left) self.leaf_size = leaf_size + + # min_left and min_right are used to speed-up query by skipping + # query on sub-nodes. If this node has size 0, query is cheap, + # so these values don't matter. if left.size > 0: self.min_left = left.min() self.max_right = right.max() @@ -233,15 +263,18 @@ def __init__(self, self.left = left self.right = right self.indices = indices - self.n_center + self.n_center = 0 else: # calculate a pivot so we can create child nodes self.is_leaf_node = False self.pivot = np.median(left + right) / 2 - left_set, right_set, center_set = self.classify_intervals(left, right) + left_set, right_set, center_set = self.classify_intervals( + left, right) - self.left_node = self.new_child_node(left, right, indices, left_set) - self.right_node = self.new_child_node(left, right, indices, right_set) + self.left_node = self.new_child_node(left, right, + indices, left_set) + self.right_node = self.new_child_node(left, right, + indices, right_set) self.center_left_values, self.center_left_indices = \ sort_values_and_indices(left, indices, center_set) @@ -251,7 +284,7 @@ def __init__(self, @cython.wraparound(False) @cython.boundscheck(False) - cdef classify_intervals(self, {dtype}_t[:] left, {dtype}_t[:] right): + cdef classify_intervals(self, {{dtype}}_t[:] left, {{dtype}}_t[:] right): """Classify the given intervals based upon whether they fall to the left, right, or overlap with this node's pivot. """ @@ -264,9 +297,9 @@ def __init__(self, overlapping_ind = Int64Vector() for i in range(self.n_elements): - if right[i] {cmp_right_converse} self.pivot: + if right[i] {{cmp_right_converse}} self.pivot: left_ind.append(i) - elif self.pivot {cmp_left_converse} left[i]: + elif self.pivot {{cmp_left_converse}} left[i]: right_ind.append(i) else: overlapping_ind.append(i) @@ -276,8 +309,8 @@ def __init__(self, overlapping_ind.to_array()) cdef new_child_node(self, - ndarray[{dtype}_t, ndim=1] left, - ndarray[{dtype}_t, ndim=1] right, + ndarray[{{dtype}}_t, ndim=1] left, + ndarray[{{dtype}}_t, ndim=1] right, ndarray[int64_t, ndim=1] indices, ndarray[int64_t, ndim=1] subset): """Create a new child node. @@ -285,19 +318,19 @@ def __init__(self, left = take(left, subset) right = take(right, subset) indices = take(indices, subset) - return {dtype_title}Closed{closed_title}IntervalNode( + return {{dtype_title}}Closed{{closed_title}}IntervalNode( left, right, indices, self.leaf_size) @cython.wraparound(False) @cython.boundscheck(False) @cython.initializedcheck(False) - cpdef query(self, Int64Vector result, scalar64_t point): + cpdef query(self, Int64Vector result, scalar_t point): """Recursively query this node and its sub-nodes for intervals that overlap with the query point. """ cdef: int64_t[:] indices - {dtype}_t[:] values + {{dtype}}_t[:] values Py_ssize_t i if self.is_leaf_node: @@ -305,7 +338,7 @@ def __init__(self, # continue the binary tree structure. Instead, we use linear # search. for i in range(self.n_elements): - if self.left[i] {cmp_left} point {cmp_right} self.right[i]: + if self.left[i] {{cmp_left}} point {{cmp_right}} self.right[i]: result.append(self.indices[i]) else: # There are child nodes. Based on comparing our query to the pivot, @@ -314,36 +347,41 @@ def __init__(self, values = self.center_left_values indices = self.center_left_indices for i in range(self.n_center): - if not values[i] {cmp_left} point: + if not values[i] {{cmp_left}} point: break result.append(indices[i]) - if point {cmp_right} self.left_node.max_right: + if point {{cmp_right}} self.left_node.max_right: self.left_node.query(result, point) elif point > self.pivot: values = self.center_right_values indices = self.center_right_indices for i in range(self.n_center - 1, -1, -1): - if not point {cmp_right} values[i]: + if not point {{cmp_right}} values[i]: break result.append(indices[i]) - if self.right_node.min_left {cmp_left} point: + if self.right_node.min_left {{cmp_left}} point: self.right_node.query(result, point) else: result.extend(self.center_left_indices) def __repr__(self): if self.is_leaf_node: - return ('<{dtype_title}Closed{closed_title}IntervalNode: ' + return ('<{{dtype_title}}Closed{{closed_title}}IntervalNode: ' '%s elements (terminal)>' % self.n_elements) else: n_left = self.left_node.n_elements n_right = self.right_node.n_elements n_center = self.n_elements - n_left - n_right - return ('<{dtype_title}Closed{closed_title}IntervalNode: pivot %s, ' - '%s elements (%s left, %s right, %s overlapping)>' % - (self.pivot, self.n_elements, n_left, n_right, n_center)) + return ('<{{dtype_title}}Closed{{closed_title}}IntervalNode: ' + 'pivot %s, %s elements (%s left, %s right, %s ' + 'overlapping)>' % (self.pivot, self.n_elements, + n_left, n_right, n_center)) def counts(self): + """ + Inspect counts on this node + useful for debugging purposes + """ if self.is_leaf_node: return self.n_elements else: @@ -352,44 +390,7 @@ def counts(self): r = self.right_node.counts() return (m, (l, r)) -NODE_CLASSES['{dtype}', '{closed}'] = {dtype_title}Closed{closed_title}IntervalNode -''' - - -def generate_node_template(): - output = StringIO() - for dtype in ['float64', 'int64']: - for closed, cmp_left, cmp_right in [ - ('left', '<=', '<'), - ('right', '<', '<='), - ('both', '<=', '<='), - ('neither', '<', '<')]: - cmp_left_converse = '<' if cmp_left == '<=' else '<=' - cmp_right_converse = '<' if cmp_right == '<=' else '<=' - classes = node_template.format(dtype=dtype, - dtype_title=dtype.title(), - closed=closed, - closed_title=closed.title(), - cmp_left=cmp_left, - cmp_right=cmp_right, - cmp_left_converse=cmp_left_converse, - cmp_right_converse=cmp_right_converse) - output.write(classes) - output.write("\n") - return output.getvalue() - - -def generate_cython_file(): - # Put `intervaltree.pyx` in the same directory as this file - directory = os.path.dirname(os.path.realpath(__file__)) - filename = 'intervaltree.pyx' - path = os.path.join(directory, filename) - - with open(path, 'w') as f: - print(warning_to_new_contributors, file=f) - print(header, file=f) - print(generate_node_template(), file=f) - - -if __name__ == '__main__': - generate_cython_file() +NODE_CLASSES['{{dtype}}', + '{{closed}}'] = {{dtype_title}}Closed{{closed_title}}IntervalNode + +{{endfor}} diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index f90fd1e5bb44b..31402c38c770d 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -61,6 +61,8 @@ from tslib cimport (convert_to_tsobject, convert_to_timedelta64, _check_all_nulls) import tslib from tslib import NaT, Timestamp, Timedelta +import interval +from interval import Interval cdef int64_t NPY_NAT = util.get_nat() @@ -259,7 +261,7 @@ cpdef bint isscalar(object val): or PyDelta_Check(val) or PyTime_Check(val) or util.is_period_object(val) - or is_decimal(val), + or is_decimal(val) or is_interval(val)) @@ -1898,6 +1900,4 @@ cdef class BlockPlacement: include "reduce.pyx" include "properties.pyx" -include "interval.pyx" -include "intervaltree.pyx" include "inference.pyx" diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index 0c85f488dd311..f7dbae4ab736e 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -33,6 +33,10 @@ cpdef bint is_decimal(object obj): return isinstance(obj, Decimal) +cpdef bint is_interval(object obj): + return isinstance(obj, Interval) + + cpdef bint is_period(object val): """ Return a boolean if this is a Period object """ return util.is_period_object(val) @@ -430,7 +434,7 @@ def infer_dtype(object value): return 'period' elif is_interval(val): - if is_interval_array_fixed_closed(values): + if is_interval_array(values): return 'interval' for i in range(n): @@ -883,22 +887,22 @@ cpdef bint is_period_array(ndarray[object] values): return False return null_count != n -cdef inline bint is_interval(object o): - return isinstance(o, Interval) -def is_interval_array_fixed_closed(ndarray[object] values): - cdef Py_ssize_t i, n = len(values) - cdef str closed +cpdef bint is_interval_array(ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values), null_count = 0 + object v + if n == 0: return False for i in range(n): - if not is_interval(values[i]): - return False - if i == 0: - closed = values[0].closed - elif closed != values[i].closed: + v = values[i] + if util._checknull(v): + null_count += 1 + continue + if not is_interval(v): return False - return True + return null_count != n cdef extern from "parse_helper.h": diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index ed0bb263ed6cf..47679966e3d5c 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -1296,6 +1296,18 @@ cdef class _Timestamp(datetime): return result + property _short_repr: + def __get__(self): + # format a Timestamp with only _date_repr if possible + # otherwise _repr_base + if (self.hour == 0 and + self.minute == 0 and + self.second == 0 and + self.microsecond == 0 and + self.nanosecond == 0): + return self._date_repr + return self._repr_base + property asm8: def __get__(self): return np.datetime64(self.value, 'ns') diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index d72ee71570adb..2a2789843207a 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -605,8 +605,8 @@ def value_counts(values, sort=True, ascending=False, normalize=False, if bins is not None: try: from pandas.tools.tile import cut - values = Series(values).values - cat, bins = cut(values, bins, retbins=True) + values = Series(values) + ii = cut(values, bins, include_lowest=True) except TypeError: raise TypeError("bins argument only works with numeric data.") @@ -623,12 +623,18 @@ def value_counts(values, sort=True, ascending=False, normalize=False, if not isinstance(keys, Index): keys = Index(keys) result = Series(counts, index=keys, name=name) + # count, remove nulls (from the index), and but the bins + result = ii.value_counts(dropna=dropna) + result = result[result.index.notnull()] + result.index = result.index.astype('interval') + result = result.sort_index() - if bins is not None: - # TODO: This next line should be more efficient - result = result.reindex(np.arange(len(cat.categories)), - fill_value=0) - result.index = bins[:-1] + # if we are dropna and we have NO values + if dropna and (result.values == 0).all(): + result = result.iloc[0:0] + + # normalizing is by len of all (regardless of dropna) + counts = np.array([len(ii)]) if sort: result = result.sort_values(ascending=ascending) @@ -1395,6 +1401,8 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, allow_fill=allow_fill) elif is_datetimetz(arr): return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) + elif is_interval_dtype(arr): + return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) if indexer is None: indexer = np.arange(arr.shape[axis], dtype=np.int64) diff --git a/pandas/core/api.py b/pandas/core/api.py index dbb5e22358c18..ea5be17ef3aaf 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -11,8 +11,8 @@ from pandas.formats.format import set_eng_float_format from pandas.core.index import (Index, CategoricalIndex, Int64Index, UInt64Index, RangeIndex, Float64Index, - MultiIndex) -from pandas.core.interval import Interval, IntervalIndex + MultiIndex, IntervalIndex) +from pandas.indexes.interval import Interval, interval_range from pandas.core.series import Series from pandas.core.frame import DataFrame diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 69c90d8cc9efd..45a9577c8d8b2 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -18,6 +18,7 @@ from pandas.types.common import (is_numeric_dtype, is_timedelta64_dtype, is_datetime64_dtype, is_categorical_dtype, + is_interval_dtype, is_datetimelike, is_datetime64_any_dtype, is_bool, is_integer_dtype, @@ -40,10 +41,11 @@ from pandas.core.base import (PandasObject, SelectionMixin, GroupByError, DataError, SpecificationError) +from pandas.core.index import (Index, MultiIndex, + CategoricalIndex, _ensure_index) from pandas.core.categorical import Categorical from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame -from pandas.core.interval import IntervalIndex from pandas.core.internals import BlockManager, make_block from pandas.core.series import Series from pandas.core.panel import Panel @@ -2659,7 +2661,7 @@ def _convert_grouper(axis, grouper): return grouper.reindex(axis)._values elif isinstance(grouper, (list, Series, Index, np.ndarray)): if len(grouper) != len(axis): - raise AssertionError('Grouper and axis must be same length') + raise ValueError('Grouper and axis must be same length') return grouper else: return grouper @@ -3144,28 +3146,29 @@ def value_counts(self, normalize=False, sort=True, ascending=False, if bins is None: lab, lev = algorithms.factorize(val, sort=True) + llab = lambda lab, inc: lab[inc] else: - raise NotImplementedError('this is broken') - lab, bins = cut(val, bins, retbins=True) - # bins[:-1] for backward compat; - # o.w. cat.categories could be better - # cat = Categorical(cat) - # lab, lev, dropna = cat.codes, bins[:-1], False - - if (lab.dtype == object - and lib.is_interval_array_fixed_closed(lab[notnull(lab)])): - lab_index = Index(lab) - assert isinstance(lab, IntervalIndex) - sorter = np.lexsort((lab_index.left, lab_index.right, ids)) + + # lab is a Categorical with categories an IntervalIndex + lab = cut(Series(val), bins, include_lowest=True) + lev = lab.cat.categories + lab = lev.take(lab.cat.codes) + llab = lambda lab, inc: lab[inc]._multiindex.labels[-1] + + if is_interval_dtype(lab): + # TODO: should we do this inside II? + sorter = np.lexsort((lab.left, lab.right, ids)) else: sorter = np.lexsort((lab, ids)) + ids, lab = ids[sorter], lab[sorter] # group boundaries are where group ids change idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]] # new values are where sorted labels change - inc = np.r_[True, lab[1:] != lab[:-1]] + lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1)) + inc = np.r_[True, lchanges] inc[idx] = True # group boundaries are also new values out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts @@ -3173,7 +3176,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False, rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx)) # multi-index components - labels = list(map(rep, self.grouper.recons_labels)) + [lab[inc]] + labels = list(map(rep, self.grouper.recons_labels)) + [llab(lab, inc)] levels = [ping.group_index for ping in self.grouper.groupings] + [lev] names = self.grouper.names + [self._selection_name] @@ -3199,13 +3202,12 @@ def value_counts(self, normalize=False, sort=True, ascending=False, acc = rep(d) out /= acc - if sort: # and bins is None: + if sort and bins is None: cat = ids[inc][mask] if dropna else ids[inc] sorter = np.lexsort((out if ascending else -out, cat)) out, labels[-1] = out[sorter], labels[-1][sorter] - # if bins is None: - if True: + if bins is None: mi = MultiIndex(levels=levels, labels=labels, names=names, verify_integrity=False) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 9e22bdd5facc4..c9ff26d135f58 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1087,10 +1087,10 @@ def _getitem_iterable(self, key, axis=0): return self.obj.take(inds, axis=axis, convert=False) else: # Have the index compute an indexer or return None - # if it cannot handle + # if it cannot handle; we only act on all found values indexer, keyarr = labels._convert_listlike_indexer( key, kind=self.name) - if indexer is not None: + if indexer is not None and (indexer != -1).all(): return self.obj.take(indexer, axis=axis) # existing labels are unique and indexer are unique @@ -1429,7 +1429,7 @@ def error(): try: key = self._convert_scalar_indexer(key, axis) - if key not in ax: + if not ax._is_contained_in(key): error() except TypeError as e: @@ -1897,7 +1897,7 @@ def convert_to_index_sliceable(obj, key): elif isinstance(key, compat.string_types): # we are an actual column - if key in obj._data.items: + if obj._data.items._is_contained_in(key): return None # We might have a datetimelike string that we can translate to a diff --git a/pandas/core/interval.py b/pandas/core/interval.py deleted file mode 100644 index 68e07f21367a0..0000000000000 --- a/pandas/core/interval.py +++ /dev/null @@ -1,521 +0,0 @@ -import operator - -import numpy as np -import pandas as pd - -from pandas.core.base import PandasObject, IndexOpsMixin -from pandas.core.common import (_values_from_object, _ensure_platform_int, - notnull, is_datetime_or_timedelta_dtype, - is_integer_dtype, is_float_dtype) -from pandas.core.index import (Index, _ensure_index, default_pprint, - InvalidIndexError, MultiIndex) -from pandas.lib import (Interval, IntervalMixin, IntervalTree, - interval_bounds_to_intervals, - intervals_to_interval_bounds) -from pandas.util.decorators import cache_readonly -import pandas.core.common as com - - -_VALID_CLOSED = set(['left', 'right', 'both', 'neither']) - - -def _get_next_label(label): - dtype = getattr(label, 'dtype', type(label)) - if isinstance(label, (pd.Timestamp, pd.Timedelta)): - dtype = 'datetime64' - if is_datetime_or_timedelta_dtype(dtype): - return label + np.timedelta64(1, 'ns') - elif is_integer_dtype(dtype): - return label + 1 - elif is_float_dtype(dtype): - return np.nextafter(label, np.infty) - else: - raise TypeError('cannot determine next label for type %r' - % type(label)) - - -def _get_prev_label(label): - dtype = getattr(label, 'dtype', type(label)) - if isinstance(label, (pd.Timestamp, pd.Timedelta)): - dtype = 'datetime64' - if is_datetime_or_timedelta_dtype(dtype): - return label - np.timedelta64(1, 'ns') - elif is_integer_dtype(dtype): - return label - 1 - elif is_float_dtype(dtype): - return np.nextafter(label, -np.infty) - else: - raise TypeError('cannot determine next label for type %r' - % type(label)) - - -def _get_interval_closed_bounds(interval): - """ - Given an Interval or IntervalIndex, return the corresponding interval with - closed bounds. - """ - left, right = interval.left, interval.right - if interval.open_left: - left = _get_next_label(left) - if interval.open_right: - right = _get_prev_label(right) - return left, right - - -class IntervalIndex(IntervalMixin, Index): - """ - Immutable Index implementing an ordered, sliceable set. IntervalIndex - represents an Index of intervals that are all closed on the same side. - - .. versionadded:: 0.18 - - Properties - ---------- - left, right : array-like (1-dimensional) - Left and right bounds for each interval. - closed : {'left', 'right', 'both', 'neither'}, optional - Whether the intervals are closed on the left-side, right-side, both or - neither. Defaults to 'right'. - name : object, optional - Name to be stored in the index. - """ - _typ = 'intervalindex' - _comparables = ['name'] - _attributes = ['name', 'closed'] - _allow_index_ops = True - _engine = None # disable it - - def __new__(cls, left, right, closed='right', name=None, fastpath=False): - # TODO: validation - result = IntervalMixin.__new__(cls) - result._left = _ensure_index(left) - result._right = _ensure_index(right) - result._closed = closed - result.name = name - if not fastpath: - result._validate() - result._reset_identity() - return result - - def _validate(self): - """Verify that the IntervalIndex is valid. - """ - # TODO: exclude periods? - if self.closed not in _VALID_CLOSED: - raise ValueError("invalid options for 'closed': %s" % self.closed) - if len(self.left) != len(self.right): - raise ValueError('left and right must have the same length') - left_valid = notnull(self.left) - right_valid = notnull(self.right) - if not (left_valid == right_valid).all(): - raise ValueError('missing values must be missing in the same ' - 'location both left and right sides') - if not (self.left[left_valid] <= self.right[left_valid]).all(): - raise ValueError('left side of interval must be <= right side') - - def _simple_new(cls, values, name=None, **kwargs): - # ensure we don't end up here (this is a superclass method) - raise NotImplementedError - - def _cleanup(self): - pass - - @property - def _engine(self): - raise NotImplementedError - - @cache_readonly - def _tree(self): - return IntervalTree(self.left, self.right, closed=self.closed) - - @property - def _constructor(self): - return type(self).from_intervals - - @classmethod - def from_breaks(cls, breaks, closed='right', name=None): - """ - Construct an IntervalIndex from an array of splits - - Parameters - ---------- - breaks : array-like (1-dimensional) - Left and right bounds for each interval. - closed : {'left', 'right', 'both', 'neither'}, optional - Whether the intervals are closed on the left-side, right-side, both - or neither. Defaults to 'right'. - name : object, optional - Name to be stored in the index. - - Examples - -------- - - >>> IntervalIndex.from_breaks([0, 1, 2, 3]) - IntervalIndex(left=[0, 1, 2], - right=[1, 2, 3], - closed='right') - """ - return cls(breaks[:-1], breaks[1:], closed, name) - - @classmethod - def from_intervals(cls, data, name=None): - """ - Construct an IntervalIndex from a 1d array of Interval objects - - Parameters - ---------- - data : array-like (1-dimensional) - Array of Interval objects. All intervals must be closed on the same - sides. - name : object, optional - Name to be stored in the index. - - Examples - -------- - - >>> IntervalIndex.from_intervals([Interval(0, 1), Interval(1, 2)]) - IntervalIndex(left=[0, 1], - right=[1, 2], - closed='right') - - The generic Index constructor work identically when it infers an array - of all intervals: - - >>> Index([Interval(0, 1), Interval(1, 2)]) - IntervalIndex(left=[0, 1], - right=[1, 2], - closed='right') - """ - data = np.asarray(data) - left, right, closed = intervals_to_interval_bounds(data) - return cls(left, right, closed, name) - - @classmethod - def from_tuples(cls, data, closed='right', name=None): - left = [] - right = [] - for l, r in data: - left.append(l) - right.append(r) - return cls(np.array(left), np.array(right), closed, name) - - def to_tuples(self): - return Index(com._asarray_tuplesafe(zip(self.left, self.right))) - - @cache_readonly - def _multiindex(self): - return MultiIndex.from_arrays([self.left, self.right], - names=['left', 'right']) - - @property - def left(self): - return self._left - - @property - def right(self): - return self._right - - @property - def closed(self): - return self._closed - - def __len__(self): - return len(self.left) - - @cache_readonly - def values(self): - """Returns the IntervalIndex's data as a numpy array of Interval - objects (with dtype='object') - """ - left = np.asarray(self.left) - right = np.asarray(self.right) - return interval_bounds_to_intervals(left, right, self.closed) - - def __array__(self, result=None): - """ the array interface, return my values """ - return self.values - - def __array_wrap__(self, result, context=None): - # we don't want the superclass implementation - return result - - def _array_values(self): - return self.values - - def __reduce__(self): - return self.__class__, (self.left, self.right, self.closed, self.name) - - def _shallow_copy(self, values=None, name=None): - name = name if name is not None else self.name - if values is not None: - return type(self).from_intervals(values, name=name) - else: - return self.copy(name=name) - - def copy(self, deep=False, name=None): - left = self.left.copy(deep=True) if deep else self.left - right = self.right.copy(deep=True) if deep else self.right - name = name if name is not None else self.name - return type(self)(left, right, closed=self.closed, name=name, - fastpath=True) - - @cache_readonly - def dtype(self): - return np.dtype('O') - - @cache_readonly - def mid(self): - """Returns the mid-point of each interval in the index as an array - """ - try: - return Index(0.5 * (self.left.values + self.right.values)) - except TypeError: - # datetime safe version - delta = self.right.values - self.left.values - return Index(self.left.values + 0.5 * delta) - - @cache_readonly - def is_monotonic_increasing(self): - return self._multiindex.is_monotonic_increasing - - @cache_readonly - def is_monotonic_decreasing(self): - return self._multiindex.is_monotonic_decreasing - - @cache_readonly - def is_unique(self): - return self._multiindex.is_unique - - @cache_readonly - def is_non_overlapping_monotonic(self): - # must be increasing (e.g., [0, 1), [1, 2), [2, 3), ... ) - # or decreasing (e.g., [-1, 0), [-2, -1), [-3, -2), ...) - # we already require left <= right - return ((self.right[:-1] <= self.left[1:]).all() or - (self.left[:-1] >= self.right[1:]).all()) - - def _convert_scalar_indexer(self, key, kind=None): - return key - - def _maybe_cast_slice_bound(self, label, side, kind): - return getattr(self, side)._maybe_cast_slice_bound(label, side, kind) - - def _convert_list_indexer(self, keyarr, kind=None): - """ - we are passed a list indexer. - Return our indexer or raise if all of the values are not included in the categories - """ - locs = self.get_indexer(keyarr) - # TODO: handle keyarr if it includes intervals - if (locs == -1).any(): - raise KeyError("a list-indexer must only include existing intervals") - - return locs - - def _check_method(self, method): - if method is not None: - raise NotImplementedError( - 'method %r not yet implemented for IntervalIndex' % method) - - def _searchsorted_monotonic(self, label, side, exclude_label=False): - if not self.is_non_overlapping_monotonic: - raise KeyError('can only get slices from an IntervalIndex if ' - 'bounds are non-overlapping and all monotonic ' - 'increasing or decreasing') - - if isinstance(label, IntervalMixin): - raise NotImplementedError - - if ((side == 'left' and self.left.is_monotonic_increasing) or - (side == 'right' and self.left.is_monotonic_decreasing)): - sub_idx = self.right - if self.open_right or exclude_label: - label = _get_next_label(label) - else: - sub_idx = self.left - if self.open_left or exclude_label: - label = _get_prev_label(label) - - return sub_idx._searchsorted_monotonic(label, side) - - def _get_loc_only_exact_matches(self, key): - return self._multiindex._tuple_index.get_loc(key) - - def _find_non_overlapping_monotonic_bounds(self, key): - if isinstance(key, IntervalMixin): - start = self._searchsorted_monotonic( - key.left, 'left', exclude_label=key.open_left) - stop = self._searchsorted_monotonic( - key.right, 'right', exclude_label=key.open_right) - else: - # scalar - start = self._searchsorted_monotonic(key, 'left') - stop = self._searchsorted_monotonic(key, 'right') - return start, stop - - def get_loc(self, key, method=None): - self._check_method(method) - - original_key = key - - if self.is_non_overlapping_monotonic: - if isinstance(key, Interval): - left = self._maybe_cast_slice_bound(key.left, 'left', None) - right = self._maybe_cast_slice_bound(key.right, 'right', None) - key = Interval(left, right, key.closed) - else: - key = self._maybe_cast_slice_bound(key, 'left', None) - - start, stop = self._find_non_overlapping_monotonic_bounds(key) - - if start + 1 == stop: - return start - elif start < stop: - return slice(start, stop) - else: - raise KeyError(original_key) - - else: - # use the interval tree - if isinstance(key, Interval): - left, right = _get_interval_closed_bounds(key) - return self._tree.get_loc_interval(left, right) - else: - return self._tree.get_loc(key) - - def get_value(self, series, key): - # this method seems necessary for Series.__getitem__ but I have no idea - # what it should actually do here... - loc = self.get_loc(key) # nb. this can't handle slice objects - return series.iloc[loc] - - def get_indexer(self, target, method=None, limit=None, tolerance=None): - self._check_method(method) - target = _ensure_index(target) - - if self.is_non_overlapping_monotonic: - start, stop = self._find_non_overlapping_monotonic_bounds(target) - - start_plus_one = start + 1 - if (start_plus_one < stop).any(): - raise ValueError('indexer corresponds to non-unique elements') - return np.where(start_plus_one == stop, start, -1) - - else: - if isinstance(target, IntervalIndex): - raise NotImplementedError( - 'have not yet implemented get_indexer ' - 'for IntervalIndex indexers') - else: - return self._tree.get_indexer(target) - - def delete(self, loc): - new_left = self.left.delete(loc) - new_right = self.right.delete(loc) - return type(self)(new_left, new_right, self.closed, self.name, - fastpath=True) - - def insert(self, loc, item): - if not isinstance(item, Interval): - raise ValueError('can only insert Interval objects into an ' - 'IntervalIndex') - if not item.closed == self.closed: - raise ValueError('inserted item must be closed on the same side ' - 'as the index') - new_left = self.left.insert(loc, item.left) - new_right = self.right.insert(loc, item.right) - return type(self)(new_left, new_right, self.closed, self.name, - fastpath=True) - - def _as_like_interval_index(self, other, error_msg): - self._assert_can_do_setop(other) - other = _ensure_index(other) - if (not isinstance(other, IntervalIndex) or - self.closed != other.closed): - raise ValueError(error_msg) - return other - - def append(self, other): - msg = ('can only append two IntervalIndex objects that are closed on ' - 'the same side') - other = self._as_like_interval_index(other, msg) - new_left = self.left.append(other.left) - new_right = self.right.append(other.right) - if other.name is not None and other.name != self.name: - name = None - else: - name = self.name - return type(self)(new_left, new_right, self.closed, name, - fastpath=True) - - def take(self, indexer, axis=0): - indexer = com._ensure_platform_int(indexer) - new_left = self.left.take(indexer) - new_right = self.right.take(indexer) - return type(self)(new_left, new_right, self.closed, self.name, - fastpath=True) - - def __contains__(self, key): - try: - self.get_loc(key) - return True - except KeyError: - return False - - def __getitem__(self, value): - left = self.left[value] - right = self.right[value] - if not isinstance(left, Index): - return Interval(left, right, self.closed) - else: - return type(self)(left, right, self.closed, self.name) - - # __repr__ associated methods are based on MultiIndex - - def _format_attrs(self): - attrs = [('left', default_pprint(self.left)), - ('right', default_pprint(self.right)), - ('closed', repr(self.closed))] - if self.name is not None: - attrs.append(('name', default_pprint(self.name))) - return attrs - - def _format_space(self): - return "\n%s" % (' ' * (len(self.__class__.__name__) + 1)) - - def _format_data(self): - return None - - def argsort(self, *args, **kwargs): - return np.lexsort((self.right, self.left)) - - def equals(self, other): - if self.is_(other): - return True - try: - return (self.left.equals(other.left) - and self.right.equals(other.right) - and self.closed == other.closed) - except AttributeError: - return False - - def _setop(op_name): - def func(self, other): - msg = ('can only do set operations between two IntervalIndex ' - 'objects that are closed on the same side') - other = self._as_like_interval_index(other, msg) - result = getattr(self._multiindex, op_name)(other._multiindex) - result_name = self.name if self.name == other.name else None - return type(self).from_tuples(result.values, closed=self.closed, - name=result_name) - return func - - union = _setop('union') - intersection = _setop('intersection') - difference = _setop('difference') - sym_diff = _setop('sym_diff') - - # TODO: arithmetic operations - - -IntervalIndex._add_logical_methods_disabled() diff --git a/pandas/formats/format.py b/pandas/formats/format.py index 66a81aadc4213..907198d98cf5b 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -15,6 +15,7 @@ is_float_dtype, is_period_arraylike, is_integer_dtype, + is_interval_dtype, is_datetimetz, is_integer, is_float, @@ -575,6 +576,7 @@ def to_string(self): pprint_thing(frame.index))) text = info_line else: + strcols = self._to_str_columns() if self.line_width is None: # no need to wrap around just print # the whole frame @@ -2027,6 +2029,8 @@ def format_array(values, formatter, float_format=None, na_rep='NaN', if is_categorical_dtype(values): fmt_klass = CategoricalArrayFormatter + elif is_interval_dtype(values): + fmt_klass = IntervalArrayFormatter elif is_float_dtype(values.dtype): fmt_klass = FloatArrayFormatter elif is_period_arraylike(values): @@ -2294,6 +2298,17 @@ def _format_strings(self): return fmt_values.tolist() +class IntervalArrayFormatter(GenericArrayFormatter): + + def __init__(self, values, *args, **kwargs): + GenericArrayFormatter.__init__(self, values, *args, **kwargs) + + def _format_strings(self): + formatter = self.formatter or str + fmt_values = np.array([formatter(x) for x in self.values]) + return fmt_values + + class PeriodArrayFormatter(IntArrayFormatter): def _format_strings(self): diff --git a/pandas/indexes/api.py b/pandas/indexes/api.py index a3cb54ca97071..db076b60ab34e 100644 --- a/pandas/indexes/api.py +++ b/pandas/indexes/api.py @@ -3,6 +3,7 @@ InvalidIndexError) from pandas.indexes.category import CategoricalIndex # noqa from pandas.indexes.multi import MultiIndex # noqa +from pandas.indexes.interval import IntervalIndex # noqa from pandas.indexes.numeric import (NumericIndex, Float64Index, # noqa Int64Index, UInt64Index) from pandas.indexes.range import RangeIndex # noqa @@ -13,7 +14,7 @@ # TODO: there are many places that rely on these private methods existing in # pandas.core.index __all__ = ['Index', 'MultiIndex', 'NumericIndex', 'Float64Index', 'Int64Index', - 'CategoricalIndex', 'RangeIndex', 'UInt64Index', + 'CategoricalIndex', 'IntervalIndex', 'RangeIndex', 'UInt64Index', 'InvalidIndexError', '_new_Index', '_ensure_index', '_get_na_value', '_get_combined_index', diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index ab5c01388e652..c0635f07238b5 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -24,6 +24,7 @@ is_dtype_equal, is_object_dtype, is_categorical_dtype, + is_interval_dtype, is_bool_dtype, is_signed_integer_dtype, is_unsigned_integer_dtype, @@ -49,9 +50,9 @@ from pandas.formats.printing import pprint_thing from pandas.core.ops import _comp_method_OBJECT_ARRAY from pandas.core.strings import StringAccessorMixin - from pandas.core.config import get_option + # simplify default_pprint = lambda x, max_seq_items=None: \ pprint_thing(x, escape_chars=('\t', '\r', '\n'), quote_strings=True, @@ -138,6 +139,9 @@ class Index(IndexOpsMixin, StringAccessorMixin, PandasObject): _is_numeric_dtype = False _can_hold_na = True + # would we like our indexing holder to defer to us + _defer_to_indexing = False + # prioritize current class for _shallow_copy_with_infer, # used to infer integers as datetime-likes _infer_as_myclass = False @@ -167,6 +171,12 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, from .category import CategoricalIndex return CategoricalIndex(data, copy=copy, name=name, **kwargs) + # interval + if is_interval_dtype(data): + from .interval import IntervalIndex + return IntervalIndex.from_intervals(data, name=name, + copy=copy) + # index-like elif isinstance(data, (np.ndarray, Index, ABCSeries)): @@ -276,6 +286,10 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, elif inferred in ['floating', 'mixed-integer-float']: from .numeric import Float64Index return Float64Index(subarr, copy=copy, name=name) + elif inferred == 'interval': + from .interval import IntervalIndex + return IntervalIndex.from_intervals(subarr, name=name, + copy=copy) elif inferred == 'boolean': # don't support boolean explicity ATM pass @@ -1210,6 +1224,9 @@ def is_object(self): def is_categorical(self): return self.inferred_type in ['categorical'] + def is_interval(self): + return self.inferred_type in ['interval'] + def is_mixed(self): return self.inferred_type in ['mixed'] @@ -1413,11 +1430,6 @@ def _convert_index_indexer(self, keyarr): @Appender(_index_shared_docs['_convert_list_indexer']) def _convert_list_indexer(self, keyarr, kind=None): - """ - passed a key that is tuplesafe that is integer based - and we have a mixed index (e.g. number/labels). figure out - the indexer. return None if we can't help - """ if (kind in [None, 'iloc', 'ix'] and is_integer_dtype(keyarr) and not self.is_floating() and not isinstance(keyarr, ABCPeriodIndex)): @@ -1553,9 +1565,41 @@ def __nonzero__(self): __bool__ = __nonzero__ + _index_shared_docs['__contains__'] = """ + return a boolean if this key is IN the index + + Parameters + ---------- + key : object + + Returns + ------- + boolean + """ + + @Appender(_index_shared_docs['__contains__'] % _index_doc_kwargs) def __contains__(self, key): hash(key) - # work around some kind of odd cython bug + try: + return key in self._engine + except TypeError: + return False + + _index_shared_docs['_is_contained_in'] = """ + return a boolean if this key is IN the index + + Parameters + ---------- + key : object + + Returns + ------- + boolean + """ + + @Appender(_index_shared_docs['_is_contained_in'] % _index_doc_kwargs) + def _is_contained_in(self, key): + hash(key) try: return key in self._engine except TypeError: @@ -3341,6 +3385,13 @@ def _searchsorted_monotonic(self, label, side='left'): raise ValueError('index must be monotonic increasing or decreasing') + def _get_loc_only_exact_matches(self, key): + """ + This is overriden on subclasses (namely, IntervalIndex) to control + get_slice_bound. + """ + return self.get_loc(key) + def get_slice_bound(self, label, side, kind): """ Calculate slice bound that corresponds to given label. @@ -3370,7 +3421,7 @@ def get_slice_bound(self, label, side, kind): # we need to look up the label try: - slc = self.get_loc(label) + slc = self._get_loc_only_exact_matches(label) except KeyError as err: try: return self._searchsorted_monotonic(label, side) @@ -3606,7 +3657,9 @@ def _evaluate_compare(self, other): if needs_i8_conversion(self) and needs_i8_conversion(other): return self._evaluate_compare(other, op) - if is_object_dtype(self) and self.nlevels == 1: + if (is_object_dtype(self) and + self.nlevels == 1): + # don't pass MultiIndex with np.errstate(all='ignore'): result = _comp_method_OBJECT_ARRAY( @@ -3918,6 +3971,8 @@ def _ensure_index(index_like, copy=False): def _get_na_value(dtype): + if is_datetime64_any_dtype(dtype) or is_timedelta64_dtype(dtype): + return libts.NaT return {np.datetime64: libts.NaT, np.timedelta64: libts.NaT}.get(dtype, np.nan) diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py index 7cfc95de5f538..4800375cd5d38 100644 --- a/pandas/indexes/category.py +++ b/pandas/indexes/category.py @@ -7,7 +7,9 @@ from pandas.types.common import (is_categorical_dtype, _ensure_platform_int, is_list_like, + is_interval_dtype, is_scalar) +from pandas.core.common import _asarray_tuplesafe from pandas.types.missing import array_equivalent @@ -17,7 +19,6 @@ import pandas.core.base as base import pandas.core.missing as missing import pandas.indexes.base as ibase -from pandas.core.common import _asarray_tuplesafe _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update(dict(target_klass='CategoricalIndex')) @@ -261,14 +262,35 @@ def ordered(self): def _reverse_indexer(self): return self._data._reverse_indexer() + @Appender(_index_shared_docs['__contains__'] % _index_doc_kwargs) def __contains__(self, key): hash(key) + + if self.categories._defer_to_indexing: + return key in self.categories + + return key in self.values + + @Appender(_index_shared_docs['_is_contained_in'] % _index_doc_kwargs) + def _is_contained_in(self, key): + hash(key) + + if self.categories._defer_to_indexing: + return self.categories._is_contained_in(key) + return key in self.values def __array__(self, dtype=None): """ the array interface, return my values """ return np.array(self._data, dtype=dtype) + @Appender(_index_shared_docs['astype']) + def astype(self, dtype, copy=True): + if is_interval_dtype(dtype): + from pandas import IntervalIndex + return IntervalIndex.from_intervals(np.array(self)) + return super(CategoricalIndex, self).astype(dtype=dtype, copy=copy) + @cache_readonly def _isnan(self): """ return if each value is nan""" @@ -431,8 +453,8 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): method = missing.clean_reindex_fill_method(method) target = ibase._ensure_index(target) - if isinstance(target, CategoricalIndex): - target = target.categories + if self.equals(target): + return np.arange(len(self), dtype='intp') if method == 'pad' or method == 'backfill': raise NotImplementedError("method='pad' and method='backfill' not " @@ -440,10 +462,17 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): elif method == 'nearest': raise NotImplementedError("method='nearest' not implemented yet " 'for CategoricalIndex') - else: + if (isinstance(target, CategoricalIndex) and + self.values.is_dtype_equal(target)): + # we have the same codes + codes = target.codes + else: + if isinstance(target, CategoricalIndex): + target = target.categories codes = self.categories.get_indexer(target) - indexer, _ = self._engine.get_indexer_non_unique(codes) + + indexer, _ = self._engine.get_indexer_non_unique(codes) return _ensure_platform_int(indexer) @@ -457,20 +486,39 @@ def get_indexer_non_unique(self, target): codes = self.categories.get_indexer(target) return self._engine.get_indexer_non_unique(codes) + @Appender(_index_shared_docs['_convert_scalar_indexer']) + def _convert_scalar_indexer(self, key, kind=None): + if self.categories._defer_to_indexing: + return self.categories._convert_scalar_indexer(key, kind=kind) + + return super(CategoricalIndex, self)._convert_scalar_indexer( + key, kind=kind) + @Appender(_index_shared_docs['_convert_list_indexer']) def _convert_list_indexer(self, keyarr, kind=None): # Return our indexer or raise if all of the values are not included in # the categories - codes = self.categories.get_indexer(keyarr) - if (codes == -1).any(): - raise KeyError("a list-indexer must only include values that are " - "in the categories") - return None + if self.categories._defer_to_indexing: + indexer = self.categories._convert_list_indexer(keyarr, kind=kind) + return Index(self.codes).get_indexer_for(indexer) + + indexer = self.categories.get_indexer(keyarr) + if (indexer == -1).any(): + raise KeyError( + "a list-indexer must only " + "include values that are " + "in the categories") + + return self.get_indexer(keyarr) @Appender(_index_shared_docs['_convert_arr_indexer']) def _convert_arr_indexer(self, keyarr): keyarr = _asarray_tuplesafe(keyarr) + + if self.categories._defer_to_indexing: + return keyarr + return self._shallow_copy(keyarr) @Appender(_index_shared_docs['_convert_index_indexer']) @@ -488,6 +536,8 @@ def take(self, indices, axis=0, allow_fill=True, na_value=-1) return self._create_from_codes(taken) + take_nd = take + def map(self, mapper): """Apply mapper function to its categories (not codes). diff --git a/pandas/indexes/interval.py b/pandas/indexes/interval.py new file mode 100644 index 0000000000000..127655972e7f2 --- /dev/null +++ b/pandas/indexes/interval.py @@ -0,0 +1,983 @@ +""" define the IntervalIndex """ + +import numpy as np + +from pandas.types.missing import notnull, isnull +from pandas.types.generic import ABCPeriodIndex +from pandas.types.dtypes import IntervalDtype +from pandas.types.common import (_ensure_platform_int, + is_list_like, + is_datetime_or_timedelta_dtype, + is_integer_dtype, + is_object_dtype, + is_categorical_dtype, + is_float_dtype, + is_interval_dtype, + is_scalar, + is_integer) +from pandas.indexes.base import (Index, _ensure_index, + default_pprint, _index_shared_docs) + +from pandas._libs import Timestamp, Timedelta +from pandas._libs.interval import (Interval, IntervalMixin, IntervalTree, + intervals_to_interval_bounds) + +from pandas.indexes.multi import MultiIndex +from pandas.compat.numpy import function as nv +from pandas.core import common as com +from pandas.util.decorators import cache_readonly, Appender +from pandas.core.config import get_option + +import pandas.indexes.base as ibase +_index_doc_kwargs = dict(ibase._index_doc_kwargs) +_index_doc_kwargs.update( + dict(klass='IntervalIndex', + target_klass='IntervalIndex or list of Intervals')) + + +_VALID_CLOSED = set(['left', 'right', 'both', 'neither']) + + +def _get_next_label(label): + dtype = getattr(label, 'dtype', type(label)) + if isinstance(label, (Timestamp, Timedelta)): + dtype = 'datetime64' + if is_datetime_or_timedelta_dtype(dtype): + return label + np.timedelta64(1, 'ns') + elif is_integer_dtype(dtype): + return label + 1 + elif is_float_dtype(dtype): + return np.nextafter(label, np.infty) + else: + raise TypeError('cannot determine next label for type %r' + % type(label)) + + +def _get_prev_label(label): + dtype = getattr(label, 'dtype', type(label)) + if isinstance(label, (Timestamp, Timedelta)): + dtype = 'datetime64' + if is_datetime_or_timedelta_dtype(dtype): + return label - np.timedelta64(1, 'ns') + elif is_integer_dtype(dtype): + return label - 1 + elif is_float_dtype(dtype): + return np.nextafter(label, -np.infty) + else: + raise TypeError('cannot determine next label for type %r' + % type(label)) + + +def _get_interval_closed_bounds(interval): + """ + Given an Interval or IntervalIndex, return the corresponding interval with + closed bounds. + """ + left, right = interval.left, interval.right + if interval.open_left: + left = _get_next_label(left) + if interval.open_right: + right = _get_prev_label(right) + return left, right + + +def _new_IntervalIndex(cls, d): + """ This is called upon unpickling, + rather than the default which doesn't + have arguments and breaks __new__ """ + + return cls.from_arrays(**d) + + +class IntervalIndex(IntervalMixin, Index): + """ + Immutable Index implementing an ordered, sliceable set. IntervalIndex + represents an Index of intervals that are all closed on the same side. + + .. versionadded:: 0.20.0 + + Properties + ---------- + left, right : array-like (1-dimensional) + Left and right bounds for each interval. + closed : {'left', 'right', 'both', 'neither'}, optional + Whether the intervals are closed on the left-side, right-side, both or + neither. Defaults to 'right'. + name : object, optional + Name to be stored in the index. + copy : boolean, default False + Copy the meta-data + """ + _typ = 'intervalindex' + _comparables = ['name'] + _attributes = ['name', 'closed'] + _allow_index_ops = True + + # we would like our indexing holder to defer to us + _defer_to_indexing = True + + _mask = None + + def __new__(cls, data, closed='right', + name=None, copy=False, dtype=None, + fastpath=False, verify_integrity=True): + + if fastpath: + return cls._simple_new(data.left, data.right, closed, name, + copy=copy, verify_integrity=False) + + if name is None and hasattr(data, 'name'): + name = data.name + + if isinstance(data, IntervalIndex): + left = data.left + right = data.right + + else: + + # don't allow scalars + if is_scalar(data): + cls._scalar_data_error(data) + + data = IntervalIndex.from_intervals(data, name=name) + left, right = data.left, data.right + + return cls._simple_new(left, right, closed, name, + copy=copy, verify_integrity=verify_integrity) + + @classmethod + def _simple_new(cls, left, right, closed=None, name=None, + copy=False, verify_integrity=True): + result = IntervalMixin.__new__(cls) + + if closed is None: + closed = 'right' + left = _ensure_index(left, copy=copy) + right = _ensure_index(right, copy=copy) + + # coerce dtypes to match if needed + if is_float_dtype(left) and is_integer_dtype(right): + right = right.astype(left.dtype) + if is_float_dtype(right) and is_integer_dtype(left): + left = left.astype(right.dtype) + + if type(left) != type(right): + raise ValueError("must not have differing left [{}] " + "and right [{}] types".format( + type(left), type(right))) + + if isinstance(left, ABCPeriodIndex): + raise ValueError("Period dtypes are not supported, " + "use a PeriodIndex instead") + + result._left = left + result._right = right + result._closed = closed + result.name = name + if verify_integrity: + result._validate() + result._reset_identity() + return result + + @Appender(_index_shared_docs['_shallow_copy']) + def _shallow_copy(self, left=None, right=None, **kwargs): + if left is None: + + # no values passed + left, right = self.left, self.right + + elif right is None: + + # only single value passed, could be an IntervalIndex + # or array of Intervals + if not isinstance(left, IntervalIndex): + left = type(self).from_intervals(left) + + left, right = left.left, left.right + else: + + # both left and right are values + pass + + attributes = self._get_attributes_dict() + attributes.update(kwargs) + attributes['verify_integrity'] = False + return self._simple_new(left, right, **attributes) + + def _validate(self): + """ + Verify that the IntervalIndex is valid. + """ + if self.closed not in _VALID_CLOSED: + raise ValueError("invalid options for 'closed': %s" % self.closed) + if len(self.left) != len(self.right): + raise ValueError('left and right must have the same length') + left_mask = notnull(self.left) + right_mask = notnull(self.right) + if not (left_mask == right_mask).all(): + raise ValueError('missing values must be missing in the same ' + 'location both left and right sides') + if not (self.left[left_mask] <= self.right[left_mask]).all(): + raise ValueError('left side of interval must be <= right side') + self._mask = ~left_mask + + @cache_readonly + def hasnans(self): + """ return if I have any nans; enables various perf speedups """ + return self._isnan.any() + + @cache_readonly + def _isnan(self): + """ return if each value is nan""" + if self._mask is None: + self._mask = isnull(self.left) + return self._mask + + @cache_readonly + def _engine(self): + return IntervalTree(self.left, self.right, closed=self.closed) + + @property + def _constructor(self): + return type(self).from_intervals + + def __contains__(self, key): + """ + return a boolean if this key is IN the index + We *only* accept an Interval + + Parameters + ---------- + key : Interval + + Returns + ------- + boolean + """ + if not isinstance(key, Interval): + return False + + try: + self.get_loc(key) + return True + except KeyError: + return False + + def _is_contained_in(self, key): + """ + return a boolean if this key is IN the index + + We accept / allow keys to be not *just* actual + objects. + + Parameters + ---------- + key : int, float, Interval + + Returns + ------- + boolean + """ + try: + self.get_loc(key) + return True + except KeyError: + return False + + @classmethod + def from_breaks(cls, breaks, closed='right', name=None, copy=False): + """ + Construct an IntervalIndex from an array of splits + + Parameters + ---------- + breaks : array-like (1-dimensional) + Left and right bounds for each interval. + closed : {'left', 'right', 'both', 'neither'}, optional + Whether the intervals are closed on the left-side, right-side, both + or neither. Defaults to 'right'. + name : object, optional + Name to be stored in the index. + copy : boolean, default False + copy the data + + Examples + -------- + + >>> IntervalIndex.from_breaks([0, 1, 2, 3]) + IntervalIndex(left=[0, 1, 2], + right=[1, 2, 3], + closed='right') + """ + breaks = np.asarray(breaks) + return cls.from_arrays(breaks[:-1], breaks[1:], closed, + name=name, copy=copy) + + @classmethod + def from_arrays(cls, left, right, closed='right', name=None, copy=False): + """ + Construct an IntervalIndex from a a left and right array + + Parameters + ---------- + left : array-like (1-dimensional) + Left bounds for each interval. + right : array-like (1-dimensional) + Right bounds for each interval. + closed : {'left', 'right', 'both', 'neither'}, optional + Whether the intervals are closed on the left-side, right-side, both + or neither. Defaults to 'right'. + name : object, optional + Name to be stored in the index. + copy : boolean, default False + copy the data + + Examples + -------- + + >>> IntervalIndex.from_arrays([0, 1, 2], [1, 2, 3]) + IntervalIndex(left=[0, 1, 2], + right=[1, 2, 3], + closed='right') + """ + left = np.asarray(left) + right = np.asarray(right) + return cls._simple_new(left, right, closed, name=name, + copy=copy, verify_integrity=True) + + @classmethod + def from_intervals(cls, data, name=None, copy=False): + """ + Construct an IntervalIndex from a 1d array of Interval objects + + Parameters + ---------- + data : array-like (1-dimensional) + Array of Interval objects. All intervals must be closed on the same + sides. + name : object, optional + Name to be stored in the index. + copy : boolean, default False + by-default copy the data, this is compat only and ignored + + Examples + -------- + + >>> IntervalIndex.from_intervals([Interval(0, 1), Interval(1, 2)]) + IntervalIndex(left=[0, 1], + right=[1, 2], + closed='right') + + The generic Index constructor work identically when it infers an array + of all intervals: + + >>> Index([Interval(0, 1), Interval(1, 2)]) + IntervalIndex(left=[0, 1], + right=[1, 2], + closed='right') + """ + data = np.asarray(data) + left, right, closed = intervals_to_interval_bounds(data) + return cls.from_arrays(left, right, closed, name=name, copy=False) + + @classmethod + def from_tuples(cls, data, closed='right', name=None, copy=False): + """ + Construct an IntervalIndex from a list/array of tuples + + Parameters + ---------- + data : array-like (1-dimensional) + Array of tuples + closed : {'left', 'right', 'both', 'neither'}, optional + Whether the intervals are closed on the left-side, right-side, both + or neither. Defaults to 'right'. + name : object, optional + Name to be stored in the index. + copy : boolean, default False + by-default copy the data, this is compat only and ignored + + Examples + -------- + + """ + left = [] + right = [] + for d in data: + + if isnull(d): + left.append(np.nan) + right.append(np.nan) + continue + + l, r = d + left.append(l) + right.append(r) + + # TODO + # if we have nulls and we previous had *only* + # integer data, then we have changed the dtype + + return cls.from_arrays(left, right, closed, name=name, copy=False) + + def to_tuples(self): + return Index(com._asarray_tuplesafe(zip(self.left, self.right))) + + @cache_readonly + def _multiindex(self): + return MultiIndex.from_arrays([self.left, self.right], + names=['left', 'right']) + + @property + def left(self): + return self._left + + @property + def right(self): + return self._right + + @property + def closed(self): + return self._closed + + def __len__(self): + return len(self.left) + + @cache_readonly + def values(self): + """ + Returns the IntervalIndex's data as a numpy array of Interval + objects (with dtype='object') + """ + left = self.left + right = self.right + mask = self._isnan + closed = self._closed + + result = np.empty(len(left), dtype=object) + for i in range(len(left)): + if mask[i]: + result[i] = np.nan + else: + result[i] = Interval(left[i], right[i], closed) + return result + + def __array__(self, result=None): + """ the array interface, return my values """ + return self.values + + def __array_wrap__(self, result, context=None): + # we don't want the superclass implementation + return result + + def _array_values(self): + return self.values + + def __reduce__(self): + d = dict(left=self.left, + right=self.right) + d.update(self._get_attributes_dict()) + return _new_IntervalIndex, (self.__class__, d), None + + @Appender(_index_shared_docs['copy']) + def copy(self, deep=False, name=None): + left = self.left.copy(deep=True) if deep else self.left + right = self.right.copy(deep=True) if deep else self.right + name = name if name is not None else self.name + return type(self).from_arrays(left, right, name=name) + + @Appender(_index_shared_docs['astype']) + def astype(self, dtype, copy=True): + if is_interval_dtype(dtype): + if copy: + self = self.copy() + return self + elif is_object_dtype(dtype): + return Index(self.values, dtype=object) + elif is_categorical_dtype(dtype): + from pandas import Categorical + return Categorical(self, ordered=True) + raise ValueError('Cannot cast IntervalIndex to dtype %s' % dtype) + + @cache_readonly + def dtype(self): + return IntervalDtype.construct_from_string(str(self.left.dtype)) + + @property + def inferred_type(self): + return 'interval' + + @Appender(Index.memory_usage.__doc__) + def memory_usage(self, deep=False): + # we don't use an explict engine + # so return the bytes here + return (self.left.memory_usage(deep=deep) + + self.right.memory_usage(deep=deep)) + + @cache_readonly + def mid(self): + """Returns the mid-point of each interval in the index as an array + """ + try: + return Index(0.5 * (self.left.values + self.right.values)) + except TypeError: + # datetime safe version + delta = self.right.values - self.left.values + return Index(self.left.values + 0.5 * delta) + + @cache_readonly + def is_monotonic(self): + return self._multiindex.is_monotonic + + @cache_readonly + def is_monotonic_increasing(self): + return self._multiindex.is_monotonic_increasing + + @cache_readonly + def is_monotonic_decreasing(self): + return self._multiindex.is_monotonic_decreasing + + @cache_readonly + def is_unique(self): + return self._multiindex.is_unique + + @cache_readonly + def is_non_overlapping_monotonic(self): + # must be increasing (e.g., [0, 1), [1, 2), [2, 3), ... ) + # or decreasing (e.g., [-1, 0), [-2, -1), [-3, -2), ...) + # we already require left <= right + return ((self.right[:-1] <= self.left[1:]).all() or + (self.left[:-1] >= self.right[1:]).all()) + + @Appender(_index_shared_docs['_convert_scalar_indexer']) + def _convert_scalar_indexer(self, key, kind=None): + if kind == 'iloc': + return super(IntervalIndex, self)._convert_scalar_indexer( + key, kind=kind) + return key + + def _maybe_cast_slice_bound(self, label, side, kind): + return getattr(self, side)._maybe_cast_slice_bound(label, side, kind) + + @Appender(_index_shared_docs['_convert_list_indexer']) + def _convert_list_indexer(self, keyarr, kind=None): + """ + we are passed a list-like indexer. Return the + indexer for matching intervals. + """ + locs = self.get_indexer_for(keyarr) + check = locs == -1 + locs = locs[~check] + return locs + + def _maybe_cast_indexed(self, key): + """ + we need to cast the key, which could be a scalar + or an array-like to the type of our subtype + """ + if is_float_dtype(self.dtype.subtype): + if is_integer(key): + key = float(key) + elif isinstance(key, (np.ndarray, Index)): + key = key.astype('float64') + return key + + def _check_method(self, method): + if method is None: + return + + if method in ['bfill', 'backfill', 'pad', 'ffill', 'nearest']: + raise NotImplementedError( + 'method {} not yet implemented for ' + 'IntervalIndex'.format(method)) + + raise ValueError("Invalid fill method") + + def _searchsorted_monotonic(self, label, side, exclude_label=False): + if not self.is_non_overlapping_monotonic: + raise KeyError('can only get slices from an IntervalIndex if ' + 'bounds are non-overlapping and all monotonic ' + 'increasing or decreasing') + + if isinstance(label, IntervalMixin): + raise NotImplementedError + + if ((side == 'left' and self.left.is_monotonic_increasing) or + (side == 'right' and self.left.is_monotonic_decreasing)): + sub_idx = self.right + if self.open_right or exclude_label: + label = _get_next_label(label) + else: + sub_idx = self.left + if self.open_left or exclude_label: + label = _get_prev_label(label) + + return sub_idx._searchsorted_monotonic(label, side) + + def _get_loc_only_exact_matches(self, key): + if isinstance(key, Interval): + # TODO: this expands to a tuple index, see if we can + # do better + return Index(self._multiindex.values).get_loc(key) + raise KeyError + + def _find_non_overlapping_monotonic_bounds(self, key): + if isinstance(key, IntervalMixin): + start = self._searchsorted_monotonic( + key.left, 'left', exclude_label=key.open_left) + stop = self._searchsorted_monotonic( + key.right, 'right', exclude_label=key.open_right) + elif isinstance(key, slice): + # slice + start, stop = key.start, key.stop + if (key.step or 1) != 1: + raise NotImplementedError("cannot slice with a slice step") + if start is None: + start = 0 + else: + start = self._searchsorted_monotonic(start, 'left') + if stop is None: + stop = len(self) + else: + stop = self._searchsorted_monotonic(stop, 'right') + else: + # scalar or index-like + + start = self._searchsorted_monotonic(key, 'left') + stop = self._searchsorted_monotonic(key, 'right') + return start, stop + + def get_loc(self, key, method=None): + self._check_method(method) + + original_key = key + key = self._maybe_cast_indexed(key) + + if self.is_non_overlapping_monotonic: + if isinstance(key, Interval): + left = self._maybe_cast_slice_bound(key.left, 'left', None) + right = self._maybe_cast_slice_bound(key.right, 'right', None) + key = Interval(left, right, key.closed) + else: + key = self._maybe_cast_slice_bound(key, 'left', None) + + start, stop = self._find_non_overlapping_monotonic_bounds(key) + + if start is None or stop is None: + return slice(start, stop) + elif start + 1 == stop: + return start + elif start < stop: + return slice(start, stop) + else: + raise KeyError(original_key) + + else: + # use the interval tree + if isinstance(key, Interval): + left, right = _get_interval_closed_bounds(key) + return self._engine.get_loc_interval(left, right) + else: + return self._engine.get_loc(key) + + def get_value(self, series, key): + if com.is_bool_indexer(key): + loc = key + elif is_list_like(key): + loc = self.get_indexer(key) + else: + loc = self.get_loc(key) + return series.iloc[loc] + + @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs) + def get_indexer(self, target, method=None, limit=None, tolerance=None): + self._check_method(method) + target = _ensure_index(target) + target = self._maybe_cast_indexed(target) + + if self.equals(target): + return np.arange(len(self), dtype='intp') + + if self.is_non_overlapping_monotonic: + start, stop = self._find_non_overlapping_monotonic_bounds(target) + + start_plus_one = start + 1 + if not ((start_plus_one < stop).any()): + return np.where(start_plus_one == stop, start, -1) + + if not self.is_unique: + raise ValueError("get_indexer cannot handle non-unique indices") + + # find the left and right indexers + lindexer = self._engine.get_indexer(target.left.values) + rindexer = self._engine.get_indexer(target.right.values) + + # we want to return an indexer on the intervals + # however, our keys could provide overlapping of multiple + # intervals, so we iterate thru the indexers and construct + # a set of indexers + + indexer = [] + n = len(self) + + for l, r in zip(lindexer, rindexer): + + # not found + if l == -1 and r == -1: + indexer.append(np.array([-1])) + + elif r == -1: + indexer.append(np.arange(l, n)) + + elif l == -1: + if r == 0: + indexer.append(np.array([-1])) + else: + indexer.append(np.arange(0, r + 1)) + + else: + indexer.append(np.arange(l, r)) + + indexer = np.concatenate(indexer) + + return _ensure_platform_int(indexer) + + @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs) + def get_indexer_non_unique(self, target): + target = self._maybe_cast_indexed(_ensure_index(target)) + return super(IntervalIndex, self).get_indexer_non_unique(target) + + @Appender(_index_shared_docs['where']) + def where(self, cond, other=None): + if other is None: + other = self._na_value + values = np.where(cond, self.values, other) + return self._shallow_copy(values) + + def delete(self, loc): + new_left = self.left.delete(loc) + new_right = self.right.delete(loc) + return self._shallow_copy(new_left, new_right) + + def insert(self, loc, item): + if not isinstance(item, Interval): + raise ValueError('can only insert Interval objects into an ' + 'IntervalIndex') + if not item.closed == self.closed: + raise ValueError('inserted item must be closed on the same side ' + 'as the index') + new_left = self.left.insert(loc, item.left) + new_right = self.right.insert(loc, item.right) + return self._shallow_copy(new_left, new_right) + + def _as_like_interval_index(self, other, error_msg): + self._assert_can_do_setop(other) + other = _ensure_index(other) + if (not isinstance(other, IntervalIndex) or + self.closed != other.closed): + raise ValueError(error_msg) + return other + + def _append_same_dtype(self, to_concat, name): + """ + assert that we all have the same .closed + we allow a 0-len index here as well + """ + if not len(set([i.closed for i in to_concat if len(i)])) == 1: + msg = ('can only append two IntervalIndex objects ' + 'that are closed on the same side') + raise ValueError(msg) + return super(IntervalIndex, self)._append_same_dtype(to_concat, name) + + @Appender(_index_shared_docs['take'] % _index_doc_kwargs) + def take(self, indices, axis=0, allow_fill=True, + fill_value=None, **kwargs): + nv.validate_take(tuple(), kwargs) + indices = _ensure_platform_int(indices) + left, right = self.left, self.right + + if fill_value is None: + fill_value = self._na_value + mask = indices == -1 + + if not mask.any(): + # we won't change dtype here in this case + # if we don't need + allow_fill = False + + taker = lambda x: x.take(indices, allow_fill=allow_fill, + fill_value=fill_value) + + try: + new_left = taker(left) + new_right = taker(right) + except ValueError: + + # we need to coerce; migth have NA's in an + # interger dtype + new_left = taker(left.astype(float)) + new_right = taker(right.astype(float)) + + return self._shallow_copy(new_left, new_right) + + def __getitem__(self, value): + mask = self._isnan[value] + if is_scalar(mask) and mask: + return self._na_value + + left = self.left[value] + right = self.right[value] + + # scalar + if not isinstance(left, Index): + return Interval(left, right, self.closed) + + return self._shallow_copy(left, right) + + # __repr__ associated methods are based on MultiIndex + + def _format_with_header(self, header, **kwargs): + return header + list(self._format_native_types(**kwargs)) + + def _format_native_types(self, na_rep='', quoting=None, **kwargs): + """ actually format my specific types """ + from pandas.formats.format import IntervalArrayFormatter + return IntervalArrayFormatter(values=self, + na_rep=na_rep, + justify='all').get_result() + + def _format_data(self): + + # TODO: integrate with categorical and make generic + n = len(self) + max_seq_items = min((get_option( + 'display.max_seq_items') or n) // 10, 10) + + formatter = str + + if n == 0: + summary = '[]' + elif n == 1: + first = formatter(self[0]) + summary = '[{}]'.format(first) + elif n == 2: + first = formatter(self[0]) + last = formatter(self[-1]) + summary = '[{}, {}]'.format(first, last) + else: + + if n > max_seq_items: + n = min(max_seq_items // 2, 10) + head = [formatter(x) for x in self[:n]] + tail = [formatter(x) for x in self[-n:]] + summary = '[{} ... {}]'.format(', '.join(head), + ', '.join(tail)) + else: + head = [] + tail = [formatter(x) for x in self] + summary = '[{}]'.format(', '.join(tail)) + + return summary + self._format_space() + + def _format_attrs(self): + attrs = [('closed', repr(self.closed))] + if self.name is not None: + attrs.append(('name', default_pprint(self.name))) + attrs.append(('dtype', "'%s'" % self.dtype)) + return attrs + + def _format_space(self): + return "\n%s" % (' ' * (len(self.__class__.__name__) + 1)) + + def argsort(self, *args, **kwargs): + return np.lexsort((self.right, self.left)) + + def equals(self, other): + + if self.is_(other): + return True + + # if we can coerce to an II + # then we can compare + if not isinstance(other, IntervalIndex): + if not is_interval_dtype(other): + return False + other = Index(getattr(other, '.values', other)) + + return (self.left.equals(other.left) and + self.right.equals(other.right) and + self.closed == other.closed) + + def _setop(op_name): + def func(self, other): + msg = ('can only do set operations between two IntervalIndex ' + 'objects that are closed on the same side') + other = self._as_like_interval_index(other, msg) + result = getattr(self._multiindex, op_name)(other._multiindex) + result_name = self.name if self.name == other.name else None + return type(self).from_tuples(result.values, closed=self.closed, + name=result_name) + return func + + union = _setop('union') + intersection = _setop('intersection') + difference = _setop('difference') + symmetric_differnce = _setop('symmetric_difference') + + # TODO: arithmetic operations + + +IntervalIndex._add_logical_methods_disabled() + + +def interval_range(start=None, end=None, freq=None, periods=None, + name=None, closed='right', **kwargs): + """ + Return a fixed frequency IntervalIndex + + Parameters + ---------- + start : string or datetime-like, default None + Left bound for generating data + end : string or datetime-like, default None + Right bound for generating data + freq : interger, string or DateOffset, default 1 + periods : interger, default None + name : str, default None + Name of the resulting index + closed : string, default 'right' + options are: 'left', 'right', 'both', 'neither' + + Notes + ----- + 2 of start, end, or periods must be specified + + Returns + ------- + rng : IntervalIndex + """ + + if freq is None: + freq = 1 + + if start is None: + if periods is None or end is None: + raise ValueError("must specify 2 of start, end, periods") + start = end - periods * freq + elif end is None: + if periods is None or start is None: + raise ValueError("must specify 2 of start, end, periods") + end = start + periods * freq + elif periods is None: + if start is None or end is None: + raise ValueError("must specify 2 of start, end, periods") + pass + + # must all be same units or None + arr = np.array([start, end, freq]) + if is_object_dtype(arr): + raise ValueError("start, end, freq need to be the same type") + + return IntervalIndex.from_breaks(np.arange(start, end, freq), + name=name, + closed=closed) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 74c45aac8b620..f51ed20379726 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -1318,15 +1318,17 @@ def nlevels(self): def levshape(self): return tuple(len(x) for x in self.levels) + @Appender(_index_shared_docs['__contains__'] % _index_doc_kwargs) def __contains__(self, key): hash(key) - # work around some kind of odd cython bug try: self.get_loc(key) return True except LookupError: return False + _is_contained_in = __contains__ + def __reduce__(self): """Necessary for making this object picklable""" d = dict(levels=[lev for lev in self.levels], diff --git a/pandas/src/intervaltree.pyx b/pandas/src/intervaltree.pyx deleted file mode 100644 index 55782c930d4f8..0000000000000 --- a/pandas/src/intervaltree.pyx +++ /dev/null @@ -1,1444 +0,0 @@ - -# DO NOT EDIT THIS FILE: This file was autogenerated from -# generate_intervaltree.py, so please edit that file and then run -# `python2 generate_intervaltree.py` to re-generate this file. - - -from numpy cimport int64_t, float64_t -from numpy cimport ndarray, PyArray_ArgSort, NPY_QUICKSORT, PyArray_Take -import numpy as np - -cimport cython -cimport numpy as cnp -cnp.import_array() - -from hashtable cimport Int64Vector, Int64VectorData - - -ctypedef fused scalar64_t: - float64_t - int64_t - - -NODE_CLASSES = {} - - -cdef class IntervalTree(IntervalMixin): - """A centered interval tree - - Based off the algorithm described on Wikipedia: - http://en.wikipedia.org/wiki/Interval_tree - """ - cdef: - readonly object left, right, root - readonly str closed - object _left_sorter, _right_sorter - - def __init__(self, left, right, closed='right', leaf_size=100): - """ - Parameters - ---------- - left, right : np.ndarray[ndim=1] - Left and right bounds for each interval. Assumed to contain no - NaNs. - closed : {'left', 'right', 'both', 'neither'}, optional - Whether the intervals are closed on the left-side, right-side, both - or neither. Defaults to 'right'. - leaf_size : int, optional - Parameter that controls when the tree switches from creating nodes - to brute-force search. Tune this parameter to optimize query - performance. - """ - if closed not in ['left', 'right', 'both', 'neither']: - raise ValueError("invalid option for 'closed': %s" % closed) - - left = np.asarray(left) - right = np.asarray(right) - dtype = np.result_type(left, right) - self.left = np.asarray(left, dtype=dtype) - self.right = np.asarray(right, dtype=dtype) - - indices = np.arange(len(left), dtype='int64') - - self.closed = closed - - node_cls = NODE_CLASSES[str(dtype), closed] - self.root = node_cls(self.left, self.right, indices, leaf_size) - - @property - def left_sorter(self): - """How to sort the left labels; this is used for binary search - """ - if self._left_sorter is None: - self._left_sorter = np.argsort(self.left) - return self._left_sorter - - @property - def right_sorter(self): - """How to sort the right labels - """ - if self._right_sorter is None: - self._right_sorter = np.argsort(self.right) - return self._right_sorter - - def get_loc(self, scalar64_t key): - """Return all positions corresponding to intervals that overlap with - the given scalar key - """ - result = Int64Vector() - self.root.query(result, key) - if not result.data.n: - raise KeyError(key) - return result.to_array() - - def _get_partial_overlap(self, key_left, key_right, side): - """Return all positions corresponding to intervals with the given side - falling between the left and right bounds of an interval query - """ - if side == 'left': - values = self.left - sorter = self.left_sorter - else: - values = self.right - sorter = self.right_sorter - key = [key_left, key_right] - i, j = values.searchsorted(key, sorter=sorter) - return sorter[i:j] - - def get_loc_interval(self, key_left, key_right): - """Lookup the intervals enclosed in the given interval bounds - - The given interval is presumed to have closed bounds. - """ - import pandas as pd - left_overlap = self._get_partial_overlap(key_left, key_right, 'left') - right_overlap = self._get_partial_overlap(key_left, key_right, 'right') - enclosing = self.get_loc(0.5 * (key_left + key_right)) - combined = np.concatenate([left_overlap, right_overlap, enclosing]) - uniques = pd.unique(combined) - return uniques - - def get_indexer(self, scalar64_t[:] target): - """Return the positions corresponding to unique intervals that overlap - with the given array of scalar targets. - """ - # TODO: write get_indexer_intervals - cdef: - int64_t old_len, i - Int64Vector result - - result = Int64Vector() - old_len = 0 - for i in range(len(target)): - self.root.query(result, target[i]) - if result.data.n == old_len: - result.append(-1) - elif result.data.n > old_len + 1: - raise KeyError( - 'indexer does not intersect a unique set of intervals') - old_len = result.data.n - return result.to_array() - - def get_indexer_non_unique(self, scalar64_t[:] target): - """Return the positions corresponding to intervals that overlap with - the given array of scalar targets. Non-unique positions are repeated. - """ - cdef: - int64_t old_len, i - Int64Vector result, missing - - result = Int64Vector() - missing = Int64Vector() - old_len = 0 - for i in range(len(target)): - self.root.query(result, target[i]) - if result.data.n == old_len: - result.append(-1) - missing.append(i) - old_len = result.data.n - return result.to_array(), missing.to_array() - - def __repr__(self): - return ('' - % self.root.n_elements) - - -cdef take(ndarray source, ndarray indices): - """Take the given positions from a 1D ndarray - """ - return PyArray_Take(source, indices, 0) - - -cdef sort_values_and_indices(all_values, all_indices, subset): - indices = take(all_indices, subset) - values = take(all_values, subset) - sorter = PyArray_ArgSort(values, 0, NPY_QUICKSORT) - sorted_values = take(values, sorter) - sorted_indices = take(indices, sorter) - return sorted_values, sorted_indices - - -cdef class Float64ClosedLeftIntervalNode: - """Non-terminal node for an IntervalTree - - Categorizes intervals by those that fall to the left, those that fall to - the right, and those that overlap with the pivot. - """ - cdef: - Float64ClosedLeftIntervalNode left_node, right_node - float64_t[:] center_left_values, center_right_values, left, right - int64_t[:] center_left_indices, center_right_indices, indices - float64_t min_left, max_right - readonly float64_t pivot - readonly int64_t n_elements, n_center, leaf_size - readonly bint is_leaf_node - - def __init__(self, - ndarray[float64_t, ndim=1] left, - ndarray[float64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - int64_t leaf_size): - - self.n_elements = len(left) - self.leaf_size = leaf_size - if left.size > 0: - self.min_left = left.min() - self.max_right = right.max() - else: - self.min_left = 0 - self.max_right = 0 - - if self.n_elements <= leaf_size: - # make this a terminal (leaf) node - self.is_leaf_node = True - self.left = left - self.right = right - self.indices = indices - self.n_center - else: - # calculate a pivot so we can create child nodes - self.is_leaf_node = False - self.pivot = np.median(left + right) / 2 - left_set, right_set, center_set = self.classify_intervals(left, right) - - self.left_node = self.new_child_node(left, right, indices, left_set) - self.right_node = self.new_child_node(left, right, indices, right_set) - - self.center_left_values, self.center_left_indices = \ - sort_values_and_indices(left, indices, center_set) - self.center_right_values, self.center_right_indices = \ - sort_values_and_indices(right, indices, center_set) - self.n_center = len(self.center_left_indices) - - @cython.wraparound(False) - @cython.boundscheck(False) - cdef classify_intervals(self, float64_t[:] left, float64_t[:] right): - """Classify the given intervals based upon whether they fall to the - left, right, or overlap with this node's pivot. - """ - cdef: - Int64Vector left_ind, right_ind, overlapping_ind - Py_ssize_t i - - left_ind = Int64Vector() - right_ind = Int64Vector() - overlapping_ind = Int64Vector() - - for i in range(self.n_elements): - if right[i] <= self.pivot: - left_ind.append(i) - elif self.pivot < left[i]: - right_ind.append(i) - else: - overlapping_ind.append(i) - - return (left_ind.to_array(), - right_ind.to_array(), - overlapping_ind.to_array()) - - cdef new_child_node(self, - ndarray[float64_t, ndim=1] left, - ndarray[float64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - ndarray[int64_t, ndim=1] subset): - """Create a new child node. - """ - left = take(left, subset) - right = take(right, subset) - indices = take(indices, subset) - return Float64ClosedLeftIntervalNode( - left, right, indices, self.leaf_size) - - @cython.wraparound(False) - @cython.boundscheck(False) - @cython.initializedcheck(False) - cpdef query(self, Int64Vector result, scalar64_t point): - """Recursively query this node and its sub-nodes for intervals that - overlap with the query point. - """ - cdef: - int64_t[:] indices - float64_t[:] values - Py_ssize_t i - - if self.is_leaf_node: - # Once we get down to a certain size, it doesn't make sense to - # continue the binary tree structure. Instead, we use linear - # search. - for i in range(self.n_elements): - if self.left[i] <= point < self.right[i]: - result.append(self.indices[i]) - else: - # There are child nodes. Based on comparing our query to the pivot, - # look at the center values, then go to the relevant child. - if point < self.pivot: - values = self.center_left_values - indices = self.center_left_indices - for i in range(self.n_center): - if not values[i] <= point: - break - result.append(indices[i]) - if point < self.left_node.max_right: - self.left_node.query(result, point) - elif point > self.pivot: - values = self.center_right_values - indices = self.center_right_indices - for i in range(self.n_center - 1, -1, -1): - if not point < values[i]: - break - result.append(indices[i]) - if self.right_node.min_left <= point: - self.right_node.query(result, point) - else: - result.extend(self.center_left_indices) - - def __repr__(self): - if self.is_leaf_node: - return ('' % self.n_elements) - else: - n_left = self.left_node.n_elements - n_right = self.right_node.n_elements - n_center = self.n_elements - n_left - n_right - return ('' % - (self.pivot, self.n_elements, n_left, n_right, n_center)) - - def counts(self): - if self.is_leaf_node: - return self.n_elements - else: - m = len(self.center_left_values) - l = self.left_node.counts() - r = self.right_node.counts() - return (m, (l, r)) - -NODE_CLASSES['float64', 'left'] = Float64ClosedLeftIntervalNode - - -cdef class Float64ClosedRightIntervalNode: - """Non-terminal node for an IntervalTree - - Categorizes intervals by those that fall to the left, those that fall to - the right, and those that overlap with the pivot. - """ - cdef: - Float64ClosedRightIntervalNode left_node, right_node - float64_t[:] center_left_values, center_right_values, left, right - int64_t[:] center_left_indices, center_right_indices, indices - float64_t min_left, max_right - readonly float64_t pivot - readonly int64_t n_elements, n_center, leaf_size - readonly bint is_leaf_node - - def __init__(self, - ndarray[float64_t, ndim=1] left, - ndarray[float64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - int64_t leaf_size): - - self.n_elements = len(left) - self.leaf_size = leaf_size - if left.size > 0: - self.min_left = left.min() - self.max_right = right.max() - else: - self.min_left = 0 - self.max_right = 0 - - if self.n_elements <= leaf_size: - # make this a terminal (leaf) node - self.is_leaf_node = True - self.left = left - self.right = right - self.indices = indices - self.n_center - else: - # calculate a pivot so we can create child nodes - self.is_leaf_node = False - self.pivot = np.median(left + right) / 2 - left_set, right_set, center_set = self.classify_intervals(left, right) - - self.left_node = self.new_child_node(left, right, indices, left_set) - self.right_node = self.new_child_node(left, right, indices, right_set) - - self.center_left_values, self.center_left_indices = \ - sort_values_and_indices(left, indices, center_set) - self.center_right_values, self.center_right_indices = \ - sort_values_and_indices(right, indices, center_set) - self.n_center = len(self.center_left_indices) - - @cython.wraparound(False) - @cython.boundscheck(False) - cdef classify_intervals(self, float64_t[:] left, float64_t[:] right): - """Classify the given intervals based upon whether they fall to the - left, right, or overlap with this node's pivot. - """ - cdef: - Int64Vector left_ind, right_ind, overlapping_ind - Py_ssize_t i - - left_ind = Int64Vector() - right_ind = Int64Vector() - overlapping_ind = Int64Vector() - - for i in range(self.n_elements): - if right[i] < self.pivot: - left_ind.append(i) - elif self.pivot <= left[i]: - right_ind.append(i) - else: - overlapping_ind.append(i) - - return (left_ind.to_array(), - right_ind.to_array(), - overlapping_ind.to_array()) - - cdef new_child_node(self, - ndarray[float64_t, ndim=1] left, - ndarray[float64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - ndarray[int64_t, ndim=1] subset): - """Create a new child node. - """ - left = take(left, subset) - right = take(right, subset) - indices = take(indices, subset) - return Float64ClosedRightIntervalNode( - left, right, indices, self.leaf_size) - - @cython.wraparound(False) - @cython.boundscheck(False) - @cython.initializedcheck(False) - cpdef query(self, Int64Vector result, scalar64_t point): - """Recursively query this node and its sub-nodes for intervals that - overlap with the query point. - """ - cdef: - int64_t[:] indices - float64_t[:] values - Py_ssize_t i - - if self.is_leaf_node: - # Once we get down to a certain size, it doesn't make sense to - # continue the binary tree structure. Instead, we use linear - # search. - for i in range(self.n_elements): - if self.left[i] < point <= self.right[i]: - result.append(self.indices[i]) - else: - # There are child nodes. Based on comparing our query to the pivot, - # look at the center values, then go to the relevant child. - if point < self.pivot: - values = self.center_left_values - indices = self.center_left_indices - for i in range(self.n_center): - if not values[i] < point: - break - result.append(indices[i]) - if point <= self.left_node.max_right: - self.left_node.query(result, point) - elif point > self.pivot: - values = self.center_right_values - indices = self.center_right_indices - for i in range(self.n_center - 1, -1, -1): - if not point <= values[i]: - break - result.append(indices[i]) - if self.right_node.min_left < point: - self.right_node.query(result, point) - else: - result.extend(self.center_left_indices) - - def __repr__(self): - if self.is_leaf_node: - return ('' % self.n_elements) - else: - n_left = self.left_node.n_elements - n_right = self.right_node.n_elements - n_center = self.n_elements - n_left - n_right - return ('' % - (self.pivot, self.n_elements, n_left, n_right, n_center)) - - def counts(self): - if self.is_leaf_node: - return self.n_elements - else: - m = len(self.center_left_values) - l = self.left_node.counts() - r = self.right_node.counts() - return (m, (l, r)) - -NODE_CLASSES['float64', 'right'] = Float64ClosedRightIntervalNode - - -cdef class Float64ClosedBothIntervalNode: - """Non-terminal node for an IntervalTree - - Categorizes intervals by those that fall to the left, those that fall to - the right, and those that overlap with the pivot. - """ - cdef: - Float64ClosedBothIntervalNode left_node, right_node - float64_t[:] center_left_values, center_right_values, left, right - int64_t[:] center_left_indices, center_right_indices, indices - float64_t min_left, max_right - readonly float64_t pivot - readonly int64_t n_elements, n_center, leaf_size - readonly bint is_leaf_node - - def __init__(self, - ndarray[float64_t, ndim=1] left, - ndarray[float64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - int64_t leaf_size): - - self.n_elements = len(left) - self.leaf_size = leaf_size - if left.size > 0: - self.min_left = left.min() - self.max_right = right.max() - else: - self.min_left = 0 - self.max_right = 0 - - if self.n_elements <= leaf_size: - # make this a terminal (leaf) node - self.is_leaf_node = True - self.left = left - self.right = right - self.indices = indices - self.n_center - else: - # calculate a pivot so we can create child nodes - self.is_leaf_node = False - self.pivot = np.median(left + right) / 2 - left_set, right_set, center_set = self.classify_intervals(left, right) - - self.left_node = self.new_child_node(left, right, indices, left_set) - self.right_node = self.new_child_node(left, right, indices, right_set) - - self.center_left_values, self.center_left_indices = \ - sort_values_and_indices(left, indices, center_set) - self.center_right_values, self.center_right_indices = \ - sort_values_and_indices(right, indices, center_set) - self.n_center = len(self.center_left_indices) - - @cython.wraparound(False) - @cython.boundscheck(False) - cdef classify_intervals(self, float64_t[:] left, float64_t[:] right): - """Classify the given intervals based upon whether they fall to the - left, right, or overlap with this node's pivot. - """ - cdef: - Int64Vector left_ind, right_ind, overlapping_ind - Py_ssize_t i - - left_ind = Int64Vector() - right_ind = Int64Vector() - overlapping_ind = Int64Vector() - - for i in range(self.n_elements): - if right[i] < self.pivot: - left_ind.append(i) - elif self.pivot < left[i]: - right_ind.append(i) - else: - overlapping_ind.append(i) - - return (left_ind.to_array(), - right_ind.to_array(), - overlapping_ind.to_array()) - - cdef new_child_node(self, - ndarray[float64_t, ndim=1] left, - ndarray[float64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - ndarray[int64_t, ndim=1] subset): - """Create a new child node. - """ - left = take(left, subset) - right = take(right, subset) - indices = take(indices, subset) - return Float64ClosedBothIntervalNode( - left, right, indices, self.leaf_size) - - @cython.wraparound(False) - @cython.boundscheck(False) - @cython.initializedcheck(False) - cpdef query(self, Int64Vector result, scalar64_t point): - """Recursively query this node and its sub-nodes for intervals that - overlap with the query point. - """ - cdef: - int64_t[:] indices - float64_t[:] values - Py_ssize_t i - - if self.is_leaf_node: - # Once we get down to a certain size, it doesn't make sense to - # continue the binary tree structure. Instead, we use linear - # search. - for i in range(self.n_elements): - if self.left[i] <= point <= self.right[i]: - result.append(self.indices[i]) - else: - # There are child nodes. Based on comparing our query to the pivot, - # look at the center values, then go to the relevant child. - if point < self.pivot: - values = self.center_left_values - indices = self.center_left_indices - for i in range(self.n_center): - if not values[i] <= point: - break - result.append(indices[i]) - if point <= self.left_node.max_right: - self.left_node.query(result, point) - elif point > self.pivot: - values = self.center_right_values - indices = self.center_right_indices - for i in range(self.n_center - 1, -1, -1): - if not point <= values[i]: - break - result.append(indices[i]) - if self.right_node.min_left <= point: - self.right_node.query(result, point) - else: - result.extend(self.center_left_indices) - - def __repr__(self): - if self.is_leaf_node: - return ('' % self.n_elements) - else: - n_left = self.left_node.n_elements - n_right = self.right_node.n_elements - n_center = self.n_elements - n_left - n_right - return ('' % - (self.pivot, self.n_elements, n_left, n_right, n_center)) - - def counts(self): - if self.is_leaf_node: - return self.n_elements - else: - m = len(self.center_left_values) - l = self.left_node.counts() - r = self.right_node.counts() - return (m, (l, r)) - -NODE_CLASSES['float64', 'both'] = Float64ClosedBothIntervalNode - - -cdef class Float64ClosedNeitherIntervalNode: - """Non-terminal node for an IntervalTree - - Categorizes intervals by those that fall to the left, those that fall to - the right, and those that overlap with the pivot. - """ - cdef: - Float64ClosedNeitherIntervalNode left_node, right_node - float64_t[:] center_left_values, center_right_values, left, right - int64_t[:] center_left_indices, center_right_indices, indices - float64_t min_left, max_right - readonly float64_t pivot - readonly int64_t n_elements, n_center, leaf_size - readonly bint is_leaf_node - - def __init__(self, - ndarray[float64_t, ndim=1] left, - ndarray[float64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - int64_t leaf_size): - - self.n_elements = len(left) - self.leaf_size = leaf_size - if left.size > 0: - self.min_left = left.min() - self.max_right = right.max() - else: - self.min_left = 0 - self.max_right = 0 - - if self.n_elements <= leaf_size: - # make this a terminal (leaf) node - self.is_leaf_node = True - self.left = left - self.right = right - self.indices = indices - self.n_center - else: - # calculate a pivot so we can create child nodes - self.is_leaf_node = False - self.pivot = np.median(left + right) / 2 - left_set, right_set, center_set = self.classify_intervals(left, right) - - self.left_node = self.new_child_node(left, right, indices, left_set) - self.right_node = self.new_child_node(left, right, indices, right_set) - - self.center_left_values, self.center_left_indices = \ - sort_values_and_indices(left, indices, center_set) - self.center_right_values, self.center_right_indices = \ - sort_values_and_indices(right, indices, center_set) - self.n_center = len(self.center_left_indices) - - @cython.wraparound(False) - @cython.boundscheck(False) - cdef classify_intervals(self, float64_t[:] left, float64_t[:] right): - """Classify the given intervals based upon whether they fall to the - left, right, or overlap with this node's pivot. - """ - cdef: - Int64Vector left_ind, right_ind, overlapping_ind - Py_ssize_t i - - left_ind = Int64Vector() - right_ind = Int64Vector() - overlapping_ind = Int64Vector() - - for i in range(self.n_elements): - if right[i] <= self.pivot: - left_ind.append(i) - elif self.pivot <= left[i]: - right_ind.append(i) - else: - overlapping_ind.append(i) - - return (left_ind.to_array(), - right_ind.to_array(), - overlapping_ind.to_array()) - - cdef new_child_node(self, - ndarray[float64_t, ndim=1] left, - ndarray[float64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - ndarray[int64_t, ndim=1] subset): - """Create a new child node. - """ - left = take(left, subset) - right = take(right, subset) - indices = take(indices, subset) - return Float64ClosedNeitherIntervalNode( - left, right, indices, self.leaf_size) - - @cython.wraparound(False) - @cython.boundscheck(False) - @cython.initializedcheck(False) - cpdef query(self, Int64Vector result, scalar64_t point): - """Recursively query this node and its sub-nodes for intervals that - overlap with the query point. - """ - cdef: - int64_t[:] indices - float64_t[:] values - Py_ssize_t i - - if self.is_leaf_node: - # Once we get down to a certain size, it doesn't make sense to - # continue the binary tree structure. Instead, we use linear - # search. - for i in range(self.n_elements): - if self.left[i] < point < self.right[i]: - result.append(self.indices[i]) - else: - # There are child nodes. Based on comparing our query to the pivot, - # look at the center values, then go to the relevant child. - if point < self.pivot: - values = self.center_left_values - indices = self.center_left_indices - for i in range(self.n_center): - if not values[i] < point: - break - result.append(indices[i]) - if point < self.left_node.max_right: - self.left_node.query(result, point) - elif point > self.pivot: - values = self.center_right_values - indices = self.center_right_indices - for i in range(self.n_center - 1, -1, -1): - if not point < values[i]: - break - result.append(indices[i]) - if self.right_node.min_left < point: - self.right_node.query(result, point) - else: - result.extend(self.center_left_indices) - - def __repr__(self): - if self.is_leaf_node: - return ('' % self.n_elements) - else: - n_left = self.left_node.n_elements - n_right = self.right_node.n_elements - n_center = self.n_elements - n_left - n_right - return ('' % - (self.pivot, self.n_elements, n_left, n_right, n_center)) - - def counts(self): - if self.is_leaf_node: - return self.n_elements - else: - m = len(self.center_left_values) - l = self.left_node.counts() - r = self.right_node.counts() - return (m, (l, r)) - -NODE_CLASSES['float64', 'neither'] = Float64ClosedNeitherIntervalNode - - -cdef class Int64ClosedLeftIntervalNode: - """Non-terminal node for an IntervalTree - - Categorizes intervals by those that fall to the left, those that fall to - the right, and those that overlap with the pivot. - """ - cdef: - Int64ClosedLeftIntervalNode left_node, right_node - int64_t[:] center_left_values, center_right_values, left, right - int64_t[:] center_left_indices, center_right_indices, indices - int64_t min_left, max_right - readonly int64_t pivot - readonly int64_t n_elements, n_center, leaf_size - readonly bint is_leaf_node - - def __init__(self, - ndarray[int64_t, ndim=1] left, - ndarray[int64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - int64_t leaf_size): - - self.n_elements = len(left) - self.leaf_size = leaf_size - if left.size > 0: - self.min_left = left.min() - self.max_right = right.max() - else: - self.min_left = 0 - self.max_right = 0 - - if self.n_elements <= leaf_size: - # make this a terminal (leaf) node - self.is_leaf_node = True - self.left = left - self.right = right - self.indices = indices - self.n_center - else: - # calculate a pivot so we can create child nodes - self.is_leaf_node = False - self.pivot = np.median(left + right) / 2 - left_set, right_set, center_set = self.classify_intervals(left, right) - - self.left_node = self.new_child_node(left, right, indices, left_set) - self.right_node = self.new_child_node(left, right, indices, right_set) - - self.center_left_values, self.center_left_indices = \ - sort_values_and_indices(left, indices, center_set) - self.center_right_values, self.center_right_indices = \ - sort_values_and_indices(right, indices, center_set) - self.n_center = len(self.center_left_indices) - - @cython.wraparound(False) - @cython.boundscheck(False) - cdef classify_intervals(self, int64_t[:] left, int64_t[:] right): - """Classify the given intervals based upon whether they fall to the - left, right, or overlap with this node's pivot. - """ - cdef: - Int64Vector left_ind, right_ind, overlapping_ind - Py_ssize_t i - - left_ind = Int64Vector() - right_ind = Int64Vector() - overlapping_ind = Int64Vector() - - for i in range(self.n_elements): - if right[i] <= self.pivot: - left_ind.append(i) - elif self.pivot < left[i]: - right_ind.append(i) - else: - overlapping_ind.append(i) - - return (left_ind.to_array(), - right_ind.to_array(), - overlapping_ind.to_array()) - - cdef new_child_node(self, - ndarray[int64_t, ndim=1] left, - ndarray[int64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - ndarray[int64_t, ndim=1] subset): - """Create a new child node. - """ - left = take(left, subset) - right = take(right, subset) - indices = take(indices, subset) - return Int64ClosedLeftIntervalNode( - left, right, indices, self.leaf_size) - - @cython.wraparound(False) - @cython.boundscheck(False) - @cython.initializedcheck(False) - cpdef query(self, Int64Vector result, scalar64_t point): - """Recursively query this node and its sub-nodes for intervals that - overlap with the query point. - """ - cdef: - int64_t[:] indices - int64_t[:] values - Py_ssize_t i - - if self.is_leaf_node: - # Once we get down to a certain size, it doesn't make sense to - # continue the binary tree structure. Instead, we use linear - # search. - for i in range(self.n_elements): - if self.left[i] <= point < self.right[i]: - result.append(self.indices[i]) - else: - # There are child nodes. Based on comparing our query to the pivot, - # look at the center values, then go to the relevant child. - if point < self.pivot: - values = self.center_left_values - indices = self.center_left_indices - for i in range(self.n_center): - if not values[i] <= point: - break - result.append(indices[i]) - if point < self.left_node.max_right: - self.left_node.query(result, point) - elif point > self.pivot: - values = self.center_right_values - indices = self.center_right_indices - for i in range(self.n_center - 1, -1, -1): - if not point < values[i]: - break - result.append(indices[i]) - if self.right_node.min_left <= point: - self.right_node.query(result, point) - else: - result.extend(self.center_left_indices) - - def __repr__(self): - if self.is_leaf_node: - return ('' % self.n_elements) - else: - n_left = self.left_node.n_elements - n_right = self.right_node.n_elements - n_center = self.n_elements - n_left - n_right - return ('' % - (self.pivot, self.n_elements, n_left, n_right, n_center)) - - def counts(self): - if self.is_leaf_node: - return self.n_elements - else: - m = len(self.center_left_values) - l = self.left_node.counts() - r = self.right_node.counts() - return (m, (l, r)) - -NODE_CLASSES['int64', 'left'] = Int64ClosedLeftIntervalNode - - -cdef class Int64ClosedRightIntervalNode: - """Non-terminal node for an IntervalTree - - Categorizes intervals by those that fall to the left, those that fall to - the right, and those that overlap with the pivot. - """ - cdef: - Int64ClosedRightIntervalNode left_node, right_node - int64_t[:] center_left_values, center_right_values, left, right - int64_t[:] center_left_indices, center_right_indices, indices - int64_t min_left, max_right - readonly int64_t pivot - readonly int64_t n_elements, n_center, leaf_size - readonly bint is_leaf_node - - def __init__(self, - ndarray[int64_t, ndim=1] left, - ndarray[int64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - int64_t leaf_size): - - self.n_elements = len(left) - self.leaf_size = leaf_size - if left.size > 0: - self.min_left = left.min() - self.max_right = right.max() - else: - self.min_left = 0 - self.max_right = 0 - - if self.n_elements <= leaf_size: - # make this a terminal (leaf) node - self.is_leaf_node = True - self.left = left - self.right = right - self.indices = indices - self.n_center - else: - # calculate a pivot so we can create child nodes - self.is_leaf_node = False - self.pivot = np.median(left + right) / 2 - left_set, right_set, center_set = self.classify_intervals(left, right) - - self.left_node = self.new_child_node(left, right, indices, left_set) - self.right_node = self.new_child_node(left, right, indices, right_set) - - self.center_left_values, self.center_left_indices = \ - sort_values_and_indices(left, indices, center_set) - self.center_right_values, self.center_right_indices = \ - sort_values_and_indices(right, indices, center_set) - self.n_center = len(self.center_left_indices) - - @cython.wraparound(False) - @cython.boundscheck(False) - cdef classify_intervals(self, int64_t[:] left, int64_t[:] right): - """Classify the given intervals based upon whether they fall to the - left, right, or overlap with this node's pivot. - """ - cdef: - Int64Vector left_ind, right_ind, overlapping_ind - Py_ssize_t i - - left_ind = Int64Vector() - right_ind = Int64Vector() - overlapping_ind = Int64Vector() - - for i in range(self.n_elements): - if right[i] < self.pivot: - left_ind.append(i) - elif self.pivot <= left[i]: - right_ind.append(i) - else: - overlapping_ind.append(i) - - return (left_ind.to_array(), - right_ind.to_array(), - overlapping_ind.to_array()) - - cdef new_child_node(self, - ndarray[int64_t, ndim=1] left, - ndarray[int64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - ndarray[int64_t, ndim=1] subset): - """Create a new child node. - """ - left = take(left, subset) - right = take(right, subset) - indices = take(indices, subset) - return Int64ClosedRightIntervalNode( - left, right, indices, self.leaf_size) - - @cython.wraparound(False) - @cython.boundscheck(False) - @cython.initializedcheck(False) - cpdef query(self, Int64Vector result, scalar64_t point): - """Recursively query this node and its sub-nodes for intervals that - overlap with the query point. - """ - cdef: - int64_t[:] indices - int64_t[:] values - Py_ssize_t i - - if self.is_leaf_node: - # Once we get down to a certain size, it doesn't make sense to - # continue the binary tree structure. Instead, we use linear - # search. - for i in range(self.n_elements): - if self.left[i] < point <= self.right[i]: - result.append(self.indices[i]) - else: - # There are child nodes. Based on comparing our query to the pivot, - # look at the center values, then go to the relevant child. - if point < self.pivot: - values = self.center_left_values - indices = self.center_left_indices - for i in range(self.n_center): - if not values[i] < point: - break - result.append(indices[i]) - if point <= self.left_node.max_right: - self.left_node.query(result, point) - elif point > self.pivot: - values = self.center_right_values - indices = self.center_right_indices - for i in range(self.n_center - 1, -1, -1): - if not point <= values[i]: - break - result.append(indices[i]) - if self.right_node.min_left < point: - self.right_node.query(result, point) - else: - result.extend(self.center_left_indices) - - def __repr__(self): - if self.is_leaf_node: - return ('' % self.n_elements) - else: - n_left = self.left_node.n_elements - n_right = self.right_node.n_elements - n_center = self.n_elements - n_left - n_right - return ('' % - (self.pivot, self.n_elements, n_left, n_right, n_center)) - - def counts(self): - if self.is_leaf_node: - return self.n_elements - else: - m = len(self.center_left_values) - l = self.left_node.counts() - r = self.right_node.counts() - return (m, (l, r)) - -NODE_CLASSES['int64', 'right'] = Int64ClosedRightIntervalNode - - -cdef class Int64ClosedBothIntervalNode: - """Non-terminal node for an IntervalTree - - Categorizes intervals by those that fall to the left, those that fall to - the right, and those that overlap with the pivot. - """ - cdef: - Int64ClosedBothIntervalNode left_node, right_node - int64_t[:] center_left_values, center_right_values, left, right - int64_t[:] center_left_indices, center_right_indices, indices - int64_t min_left, max_right - readonly int64_t pivot - readonly int64_t n_elements, n_center, leaf_size - readonly bint is_leaf_node - - def __init__(self, - ndarray[int64_t, ndim=1] left, - ndarray[int64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - int64_t leaf_size): - - self.n_elements = len(left) - self.leaf_size = leaf_size - if left.size > 0: - self.min_left = left.min() - self.max_right = right.max() - else: - self.min_left = 0 - self.max_right = 0 - - if self.n_elements <= leaf_size: - # make this a terminal (leaf) node - self.is_leaf_node = True - self.left = left - self.right = right - self.indices = indices - self.n_center - else: - # calculate a pivot so we can create child nodes - self.is_leaf_node = False - self.pivot = np.median(left + right) / 2 - left_set, right_set, center_set = self.classify_intervals(left, right) - - self.left_node = self.new_child_node(left, right, indices, left_set) - self.right_node = self.new_child_node(left, right, indices, right_set) - - self.center_left_values, self.center_left_indices = \ - sort_values_and_indices(left, indices, center_set) - self.center_right_values, self.center_right_indices = \ - sort_values_and_indices(right, indices, center_set) - self.n_center = len(self.center_left_indices) - - @cython.wraparound(False) - @cython.boundscheck(False) - cdef classify_intervals(self, int64_t[:] left, int64_t[:] right): - """Classify the given intervals based upon whether they fall to the - left, right, or overlap with this node's pivot. - """ - cdef: - Int64Vector left_ind, right_ind, overlapping_ind - Py_ssize_t i - - left_ind = Int64Vector() - right_ind = Int64Vector() - overlapping_ind = Int64Vector() - - for i in range(self.n_elements): - if right[i] < self.pivot: - left_ind.append(i) - elif self.pivot < left[i]: - right_ind.append(i) - else: - overlapping_ind.append(i) - - return (left_ind.to_array(), - right_ind.to_array(), - overlapping_ind.to_array()) - - cdef new_child_node(self, - ndarray[int64_t, ndim=1] left, - ndarray[int64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - ndarray[int64_t, ndim=1] subset): - """Create a new child node. - """ - left = take(left, subset) - right = take(right, subset) - indices = take(indices, subset) - return Int64ClosedBothIntervalNode( - left, right, indices, self.leaf_size) - - @cython.wraparound(False) - @cython.boundscheck(False) - @cython.initializedcheck(False) - cpdef query(self, Int64Vector result, scalar64_t point): - """Recursively query this node and its sub-nodes for intervals that - overlap with the query point. - """ - cdef: - int64_t[:] indices - int64_t[:] values - Py_ssize_t i - - if self.is_leaf_node: - # Once we get down to a certain size, it doesn't make sense to - # continue the binary tree structure. Instead, we use linear - # search. - for i in range(self.n_elements): - if self.left[i] <= point <= self.right[i]: - result.append(self.indices[i]) - else: - # There are child nodes. Based on comparing our query to the pivot, - # look at the center values, then go to the relevant child. - if point < self.pivot: - values = self.center_left_values - indices = self.center_left_indices - for i in range(self.n_center): - if not values[i] <= point: - break - result.append(indices[i]) - if point <= self.left_node.max_right: - self.left_node.query(result, point) - elif point > self.pivot: - values = self.center_right_values - indices = self.center_right_indices - for i in range(self.n_center - 1, -1, -1): - if not point <= values[i]: - break - result.append(indices[i]) - if self.right_node.min_left <= point: - self.right_node.query(result, point) - else: - result.extend(self.center_left_indices) - - def __repr__(self): - if self.is_leaf_node: - return ('' % self.n_elements) - else: - n_left = self.left_node.n_elements - n_right = self.right_node.n_elements - n_center = self.n_elements - n_left - n_right - return ('' % - (self.pivot, self.n_elements, n_left, n_right, n_center)) - - def counts(self): - if self.is_leaf_node: - return self.n_elements - else: - m = len(self.center_left_values) - l = self.left_node.counts() - r = self.right_node.counts() - return (m, (l, r)) - -NODE_CLASSES['int64', 'both'] = Int64ClosedBothIntervalNode - - -cdef class Int64ClosedNeitherIntervalNode: - """Non-terminal node for an IntervalTree - - Categorizes intervals by those that fall to the left, those that fall to - the right, and those that overlap with the pivot. - """ - cdef: - Int64ClosedNeitherIntervalNode left_node, right_node - int64_t[:] center_left_values, center_right_values, left, right - int64_t[:] center_left_indices, center_right_indices, indices - int64_t min_left, max_right - readonly int64_t pivot - readonly int64_t n_elements, n_center, leaf_size - readonly bint is_leaf_node - - def __init__(self, - ndarray[int64_t, ndim=1] left, - ndarray[int64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - int64_t leaf_size): - - self.n_elements = len(left) - self.leaf_size = leaf_size - if left.size > 0: - self.min_left = left.min() - self.max_right = right.max() - else: - self.min_left = 0 - self.max_right = 0 - - if self.n_elements <= leaf_size: - # make this a terminal (leaf) node - self.is_leaf_node = True - self.left = left - self.right = right - self.indices = indices - self.n_center - else: - # calculate a pivot so we can create child nodes - self.is_leaf_node = False - self.pivot = np.median(left + right) / 2 - left_set, right_set, center_set = self.classify_intervals(left, right) - - self.left_node = self.new_child_node(left, right, indices, left_set) - self.right_node = self.new_child_node(left, right, indices, right_set) - - self.center_left_values, self.center_left_indices = \ - sort_values_and_indices(left, indices, center_set) - self.center_right_values, self.center_right_indices = \ - sort_values_and_indices(right, indices, center_set) - self.n_center = len(self.center_left_indices) - - @cython.wraparound(False) - @cython.boundscheck(False) - cdef classify_intervals(self, int64_t[:] left, int64_t[:] right): - """Classify the given intervals based upon whether they fall to the - left, right, or overlap with this node's pivot. - """ - cdef: - Int64Vector left_ind, right_ind, overlapping_ind - Py_ssize_t i - - left_ind = Int64Vector() - right_ind = Int64Vector() - overlapping_ind = Int64Vector() - - for i in range(self.n_elements): - if right[i] <= self.pivot: - left_ind.append(i) - elif self.pivot <= left[i]: - right_ind.append(i) - else: - overlapping_ind.append(i) - - return (left_ind.to_array(), - right_ind.to_array(), - overlapping_ind.to_array()) - - cdef new_child_node(self, - ndarray[int64_t, ndim=1] left, - ndarray[int64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - ndarray[int64_t, ndim=1] subset): - """Create a new child node. - """ - left = take(left, subset) - right = take(right, subset) - indices = take(indices, subset) - return Int64ClosedNeitherIntervalNode( - left, right, indices, self.leaf_size) - - @cython.wraparound(False) - @cython.boundscheck(False) - @cython.initializedcheck(False) - cpdef query(self, Int64Vector result, scalar64_t point): - """Recursively query this node and its sub-nodes for intervals that - overlap with the query point. - """ - cdef: - int64_t[:] indices - int64_t[:] values - Py_ssize_t i - - if self.is_leaf_node: - # Once we get down to a certain size, it doesn't make sense to - # continue the binary tree structure. Instead, we use linear - # search. - for i in range(self.n_elements): - if self.left[i] < point < self.right[i]: - result.append(self.indices[i]) - else: - # There are child nodes. Based on comparing our query to the pivot, - # look at the center values, then go to the relevant child. - if point < self.pivot: - values = self.center_left_values - indices = self.center_left_indices - for i in range(self.n_center): - if not values[i] < point: - break - result.append(indices[i]) - if point < self.left_node.max_right: - self.left_node.query(result, point) - elif point > self.pivot: - values = self.center_right_values - indices = self.center_right_indices - for i in range(self.n_center - 1, -1, -1): - if not point < values[i]: - break - result.append(indices[i]) - if self.right_node.min_left < point: - self.right_node.query(result, point) - else: - result.extend(self.center_left_indices) - - def __repr__(self): - if self.is_leaf_node: - return ('' % self.n_elements) - else: - n_left = self.left_node.n_elements - n_right = self.right_node.n_elements - n_center = self.n_elements - n_left - n_right - return ('' % - (self.pivot, self.n_elements, n_left, n_right, n_center)) - - def counts(self): - if self.is_leaf_node: - return self.n_elements - else: - m = len(self.center_left_values) - l = self.left_node.counts() - r = self.right_node.counts() - return (m, (l, r)) - -NODE_CLASSES['int64', 'neither'] = Int64ClosedNeitherIntervalNode - - diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 7301c87026114..a15d7cf26cbea 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -49,7 +49,7 @@ class TestPDApi(Base, tm.TestCase): 'Period', 'PeriodIndex', 'RangeIndex', 'UInt64Index', 'Series', 'SparseArray', 'SparseDataFrame', 'SparseSeries', 'TimeGrouper', 'Timedelta', - 'TimedeltaIndex', 'Timestamp'] + 'TimedeltaIndex', 'Timestamp', 'Interval', 'IntervalIndex'] # these are already deprecated; awaiting removal deprecated_classes = ['WidePanel', 'Panel4D', @@ -63,7 +63,7 @@ class TestPDApi(Base, tm.TestCase): # top-level functions funcs = ['bdate_range', 'concat', 'crosstab', 'cut', - 'date_range', 'eval', + 'date_range', 'interval_range', 'eval', 'factorize', 'get_dummies', 'infer_freq', 'isnull', 'lreshape', 'melt', 'notnull', 'offsets', diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index e52bfdbd4f837..f05b6fdd6bc23 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -8,7 +8,10 @@ from pandas.compat import lrange from pandas import (DataFrame, Series, Index, MultiIndex, - RangeIndex, date_range) + RangeIndex, date_range, IntervalIndex) +from pandas.types.common import (is_object_dtype, + is_categorical_dtype, + is_interval_dtype) import pandas as pd from pandas.util.testing import (assert_series_equal, @@ -295,6 +298,17 @@ def test_set_index_dst(self): exp = pd.DataFrame({'b': [3, 4, 5]}, index=exp_index) tm.assert_frame_equal(res, exp) + def test_reset_index_with_intervals(self): + idx = pd.IntervalIndex.from_breaks(np.arange(11), name='x') + original = pd.DataFrame({'x': idx, 'y': np.arange(10)})[['x', 'y']] + + result = original.set_index('x') + expected = pd.DataFrame({'y': np.arange(10)}, index=idx) + assert_frame_equal(result, expected) + + result2 = result.reset_index() + assert_frame_equal(result2, original) + def test_set_index_multiindexcolumns(self): columns = MultiIndex.from_tuples([('foo', 1), ('foo', 2), ('bar', 1)]) df = DataFrame(np.random.randn(3, 3), columns=columns) @@ -730,3 +744,53 @@ def test_set_index_preserve_categorical_dtype(self): result = df.set_index(cols).reset_index() result = result.reindex(columns=df.columns) tm.assert_frame_equal(result, df) + + +class TestIntervalIndex(tm.TestCase): + + def test_setitem(self): + + df = DataFrame({'A': range(10)}) + s = pd.cut(df.A, 5) + self.assertIsInstance(s.cat.categories, IntervalIndex) + + # B & D end up as Categoricals + # the remainer are converted to in-line objects + # contining an IntervalIndex.values + df['B'] = s + df['C'] = np.array(s) + df['D'] = s.values + df['E'] = np.array(s.values) + + assert is_categorical_dtype(df['B']) + assert is_interval_dtype(df['B'].cat.categories) + assert is_categorical_dtype(df['D']) + assert is_interval_dtype(df['D'].cat.categories) + + assert is_object_dtype(df['C']) + assert is_object_dtype(df['E']) + + # they compare equal as Index + # when converted to numpy objects + c = lambda x: Index(np.array(x)) + tm.assert_index_equal(c(df.B), c(df.B), check_names=False) + tm.assert_index_equal(c(df.B), c(df.C), check_names=False) + tm.assert_index_equal(c(df.B), c(df.D), check_names=False) + tm.assert_index_equal(c(df.B), c(df.D), check_names=False) + + # B & D are the same Series + tm.assert_series_equal(df['B'], df['B'], check_names=False) + tm.assert_series_equal(df['B'], df['D'], check_names=False) + + # C & E are the same Series + tm.assert_series_equal(df['C'], df['C'], check_names=False) + tm.assert_series_equal(df['C'], df['E'], check_names=False) + + def test_set_reset_index(self): + + df = DataFrame({'A': range(10)}) + s = pd.cut(df.A, 5) + df['B'] = s + df = df.set_index('B') + + df = df.reset_index() diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index cfcb531bedab8..68bdc0c6d5112 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -7,7 +7,7 @@ import pandas as pd from pandas import (Index, MultiIndex, CategoricalIndex, - DataFrame, Categorical, Series) + DataFrame, Categorical, Series, Interval) from pandas.util.testing import assert_frame_equal, assert_series_equal import pandas.util.testing as tm from .common import MixIn @@ -519,7 +519,8 @@ def test_groupby_categorical_two_columns(self): res = groups_double_key.agg('mean') nan = np.nan idx = MultiIndex.from_product( - [Categorical(["(1, 2]", "(2, 3]", "(3, 6]"], ordered=True), + [Categorical([Interval(1, 2), Interval(2, 3), + Interval(3, 6)], ordered=True), [1, 2, 3, 4]], names=["cat", "C2"]) exp = DataFrame({"C1": [nan, nan, nan, nan, 3, 3, diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 800e2e8aa1cc1..25f89b29021ce 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -864,11 +864,13 @@ def test_get_group_empty_bins(self): bins = [0, 5, 10, 15] g = d.groupby(pd.cut(d[0], bins)) - result = g.get_group('(0, 5]') + # TODO: should prob allow a str of Interval work as well + # IOW '(0, 5]' + result = g.get_group(pd.Interval(0, 5)) expected = DataFrame([3, 1], index=[0, 1]) assert_frame_equal(result, expected) - self.assertRaises(KeyError, lambda: g.get_group('(10, 15]')) + self.assertRaises(KeyError, lambda: g.get_group(pd.Interval(10, 15))) def test_get_group_grouped_by_tuple(self): # GH 8121 @@ -3866,49 +3868,6 @@ def test_transform_doesnt_clobber_ints(self): expected = gb2.transform('mean') tm.assert_frame_equal(result, expected) - def test_groupby_categorical_two_columns(self): - - # https://github.com/pydata/pandas/issues/8138 - d = {'cat': pd.Categorical(["a","b","a","b"], categories=["a", "b", "c"], ordered=True), - 'ints': [1, 1, 2, 2],'val': [10, 20, 30, 40]} - test = pd.DataFrame(d) - - # Grouping on a single column - groups_single_key = test.groupby("cat") - res = groups_single_key.agg('mean') - exp = DataFrame({"ints":[1.5,1.5,np.nan], "val":[20,30,np.nan]}, - index=pd.CategoricalIndex(["a", "b", "c"], name="cat")) - tm.assert_frame_equal(res, exp) - - # Grouping on two columns - groups_double_key = test.groupby(["cat","ints"]) - res = groups_double_key.agg('mean') - exp = DataFrame({"val":[10,30,20,40,np.nan,np.nan], - "cat": ["a","a","b","b","c","c"], - "ints": [1,2,1,2,1,2]}).set_index(["cat","ints"]) - tm.assert_frame_equal(res, exp) - - # GH 10132 - for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]: - c, i = key - result = groups_double_key.get_group(key) - expected = test[(test.cat == c) & (test.ints == i)] - assert_frame_equal(result, expected) - - d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]} - test = pd.DataFrame(d) - values = pd.cut(test['C1'], [1, 2, 3, 6], labels=pd.Categorical(['a', 'b', 'c'])) - values.name = "cat" - groups_double_key = test.groupby([values,'C2']) - - res = groups_double_key.agg('mean') - nan = np.nan - idx = MultiIndex.from_product([['a', 'b', 'c'], [1, 2, 3, 4]], - names=["cat", "C2"]) - exp = DataFrame({"C1":[nan,nan,nan,nan, 3, 3,nan,nan, nan,nan, 4, 5], - "C3":[nan,nan,nan,nan, 10,100,nan,nan, nan,nan,200,34]}, index=idx) - tm.assert_frame_equal(res, exp) - def test_groupby_apply_all_none(self): # Tests to make sure no errors if apply function returns all None # values. Issue 9684. diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 08f8f8d48e705..54d47d02c5e8e 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -7,7 +7,8 @@ from pandas import (Series, Index, Float64Index, Int64Index, UInt64Index, RangeIndex, MultiIndex, CategoricalIndex, DatetimeIndex, - TimedeltaIndex, PeriodIndex, notnull, isnull) + TimedeltaIndex, PeriodIndex, IntervalIndex, + notnull, isnull) from pandas.types.common import needs_i8_conversion from pandas.util.testing import assertRaisesRegexp from pandas._libs.tslib import iNaT @@ -255,18 +256,21 @@ def test_ensure_copied_data(self): tm.assert_numpy_array_equal(index.values, result.values, check_same='copy') - if not isinstance(index, PeriodIndex): - result = index_type(index.values, copy=False, **init_kwargs) - tm.assert_numpy_array_equal(index.values, result.values, - check_same='same') - tm.assert_numpy_array_equal(index._values, result._values, - check_same='same') - else: + if isinstance(index, PeriodIndex): # .values an object array of Period, thus copied result = index_type(ordinal=index.asi8, copy=False, **init_kwargs) tm.assert_numpy_array_equal(index._values, result._values, check_same='same') + elif isinstance(index, IntervalIndex): + # checked in test_interval.py + pass + else: + result = index_type(index.values, copy=False, **init_kwargs) + tm.assert_numpy_array_equal(index.values, result.values, + check_same='same') + tm.assert_numpy_array_equal(index._values, result._values, + check_same='same') def test_copy_and_deepcopy(self): from copy import copy, deepcopy @@ -377,8 +381,9 @@ def test_memory_usage(self): result2 = index.memory_usage() result3 = index.memory_usage(deep=True) - # RangeIndex doesn't use a hashtable engine - if not isinstance(index, RangeIndex): + # RangeIndex, IntervalIndex + # don't have engines + if not isinstance(index, (RangeIndex, IntervalIndex)): self.assertTrue(result2 > result) if index.inferred_type == 'object': diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index a8197b070b032..cc819ff83b1dd 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -14,7 +14,7 @@ from pandas import (period_range, date_range, Series, DataFrame, Float64Index, Int64Index, CategoricalIndex, DatetimeIndex, TimedeltaIndex, - PeriodIndex) + PeriodIndex, isnull) from pandas.core.index import _get_combined_index from pandas.util.testing import assert_almost_equal from pandas.compat.numpy import np_datetime64_compat @@ -504,7 +504,7 @@ def test_is_(self): def test_asof(self): d = self.dateIndex[0] self.assertEqual(self.dateIndex.asof(d), d) - self.assertTrue(np.isnan(self.dateIndex.asof(d - timedelta(1)))) + self.assertTrue(isnull(self.dateIndex.asof(d - timedelta(1)))) d = self.dateIndex[-1] self.assertEqual(self.dateIndex.asof(d + timedelta(1)), d) diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 0d75ba5f2bd46..b8c50239efac3 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -8,7 +8,7 @@ import numpy as np -from pandas import Categorical, compat, notnull +from pandas import Categorical, IntervalIndex, compat, notnull from pandas.util.testing import assert_almost_equal import pandas.core.config as cf import pandas as pd @@ -343,11 +343,25 @@ def test_astype(self): self.assertIsInstance(result, Index) self.assertNotIsInstance(result, CategoricalIndex) + # interval + ii = IntervalIndex.from_arrays(left=[-0.001, 2.0], + right=[2, 4], + closed='right') + + ci = CategoricalIndex(Categorical.from_codes([0, 1, -1], categories=ii, ordered=True)) + + result = ci.astype('interval') + expected = ii.take([0, 1, -1]) + tm.assert_index_equal(result, expected) + + result = IntervalIndex.from_intervals(result.values) + tm.assert_index_equal(result, expected) + def test_reindex_base(self): # determined by cat ordering idx = self.create_index() - expected = np.array([4, 0, 1, 5, 2, 3], dtype=np.intp) + expected = np.arange(len(idx), dtype=np.intp) actual = idx.get_indexer(idx) tm.assert_numpy_array_equal(expected, actual) diff --git a/pandas/tests/indexes/test_interval.py b/pandas/tests/indexes/test_interval.py new file mode 100644 index 0000000000000..6771b875c5137 --- /dev/null +++ b/pandas/tests/indexes/test_interval.py @@ -0,0 +1,799 @@ +from __future__ import division + +import pytest +import numpy as np + +from pandas import (Interval, IntervalIndex, Index, isnull, + interval_range, Timestamp, Timedelta) +from pandas._libs.interval import IntervalTree +from pandas.tests.indexes.common import Base +import pandas.util.testing as tm +import pandas as pd + + +class TestIntervalIndex(Base, tm.TestCase): + _holder = IntervalIndex + + def setUp(self): + self.index = IntervalIndex.from_arrays([0, 1], [1, 2]) + self.index_with_nan = IntervalIndex.from_tuples( + [(0, 1), np.nan, (1, 2)]) + self.indices = dict(intervalIndex=tm.makeIntervalIndex(10)) + + def create_index(self): + return IntervalIndex.from_breaks(np.arange(10)) + + def test_constructors(self): + expected = self.index + actual = IntervalIndex.from_breaks(np.arange(3), closed='right') + self.assertTrue(expected.equals(actual)) + + alternate = IntervalIndex.from_breaks(np.arange(3), closed='left') + self.assertFalse(expected.equals(alternate)) + + actual = IntervalIndex.from_intervals([Interval(0, 1), Interval(1, 2)]) + self.assertTrue(expected.equals(actual)) + + actual = IntervalIndex([Interval(0, 1), Interval(1, 2)]) + self.assertTrue(expected.equals(actual)) + + actual = IntervalIndex.from_arrays(np.arange(2), np.arange(2) + 1, + closed='right') + self.assertTrue(expected.equals(actual)) + + actual = Index([Interval(0, 1), Interval(1, 2)]) + self.assertIsInstance(actual, IntervalIndex) + self.assertTrue(expected.equals(actual)) + + actual = Index(expected) + self.assertIsInstance(actual, IntervalIndex) + self.assertTrue(expected.equals(actual)) + + def test_constructors_other(self): + + # all-nan + result = IntervalIndex.from_intervals([np.nan]) + expected = np.array([np.nan], dtype=object) + tm.assert_numpy_array_equal(result.values, expected) + + # empty + result = IntervalIndex.from_intervals([]) + expected = np.array([], dtype=object) + tm.assert_numpy_array_equal(result.values, expected) + + def test_constructors_errors(self): + + # scalar + with pytest.raises(TypeError): + IntervalIndex(5) + + # not an interval + with pytest.raises(TypeError): + IntervalIndex([0, 1]) + + with pytest.raises(TypeError): + IntervalIndex.from_intervals([0, 1]) + + # invalid closed + with pytest.raises(ValueError): + IntervalIndex.from_arrays([0, 1], [1, 2], closed='invalid') + + # mismatched closed + with pytest.raises(ValueError): + IntervalIndex.from_intervals([Interval(0, 1), + Interval(1, 2, closed='left')]) + + with pytest.raises(ValueError): + IntervalIndex.from_arrays([0, 10], [3, 5]) + + with pytest.raises(ValueError): + Index([Interval(0, 1), Interval(2, 3, closed='left')]) + + # no point in nesting periods in an IntervalIndex + with pytest.raises(ValueError): + IntervalIndex.from_breaks( + pd.period_range('2000-01-01', periods=3)) + + def test_constructors_datetimelike(self): + + # DTI / TDI + for idx in [pd.date_range('20130101', periods=5), + pd.timedelta_range('1 day', periods=5)]: + result = IntervalIndex.from_breaks(idx) + expected = IntervalIndex.from_breaks(idx.values) + tm.assert_index_equal(result, expected) + + expected_scalar_type = type(idx[0]) + i = result[0] + self.assertTrue(isinstance(i.left, expected_scalar_type)) + self.assertTrue(isinstance(i.right, expected_scalar_type)) + + def test_constructors_error(self): + + # non-intervals + def f(): + IntervalIndex.from_intervals([0.997, 4.0]) + self.assertRaises(TypeError, f) + + def test_properties(self): + index = self.index + self.assertEqual(len(index), 2) + self.assertEqual(index.size, 2) + self.assertEqual(index.shape, (2, )) + + self.assert_index_equal(index.left, Index([0, 1])) + self.assert_index_equal(index.right, Index([1, 2])) + self.assert_index_equal(index.mid, Index([0.5, 1.5])) + + self.assertEqual(index.closed, 'right') + + expected = np.array([Interval(0, 1), Interval(1, 2)], dtype=object) + self.assert_numpy_array_equal(np.asarray(index), expected) + self.assert_numpy_array_equal(index.values, expected) + + # with nans + index = self.index_with_nan + self.assertEqual(len(index), 3) + self.assertEqual(index.size, 3) + self.assertEqual(index.shape, (3, )) + + self.assert_index_equal(index.left, Index([0, np.nan, 1])) + self.assert_index_equal(index.right, Index([1, np.nan, 2])) + self.assert_index_equal(index.mid, Index([0.5, np.nan, 1.5])) + + self.assertEqual(index.closed, 'right') + + expected = np.array([Interval(0, 1), np.nan, + Interval(1, 2)], dtype=object) + self.assert_numpy_array_equal(np.asarray(index), expected) + self.assert_numpy_array_equal(index.values, expected) + + def test_with_nans(self): + index = self.index + self.assertFalse(index.hasnans) + self.assert_numpy_array_equal(index.isnull(), + np.array([False, False])) + self.assert_numpy_array_equal(index.notnull(), + np.array([True, True])) + + index = self.index_with_nan + self.assertTrue(index.hasnans) + self.assert_numpy_array_equal(index.notnull(), + np.array([True, False, True])) + self.assert_numpy_array_equal(index.isnull(), + np.array([False, True, False])) + + def test_copy(self): + actual = self.index.copy() + self.assertTrue(actual.equals(self.index)) + + actual = self.index.copy(deep=True) + self.assertTrue(actual.equals(self.index)) + self.assertIsNot(actual.left, self.index.left) + + def test_ensure_copied_data(self): + # exercise the copy flag in the constructor + + # not copying + index = self.index + result = IntervalIndex(index, copy=False) + tm.assert_numpy_array_equal(index.left.values, result.left.values, + check_same='same') + tm.assert_numpy_array_equal(index.right.values, result.right.values, + check_same='same') + + # by-definition make a copy + result = IntervalIndex.from_intervals(index.values, copy=False) + tm.assert_numpy_array_equal(index.left.values, result.left.values, + check_same='copy') + tm.assert_numpy_array_equal(index.right.values, result.right.values, + check_same='copy') + + def test_equals(self): + + idx = self.index + self.assertTrue(idx.equals(idx)) + self.assertTrue(idx.equals(idx.copy())) + + self.assertFalse(idx.equals(idx.astype(object))) + self.assertFalse(idx.equals(np.array(idx))) + self.assertFalse(idx.equals(list(idx))) + + self.assertFalse(idx.equals([1, 2])) + self.assertFalse(idx.equals(np.array([1, 2]))) + self.assertFalse(idx.equals( + pd.date_range('20130101', periods=2))) + + def test_astype(self): + + idx = self.index + + for dtype in [np.int64, np.float64, 'datetime64[ns]', + 'datetime64[ns, US/Eastern]', 'timedelta64', + 'period[M]']: + self.assertRaises(ValueError, idx.astype, dtype) + + result = idx.astype(object) + tm.assert_index_equal(result, Index(idx.values, dtype='object')) + self.assertFalse(idx.equals(result)) + self.assertTrue(idx.equals(IntervalIndex.from_intervals(result))) + + result = idx.astype('interval') + tm.assert_index_equal(result, idx) + self.assertTrue(result.equals(idx)) + + result = idx.astype('category') + expected = pd.Categorical(idx, ordered=True) + tm.assert_categorical_equal(result, expected) + + def test_where(self): + expected = self.index + result = self.index.where(self.index.notnull()) + tm.assert_index_equal(result, expected) + + idx = IntervalIndex.from_breaks([1, 2]) + result = idx.where([True, False]) + expected = IntervalIndex.from_intervals( + [Interval(1.0, 2.0, closed='right'), np.nan]) + tm.assert_index_equal(result, expected) + + def test_where_array_like(self): + pass + + def test_delete(self): + expected = IntervalIndex.from_breaks([1, 2]) + actual = self.index.delete(0) + self.assertTrue(expected.equals(actual)) + + def test_insert(self): + expected = IntervalIndex.from_breaks(range(4)) + actual = self.index.insert(2, Interval(2, 3)) + self.assertTrue(expected.equals(actual)) + + self.assertRaises(ValueError, self.index.insert, 0, 1) + self.assertRaises(ValueError, self.index.insert, 0, + Interval(2, 3, closed='left')) + + def test_take(self): + actual = self.index.take([0, 1]) + self.assertTrue(self.index.equals(actual)) + + expected = IntervalIndex.from_arrays([0, 0, 1], [1, 1, 2]) + actual = self.index.take([0, 0, 1]) + self.assertTrue(expected.equals(actual)) + + def test_monotonic_and_unique(self): + self.assertTrue(self.index.is_monotonic) + self.assertTrue(self.index.is_unique) + + idx = IntervalIndex.from_tuples([(0, 1), (0.5, 1.5)]) + self.assertTrue(idx.is_monotonic) + self.assertTrue(idx.is_unique) + + idx = IntervalIndex.from_tuples([(0, 1), (2, 3), (1, 2)]) + self.assertFalse(idx.is_monotonic) + self.assertTrue(idx.is_unique) + + idx = IntervalIndex.from_tuples([(0, 2), (0, 2)]) + self.assertFalse(idx.is_unique) + self.assertTrue(idx.is_monotonic) + + @pytest.mark.xfail(reason='not a valid repr as we use interval notation') + def test_repr(self): + i = IntervalIndex.from_tuples([(0, 1), (1, 2)], closed='right') + expected = ("IntervalIndex(left=[0, 1]," + "\n right=[1, 2]," + "\n closed='right'," + "\n dtype='interval[int64]')") + self.assertEqual(repr(i), expected) + + i = IntervalIndex.from_tuples((Timestamp('20130101'), + Timestamp('20130102')), + (Timestamp('20130102'), + Timestamp('20130103')), + closed='right') + expected = ("IntervalIndex(left=['2013-01-01', '2013-01-02']," + "\n right=['2013-01-02', '2013-01-03']," + "\n closed='right'," + "\n dtype='interval[datetime64[ns]]')") + self.assertEqual(repr(i), expected) + + @pytest.mark.xfail(reason='not a valid repr as we use interval notation') + def test_repr_max_seq_item_setting(self): + super(TestIntervalIndex, self).test_repr_max_seq_item_setting() + + @pytest.mark.xfail(reason='not a valid repr as we use interval notation') + def test_repr_roundtrip(self): + super(TestIntervalIndex, self).test_repr_roundtrip() + + def test_get_item(self): + i = IntervalIndex.from_arrays((0, 1, np.nan), (1, 2, np.nan), + closed='right') + assert i[0] == Interval(0.0, 1.0) + assert i[1] == Interval(1.0, 2.0) + assert isnull(i[2]) + + result = i[0:1] + expected = IntervalIndex.from_arrays((0.,), (1.,), closed='right') + tm.assert_index_equal(result, expected) + + result = i[0:2] + expected = IntervalIndex.from_arrays((0., 1), (1., 2.), closed='right') + tm.assert_index_equal(result, expected) + + result = i[1:3] + expected = IntervalIndex.from_arrays((1., np.nan), (2., np.nan), + closed='right') + tm.assert_index_equal(result, expected) + + def test_get_loc_value(self): + self.assertRaises(KeyError, self.index.get_loc, 0) + self.assertEqual(self.index.get_loc(0.5), 0) + self.assertEqual(self.index.get_loc(1), 0) + self.assertEqual(self.index.get_loc(1.5), 1) + self.assertEqual(self.index.get_loc(2), 1) + self.assertRaises(KeyError, self.index.get_loc, -1) + self.assertRaises(KeyError, self.index.get_loc, 3) + + idx = IntervalIndex.from_tuples([(0, 2), (1, 3)]) + self.assertEqual(idx.get_loc(0.5), 0) + self.assertEqual(idx.get_loc(1), 0) + self.assert_numpy_array_equal(idx.get_loc(1.5), + np.array([0, 1], dtype='int64')) + self.assert_numpy_array_equal(np.sort(idx.get_loc(2)), + np.array([0, 1], dtype='int64')) + self.assertEqual(idx.get_loc(3), 1) + self.assertRaises(KeyError, idx.get_loc, 3.5) + + idx = IntervalIndex.from_arrays([0, 2], [1, 3]) + self.assertRaises(KeyError, idx.get_loc, 1.5) + + def slice_locs_cases(self, breaks): + # TODO: same tests for more index types + index = IntervalIndex.from_breaks([0, 1, 2], closed='right') + self.assertEqual(index.slice_locs(), (0, 2)) + self.assertEqual(index.slice_locs(0, 1), (0, 1)) + self.assertEqual(index.slice_locs(1, 1), (0, 1)) + self.assertEqual(index.slice_locs(0, 2), (0, 2)) + self.assertEqual(index.slice_locs(0.5, 1.5), (0, 2)) + self.assertEqual(index.slice_locs(0, 0.5), (0, 1)) + self.assertEqual(index.slice_locs(start=1), (0, 2)) + self.assertEqual(index.slice_locs(start=1.2), (1, 2)) + self.assertEqual(index.slice_locs(end=1), (0, 1)) + self.assertEqual(index.slice_locs(end=1.1), (0, 2)) + self.assertEqual(index.slice_locs(end=1.0), (0, 1)) + self.assertEqual(*index.slice_locs(-1, -1)) + + index = IntervalIndex.from_breaks([0, 1, 2], closed='neither') + self.assertEqual(index.slice_locs(0, 1), (0, 1)) + self.assertEqual(index.slice_locs(0, 2), (0, 2)) + self.assertEqual(index.slice_locs(0.5, 1.5), (0, 2)) + self.assertEqual(index.slice_locs(1, 1), (1, 1)) + self.assertEqual(index.slice_locs(1, 2), (1, 2)) + + index = IntervalIndex.from_breaks([0, 1, 2], closed='both') + self.assertEqual(index.slice_locs(1, 1), (0, 2)) + self.assertEqual(index.slice_locs(1, 2), (0, 2)) + + def test_slice_locs_int64(self): + self.slice_locs_cases([0, 1, 2]) + + def test_slice_locs_float64(self): + self.slice_locs_cases([0.0, 1.0, 2.0]) + + def slice_locs_decreasing_cases(self, tuples): + index = IntervalIndex.from_tuples(tuples) + self.assertEqual(index.slice_locs(1.5, 0.5), (1, 3)) + self.assertEqual(index.slice_locs(2, 0), (1, 3)) + self.assertEqual(index.slice_locs(2, 1), (1, 3)) + self.assertEqual(index.slice_locs(3, 1.1), (0, 3)) + self.assertEqual(index.slice_locs(3, 3), (0, 2)) + self.assertEqual(index.slice_locs(3.5, 3.3), (0, 1)) + self.assertEqual(index.slice_locs(1, -3), (2, 3)) + self.assertEqual(*index.slice_locs(-1, -1)) + + def test_slice_locs_decreasing_int64(self): + self.slice_locs_cases([(2, 4), (1, 3), (0, 2)]) + + def test_slice_locs_decreasing_float64(self): + self.slice_locs_cases([(2., 4.), (1., 3.), (0., 2.)]) + + def test_slice_locs_fails(self): + index = IntervalIndex.from_tuples([(1, 2), (0, 1), (2, 3)]) + with self.assertRaises(KeyError): + index.slice_locs(1, 2) + + def test_get_loc_interval(self): + self.assertEqual(self.index.get_loc(Interval(0, 1)), 0) + self.assertEqual(self.index.get_loc(Interval(0, 0.5)), 0) + self.assertEqual(self.index.get_loc(Interval(0, 1, 'left')), 0) + self.assertRaises(KeyError, self.index.get_loc, Interval(2, 3)) + self.assertRaises(KeyError, self.index.get_loc, + Interval(-1, 0, 'left')) + + def test_get_indexer(self): + actual = self.index.get_indexer([-1, 0, 0.5, 1, 1.5, 2, 3]) + expected = np.array([-1, -1, 0, 0, 1, 1, -1], dtype='int64') + self.assert_numpy_array_equal(actual, expected) + + actual = self.index.get_indexer(self.index) + expected = np.array([0, 1], dtype='int64') + self.assert_numpy_array_equal(actual, expected) + + index = IntervalIndex.from_breaks([0, 1, 2], closed='left') + actual = index.get_indexer([-1, 0, 0.5, 1, 1.5, 2, 3]) + expected = np.array([-1, 0, 0, 1, 1, -1, -1], dtype='int64') + self.assert_numpy_array_equal(actual, expected) + + actual = self.index.get_indexer(index[:1]) + expected = np.array([0], dtype='int64') + self.assert_numpy_array_equal(actual, expected) + + actual = self.index.get_indexer(index) + expected = np.array([-1, 0], dtype='int64') + self.assert_numpy_array_equal(actual, expected) + + @pytest.mark.xfail(reason="what to return for overlaps") + def test_get_indexer_subintervals(self): + # TODO + + # return indexers for wholly contained subintervals + target = IntervalIndex.from_breaks(np.linspace(0, 2, 5)) + actual = self.index.get_indexer(target) + expected = np.array([0, 0, 1, 1], dtype='int64') + self.assert_numpy_array_equal(actual, expected) + + target = IntervalIndex.from_breaks([0, 0.67, 1.33, 2]) + actual = self.index.get_indexer(target) + expected = np.array([-1, 0, 1], dtype='int64') + self.assert_numpy_array_equal(actual, expected) + + actual = self.index.get_indexer(target[[0, -1]]) + expected = np.array([0, 1], dtype='int64') + self.assert_numpy_array_equal(actual, expected) + + target = IntervalIndex.from_breaks([0, 0.33, 0.67, 1], closed='left') + actual = self.index.get_indexer(target) + expected = np.array([0, 0, 0], dtype='int64') + self.assert_numpy_array_equal(actual, expected) + + def test_contains(self): + # only endpoints are valid + i = IntervalIndex.from_arrays([0, 1], [1, 2]) + + # invalid + self.assertNotIn(0, i) + self.assertNotIn(1, i) + self.assertNotIn(2, i) + + # valid + self.assertIn(Interval(0, 1), i) + self.assertIn(Interval(0, 2), i) + self.assertIn(Interval(0, 0.5), i) + self.assertNotIn(Interval(3, 5), i) + self.assertNotIn(Interval(-1, 0, closed='left'), i) + + def test_is_contained_in(self): + # can select values that are IN the range of a value + i = IntervalIndex.from_arrays([0, 1], [1, 2]) + + assert i._is_contained_in(0.1) + assert i._is_contained_in(0.5) + assert i._is_contained_in(1) + assert i._is_contained_in(Interval(0, 1)) + assert i._is_contained_in(Interval(0, 2)) + + # these overlaps completely + assert i._is_contained_in(Interval(0, 3)) + assert i._is_contained_in(Interval(1, 3)) + + assert not i._is_contained_in(20) + assert not i._is_contained_in(-20) + + def test_dropna(self): + + expected = IntervalIndex.from_tuples([(0.0, 1.0), (1.0, 2.0)]) + + ii = IntervalIndex.from_tuples([(0, 1), (1, 2), np.nan]) + result = ii.dropna() + tm.assert_index_equal(result, expected) + + ii = IntervalIndex.from_arrays([0, 1, np.nan], [1, 2, np.nan]) + result = ii.dropna() + tm.assert_index_equal(result, expected) + + def test_non_contiguous(self): + index = IntervalIndex.from_tuples([(0, 1), (2, 3)]) + target = [0.5, 1.5, 2.5] + actual = index.get_indexer(target) + expected = np.array([0, -1, 1], dtype='int64') + self.assert_numpy_array_equal(actual, expected) + + self.assertNotIn(1.5, index) + + def test_union(self): + other = IntervalIndex.from_arrays([2], [3]) + expected = IntervalIndex.from_arrays(range(3), range(1, 4)) + actual = self.index.union(other) + self.assertTrue(expected.equals(actual)) + + actual = other.union(self.index) + self.assertTrue(expected.equals(actual)) + + tm.assert_index_equal(self.index.union(self.index), self.index) + tm.assert_index_equal(self.index.union(self.index[:1]), + self.index) + + def test_intersection(self): + other = IntervalIndex.from_breaks([1, 2, 3]) + expected = IntervalIndex.from_breaks([1, 2]) + actual = self.index.intersection(other) + self.assertTrue(expected.equals(actual)) + + tm.assert_index_equal(self.index.intersection(self.index), + self.index) + + def test_difference(self): + tm.assert_index_equal(self.index.difference(self.index[:1]), + self.index[1:]) + + def test_symmetric_difference(self): + result = self.index[:1].symmetric_difference(self.index[1:]) + expected = self.index + tm.assert_index_equal(result, expected) + + def test_set_operation_errors(self): + self.assertRaises(ValueError, self.index.union, self.index.left) + + other = IntervalIndex.from_breaks([0, 1, 2], closed='neither') + self.assertRaises(ValueError, self.index.union, other) + + def test_isin(self): + actual = self.index.isin(self.index) + self.assert_numpy_array_equal(np.array([True, True]), actual) + + actual = self.index.isin(self.index[:1]) + self.assert_numpy_array_equal(np.array([True, False]), actual) + + def test_comparison(self): + actual = Interval(0, 1) < self.index + expected = np.array([False, True]) + self.assert_numpy_array_equal(actual, expected) + + actual = Interval(0.5, 1.5) < self.index + expected = np.array([False, True]) + self.assert_numpy_array_equal(actual, expected) + actual = self.index > Interval(0.5, 1.5) + self.assert_numpy_array_equal(actual, expected) + + actual = self.index == self.index + expected = np.array([True, True]) + self.assert_numpy_array_equal(actual, expected) + actual = self.index <= self.index + self.assert_numpy_array_equal(actual, expected) + actual = self.index >= self.index + self.assert_numpy_array_equal(actual, expected) + + actual = self.index < self.index + expected = np.array([False, False]) + self.assert_numpy_array_equal(actual, expected) + actual = self.index > self.index + self.assert_numpy_array_equal(actual, expected) + + actual = self.index == IntervalIndex.from_breaks([0, 1, 2], 'left') + self.assert_numpy_array_equal(actual, expected) + + actual = self.index == self.index.values + self.assert_numpy_array_equal(actual, np.array([True, True])) + actual = self.index.values == self.index + self.assert_numpy_array_equal(actual, np.array([True, True])) + actual = self.index <= self.index.values + self.assert_numpy_array_equal(actual, np.array([True, True])) + actual = self.index != self.index.values + self.assert_numpy_array_equal(actual, np.array([False, False])) + actual = self.index > self.index.values + self.assert_numpy_array_equal(actual, np.array([False, False])) + actual = self.index.values > self.index + self.assert_numpy_array_equal(actual, np.array([False, False])) + + # invalid comparisons + actual = self.index == 0 + self.assert_numpy_array_equal(actual, np.array([False, False])) + actual = self.index == self.index.left + self.assert_numpy_array_equal(actual, np.array([False, False])) + + with self.assertRaisesRegexp(TypeError, 'unorderable types'): + self.index > 0 + with self.assertRaisesRegexp(TypeError, 'unorderable types'): + self.index <= 0 + with self.assertRaises(TypeError): + self.index > np.arange(2) + with self.assertRaises(ValueError): + self.index > np.arange(3) + + def test_missing_values(self): + idx = pd.Index([np.nan, pd.Interval(0, 1), pd.Interval(1, 2)]) + idx2 = pd.IntervalIndex.from_arrays([np.nan, 0, 1], [np.nan, 1, 2]) + assert idx.equals(idx2) + + with pytest.raises(ValueError): + IntervalIndex.from_arrays([np.nan, 0, 1], np.array([0, 1, 2])) + + self.assert_numpy_array_equal(isnull(idx), + np.array([True, False, False])) + + def test_sort_values(self): + expected = IntervalIndex.from_breaks([1, 2, 3, 4]) + actual = IntervalIndex.from_tuples([(3, 4), (1, 2), + (2, 3)]).sort_values() + tm.assert_index_equal(expected, actual) + + # nan + idx = self.index_with_nan + mask = idx.isnull() + self.assert_numpy_array_equal(mask, np.array([False, True, False])) + + result = idx.sort_values() + mask = result.isnull() + self.assert_numpy_array_equal(mask, np.array([False, False, True])) + + result = idx.sort_values(ascending=False) + mask = result.isnull() + self.assert_numpy_array_equal(mask, np.array([True, False, False])) + + def test_datetime(self): + dates = pd.date_range('2000', periods=3) + idx = IntervalIndex.from_breaks(dates) + + tm.assert_index_equal(idx.left, dates[:2]) + tm.assert_index_equal(idx.right, dates[-2:]) + + expected = pd.date_range('2000-01-01T12:00', periods=2) + tm.assert_index_equal(idx.mid, expected) + + self.assertNotIn(pd.Timestamp('2000-01-01T12'), idx) + self.assertNotIn(pd.Timestamp('2000-01-01T12'), idx) + + target = pd.date_range('1999-12-31T12:00', periods=7, freq='12H') + actual = idx.get_indexer(target) + expected = np.array([-1, -1, 0, 0, 1, 1, -1], dtype='int64') + self.assert_numpy_array_equal(actual, expected) + + def test_append(self): + + index1 = IntervalIndex.from_arrays([0, 1], [1, 2]) + index2 = IntervalIndex.from_arrays([1, 2], [2, 3]) + + result = index1.append(index2) + expected = IntervalIndex.from_arrays([0, 1, 1, 2], [1, 2, 2, 3]) + tm.assert_index_equal(result, expected) + + result = index1.append([index1, index2]) + expected = IntervalIndex.from_arrays([0, 1, 0, 1, 1, 2], + [1, 2, 1, 2, 2, 3]) + tm.assert_index_equal(result, expected) + + def f(): + index1.append(IntervalIndex.from_arrays([0, 1], [1, 2], + closed='both')) + + self.assertRaises(ValueError, f) + + +class TestIntervalRange(tm.TestCase): + + def test_construction(self): + result = interval_range(0, 5, name='foo', closed='both') + expected = IntervalIndex.from_breaks( + np.arange(0, 5), name='foo', closed='both') + tm.assert_index_equal(result, expected) + + def test_errors(self): + + # not enough params + def f(): + interval_range(0) + + self.assertRaises(ValueError, f) + + def f(): + interval_range(periods=2) + + self.assertRaises(ValueError, f) + + def f(): + interval_range() + + self.assertRaises(ValueError, f) + + # mixed units + def f(): + interval_range(0, Timestamp('20130101'), freq=2) + + self.assertRaises(ValueError, f) + + def f(): + interval_range(0, 10, freq=Timedelta('1day')) + + self.assertRaises(ValueError, f) + + +class TestIntervalTree(tm.TestCase): + def setUp(self): + gentree = lambda dtype: IntervalTree(np.arange(5, dtype=dtype), + np.arange(5, dtype=dtype) + 2) + self.tree = gentree('int64') + self.trees = {dtype: gentree(dtype) + for dtype in ['int32', 'int64', 'float32', 'float64']} + + def test_get_loc(self): + for dtype, tree in self.trees.items(): + self.assert_numpy_array_equal(tree.get_loc(1), + np.array([0], dtype='int64')) + self.assert_numpy_array_equal(np.sort(tree.get_loc(2)), + np.array([0, 1], dtype='int64')) + with self.assertRaises(KeyError): + tree.get_loc(-1) + + def test_get_indexer(self): + for dtype, tree in self.trees.items(): + self.assert_numpy_array_equal( + tree.get_indexer(np.array([1.0, 5.5, 6.5])), + np.array([0, 4, -1], dtype='int64')) + with self.assertRaises(KeyError): + tree.get_indexer(np.array([3.0])) + + def test_get_indexer_non_unique(self): + indexer, missing = self.tree.get_indexer_non_unique( + np.array([1.0, 2.0, 6.5])) + self.assert_numpy_array_equal(indexer[:1], + np.array([0], dtype='int64')) + self.assert_numpy_array_equal(np.sort(indexer[1:3]), + np.array([0, 1], dtype='int64')) + self.assert_numpy_array_equal(np.sort(indexer[3:]), + np.array([-1], dtype='int64')) + self.assert_numpy_array_equal(missing, np.array([2], dtype='int64')) + + def test_duplicates(self): + tree = IntervalTree([0, 0, 0], [1, 1, 1]) + self.assert_numpy_array_equal(np.sort(tree.get_loc(0.5)), + np.array([0, 1, 2], dtype='int64')) + + with self.assertRaises(KeyError): + tree.get_indexer(np.array([0.5])) + + indexer, missing = tree.get_indexer_non_unique(np.array([0.5])) + self.assert_numpy_array_equal(np.sort(indexer), + np.array([0, 1, 2], dtype='int64')) + self.assert_numpy_array_equal(missing, np.array([], dtype='int64')) + + def test_get_loc_closed(self): + for closed in ['left', 'right', 'both', 'neither']: + tree = IntervalTree([0], [1], closed=closed) + for p, errors in [(0, tree.open_left), + (1, tree.open_right)]: + if errors: + with self.assertRaises(KeyError): + tree.get_loc(p) + else: + self.assert_numpy_array_equal(tree.get_loc(p), + np.array([0], dtype='int64')) + + def test_get_indexer_closed(self): + x = np.arange(1000, dtype='int64') + found = x + not_found = (-1 * np.ones(1000)).astype('int64') + for leaf_size in [1, 10, 100, 10000]: + for closed in ['left', 'right', 'both', 'neither']: + tree = IntervalTree(x, x + 0.5, closed=closed, + leaf_size=leaf_size) + self.assert_numpy_array_equal(found, + tree.get_indexer(x + 0.25)) + + expected = found if tree.closed_left else not_found + self.assert_numpy_array_equal(expected, + tree.get_indexer(x + 0.0)) + + expected = found if tree.closed_right else not_found + self.assert_numpy_array_equal(expected, + tree.get_indexer(x + 0.5)) diff --git a/pandas/tests/indexing/test_interval.py b/pandas/tests/indexing/test_interval.py new file mode 100644 index 0000000000000..a5432be0da466 --- /dev/null +++ b/pandas/tests/indexing/test_interval.py @@ -0,0 +1,141 @@ +import pytest +import numpy as np +import pandas as pd + +from pandas import Series, DataFrame, IntervalIndex, Interval +import pandas.util.testing as tm + + +class TestIntervalIndex(tm.TestCase): + + def setUp(self): + self.s = Series(np.arange(5), IntervalIndex.from_breaks(np.arange(6))) + + def test_loc_getitem_series(self): + + s = self.s + expected = 0 + self.assertEqual(expected, s.loc[0.5]) + self.assertEqual(expected, s.loc[1]) + self.assertEqual(expected, s.loc[Interval(0, 1)]) + self.assertRaises(KeyError, s.loc.__getitem__, 0) + + expected = s.iloc[:3] + tm.assert_series_equal(expected, s.loc[:3]) + tm.assert_series_equal(expected, s.loc[:2.5]) + tm.assert_series_equal(expected, s.loc[0.1:2.5]) + tm.assert_series_equal(expected, s.loc[-1:3]) + + expected = s.iloc[1:4] + tm.assert_series_equal(expected, s.loc[[1.5, 2.5, 3.5]]) + tm.assert_series_equal(expected, s.loc[[2, 3, 4]]) + tm.assert_series_equal(expected, s.loc[[1.5, 3, 4]]) + + expected = s.iloc[2:5] + tm.assert_series_equal(expected, s.loc[s >= 2]) + + expected = s.iloc[2:5] + result = s.loc[[pd.Interval(3, 6)]] + tm.assert_series_equal(expected, result) + + expected = s.iloc[2:4] + result = s.loc[[pd.Interval(3, 5)]] + tm.assert_series_equal(expected, result) + + expected = s.iloc[[2, 3, 4, 2, 3, 4]] + result = s.loc[[pd.Interval(3, 6), pd.Interval(3, 6)]] + tm.assert_series_equal(expected, result) + + # slice of interval + with pytest.raises(NotImplementedError): + result = s.loc[pd.Interval(3, 6):] + + def test_loc_non_matching(self): + s = self.s + + # TODO: We are getting at least 1 matching + # interval so this meets our current semantics + expected = s.iloc[[2, 3, 4]] + result = s.loc[[-1, 3, 4, 5]] + tm.assert_series_equal(expected, result) + + def test_getitem_series(self): + + s = self.s + expected = 0 + self.assertEqual(expected, s[0.5]) + self.assertEqual(expected, s[1]) + self.assertEqual(expected, s[Interval(0, 1)]) + self.assertRaises(KeyError, s.__getitem__, 0) + + expected = s.iloc[:3] + tm.assert_series_equal(expected, s[:3]) + tm.assert_series_equal(expected, s[:2.5]) + tm.assert_series_equal(expected, s[0.1:2.5]) + tm.assert_series_equal(expected, s[-1:3]) + + expected = s.iloc[1:4] + tm.assert_series_equal(expected, s[[1.5, 2.5, 3.5]]) + tm.assert_series_equal(expected, s[[2, 3, 4]]) + tm.assert_series_equal(expected, s[[1.5, 3, 4]]) + + expected = s.iloc[2:5] + tm.assert_series_equal(expected, s[s >= 2]) + + expected = s.iloc[2:5] + result = s[[pd.Interval(3, 6)]] + tm.assert_series_equal(expected, result) + + # slice of interval + with pytest.raises(NotImplementedError): + result = s[pd.Interval(3, 6):] + + # slice of scalar + with pytest.raises(NotImplementedError): + s[0:4:2] + + def test_large_series(self): + s = Series(np.arange(1000000), + index=IntervalIndex.from_breaks(np.arange(1000001))) + + result1 = s.loc[:80000] + result2 = s.loc[0:80000] + result3 = s.loc[0:80000:1] + tm.assert_series_equal(result1, result2) + tm.assert_series_equal(result1, result3) + + def test_loc_getitem_frame(self): + + df = DataFrame({'A': range(10)}) + s = pd.cut(df.A, 5) + df['B'] = s + df = df.set_index('B') + + result = df.loc[4] + expected = df.iloc[4:6] + tm.assert_frame_equal(result, expected) + + def f(): + df.loc[10] + + self.assertRaises(KeyError, f) + + # single list-like + result = df.loc[[4]] + expected = df.iloc[4:6] + tm.assert_frame_equal(result, expected) + + # non-unique + result = df.loc[[4, 5]] + expected = df.take([4, 5, 4, 5]) + tm.assert_frame_equal(result, expected) + + def f(): + df.loc[[10]] + + self.assertRaises(KeyError, f) + + # partial missing + result = df.loc[[10, 4]] + expected = df.iloc[4:6] + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/scalar/test_interval.py b/pandas/tests/scalar/test_interval.py new file mode 100644 index 0000000000000..63e57fb472861 --- /dev/null +++ b/pandas/tests/scalar/test_interval.py @@ -0,0 +1,129 @@ +from __future__ import division + +import pytest +from pandas import Interval +import pandas.util.testing as tm + + +class TestInterval(tm.TestCase): + def setUp(self): + self.interval = Interval(0, 1) + + def test_properties(self): + self.assertEqual(self.interval.closed, 'right') + self.assertEqual(self.interval.left, 0) + self.assertEqual(self.interval.right, 1) + self.assertEqual(self.interval.mid, 0.5) + + def test_repr(self): + self.assertEqual(repr(self.interval), + "Interval(0, 1, closed='right')") + self.assertEqual(str(self.interval), "(0, 1]") + + interval_left = Interval(0, 1, closed='left') + self.assertEqual(repr(interval_left), + "Interval(0, 1, closed='left')") + self.assertEqual(str(interval_left), "[0, 1)") + + def test_contains(self): + self.assertIn(0.5, self.interval) + self.assertIn(1, self.interval) + self.assertNotIn(0, self.interval) + self.assertRaises(TypeError, lambda: self.interval in self.interval) + + interval = Interval(0, 1, closed='both') + self.assertIn(0, interval) + self.assertIn(1, interval) + + interval = Interval(0, 1, closed='neither') + self.assertNotIn(0, interval) + self.assertIn(0.5, interval) + self.assertNotIn(1, interval) + + def test_equal(self): + self.assertEqual(Interval(0, 1), Interval(0, 1, closed='right')) + self.assertNotEqual(Interval(0, 1), Interval(0, 1, closed='left')) + self.assertNotEqual(Interval(0, 1), 0) + + def test_comparison(self): + with self.assertRaisesRegexp(TypeError, 'unorderable types'): + Interval(0, 1) < 2 + + self.assertTrue(Interval(0, 1) < Interval(1, 2)) + self.assertTrue(Interval(0, 1) < Interval(0, 2)) + self.assertTrue(Interval(0, 1) < Interval(0.5, 1.5)) + self.assertTrue(Interval(0, 1) <= Interval(0, 1)) + self.assertTrue(Interval(0, 1) > Interval(-1, 2)) + self.assertTrue(Interval(0, 1) >= Interval(0, 1)) + + def test_hash(self): + # should not raise + hash(self.interval) + + def test_math_add(self): + expected = Interval(1, 2) + actual = self.interval + 1 + self.assertEqual(expected, actual) + + expected = Interval(1, 2) + actual = 1 + self.interval + self.assertEqual(expected, actual) + + actual = self.interval + actual += 1 + self.assertEqual(expected, actual) + + with pytest.raises(TypeError): + self.interval + Interval(1, 2) + + with pytest.raises(TypeError): + self.interval + 'foo' + + def test_math_sub(self): + expected = Interval(-1, 0) + actual = self.interval - 1 + self.assertEqual(expected, actual) + + actual = self.interval + actual -= 1 + self.assertEqual(expected, actual) + + with pytest.raises(TypeError): + self.interval - Interval(1, 2) + + with pytest.raises(TypeError): + self.interval - 'foo' + + def test_math_mult(self): + expected = Interval(0, 2) + actual = self.interval * 2 + self.assertEqual(expected, actual) + + expected = Interval(0, 2) + actual = 2 * self.interval + self.assertEqual(expected, actual) + + actual = self.interval + actual *= 2 + self.assertEqual(expected, actual) + + with pytest.raises(TypeError): + self.interval * Interval(1, 2) + + with pytest.raises(TypeError): + self.interval * 'foo' + + def test_math_div(self): + expected = Interval(0, 0.5) + actual = self.interval / 2.0 + self.assertEqual(expected, actual) + + actual = self.interval + actual /= 2.0 + self.assertEqual(expected, actual) + + with pytest.raises(TypeError): + self.interval / Interval(1, 2) + + with pytest.raises(TypeError): + self.interval / 'foo' diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index dbe2db67359f3..5822489c06a38 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -10,8 +10,7 @@ from pandas.types.common import is_categorical_dtype, is_datetime64tz_dtype from pandas import (Index, Series, isnull, date_range, - period_range, NaT) -from pandas.core.index import MultiIndex + NaT, period_range, MultiIndex, IntervalIndex) from pandas.tseries.index import Timestamp, DatetimeIndex from pandas._libs import lib @@ -543,6 +542,17 @@ def test_constructor_with_datetime_tz(self): expected = Series(pd.DatetimeIndex(['NaT', 'NaT'], tz='US/Eastern')) assert_series_equal(s, expected) + def test_construction_interval(self): + # construction from interval & array of intervals + index = IntervalIndex.from_breaks(np.arange(3), closed='right') + result = Series(index) + repr(result) + str(result) + tm.assert_index_equal(Index(result.values), index) + + result = Series(index.values) + tm.assert_index_equal(Index(result.values), index) + def test_construction_consistency(self): # make sure that we are not re-localizing upon construction diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index ea49abeee21c5..4a3332c2de6d8 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -10,7 +10,7 @@ import pandas as pd from pandas import (Series, DataFrame, isnull, date_range, - MultiIndex, Index, Timestamp, NaT) + MultiIndex, Index, Timestamp, NaT, IntervalIndex) from pandas.compat import range from pandas._libs.tslib import iNaT from pandas.util.testing import assert_series_equal, assert_frame_equal @@ -556,6 +556,15 @@ def test_dropna_no_nan(self): s2.dropna(inplace=True) self.assert_series_equal(s2, s) + def test_dropna_intervals(self): + s = Series([np.nan, 1, 2, 3], IntervalIndex.from_arrays( + [np.nan, 0, 1, 2], + [np.nan, 1, 2, 3])) + + result = s.dropna() + expected = s.iloc[1:] + assert_series_equal(result, expected) + def test_valid(self): ts = self.ts.copy() ts[::2] = np.NaN diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 135521f287f7c..19c9b69ff1988 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -7,7 +7,9 @@ from datetime import datetime from itertools import permutations from pandas import (Series, Categorical, CategoricalIndex, Index, - Timestamp, DatetimeIndex) +from pandas import (Series, Categorical, CategoricalIndex, + Timestamp, DatetimeIndex, + Index, IntervalIndex) import pandas as pd from pandas import compat @@ -590,8 +592,9 @@ def test_value_counts(self): # tm.assertIsInstance(factor, n) result = algos.value_counts(factor) - breaks = [-1.192, -0.535, 0.121, 0.777, 1.433] - expected_index = pd.IntervalIndex.from_breaks(breaks) + breaks = [-1.194, -0.535, 0.121, 0.777, 1.433] + expected_index = pd.IntervalIndex.from_breaks( + breaks).astype('category') expected = Series([1, 1, 1, 1], index=expected_index) tm.assert_series_equal(result.sort_index(), expected.sort_index()) @@ -599,13 +602,15 @@ def test_value_counts(self): def test_value_counts_bins(self): s = [1, 2, 3, 4] result = algos.value_counts(s, bins=1) - self.assertEqual(result.tolist(), [4]) - self.assertEqual(result.index[0], pd.Interval(0.999, 4.0)) + expected = Series([4], + index=IntervalIndex.from_tuples([(0.996, 4.0)])) + tm.assert_series_equal(result, expected) result = algos.value_counts(s, bins=2, sort=False) - self.assertEqual(result.tolist(), [2, 2]) - self.assertEqual(result.index.min(), pd.Interval(0.999, 2.5)) - self.assertEqual(result.index.max(), pd.Interval(2.5, 4.0)) + expected = Series([2, 2], + index=IntervalIndex.from_tuples([(0.996, 2.5), + (2.5, 4.0)])) + tm.assert_series_equal(result, expected) def test_value_counts_dtypes(self): result = algos.value_counts([1, 1.]) @@ -657,6 +662,7 @@ def test_categorical(self): result = s.value_counts() expected = Series([3, 2, 1], index=pd.CategoricalIndex(['a', 'b', 'c'])) + tm.assert_series_equal(result, expected, check_index_type=True) # preserve order? @@ -670,12 +676,13 @@ def test_categorical_nans(self): s.iloc[1] = np.nan result = s.value_counts() expected = Series([4, 3, 2], index=pd.CategoricalIndex( + ['a', 'b', 'c'], categories=['a', 'b', 'c'])) tm.assert_series_equal(result, expected, check_index_type=True) result = s.value_counts(dropna=False) expected = Series([ 4, 3, 2, 1 - ], index=pd.CategoricalIndex(['a', 'b', 'c', np.nan])) + ], index=CategoricalIndex(['a', 'b', 'c', np.nan])) tm.assert_series_equal(result, expected, check_index_type=True) # out of order diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 1fe449fa26aef..4a1cf6314aaed 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -31,7 +31,6 @@ def test_string_methods_dont_fail(self): unicode(self.container) # noqa def test_tricky_container(self): - import nose if not hasattr(self, 'unicode_container'): pytest.skip('Need unicode_container to test with this') repr(self.unicode_container) @@ -576,10 +575,10 @@ def test_value_counts_bins(self): s1 = Series([1, 1, 2, 3]) res1 = s1.value_counts(bins=1) - exp1 = Series({Interval(0.999, 3.0): 4}) + exp1 = Series({Interval(0.997, 3.0): 4}) tm.assert_series_equal(res1, exp1) res1n = s1.value_counts(bins=1, normalize=True) - exp1n = Series({Interval(0.999, 3.0): 1.0}) + exp1n = Series({Interval(0.997, 3.0): 1.0}) tm.assert_series_equal(res1n, exp1n) if isinstance(s1, Index): @@ -590,12 +589,20 @@ def test_value_counts_bins(self): self.assertEqual(s1.nunique(), 3) - res4 = s1.value_counts(bins=4) - intervals = IntervalIndex.from_breaks([0.999, 1.5, 2.0, 2.5, 3.0]) - exp4 = Series([2, 1, 1], index=intervals.take([0, 3, 1])) + # these return the same + res4 = s1.value_counts(bins=4, dropna=True) + intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) + exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) tm.assert_series_equal(res4, exp4) + + res4 = s1.value_counts(bins=4, dropna=False) + intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) + exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) + tm.assert_series_equal(res4, exp4) + res4n = s1.value_counts(bins=4, normalize=True) - exp4n = Series([0.5, 0.25, 0.25], index=intervals.take([0, 3, 1])) + exp4n = Series([0.5, 0.25, 0.25, 0], + index=intervals.take([0, 3, 1, 2])) tm.assert_series_equal(res4n, exp4n) # handle NA's properly diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index fe37fa000e687..8600b2d726e49 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -22,7 +22,7 @@ date_range, DatetimeIndex, period_range, PeriodIndex, timedelta_range, TimedeltaIndex, NaT, - Interval) + Interval, IntervalIndex) from pandas.compat import range, lrange, u, PY3 from pandas.core.config import option_context @@ -122,6 +122,16 @@ def test_constructor_unsortable(self): self.assertRaises( TypeError, lambda: Categorical(arr, ordered=True)) + def test_constructor_interval(self): + result = Categorical([Interval(1, 2), Interval(2, 3), Interval(3, 6)], + ordered=True) + ii = IntervalIndex.from_intervals([Interval(1, 2), + Interval(2, 3), + Interval(3, 6)]) + exp = Categorical(ii, ordered=True) + self.assert_categorical_equal(result, exp) + tm.assert_index_equal(result.categories, ii) + def test_is_equal_dtype(self): # test dtype comparisons between cats diff --git a/pandas/tests/test_interval.py b/pandas/tests/test_interval.py deleted file mode 100644 index 1b52e2629b38c..0000000000000 --- a/pandas/tests/test_interval.py +++ /dev/null @@ -1,591 +0,0 @@ -from __future__ import division -import numpy as np - -from pandas.core.interval import Interval, IntervalIndex -from pandas.core.index import Index -from pandas.lib import IntervalTree - -import pandas.util.testing as tm -import pandas as pd - - -class TestInterval(tm.TestCase): - def setUp(self): - self.interval = Interval(0, 1) - - def test_properties(self): - self.assertEqual(self.interval.closed, 'right') - self.assertEqual(self.interval.left, 0) - self.assertEqual(self.interval.right, 1) - self.assertEqual(self.interval.mid, 0.5) - - def test_repr(self): - self.assertEqual(repr(self.interval), - "Interval(0, 1, closed='right')") - self.assertEqual(str(self.interval), "(0, 1]") - - interval_left = Interval(0, 1, closed='left') - self.assertEqual(repr(interval_left), - "Interval(0, 1, closed='left')") - self.assertEqual(str(interval_left), "[0, 1)") - - def test_contains(self): - self.assertIn(0.5, self.interval) - self.assertIn(1, self.interval) - self.assertNotIn(0, self.interval) - self.assertRaises(TypeError, lambda: self.interval in self.interval) - - interval = Interval(0, 1, closed='both') - self.assertIn(0, interval) - self.assertIn(1, interval) - - interval = Interval(0, 1, closed='neither') - self.assertNotIn(0, interval) - self.assertIn(0.5, interval) - self.assertNotIn(1, interval) - - def test_equal(self): - self.assertEqual(Interval(0, 1), Interval(0, 1, closed='right')) - self.assertNotEqual(Interval(0, 1), Interval(0, 1, closed='left')) - self.assertNotEqual(Interval(0, 1), 0) - - def test_comparison(self): - with self.assertRaisesRegexp(TypeError, 'unorderable types'): - Interval(0, 1) < 2 - - self.assertTrue(Interval(0, 1) < Interval(1, 2)) - self.assertTrue(Interval(0, 1) < Interval(0, 2)) - self.assertTrue(Interval(0, 1) < Interval(0.5, 1.5)) - self.assertTrue(Interval(0, 1) <= Interval(0, 1)) - self.assertTrue(Interval(0, 1) > Interval(-1, 2)) - self.assertTrue(Interval(0, 1) >= Interval(0, 1)) - - def test_hash(self): - # should not raise - hash(self.interval) - - def test_math_add(self): - expected = Interval(1, 2) - actual = self.interval + 1 - self.assertEqual(expected, actual) - - expected = Interval(1, 2) - actual = 1 + self.interval - self.assertEqual(expected, actual) - - actual = self.interval - actual += 1 - self.assertEqual(expected, actual) - - with self.assertRaises(TypeError): - self.interval + Interval(1, 2) - - def test_math_sub(self): - expected = Interval(-1, 0) - actual = self.interval - 1 - self.assertEqual(expected, actual) - - actual = self.interval - actual -= 1 - self.assertEqual(expected, actual) - - with self.assertRaises(TypeError): - self.interval - Interval(1, 2) - - def test_math_mult(self): - expected = Interval(0, 2) - actual = self.interval * 2 - self.assertEqual(expected, actual) - - expected = Interval(0, 2) - actual = 2 * self.interval - self.assertEqual(expected, actual) - - actual = self.interval - actual *= 2 - self.assertEqual(expected, actual) - - with self.assertRaises(TypeError): - self.interval * Interval(1, 2) - - def test_math_div(self): - expected = Interval(0, 0.5) - actual = self.interval / 2.0 - self.assertEqual(expected, actual) - - actual = self.interval - actual /= 2.0 - self.assertEqual(expected, actual) - - with self.assertRaises(TypeError): - self.interval / Interval(1, 2) - - -class TestIntervalTree(tm.TestCase): - def setUp(self): - self.tree = IntervalTree(np.arange(5), np.arange(5) + 2) - - def test_get_loc(self): - self.assert_numpy_array_equal(self.tree.get_loc(1), [0]) - self.assert_numpy_array_equal(np.sort(self.tree.get_loc(2)), [0, 1]) - with self.assertRaises(KeyError): - self.tree.get_loc(-1) - - def test_get_indexer(self): - self.assert_numpy_array_equal( - self.tree.get_indexer(np.array([1.0, 5.5, 6.5])), [0, 4, -1]) - with self.assertRaises(KeyError): - self.tree.get_indexer(np.array([3.0])) - - def test_get_indexer_non_unique(self): - indexer, missing = self.tree.get_indexer_non_unique( - np.array([1.0, 2.0, 6.5])) - self.assert_numpy_array_equal(indexer[:1], [0]) - self.assert_numpy_array_equal(np.sort(indexer[1:3]), [0, 1]) - self.assert_numpy_array_equal(np.sort(indexer[3:]), [-1]) - self.assert_numpy_array_equal(missing, [2]) - - def test_duplicates(self): - tree = IntervalTree([0, 0, 0], [1, 1, 1]) - self.assert_numpy_array_equal(np.sort(tree.get_loc(0.5)), [0, 1, 2]) - - with self.assertRaises(KeyError): - tree.get_indexer(np.array([0.5])) - - indexer, missing = tree.get_indexer_non_unique(np.array([0.5])) - self.assert_numpy_array_equal(np.sort(indexer), [0, 1, 2]) - self.assert_numpy_array_equal(missing, []) - - def test_get_loc_closed(self): - for closed in ['left', 'right', 'both', 'neither']: - tree = IntervalTree([0], [1], closed=closed) - for p, errors in [(0, tree.open_left), - (1, tree.open_right)]: - if errors: - with self.assertRaises(KeyError): - tree.get_loc(p) - else: - self.assert_numpy_array_equal(tree.get_loc(p), - np.array([0])) - - def test_get_indexer_closed(self): - x = np.arange(1000) - found = x - not_found = -np.ones(1000) - for leaf_size in [1, 10, 100, 10000]: - for closed in ['left', 'right', 'both', 'neither']: - tree = IntervalTree(x, x + 0.5, closed=closed, - leaf_size=leaf_size) - self.assert_numpy_array_equal(found, tree.get_indexer(x + 0.25)) - - expected = found if tree.closed_left else not_found - self.assert_numpy_array_equal(expected, tree.get_indexer(x + 0.0)) - - expected = found if tree.closed_right else not_found - self.assert_numpy_array_equal(expected, tree.get_indexer(x + 0.5)) - - -class TestIntervalIndex(tm.TestCase): - def setUp(self): - self.index = IntervalIndex([0, 1], [1, 2]) - - def test_constructors(self): - expected = self.index - actual = IntervalIndex.from_breaks(np.arange(3), closed='right') - self.assertTrue(expected.equals(actual)) - - alternate = IntervalIndex.from_breaks(np.arange(3), closed='left') - self.assertFalse(expected.equals(alternate)) - - actual = IntervalIndex.from_intervals([Interval(0, 1), Interval(1, 2)]) - self.assertTrue(expected.equals(actual)) - - self.assertRaises(ValueError, IntervalIndex, [0], [1], closed='invalid') - - # TODO: fix all these commented out tests (here and below) - - intervals = [Interval(0, 1), Interval(1, 2, closed='left')] - with self.assertRaises(ValueError): - IntervalIndex.from_intervals(intervals) - - with self.assertRaises(ValueError): - IntervalIndex([0, 10], [3, 5]) - - actual = Index([Interval(0, 1), Interval(1, 2)]) - self.assertIsInstance(actual, IntervalIndex) - self.assertTrue(expected.equals(actual)) - - actual = Index(expected) - self.assertIsInstance(actual, IntervalIndex) - self.assertTrue(expected.equals(actual)) - - # no point in nesting periods in an IntervalIndex - # self.assertRaises(ValueError, IntervalIndex.from_breaks, - # pd.period_range('2000-01-01', periods=3)) - - def test_properties(self): - self.assertEqual(len(self.index), 2) - self.assertEqual(self.index.size, 2) - - self.assert_numpy_array_equal(self.index.left, [0, 1]) - self.assertIsInstance(self.index.left, Index) - - self.assert_numpy_array_equal(self.index.right, [1, 2]) - self.assertIsInstance(self.index.right, Index) - - self.assert_numpy_array_equal(self.index.mid, [0.5, 1.5]) - self.assertIsInstance(self.index.mid, Index) - - self.assertEqual(self.index.closed, 'right') - - expected = np.array([Interval(0, 1), Interval(1, 2)], dtype=object) - self.assert_numpy_array_equal(np.asarray(self.index), expected) - self.assert_numpy_array_equal(self.index.values, expected) - - def test_copy(self): - actual = self.index.copy() - self.assertTrue(actual.equals(self.index)) - - actual = self.index.copy(deep=True) - self.assertTrue(actual.equals(self.index)) - self.assertIsNot(actual.left, self.index.left) - - def test_delete(self): - expected = IntervalIndex.from_breaks([1, 2]) - actual = self.index.delete(0) - self.assertTrue(expected.equals(actual)) - - def test_insert(self): - expected = IntervalIndex.from_breaks(range(4)) - actual = self.index.insert(2, Interval(2, 3)) - self.assertTrue(expected.equals(actual)) - - self.assertRaises(ValueError, self.index.insert, 0, 1) - self.assertRaises(ValueError, self.index.insert, 0, - Interval(2, 3, closed='left')) - - def test_take(self): - actual = self.index.take([0, 1]) - self.assertTrue(self.index.equals(actual)) - - expected = IntervalIndex([0, 0, 1], [1, 1, 2]) - actual = self.index.take([0, 0, 1]) - self.assertTrue(expected.equals(actual)) - - def test_monotonic_and_unique(self): - self.assertTrue(self.index.is_monotonic) - self.assertTrue(self.index.is_unique) - - idx = IntervalIndex.from_tuples([(0, 1), (0.5, 1.5)]) - self.assertTrue(idx.is_monotonic) - self.assertTrue(idx.is_unique) - - idx = IntervalIndex.from_tuples([(0, 1), (2, 3), (1, 2)]) - self.assertFalse(idx.is_monotonic) - self.assertTrue(idx.is_unique) - - idx = IntervalIndex.from_tuples([(0, 2), (0, 2)]) - self.assertFalse(idx.is_unique) - self.assertTrue(idx.is_monotonic) - - def test_repr(self): - expected = ("IntervalIndex(left=[0, 1],\n right=[1, 2]," - "\n closed='right')") - IntervalIndex((0, 1), (1, 2), closed='right') - self.assertEqual(repr(self.index), expected) - - def test_get_loc_value(self): - self.assertRaises(KeyError, self.index.get_loc, 0) - self.assertEqual(self.index.get_loc(0.5), 0) - self.assertEqual(self.index.get_loc(1), 0) - self.assertEqual(self.index.get_loc(1.5), 1) - self.assertEqual(self.index.get_loc(2), 1) - self.assertRaises(KeyError, self.index.get_loc, -1) - self.assertRaises(KeyError, self.index.get_loc, 3) - - idx = IntervalIndex.from_tuples([(0, 2), (1, 3)]) - self.assertEqual(idx.get_loc(0.5), 0) - self.assertEqual(idx.get_loc(1), 0) - self.assert_numpy_array_equal(idx.get_loc(1.5), [0, 1]) - self.assert_numpy_array_equal(np.sort(idx.get_loc(2)), [0, 1]) - self.assertEqual(idx.get_loc(3), 1) - self.assertRaises(KeyError, idx.get_loc, 3.5) - - idx = IntervalIndex([0, 2], [1, 3]) - self.assertRaises(KeyError, idx.get_loc, 1.5) - - def slice_locs_cases(self, breaks): - # TODO: same tests for more index types - index = IntervalIndex.from_breaks([0, 1, 2], closed='right') - self.assertEqual(index.slice_locs(), (0, 2)) - self.assertEqual(index.slice_locs(0, 1), (0, 1)) - self.assertEqual(index.slice_locs(1, 1), (0, 1)) - self.assertEqual(index.slice_locs(0, 2), (0, 2)) - self.assertEqual(index.slice_locs(0.5, 1.5), (0, 2)) - self.assertEqual(index.slice_locs(0, 0.5), (0, 1)) - self.assertEqual(index.slice_locs(start=1), (0, 2)) - self.assertEqual(index.slice_locs(start=1.2), (1, 2)) - self.assertEqual(index.slice_locs(end=1), (0, 1)) - self.assertEqual(index.slice_locs(end=1.1), (0, 2)) - self.assertEqual(index.slice_locs(end=1.0), (0, 1)) - self.assertEqual(*index.slice_locs(-1, -1)) - - index = IntervalIndex.from_breaks([0, 1, 2], closed='neither') - self.assertEqual(index.slice_locs(0, 1), (0, 1)) - self.assertEqual(index.slice_locs(0, 2), (0, 2)) - self.assertEqual(index.slice_locs(0.5, 1.5), (0, 2)) - self.assertEqual(index.slice_locs(1, 1), (1, 1)) - self.assertEqual(index.slice_locs(1, 2), (1, 2)) - - index = IntervalIndex.from_breaks([0, 1, 2], closed='both') - self.assertEqual(index.slice_locs(1, 1), (0, 2)) - self.assertEqual(index.slice_locs(1, 2), (0, 2)) - - def test_slice_locs_int64(self): - self.slice_locs_cases([0, 1, 2]) - - def test_slice_locs_float64(self): - self.slice_locs_cases([0.0, 1.0, 2.0]) - - def slice_locs_decreasing_cases(self, tuples): - index = IntervalIndex.from_tuples(tuples) - self.assertEqual(index.slice_locs(1.5, 0.5), (1, 3)) - self.assertEqual(index.slice_locs(2, 0), (1, 3)) - self.assertEqual(index.slice_locs(2, 1), (1, 3)) - self.assertEqual(index.slice_locs(3, 1.1), (0, 3)) - self.assertEqual(index.slice_locs(3, 3), (0, 2)) - self.assertEqual(index.slice_locs(3.5, 3.3), (0, 1)) - self.assertEqual(index.slice_locs(1, -3), (2, 3)) - self.assertEqual(*index.slice_locs(-1, -1)) - - def test_slice_locs_decreasing_int64(self): - self.slice_locs_cases([(2, 4), (1, 3), (0, 2)]) - - def test_slice_locs_decreasing_float64(self): - self.slice_locs_cases([(2., 4.), (1., 3.), (0., 2.)]) - - def test_slice_locs_fails(self): - index = IntervalIndex.from_tuples([(1, 2), (0, 1), (2, 3)]) - with self.assertRaises(KeyError): - index.slice_locs(1, 2) - - def test_get_loc_interval(self): - self.assertEqual(self.index.get_loc(Interval(0, 1)), 0) - self.assertEqual(self.index.get_loc(Interval(0, 0.5)), 0) - self.assertEqual(self.index.get_loc(Interval(0, 1, 'left')), 0) - self.assertRaises(KeyError, self.index.get_loc, Interval(2, 3)) - self.assertRaises(KeyError, self.index.get_loc, Interval(-1, 0, 'left')) - - def test_get_indexer(self): - actual = self.index.get_indexer([-1, 0, 0.5, 1, 1.5, 2, 3]) - expected = [-1, -1, 0, 0, 1, 1, -1] - self.assert_numpy_array_equal(actual, expected) - - actual = self.index.get_indexer(self.index) - expected = [0, 1] - self.assert_numpy_array_equal(actual, expected) - - index = IntervalIndex.from_breaks([0, 1, 2], closed='left') - actual = index.get_indexer([-1, 0, 0.5, 1, 1.5, 2, 3]) - expected = [-1, 0, 0, 1, 1, -1, -1] - self.assert_numpy_array_equal(actual, expected) - - actual = self.index.get_indexer(index[:1]) - expected = [0] - self.assert_numpy_array_equal(actual, expected) - - self.assertRaises(ValueError, self.index.get_indexer, index) - - def test_get_indexer_subintervals(self): - # return indexers for wholly contained subintervals - target = IntervalIndex.from_breaks(np.linspace(0, 2, 5)) - actual = self.index.get_indexer(target) - expected = [0, 0, 1, 1] - self.assert_numpy_array_equal(actual, expected) - - target = IntervalIndex.from_breaks([0, 0.67, 1.33, 2]) - self.assertRaises(ValueError, self.index.get_indexer, target) - - actual = self.index.get_indexer(target[[0, -1]]) - expected = [0, 1] - self.assert_numpy_array_equal(actual, expected) - - target = IntervalIndex.from_breaks([0, 0.33, 0.67, 1], closed='left') - actual = self.index.get_indexer(target) - expected = [0, 0, 0] - self.assert_numpy_array_equal(actual, expected) - - def test_contains(self): - self.assertNotIn(0, self.index) - self.assertIn(0.5, self.index) - self.assertIn(2, self.index) - - self.assertIn(Interval(0, 1), self.index) - self.assertIn(Interval(0, 2), self.index) - self.assertIn(Interval(0, 0.5), self.index) - self.assertNotIn(Interval(3, 5), self.index) - self.assertNotIn(Interval(-1, 0, closed='left'), self.index) - - def test_non_contiguous(self): - index = IntervalIndex.from_tuples([(0, 1), (2, 3)]) - target = [0.5, 1.5, 2.5] - actual = index.get_indexer(target) - expected = [0, -1, 1] - self.assert_numpy_array_equal(actual, expected) - - self.assertNotIn(1.5, index) - - def test_union(self): - other = IntervalIndex([2], [3]) - expected = IntervalIndex(range(3), range(1, 4)) - actual = self.index.union(other) - self.assertTrue(expected.equals(actual)) - - actual = other.union(self.index) - self.assertTrue(expected.equals(actual)) - - self.assert_numpy_array_equal(self.index.union(self.index), self.index) - self.assert_numpy_array_equal(self.index.union(self.index[:1]), - self.index) - - def test_intersection(self): - other = IntervalIndex.from_breaks([1, 2, 3]) - expected = IntervalIndex.from_breaks([1, 2]) - actual = self.index.intersection(other) - self.assertTrue(expected.equals(actual)) - - self.assert_numpy_array_equal(self.index.intersection(self.index), - self.index) - - def test_difference(self): - self.assert_numpy_array_equal(self.index.difference(self.index[:1]), - self.index[1:]) - - def test_sym_diff(self): - self.assert_numpy_array_equal(self.index[:1].sym_diff(self.index[1:]), - self.index) - - def test_set_operation_errors(self): - self.assertRaises(ValueError, self.index.union, self.index.left) - - other = IntervalIndex.from_breaks([0, 1, 2], closed='neither') - self.assertRaises(ValueError, self.index.union, other) - - def test_isin(self): - actual = self.index.isin(self.index) - self.assert_numpy_array_equal([True, True], actual) - - actual = self.index.isin(self.index[:1]) - self.assert_numpy_array_equal([True, False], actual) - - def test_comparison(self): - actual = Interval(0, 1) < self.index - expected = [False, True] - self.assert_numpy_array_equal(actual, expected) - - actual = Interval(0.5, 1.5) < self.index - expected = [False, True] - self.assert_numpy_array_equal(actual, expected) - actual = self.index > Interval(0.5, 1.5) - self.assert_numpy_array_equal(actual, expected) - - actual = self.index == self.index - expected = [True, True] - self.assert_numpy_array_equal(actual, expected) - actual = self.index <= self.index - self.assert_numpy_array_equal(actual, expected) - actual = self.index >= self.index - self.assert_numpy_array_equal(actual, expected) - - actual = self.index < self.index - expected = [False, False] - self.assert_numpy_array_equal(actual, expected) - actual = self.index > self.index - self.assert_numpy_array_equal(actual, expected) - - actual = self.index == IntervalIndex.from_breaks([0, 1, 2], 'left') - self.assert_numpy_array_equal(actual, expected) - - actual = self.index == self.index.values - self.assert_numpy_array_equal(actual, [True, True]) - actual = self.index.values == self.index - self.assert_numpy_array_equal(actual, [True, True]) - actual = self.index <= self.index.values - self.assert_numpy_array_equal(actual, [True, True]) - actual = self.index != self.index.values - self.assert_numpy_array_equal(actual, [False, False]) - actual = self.index > self.index.values - self.assert_numpy_array_equal(actual, [False, False]) - actual = self.index.values > self.index - self.assert_numpy_array_equal(actual, [False, False]) - - # invalid comparisons - actual = self.index == 0 - self.assert_numpy_array_equal(actual, [False, False]) - actual = self.index == self.index.left - self.assert_numpy_array_equal(actual, [False, False]) - - with self.assertRaisesRegexp(TypeError, 'unorderable types'): - self.index > 0 - with self.assertRaisesRegexp(TypeError, 'unorderable types'): - self.index <= 0 - with self.assertRaises(TypeError): - self.index > np.arange(2) - with self.assertRaises(ValueError): - self.index > np.arange(3) - - def test_missing_values(self): - idx = pd.Index([np.nan, pd.Interval(0, 1), pd.Interval(1, 2)]) - idx2 = pd.IntervalIndex([np.nan, 0, 1], [np.nan, 1, 2]) - assert idx.equals(idx2) - - with tm.assertRaisesRegexp(ValueError, 'both left and right sides'): - pd.IntervalIndex([np.nan, 0, 1], [0, 1, 2]) - - self.assert_numpy_array_equal(pd.isnull(idx), [True, False, False]) - - def test_order(self): - expected = IntervalIndex.from_breaks([1, 2, 3, 4]) - actual = IntervalIndex.from_tuples([(3, 4), (1, 2), (2, 3)]).order() - self.assert_numpy_array_equal(expected, actual) - - def test_datetime(self): - dates = pd.date_range('2000', periods=3) - idx = IntervalIndex.from_breaks(dates) - - self.assert_numpy_array_equal(idx.left, dates[:2]) - self.assert_numpy_array_equal(idx.right, dates[-2:]) - - expected = pd.date_range('2000-01-01T12:00', periods=2) - self.assert_numpy_array_equal(idx.mid, expected) - - self.assertIn('2000-01-01T12', idx) - - target = pd.date_range('1999-12-31T12:00', periods=7, freq='12H') - actual = idx.get_indexer(target) - expected = [-1, -1, 0, 0, 1, 1, -1] - self.assert_numpy_array_equal(actual, expected) - - # def test_math(self): - # # add, subtract, multiply, divide with scalars should be OK - # actual = 2 * self.index + 1 - # expected = IntervalIndex.from_breaks((2 * np.arange(3) + 1)) - # self.assertTrue(expected.equals(actual)) - - # actual = self.index / 2.0 - 1 - # expected = IntervalIndex.from_breaks((np.arange(3) / 2.0 - 1)) - # self.assertTrue(expected.equals(actual)) - - # with self.assertRaises(TypeError): - # # doesn't make sense to add two IntervalIndex objects - # self.index + self.index - - # def test_datetime_math(self): - - # expected = IntervalIndex(pd.date_range('2000-01-02', periods=3)) - # actual = idx + pd.to_timedelta(1, unit='D') - # self.assertTrue(expected.equals(actual)) - - # TODO: other set operations (left join, right join, intersection), - # set operations with conflicting IntervalIndex objects or other dtypes, - # groupby, cut, reset_index... diff --git a/pandas/tests/tools/test_tile.py b/pandas/tests/tools/test_tile.py index 2d657c14b73a6..e0a625bbf29c2 100644 --- a/pandas/tests/tools/test_tile.py +++ b/pandas/tests/tools/test_tile.py @@ -3,23 +3,20 @@ import numpy as np from pandas.compat import zip -from pandas import DataFrame, Series, Index, unique, isnull, Categorical +from pandas import (Series, Index, isnull, + to_datetime, DatetimeIndex, Timestamp, + Interval, IntervalIndex, Categorical, + cut, qcut, date_range) import pandas.util.testing as tm -from pandas.util.testing import assertRaisesRegexp -import pandas.core.common as com from pandas.core.algorithms import quantile -from pandas.core.categorical import Categorical -from pandas.core.interval import Interval, IntervalIndex -from pandas.tools.tile import cut, qcut import pandas.tools.tile as tmod -from pandas import to_datetime, DatetimeIndex, Timestamp class TestCut(tm.TestCase): def test_simple(self): - data = np.ones(5) + data = np.ones(5, dtype='int64') result = cut(data, 4, labels=False) expected = np.array([1, 1, 1, 1, 1]) tm.assert_numpy_array_equal(result, expected, @@ -30,29 +27,37 @@ def test_bins(self): result, bins = cut(data, 3, retbins=True) intervals = IntervalIndex.from_breaks(bins.round(3)) - tm.assert_numpy_array_equal(result, intervals.take([0, 0, 0, 1, 2, 0])) - tm.assert_almost_equal(bins, [0.1905, 3.36666667, 6.53333333, 9.7]) + expected = intervals.take([0, 0, 0, 1, 2, 0]).astype('category') + tm.assert_categorical_equal(result, expected) + tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667, + 6.53333333, 9.7])) def test_right(self): data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575]) result, bins = cut(data, 4, right=True, retbins=True) intervals = IntervalIndex.from_breaks(bins.round(3)) - tm.assert_numpy_array_equal(result, intervals.take([0, 0, 0, 2, 3, 0, 0])) - tm.assert_almost_equal(bins, [0.1905, 2.575, 4.95, 7.325, 9.7]) + expected = intervals.astype('category').take([0, 0, 0, 2, 3, 0, 0]) + tm.assert_categorical_equal(result, expected) + tm.assert_almost_equal(bins, np.array([0.1905, 2.575, 4.95, + 7.325, 9.7])) def test_noright(self): data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575]) result, bins = cut(data, 4, right=False, retbins=True) intervals = IntervalIndex.from_breaks(bins.round(3), closed='left') - tm.assert_numpy_array_equal(result, intervals.take([0, 0, 0, 2, 3, 0, 1])) - tm.assert_almost_equal(bins, [0.2, 2.575, 4.95, 7.325, 9.7095]) + expected = intervals.take([0, 0, 0, 2, 3, 0, 1]).astype('category') + tm.assert_categorical_equal(result, expected) + tm.assert_almost_equal(bins, np.array([0.2, 2.575, 4.95, + 7.325, 9.7095])) def test_arraylike(self): data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1] result, bins = cut(data, 3, retbins=True) intervals = IntervalIndex.from_breaks(bins.round(3)) - tm.assert_numpy_array_equal(result, intervals.take([0, 0, 0, 1, 2, 0])) - tm.assert_almost_equal(bins, [0.1905, 3.36666667, 6.53333333, 9.7]) + expected = intervals.take([0, 0, 0, 1, 2, 0]).astype('category') + tm.assert_categorical_equal(result, expected) + tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667, + 6.53333333, 9.7])) def test_bins_not_monotonic(self): data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1] @@ -81,12 +86,12 @@ def test_labels(self): result, bins = cut(arr, 4, retbins=True) ex_levels = IntervalIndex.from_breaks([-1e-3, 0.25, 0.5, 0.75, 1]) - self.assert_numpy_array_equal(unique(result), ex_levels) + tm.assert_index_equal(result.categories, ex_levels) result, bins = cut(arr, 4, retbins=True, right=False) ex_levels = IntervalIndex.from_breaks([0, 0.25, 0.5, 0.75, 1 + 1e-3], closed='left') - self.assert_numpy_array_equal(unique(result), ex_levels) + tm.assert_index_equal(result.categories, ex_levels) def test_cut_pass_series_name_to_factor(self): s = Series(np.random.randn(100), name='foo') @@ -98,8 +103,9 @@ def test_label_precision(self): arr = np.arange(0, 0.73, 0.01) result = cut(arr, 4, precision=2) - ex_levels = IntervalIndex.from_breaks([-0.00072, 0.18, 0.36, 0.54, 0.72]) - self.assert_numpy_array_equal(unique(result), ex_levels) + ex_levels = IntervalIndex.from_breaks([-0.00072, 0.18, 0.36, + 0.54, 0.72]) + tm.assert_index_equal(result.categories, ex_levels) def test_na_handling(self): arr = np.arange(0, 0.75, 0.01) @@ -109,12 +115,12 @@ def test_na_handling(self): result_arr = np.asarray(result) - ex_arr = np.where(com.isnull(arr), np.nan, result_arr) + ex_arr = np.where(isnull(arr), np.nan, result_arr) tm.assert_almost_equal(result_arr, ex_arr) result = cut(arr, 4, labels=False) - ex_result = np.where(com.isnull(arr), np.nan, result) + ex_result = np.where(isnull(arr), np.nan, result) tm.assert_almost_equal(result, ex_result) def test_inf_handling(self): @@ -125,8 +131,8 @@ def test_inf_handling(self): result = cut(data, bins) result_ser = cut(data_ser, bins) - ex_uniques = IntervalIndex.from_breaks(bins).values - tm.assert_numpy_array_equal(unique(result), ex_uniques) + ex_uniques = IntervalIndex.from_breaks(bins) + tm.assert_index_equal(result.categories, ex_uniques) self.assertEqual(result[5], Interval(4, np.inf)) self.assertEqual(result[0], Interval(-np.inf, 2)) self.assertEqual(result_ser[5], Interval(4, np.inf)) @@ -135,12 +141,17 @@ def test_inf_handling(self): def test_qcut(self): arr = np.random.randn(1000) + # we store the bins as Index that have been rounded + # to comparisions are a bit tricky labels, bins = qcut(arr, 4, retbins=True) ex_bins = quantile(arr, [0, .25, .5, .75, 1.]) - tm.assert_almost_equal(bins, ex_bins) + result = labels.categories.left.values + self.assertTrue(np.allclose(result, ex_bins[:-1], atol=1e-2)) + result = labels.categories.right.values + self.assertTrue(np.allclose(result, ex_bins[1:], atol=1e-2)) ex_levels = cut(arr, ex_bins, include_lowest=True) - self.assert_categorical_equal(labels, ex_levels) + tm.assert_categorical_equal(labels, ex_levels) def test_qcut_bounds(self): arr = np.random.randn(1000) @@ -153,11 +164,11 @@ def test_qcut_specify_quantiles(self): factor = qcut(arr, [0, .25, .5, .75, 1.]) expected = qcut(arr, 4) - self.assert_numpy_array_equal(factor, expected) + tm.assert_categorical_equal(factor, expected) def test_qcut_all_bins_same(self): - assertRaisesRegexp(ValueError, "edges.*unique", qcut, - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 3) + tm.assertRaisesRegexp(ValueError, "edges.*unique", qcut, + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 3) def test_cut_out_of_bounds(self): arr = np.random.randn(100) @@ -174,36 +185,39 @@ def test_cut_pass_labels(self): labels = ['Small', 'Medium', 'Large'] result = cut(arr, bins, labels=labels) - exp = ['Medium'] + 4 * ['Small'] + ['Medium', 'Large'] - self.assert_numpy_array_equal(result, exp) + exp = Categorical(['Medium'] + 4 * ['Small'] + ['Medium', 'Large'], + ordered=True) + self.assert_categorical_equal(result, exp) - result = cut(arr, bins, labels=Categorical.from_codes([0, 1, 2], labels)) + result = cut(arr, bins, labels=Categorical.from_codes([0, 1, 2], + labels)) exp = Categorical.from_codes([1] + 4 * [0] + [1, 2], labels) - self.assertTrue(result.equals(exp)) + self.assert_categorical_equal(result, exp) def test_qcut_include_lowest(self): values = np.arange(10) - cats = qcut(values, 4) + ii = qcut(values, 4) - ex_levels = [Interval(0, 2.25, closed='both'), Interval(2.25, 4.5), - Interval(4.5, 6.75), Interval(6.75, 9)] - self.assert_numpy_array_equal(unique(cats), ex_levels) + ex_levels = IntervalIndex.from_intervals( + [Interval(-0.001, 2.25), + Interval(2.25, 4.5), + Interval(4.5, 6.75), + Interval(6.75, 9)]) + tm.assert_index_equal(ii.categories, ex_levels) def test_qcut_nas(self): arr = np.random.randn(100) arr[:20] = np.nan result = qcut(arr, 4) - self.assertTrue(com.isnull(result[:20]).all()) + self.assertTrue(isnull(result[:20]).all()) def test_qcut_index(self): - # the result is closed on a different side for the first interval, but - # we should still be able to make an index result = qcut([0, 2], 2) - index = Index(result) - expected = Index([Interval(0, 1, closed='both'), Interval(1, 2)]) - self.assert_numpy_array_equal(index, expected) + expected = Index([Interval(-0.001, 1), Interval(1, 2)]).astype( + 'category') + self.assert_categorical_equal(result, expected) def test_round_frac(self): # it works @@ -247,41 +261,46 @@ def test_qcut_binning_issues(self): self.assertTrue(ep <= sn) def test_cut_return_intervals(self): - s = Series([0,1,2,3,4,5,6,7,8]) - res = cut(s,3) + s = Series([0, 1, 2, 3, 4, 5, 6, 7, 8]) + res = cut(s, 3) exp_bins = np.linspace(0, 8, num=4).round(3) exp_bins[0] -= 0.008 - exp = Series(IntervalIndex.from_breaks(exp_bins).take([0,0,0,1,1,1,2,2,2])) + exp = Series(IntervalIndex.from_breaks(exp_bins, closed='right').take( + [0, 0, 0, 1, 1, 1, 2, 2, 2])).astype('category', ordered=True) tm.assert_series_equal(res, exp) def test_qcut_return_intervals(self): - s = Series([0,1,2,3,4,5,6,7,8]) - res = qcut(s,[0,0.333,0.666,1]) - exp_levels = np.array([Interval(0, 2.664, closed='both'), + s = Series([0, 1, 2, 3, 4, 5, 6, 7, 8]) + res = qcut(s, [0, 0.333, 0.666, 1]) + exp_levels = np.array([Interval(-0.001, 2.664), Interval(2.664, 5.328), Interval(5.328, 8)]) - exp = Series(exp_levels.take([0,0,0,1,1,1,2,2,2])) + exp = Series(exp_levels.take([0, 0, 0, 1, 1, 1, 2, 2, 2])).astype( + 'category', ordered=True) tm.assert_series_equal(res, exp) def test_series_retbins(self): # GH 8589 s = Series(np.arange(4)) result, bins = cut(s, 2, retbins=True) - tm.assert_numpy_array_equal(result.cat.codes.values, - np.array([0, 0, 1, 1], dtype=np.int8)) - tm.assert_numpy_array_equal(bins, np.array([-0.003, 1.5, 3])) + expected = Series(IntervalIndex.from_breaks( + [-0.003, 1.5, 3], closed='right').repeat(2)).astype('category', + ordered=True) + tm.assert_series_equal(result, expected) result, bins = qcut(s, 2, retbins=True) - tm.assert_numpy_array_equal(result.cat.codes.values, - np.array([0, 0, 1, 1], dtype=np.int8)) - tm.assert_numpy_array_equal(bins, np.array([0, 1.5, 3])) + expected = Series(IntervalIndex.from_breaks( + [-0.001, 1.5, 3], closed='right').repeat(2)).astype('category', + ordered=True) + tm.assert_series_equal(result, expected) def test_qcut_duplicates_bin(self): # GH 7751 values = [0, 0, 0, 0, 1, 2, 3] - result_levels = ['[0, 1]', '(1, 3]'] + expected = IntervalIndex.from_intervals([Interval(-0.001, 1), + Interval(1, 3)]) - cats = qcut(values, 3, duplicates='drop') - self.assertTrue((cats.categories == result_levels).all()) + result = qcut(values, 3, duplicates='drop') + tm.assert_index_equal(result.categories, expected) self.assertRaises(ValueError, qcut, values, 3) self.assertRaises(ValueError, qcut, values, 3, duplicates='raise') @@ -297,51 +316,57 @@ def test_single_quantile(self): result = qcut(s, 1, labels=False) tm.assert_series_equal(result, expected) result = qcut(s, 1) - exp_lab = Series(Categorical.from_codes([0, 0], ["[9, 9]"], - ordered=True)) - tm.assert_series_equal(result, exp_lab) + intervals = IntervalIndex([Interval(8.999, 9.0), + Interval(8.999, 9.0)], closed='right') + expected = Series(intervals).astype('category', ordered=True) + tm.assert_series_equal(result, expected) s = Series([-9., -9.]) + expected = Series([0, 0]) result = qcut(s, 1, labels=False) tm.assert_series_equal(result, expected) result = qcut(s, 1) - exp_lab = Series(Categorical.from_codes([0, 0], ["[-9, -9]"], - ordered=True)) - tm.assert_series_equal(result, exp_lab) + intervals = IntervalIndex([Interval(-9.001, -9.0), + Interval(-9.001, -9.0)], closed='right') + expected = Series(intervals).astype('category', ordered=True) + tm.assert_series_equal(result, expected) s = Series([0., 0.]) + expected = Series([0, 0]) result = qcut(s, 1, labels=False) tm.assert_series_equal(result, expected) result = qcut(s, 1) - exp_lab = Series(Categorical.from_codes([0, 0], ["[0, 0]"], - ordered=True)) - tm.assert_series_equal(result, exp_lab) - - expected = Series([0]) + intervals = IntervalIndex([Interval(-0.001, 0.0), + Interval(-0.001, 0.0)], closed='right') + expected = Series(intervals).astype('category', ordered=True) + tm.assert_series_equal(result, expected) s = Series([9]) + expected = Series([0]) result = qcut(s, 1, labels=False) tm.assert_series_equal(result, expected) result = qcut(s, 1) - exp_lab = Series(Categorical.from_codes([0], ["[9, 9]"], - ordered=True)) - tm.assert_series_equal(result, exp_lab) + intervals = IntervalIndex([Interval(8.999, 9.0)], closed='right') + expected = Series(intervals).astype('category', ordered=True) + tm.assert_series_equal(result, expected) s = Series([-9]) + expected = Series([0]) result = qcut(s, 1, labels=False) tm.assert_series_equal(result, expected) result = qcut(s, 1) - exp_lab = Series(Categorical.from_codes([0], ["[-9, -9]"], - ordered=True)) - tm.assert_series_equal(result, exp_lab) + intervals = IntervalIndex([Interval(-9.001, -9.0)], closed='right') + expected = Series(intervals).astype('category', ordered=True) + tm.assert_series_equal(result, expected) s = Series([0]) + expected = Series([0]) result = qcut(s, 1, labels=False) tm.assert_series_equal(result, expected) result = qcut(s, 1) - exp_lab = Series(Categorical.from_codes([0], ["[0, 0]"], - ordered=True)) - tm.assert_series_equal(result, exp_lab) + intervals = IntervalIndex([Interval(-0.001, 0.0)], closed='right') + expected = Series(intervals).astype('category', ordered=True) + tm.assert_series_equal(result, expected) def test_single_bin(self): # issue 14652 @@ -382,11 +407,18 @@ def test_datetime_cut(self): # GH 14714 # testing for time data to be present as series data = to_datetime(Series(['2013-01-01', '2013-01-02', '2013-01-03'])) + result, bins = cut(data, 3, retbins=True) - expected = Series(['(2012-12-31 23:57:07.200000, 2013-01-01 16:00:00]', - '(2013-01-01 16:00:00, 2013-01-02 08:00:00]', - '(2013-01-02 08:00:00, 2013-01-03 00:00:00]'], - ).astype("category", ordered=True) + expected = ( + Series(IntervalIndex.from_intervals([ + Interval(Timestamp('2012-12-31 23:57:07.200000'), + Timestamp('2013-01-01 16:00:00')), + Interval(Timestamp('2013-01-01 16:00:00'), + Timestamp('2013-01-02 08:00:00')), + Interval(Timestamp('2013-01-02 08:00:00'), + Timestamp('2013-01-03 00:00:00'))])) + .astype('category', ordered=True)) + tm.assert_series_equal(result, expected) # testing for time data to be present as list @@ -410,9 +442,11 @@ def test_datetime_cut(self): def test_datetime_bin(self): data = [np.datetime64('2012-12-13'), np.datetime64('2012-12-15')] bin_data = ['2012-12-12', '2012-12-14', '2012-12-16'] - expected = Series(['(2012-12-12 00:00:00, 2012-12-14 00:00:00]', - '(2012-12-14 00:00:00, 2012-12-16 00:00:00]'], - ).astype("category", ordered=True) + expected = ( + Series(IntervalIndex.from_intervals([ + Interval(Timestamp(bin_data[0]), Timestamp(bin_data[1])), + Interval(Timestamp(bin_data[1]), Timestamp(bin_data[2]))])) + .astype('category', ordered=True)) for conv in [Timestamp, Timestamp, np.datetime64]: bins = [conv(v) for v in bin_data] @@ -427,13 +461,19 @@ def test_datetime_bin(self): result = cut(data, bins=bin_pydatetime) tm.assert_series_equal(Series(result), expected) - result, bins = cut(s, 2, retbins=True, labels=[0, 1]) - tm.assert_numpy_array_equal(result, [0, 0, 1, 1]) - tm.assert_almost_equal(bins, [-0.003, 1.5, 3]) + def test_datetime_nan(self): + + def f(): + cut(date_range('20130101', periods=3), bins=[0, 2, 4]) + self.assertRaises(ValueError, f) - result, bins = qcut(s, 2, retbins=True, labels=[0, 1]) - tm.assert_numpy_array_equal(result, [0, 0, 1, 1]) - tm.assert_almost_equal(bins, [0, 1.5, 3]) + result = cut(date_range('20130102', periods=5), + bins=date_range('20130101', periods=2)) + mask = result.categories.isnull() + self.assert_numpy_array_equal(mask, np.array([False])) + mask = result.isnull() + self.assert_numpy_array_equal( + mask, np.array([False, True, True, True, True])) def curpath(): diff --git a/pandas/tests/types/test_dtypes.py b/pandas/tests/types/test_dtypes.py index e7b2edeb57714..79d9fd84396e7 100644 --- a/pandas/tests/types/test_dtypes.py +++ b/pandas/tests/types/test_dtypes.py @@ -3,14 +3,15 @@ import numpy as np import pandas as pd -from pandas import Series, Categorical, date_range +from pandas import Series, Categorical, IntervalIndex, date_range -from pandas.types.dtypes import DatetimeTZDtype, PeriodDtype, CategoricalDtype +from pandas.types.dtypes import (DatetimeTZDtype, PeriodDtype, + IntervalDtype, CategoricalDtype) from pandas.types.common import (is_categorical_dtype, is_categorical, is_datetime64tz_dtype, is_datetimetz, is_period_dtype, is_period, is_dtype_equal, is_datetime64_ns_dtype, - is_datetime64_dtype, + is_datetime64_dtype, is_interval_dtype, is_datetime64_any_dtype, is_string_dtype, _coerce_to_dtype) import pandas.util.testing as tm @@ -351,3 +352,114 @@ def test_empty(self): def test_not_string(self): # though PeriodDtype has object kind, it cannot be string self.assertFalse(is_string_dtype(PeriodDtype('D'))) + + +class TestIntervalDtype(Base, tm.TestCase): + + # TODO: placeholder + def setUp(self): + self.dtype = IntervalDtype('int64') + + def test_construction(self): + with tm.assertRaises(ValueError): + IntervalDtype('xx') + + for s in ['interval[int64]', 'Interval[int64]', 'int64']: + i = IntervalDtype(s) + self.assertEqual(i.subtype, np.dtype('int64')) + self.assertTrue(is_interval_dtype(i)) + + def test_construction_generic(self): + # generic + i = IntervalDtype('interval') + self.assertIs(i.subtype, None) + self.assertTrue(is_interval_dtype(i)) + self.assertTrue(str(i) == 'interval') + + i = IntervalDtype() + self.assertIs(i.subtype, None) + self.assertTrue(is_interval_dtype(i)) + self.assertTrue(str(i) == 'interval') + + def test_subclass(self): + a = IntervalDtype('interval[int64]') + b = IntervalDtype('interval[int64]') + + self.assertTrue(issubclass(type(a), type(a))) + self.assertTrue(issubclass(type(a), type(b))) + + def test_is_dtype(self): + self.assertTrue(IntervalDtype.is_dtype(self.dtype)) + self.assertTrue(IntervalDtype.is_dtype('interval')) + self.assertTrue(IntervalDtype.is_dtype(IntervalDtype('float64'))) + self.assertTrue(IntervalDtype.is_dtype(IntervalDtype('int64'))) + self.assertTrue(IntervalDtype.is_dtype(IntervalDtype(np.int64))) + + self.assertFalse(IntervalDtype.is_dtype('D')) + self.assertFalse(IntervalDtype.is_dtype('3D')) + self.assertFalse(IntervalDtype.is_dtype('U')) + self.assertFalse(IntervalDtype.is_dtype('S')) + self.assertFalse(IntervalDtype.is_dtype('foo')) + self.assertFalse(IntervalDtype.is_dtype(np.object_)) + self.assertFalse(IntervalDtype.is_dtype(np.int64)) + self.assertFalse(IntervalDtype.is_dtype(np.float64)) + + def test_identity(self): + self.assertEqual(IntervalDtype('interval[int64]'), + IntervalDtype('interval[int64]')) + + def test_coerce_to_dtype(self): + self.assertEqual(_coerce_to_dtype('interval[int64]'), + IntervalDtype('interval[int64]')) + + def test_construction_from_string(self): + result = IntervalDtype('interval[int64]') + self.assertTrue(is_dtype_equal(self.dtype, result)) + result = IntervalDtype.construct_from_string('interval[int64]') + self.assertTrue(is_dtype_equal(self.dtype, result)) + with tm.assertRaises(TypeError): + IntervalDtype.construct_from_string('foo') + with tm.assertRaises(TypeError): + IntervalDtype.construct_from_string('interval[foo]') + with tm.assertRaises(TypeError): + IntervalDtype.construct_from_string('foo[int64]') + + def test_equality(self): + self.assertTrue(is_dtype_equal(self.dtype, 'interval[int64]')) + self.assertTrue(is_dtype_equal(self.dtype, IntervalDtype('int64'))) + self.assertTrue(is_dtype_equal(self.dtype, IntervalDtype('int64'))) + self.assertTrue(is_dtype_equal(IntervalDtype('int64'), + IntervalDtype('int64'))) + + self.assertFalse(is_dtype_equal(self.dtype, 'int64')) + self.assertFalse(is_dtype_equal(IntervalDtype('int64'), + IntervalDtype('float64'))) + + def test_basic(self): + self.assertTrue(is_interval_dtype(self.dtype)) + + ii = IntervalIndex.from_breaks(range(3)) + + self.assertTrue(is_interval_dtype(ii.dtype)) + self.assertTrue(is_interval_dtype(ii)) + + s = Series(ii, name='A') + + # dtypes + # series results in object dtype currently, + self.assertFalse(is_interval_dtype(s.dtype)) + self.assertFalse(is_interval_dtype(s)) + + def test_basic_dtype(self): + self.assertTrue(is_interval_dtype('interval[int64]')) + self.assertTrue(is_interval_dtype(IntervalIndex.from_tuples([(0, 1)]))) + self.assertTrue(is_interval_dtype + (IntervalIndex.from_breaks(np.arange(4)))) + self.assertTrue(is_interval_dtype( + IntervalIndex.from_breaks(date_range('20130101', periods=3)))) + self.assertFalse(is_interval_dtype('U')) + self.assertFalse(is_interval_dtype('S')) + self.assertFalse(is_interval_dtype('foo')) + self.assertFalse(is_interval_dtype(np.object_)) + self.assertFalse(is_interval_dtype(np.int64)) + self.assertFalse(is_interval_dtype(np.float64)) diff --git a/pandas/tests/types/test_missing.py b/pandas/tests/types/test_missing.py index efd6dda02ab4b..31bf2817c8bab 100644 --- a/pandas/tests/types/test_missing.py +++ b/pandas/tests/types/test_missing.py @@ -55,6 +55,14 @@ def test_0d_array(self): self.assertFalse(isnull(np.array(0.0, dtype=object))) self.assertFalse(isnull(np.array(0, dtype=object))) + def test_empty_object(self): + + for shape in [(4, 0), (4,)]: + arr = np.empty(shape=shape, dtype=object) + result = isnull(arr) + expected = np.ones(shape=shape, dtype=bool) + tm.assert_numpy_array_equal(result, expected) + def test_isnull(self): self.assertFalse(isnull(1.)) self.assertTrue(isnull(None)) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index f1ca7ff4b19ba..c6b1ee417c64d 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -3,24 +3,22 @@ """ from pandas.types.missing import isnull -from pandas.types.common import (is_float, is_integer, - is_scalar, _ensure_int64) +from pandas.types.common import (is_integer, + is_scalar, + is_categorical_dtype, + is_datetime64_dtype, + is_timedelta64_dtype, + _ensure_int64) -from pandas.core.api import Series -from pandas.core.categorical import Categorical -from pandas.core.index import _ensure_index -from pandas.core.interval import IntervalIndex, Interval import pandas.core.algorithms as algos import pandas.core.nanops as nanops -from pandas.compat import zip -from pandas import to_timedelta, to_datetime -from pandas.types.common import is_datetime64_dtype, is_timedelta64_dtype from pandas._libs.lib import infer_dtype +from pandas import (to_timedelta, to_datetime, + Categorical, Timestamp, Timedelta, + Series, Interval, IntervalIndex) import numpy as np -import warnings - def cut(x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False): @@ -97,7 +95,6 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, if is_scalar(bins) and bins < 1: raise ValueError("`bins` should be a positive integer.") - # TODO: IntervalIndex try: # for array-like sz = x.size except AttributeError: @@ -124,13 +121,14 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, else: bins = np.asarray(bins) - bins = _convert_bin_to_numeric_type(bins) + bins = _convert_bin_to_numeric_type(bins, dtype) if (np.diff(bins) < 0).any(): raise ValueError('bins must increase monotonically.') fac, bins = _bins_to_cuts(x, bins, right=right, labels=labels, precision=precision, - include_lowest=include_lowest, dtype=dtype) + include_lowest=include_lowest, + dtype=dtype) return _postprocess_for_cut(fac, bins, retbins, x_is_series, series_index, name) @@ -154,8 +152,8 @@ def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): the resulting bins. If False, return only integer indicators of the bins. retbins : bool, optional - Whether to return the bins or not. Can be useful if bins is given - as a scalar. + Whether to return the (bins, labels) or not. Can be useful if bins + is given as a scalar. precision : int, optional The precision at which to store and display the bins labels duplicates : {default 'raise', 'drop'}, optional @@ -232,42 +230,18 @@ def _bins_to_cuts(x, bins, right=True, labels=None, if labels is not False: if labels is None: - - # TODO: IntervalIndex - increases = 0 - while True: - try: - levels = _format_levels(bins, precision, right=right, - include_lowest=include_lowest, - dtype=dtype) - except ValueError: - increases += 1 - precision += 1 - if increases >= 20: - raise - else: - break - - # - #closed = 'right' if right else 'left' - #precision = _infer_precision(precision, bins) - #breaks = [_round_frac(b, precision) for b in bins] - #labels = IntervalIndex.from_breaks(breaks, closed=closed).values - - #if right and include_lowest: - # labels[0] = Interval(labels[0].left, labels[0].right, - # closed='both') - + labels = _format_labels(bins, precision, right=right, + include_lowest=include_lowest, + dtype=dtype) else: if len(labels) != len(bins) - 1: raise ValueError('Bin labels must be one fewer than ' 'the number of bin edges') - - if not com.is_categorical(labels): - labels = np.asarray(labels) + if not is_categorical_dtype(labels): + labels = Categorical(labels, ordered=True) np.putmask(ids, na_mask, 0) - result = com.take_nd(labels, ids - 1) + result = algos.take_nd(labels, ids - 1) else: result = ids - 1 @@ -277,42 +251,6 @@ def _bins_to_cuts(x, bins, right=True, labels=None, return result, bins -def _format_levels(bins, prec, right=True, - include_lowest=False, dtype=None): - fmt = lambda v: _format_label(v, precision=prec, dtype=dtype) - if right: - levels = [] - for a, b in zip(bins, bins[1:]): - fa, fb = fmt(a), fmt(b) - -def _round_frac(x, precision): - """Round the fractional part of the given number - """ - if not np.isfinite(x) or x == 0: - return x - else: - levels = ['[%s, %s)' % (fmt(a), fmt(b)) - for a, b in zip(bins, bins[1:])] - return levels - - -def _format_label(x, precision=3, dtype=None): - fmt_str = '%%.%dg' % precision - - if is_datetime64_dtype(dtype): - return to_datetime(x, unit='ns') - if is_timedelta64_dtype(dtype): - return to_timedelta(x, unit='ns') - if np.isinf(x): - return str(x) - elif is_float(x): - frac, whole = np.modf(x) - if whole == 0: - digits = -int(np.floor(np.log10(abs(frac)))) - 1 + precision - else: - digits = precision - return np.around(x, digits) - def _trim_zeros(x): while len(x) > 1 and x[-1] == '0': @@ -340,17 +278,65 @@ def _coerce_to_type(x): return x, dtype -def _convert_bin_to_numeric_type(x): +def _convert_bin_to_numeric_type(bins, dtype): """ if the passed bin is of datetime/timedelta type, this method converts it to integer + + Parameters + ---------- + bins : list-liek of bins + dtype : dtype of data + + Raises + ------ + ValueError if bins are not of a compat dtype to dtype """ - dtype = infer_dtype(x) - if dtype == 'timedelta' or dtype == 'timedelta64': - x = to_timedelta(x).view(np.int64) - elif dtype == 'datetime' or dtype == 'datetime64': - x = to_datetime(x).view(np.int64) - return x + bins_dtype = infer_dtype(bins) + if is_timedelta64_dtype(dtype): + if bins_dtype in ['timedelta', 'timedelta64']: + bins = to_timedelta(bins).view(np.int64) + else: + raise ValueError("bins must be of timedelta64 dtype") + elif is_datetime64_dtype(dtype): + if bins_dtype in ['datetime', 'datetime64']: + bins = to_datetime(bins).view(np.int64) + else: + raise ValueError("bins must be of datetime64 dtype") + + return bins + + +def _format_labels(bins, precision, right=True, + include_lowest=False, dtype=None): + """ based on the dtype, return our labels """ + + closed = 'right' if right else 'left' + + if is_datetime64_dtype(dtype): + formatter = Timestamp + adjust = lambda x: x - Timedelta('1ns') + elif is_timedelta64_dtype(dtype): + formatter = Timedelta + adjust = lambda x: x - Timedelta('1ns') + else: + precision = _infer_precision(precision, bins) + formatter = lambda x: _round_frac(x, precision) + adjust = lambda x: x - 10 ** (-precision) + + breaks = [formatter(b) for b in bins] + labels = IntervalIndex.from_breaks(breaks, closed=closed) + + if right and include_lowest: + # we will adjust the left hand side by precision to + # account that we are all right closed + v = adjust(labels[0].left) + + i = IntervalIndex.from_intervals( + [Interval(v, labels[0].right, closed='right')]) + labels = i.append(labels[1:]) + + return labels def _preprocess_for_cut(x): @@ -372,7 +358,8 @@ def _preprocess_for_cut(x): return x_is_series, series_index, name, x -def _postprocess_for_cut(fac, bins, retbins, x_is_series, series_index, name): +def _postprocess_for_cut(fac, bins, retbins, x_is_series, + series_index, name): """ handles post processing for the cut method where we combine the index information if the originally passed @@ -386,6 +373,22 @@ def _postprocess_for_cut(fac, bins, retbins, x_is_series, series_index, name): return fac, bins + +def _round_frac(x, precision): + """ + Round the fractional part of the given number + """ + if not np.isfinite(x) or x == 0: + return x + else: + frac, whole = np.modf(x) + if whole == 0: + digits = -int(np.floor(np.log10(abs(frac)))) - 1 + precision + else: + digits = precision + return np.around(x, digits) + + def _infer_precision(base_precision, bins): """Infer an appropriate precision for _round_frac """ diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index ae40c2f66a590..fe7005418b362 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -31,6 +31,9 @@ import pandas.types.concat as _concat import pandas.tseries.frequencies as frequencies +import pandas.indexes.base as ibase +_index_doc_kwargs = dict(ibase._index_doc_kwargs) + class DatelikeOps(object): """ common ops for DatetimeIndex/PeriodIndex, but not TimedeltaIndex """ @@ -242,6 +245,7 @@ def _box_values(self, values): def _format_with_header(self, header, **kwargs): return header + list(self._format_native_types(**kwargs)) + @Appender(_index_shared_docs['__contains__'] % _index_doc_kwargs) def __contains__(self, key): try: res = self.get_loc(key) @@ -249,6 +253,8 @@ def __contains__(self, key): except (KeyError, TypeError, ValueError): return False + _is_contained_in = __contains__ + def __getitem__(self, key): """ This getitem defers to the underlying array, which by-definition can @@ -381,7 +387,7 @@ def sort_values(self, return_indexer=False, ascending=True): return self._simple_new(sorted_values, **attribs) - @Appender(_index_shared_docs['take']) + @Appender(_index_shared_docs['take'] % _index_doc_kwargs) def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) @@ -798,7 +804,7 @@ def repeat(self, repeats, *args, **kwargs): return self._shallow_copy(self.asi8.repeat(repeats), freq=freq) - @Appender(_index_shared_docs['where']) + @Appender(_index_shared_docs['where'] % _index_doc_kwargs) def where(self, cond, other=None): other = _ensure_datetimelike_to_i8(other) values = _ensure_datetimelike_to_i8(self) diff --git a/pandas/tseries/interval.py b/pandas/tseries/interval.py deleted file mode 100644 index 22801318a1853..0000000000000 --- a/pandas/tseries/interval.py +++ /dev/null @@ -1,35 +0,0 @@ - -from pandas.core.index import Index - - -class Interval(object): - """ - Represents an interval of time defined by two timestamps - """ - - def __init__(self, start, end): - self.start = start - self.end = end - - -class PeriodInterval(object): - """ - Represents an interval of time defined by two Period objects (time - ordinals) - """ - - def __init__(self, start, end): - self.start = start - self.end = end - - -class IntervalIndex(Index): - """ - - """ - - def __new__(self, starts, ends): - pass - - def dtype(self): - return self.values.dtype diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 1e1496bbe9c27..30ebc4da459ff 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -347,6 +347,7 @@ def _coerce_scalar_to_index(self, item): """ return PeriodIndex([item], **self._get_attributes_dict()) + @Appender(_index_shared_docs['__contains__']) def __contains__(self, key): if isinstance(key, Period): if key.freq != self.freq: @@ -361,6 +362,8 @@ def __contains__(self, key): return False return False + _is_contained_in = __contains__ + @property def asi8(self): return self._values.view('i8') diff --git a/pandas/types/api.py b/pandas/types/api.py index e78514ce77822..6dbd3dc6b640c 100644 --- a/pandas/types/api.py +++ b/pandas/types/api.py @@ -10,6 +10,10 @@ is_categorical, is_categorical_dtype, + # interval + is_interval, + is_interval_dtype, + # datetimelike is_datetimetz, is_datetime64_dtype, diff --git a/pandas/types/common.py b/pandas/types/common.py index 7ab2e068ac69f..0b14e484d40a7 100644 --- a/pandas/types/common.py +++ b/pandas/types/common.py @@ -7,6 +7,7 @@ from .dtypes import (CategoricalDtype, CategoricalDtypeType, DatetimeTZDtype, DatetimeTZDtypeType, PeriodDtype, PeriodDtypeType, + IntervalDtype, IntervalDtypeType, ExtensionDtype) from .generic import (ABCCategorical, ABCPeriodIndex, ABCDatetimeIndex, ABCSeries, @@ -139,6 +140,10 @@ def is_period_dtype(arr_or_dtype): return PeriodDtype.is_dtype(arr_or_dtype) +def is_interval_dtype(arr_or_dtype): + return IntervalDtype.is_dtype(arr_or_dtype) + + def is_categorical_dtype(arr_or_dtype): return CategoricalDtype.is_dtype(arr_or_dtype) @@ -501,6 +506,8 @@ def _coerce_to_dtype(dtype): dtype = DatetimeTZDtype(dtype) elif is_period_dtype(dtype): dtype = PeriodDtype(dtype) + elif is_interval_dtype(dtype): + dtype = IntervalDtype(dtype) else: dtype = np.dtype(dtype) return dtype @@ -538,6 +545,8 @@ def _get_dtype(arr_or_dtype): return arr_or_dtype elif isinstance(arr_or_dtype, PeriodDtype): return arr_or_dtype + elif isinstance(arr_or_dtype, IntervalDtype): + return arr_or_dtype elif isinstance(arr_or_dtype, string_types): if is_categorical_dtype(arr_or_dtype): return CategoricalDtype.construct_from_string(arr_or_dtype) @@ -545,6 +554,8 @@ def _get_dtype(arr_or_dtype): return DatetimeTZDtype.construct_from_string(arr_or_dtype) elif is_period_dtype(arr_or_dtype): return PeriodDtype.construct_from_string(arr_or_dtype) + elif is_interval_dtype(arr_or_dtype): + return IntervalDtype.construct_from_string(arr_or_dtype) if hasattr(arr_or_dtype, 'dtype'): arr_or_dtype = arr_or_dtype.dtype @@ -575,6 +586,8 @@ def _get_dtype_type(arr_or_dtype): return CategoricalDtypeType elif isinstance(arr_or_dtype, DatetimeTZDtype): return DatetimeTZDtypeType + elif isinstance(arr_or_dtype, IntervalDtype): + return IntervalDtypeType elif isinstance(arr_or_dtype, PeriodDtype): return PeriodDtypeType elif isinstance(arr_or_dtype, string_types): @@ -584,6 +597,8 @@ def _get_dtype_type(arr_or_dtype): return DatetimeTZDtypeType elif is_period_dtype(arr_or_dtype): return PeriodDtypeType + elif is_interval_dtype(arr_or_dtype): + return IntervalDtypeType return _get_dtype_type(np.dtype(arr_or_dtype)) try: return arr_or_dtype.dtype.type @@ -695,6 +710,8 @@ def pandas_dtype(dtype): return dtype elif isinstance(dtype, CategoricalDtype): return dtype + elif isinstance(dtype, IntervalDtype): + return dtype elif isinstance(dtype, string_types): try: return DatetimeTZDtype.construct_from_string(dtype) @@ -708,6 +725,12 @@ def pandas_dtype(dtype): except TypeError: pass + elif dtype.startswith('interval[') or dtype.startswith('Interval['): + try: + return IntervalDtype.construct_from_string(dtype) + except TypeError: + pass + try: return CategoricalDtype.construct_from_string(dtype) except TypeError: diff --git a/pandas/types/dtypes.py b/pandas/types/dtypes.py index c3494df93476b..7913950a597c9 100644 --- a/pandas/types/dtypes.py +++ b/pandas/types/dtypes.py @@ -367,3 +367,112 @@ def is_dtype(cls, dtype): else: return False return super(PeriodDtype, cls).is_dtype(dtype) + + +class IntervalDtypeType(type): + """ + the type of IntervalDtype, this metaclass determines subclass ability + """ + pass + + +class IntervalDtype(ExtensionDtype): + __metaclass__ = IntervalDtypeType + """ + A Interval duck-typed class, suitable for holding an interval + + THIS IS NOT A REAL NUMPY DTYPE + """ + type = IntervalDtypeType + kind = None + str = '|O08' + base = np.dtype('O') + num = 103 + _metadata = ['subtype'] + _match = re.compile("(I|i)nterval\[(?P.+)\]") + _cache = {} + + def __new__(cls, subtype=None): + """ + Parameters + ---------- + subtype : the dtype of the Interval + """ + + if isinstance(subtype, IntervalDtype): + return subtype + elif subtype is None or (isinstance(subtype, compat.string_types) and + subtype == 'interval'): + subtype = None + else: + if isinstance(subtype, compat.string_types): + m = cls._match.search(subtype) + if m is not None: + subtype = m.group('subtype') + + from pandas.types.common import pandas_dtype + try: + subtype = pandas_dtype(subtype) + except TypeError: + raise ValueError("could not construct IntervalDtype") + + try: + return cls._cache[str(subtype)] + except KeyError: + u = object.__new__(cls) + u.subtype = subtype + cls._cache[str(subtype)] = u + return u + + @classmethod + def construct_from_string(cls, string): + """ + attempt to construct this type from a string, raise a TypeError + if its not possible + """ + if isinstance(string, compat.string_types): + try: + return cls(string) + except ValueError: + pass + raise TypeError("could not construct IntervalDtype") + + def __unicode__(self): + if self.subtype is None: + return "interval" + return "interval[{subtype}]".format(subtype=self.subtype) + + @property + def name(self): + return str(self) + + def __hash__(self): + # make myself hashable + return hash(str(self)) + + def __eq__(self, other): + if isinstance(other, compat.string_types): + return other == self.name or other == self.name.title() + + return (isinstance(other, IntervalDtype) and + self.subtype == other.subtype) + + @classmethod + def is_dtype(cls, dtype): + """ + Return a boolean if we if the passed type is an actual dtype that we + can match (via string or type) + """ + + if isinstance(dtype, compat.string_types): + if dtype.lower().startswith('interval'): + try: + if cls.construct_from_string(dtype) is not None: + return True + else: + return False + except ValueError: + return False + else: + return False + return super(IntervalDtype, cls).is_dtype(dtype) diff --git a/pandas/types/generic.py b/pandas/types/generic.py index e7b54ccc6f25e..90608c18ae503 100644 --- a/pandas/types/generic.py +++ b/pandas/types/generic.py @@ -32,12 +32,14 @@ def _check(cls, inst): ("periodindex", )) ABCCategoricalIndex = create_pandas_abc_type("ABCCategoricalIndex", "_typ", ("categoricalindex", )) +ABCIntervalIndex = create_pandas_abc_type("ABCIntervalIndex", "_typ", + ("intervalindex", )) ABCIndexClass = create_pandas_abc_type("ABCIndexClass", "_typ", ("index", "int64index", "rangeindex", "float64index", "uint64index", "multiindex", "datetimeindex", "timedeltaindex", "periodindex", - "categoricalindex")) + "categoricalindex", "intervalindex")) ABCSeries = create_pandas_abc_type("ABCSeries", "_typ", ("series", )) ABCDataFrame = create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe", )) diff --git a/pandas/types/inference.py b/pandas/types/inference.py index 91418677c6b19..b0a93d24228af 100644 --- a/pandas/types/inference.py +++ b/pandas/types/inference.py @@ -20,6 +20,8 @@ is_decimal = lib.is_decimal +is_interval = lib.is_interval + def is_number(obj): """ diff --git a/pandas/types/missing.py b/pandas/types/missing.py index ea49af9884f5a..af3a873bc2866 100644 --- a/pandas/types/missing.py +++ b/pandas/types/missing.py @@ -9,7 +9,7 @@ from .common import (is_string_dtype, is_datetimelike, is_datetimelike_v_numeric, is_float_dtype, is_datetime64_dtype, is_datetime64tz_dtype, - is_timedelta64_dtype, + is_timedelta64_dtype, is_interval_dtype, is_complex_dtype, is_categorical_dtype, is_string_like_dtype, is_bool_dtype, is_integer_dtype, is_dtype_equal, @@ -127,6 +127,9 @@ def _isnull_ndarraylike(obj): if not isinstance(values, Categorical): values = values.values result = values.isnull() + elif is_interval_dtype(values): + from pandas import IntervalIndex + result = IntervalIndex(obj).isnull() else: # Working around NumPy ticket 1542 diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 28214b1462cb7..c73cca56f975a 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -29,6 +29,7 @@ is_number, is_bool, needs_i8_conversion, is_categorical_dtype, + is_interval_dtype, is_sequence, is_list_like) from pandas.formats.printing import pprint_thing @@ -945,6 +946,9 @@ def _get_ilevel_values(index, level): assert_attr_equal('names', left, right, obj=obj) if isinstance(left, pd.PeriodIndex) or isinstance(right, pd.PeriodIndex): assert_attr_equal('freq', left, right, obj=obj) + if (isinstance(left, pd.IntervalIndex) or + isinstance(right, pd.IntervalIndex)): + assert_attr_equal('closed', left, right, obj=obj) if check_categorical: if is_categorical_dtype(left) or is_categorical_dtype(right): @@ -1309,6 +1313,12 @@ def assert_series_equal(left, right, check_dtype=True, else: assert_numpy_array_equal(left.get_values(), right.get_values(), check_dtype=check_dtype) + elif is_interval_dtype(left) or is_interval_dtype(right): + # TODO: big hack here + l = pd.IntervalIndex(left) + r = pd.IntervalIndex(right) + assert_index_equal(l, r, obj='{0}.index'.format(obj)) + else: libtesting.assert_almost_equal(left.get_values(), right.get_values(), check_less_precise=check_less_precise, @@ -1694,6 +1704,7 @@ def makeIntervalIndex(k=10, name=None): x = np.linspace(0, 100, num=(k + 1)) return IntervalIndex.from_breaks(x, name=name) + def makeBoolIndex(k=10, name=None): if k == 1: return Index([True], name=name) diff --git a/setup.py b/setup.py index 96b25f7427370..6707af7eb0908 100755 --- a/setup.py +++ b/setup.py @@ -119,6 +119,7 @@ def is_platform_mac(): '_libs/hashtable_func_helper.pxi.in'], 'index': ['_libs/index_class_helper.pxi.in'], 'sparse': ['sparse/sparse_op_helper.pxi.in'], + 'interval': ['_libs/intervaltree.pxi.in'] } _pxifiles = [] @@ -335,6 +336,7 @@ class CheckSDist(sdist_class): 'pandas/_libs/index.pyx', 'pandas/_libs/algos.pyx', 'pandas/_libs/join.pyx', + 'pandas/_libs/interval.pyx', 'pandas/core/window.pyx', 'pandas/sparse/sparse.pyx', 'pandas/util/testing.pyx', @@ -508,6 +510,9 @@ def pxd(name): 'depends': _pxi_dep['join']}, '_libs.reshape': {'pyxfile': '_libs/reshape', 'depends': _pxi_dep['reshape']}, + '_libs.interval': {'pyxfile': '_libs/interval', + 'pxdfiles': ['_libs/hashtable'], + 'depends': _pxi_dep['interval']}, 'core.libwindow': {'pyxfile': 'core/window', 'pxdfiles': ['_libs/src/skiplist', '_libs/src/util'], 'depends': ['pandas/_libs/src/skiplist.pyx', From 4a5ebea912bdea5eb9eb886ed98a128542d4a532 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 20 Mar 2017 20:05:20 -0400 Subject: [PATCH 03/12] more tests & fixes for non-unique / overlaps rename _is_contained_in -> contains add sorting test --- pandas/core/indexing.py | 4 +- pandas/indexes/base.py | 6 +- pandas/indexes/category.py | 6 +- pandas/indexes/interval.py | 107 ++++++++++++-- pandas/indexes/multi.py | 2 +- pandas/tests/indexes/test_interval.py | 27 ++-- pandas/tests/indexing/test_interval.py | 196 +++++++++++++++++++------ pandas/tests/series/test_sorting.py | 19 ++- pandas/tseries/base.py | 2 +- pandas/tseries/period.py | 2 +- 10 files changed, 284 insertions(+), 87 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index c9ff26d135f58..dd8fa2d3ddc81 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1429,7 +1429,7 @@ def error(): try: key = self._convert_scalar_indexer(key, axis) - if not ax._is_contained_in(key): + if not ax.contains(key): error() except TypeError as e: @@ -1897,7 +1897,7 @@ def convert_to_index_sliceable(obj, key): elif isinstance(key, compat.string_types): # we are an actual column - if obj._data.items._is_contained_in(key): + if obj._data.items.contains(key): return None # We might have a datetimelike string that we can translate to a diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index c0635f07238b5..00ad4ca71cb9d 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -1585,7 +1585,7 @@ def __contains__(self, key): except TypeError: return False - _index_shared_docs['_is_contained_in'] = """ + _index_shared_docs['contains'] = """ return a boolean if this key is IN the index Parameters @@ -1597,8 +1597,8 @@ def __contains__(self, key): boolean """ - @Appender(_index_shared_docs['_is_contained_in'] % _index_doc_kwargs) - def _is_contained_in(self, key): + @Appender(_index_shared_docs['contains'] % _index_doc_kwargs) + def contains(self, key): hash(key) try: return key in self._engine diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py index 4800375cd5d38..6c57b2ed83705 100644 --- a/pandas/indexes/category.py +++ b/pandas/indexes/category.py @@ -271,12 +271,12 @@ def __contains__(self, key): return key in self.values - @Appender(_index_shared_docs['_is_contained_in'] % _index_doc_kwargs) - def _is_contained_in(self, key): + @Appender(_index_shared_docs['contains'] % _index_doc_kwargs) + def contains(self, key): hash(key) if self.categories._defer_to_indexing: - return self.categories._is_contained_in(key) + return self.categories.contains(key) return key in self.values diff --git a/pandas/indexes/interval.py b/pandas/indexes/interval.py index 127655972e7f2..63315ef861d12 100644 --- a/pandas/indexes/interval.py +++ b/pandas/indexes/interval.py @@ -263,7 +263,7 @@ def __contains__(self, key): except KeyError: return False - def _is_contained_in(self, key): + def contains(self, key): """ return a boolean if this key is IN the index @@ -566,8 +566,11 @@ def _convert_list_indexer(self, keyarr, kind=None): indexer for matching intervals. """ locs = self.get_indexer_for(keyarr) - check = locs == -1 - locs = locs[~check] + + # we have missing values + if (locs == -1).any(): + raise KeyError + return locs def _maybe_cast_indexed(self, key): @@ -575,11 +578,19 @@ def _maybe_cast_indexed(self, key): we need to cast the key, which could be a scalar or an array-like to the type of our subtype """ - if is_float_dtype(self.dtype.subtype): + if isinstance(key, IntervalIndex): + return key + + subtype = self.dtype.subtype + if is_float_dtype(subtype): if is_integer(key): key = float(key) elif isinstance(key, (np.ndarray, Index)): key = key.astype('float64') + elif is_integer_dtype(subtype): + if is_integer(key): + key = int(key) + return key def _check_method(self, method): @@ -616,6 +627,11 @@ def _searchsorted_monotonic(self, label, side, exclude_label=False): def _get_loc_only_exact_matches(self, key): if isinstance(key, Interval): + + if not self.is_unique: + raise ValueError("cannot index with a slice Interval" + " and a non-unique index") + # TODO: this expands to a tuple index, see if we can # do better return Index(self._multiindex.values).get_loc(key) @@ -685,12 +701,28 @@ def get_value(self, series, key): loc = key elif is_list_like(key): loc = self.get_indexer(key) + elif isinstance(key, slice): + + if not (key.step is None or key.step == 1): + raise ValueError("cannot support not-default " + "step in a slice") + + try: + loc = self.get_loc(key) + except TypeError: + + # we didn't find exact intervals + # or are non-unique + raise ValueError("unable to slice with " + "this key: {}".format(key)) + else: loc = self.get_loc(key) return series.iloc[loc] @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs) def get_indexer(self, target, method=None, limit=None, tolerance=None): + self._check_method(method) target = _ensure_index(target) target = self._maybe_cast_indexed(target) @@ -706,7 +738,22 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): return np.where(start_plus_one == stop, start, -1) if not self.is_unique: - raise ValueError("get_indexer cannot handle non-unique indices") + raise ValueError("cannot handle non-unique indices") + + # IntervalIndex + if isinstance(target, IntervalIndex): + indexer = self._get_reindexer(target) + + # non IntervalIndex + else: + indexer = np.concatenate([self.get_loc(i) for i in target]) + + return _ensure_platform_int(indexer) + + def _get_reindexer(self, target): + """ + Return an indexer for a target IntervalIndex with self + """ # find the left and right indexers lindexer = self._engine.get_indexer(target.left.values) @@ -720,27 +767,59 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): indexer = [] n = len(self) - for l, r in zip(lindexer, rindexer): + for i, (l, r) in enumerate(zip(lindexer, rindexer)): + + target_value = target[i] + + # matching on the lhs bound + if (l != -1 and + self.closed == 'right' and + target_value.left == self[l].right): + l += 1 + + # matching on the lhs bound + if (r != -1 and + self.closed == 'left' and + target_value.right == self[r].left): + r -= 1 # not found if l == -1 and r == -1: indexer.append(np.array([-1])) elif r == -1: + indexer.append(np.arange(l, n)) elif l == -1: - if r == 0: - indexer.append(np.array([-1])) - else: - indexer.append(np.arange(0, r + 1)) - else: - indexer.append(np.arange(l, r)) + # care about left/right closed here + value = self[i] - indexer = np.concatenate(indexer) + # target.closed same as self.closed + if self.closed == target.closed: + if target_value.left < value.left: + indexer.append(np.array([-1])) + continue - return _ensure_platform_int(indexer) + # target.closed == 'left' + elif self.closed == 'right': + if target_value.left <= value.left: + indexer.append(np.array([-1])) + continue + + # target.closed == 'right' + elif self.closed == 'left': + if target_value.left <= value.left: + indexer.append(np.array([-1])) + continue + + indexer.append(np.arange(0, r + 1)) + + else: + indexer.append(np.arange(l, r + 1)) + + return np.concatenate(indexer) @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs) def get_indexer_non_unique(self, target): diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index f51ed20379726..d1c8e0ba1cc4e 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -1327,7 +1327,7 @@ def __contains__(self, key): except LookupError: return False - _is_contained_in = __contains__ + contains = __contains__ def __reduce__(self): """Necessary for making this object picklable""" diff --git a/pandas/tests/indexes/test_interval.py b/pandas/tests/indexes/test_interval.py index 6771b875c5137..25ca961895ca3 100644 --- a/pandas/tests/indexes/test_interval.py +++ b/pandas/tests/indexes/test_interval.py @@ -430,13 +430,12 @@ def test_get_indexer(self): self.assert_numpy_array_equal(actual, expected) actual = self.index.get_indexer(index) - expected = np.array([-1, 0], dtype='int64') + expected = np.array([-1, 1], dtype='int64') self.assert_numpy_array_equal(actual, expected) - @pytest.mark.xfail(reason="what to return for overlaps") def test_get_indexer_subintervals(self): - # TODO + # TODO: is this right? # return indexers for wholly contained subintervals target = IntervalIndex.from_breaks(np.linspace(0, 2, 5)) actual = self.index.get_indexer(target) @@ -445,7 +444,7 @@ def test_get_indexer_subintervals(self): target = IntervalIndex.from_breaks([0, 0.67, 1.33, 2]) actual = self.index.get_indexer(target) - expected = np.array([-1, 0, 1], dtype='int64') + expected = np.array([0, 0, 1, 1], dtype='int64') self.assert_numpy_array_equal(actual, expected) actual = self.index.get_indexer(target[[0, -1]]) @@ -473,22 +472,22 @@ def test_contains(self): self.assertNotIn(Interval(3, 5), i) self.assertNotIn(Interval(-1, 0, closed='left'), i) - def test_is_contained_in(self): + def testcontains(self): # can select values that are IN the range of a value i = IntervalIndex.from_arrays([0, 1], [1, 2]) - assert i._is_contained_in(0.1) - assert i._is_contained_in(0.5) - assert i._is_contained_in(1) - assert i._is_contained_in(Interval(0, 1)) - assert i._is_contained_in(Interval(0, 2)) + assert i.contains(0.1) + assert i.contains(0.5) + assert i.contains(1) + assert i.contains(Interval(0, 1)) + assert i.contains(Interval(0, 2)) # these overlaps completely - assert i._is_contained_in(Interval(0, 3)) - assert i._is_contained_in(Interval(1, 3)) + assert i.contains(Interval(0, 3)) + assert i.contains(Interval(1, 3)) - assert not i._is_contained_in(20) - assert not i._is_contained_in(-20) + assert not i.contains(20) + assert not i.contains(-20) def test_dropna(self): diff --git a/pandas/tests/indexing/test_interval.py b/pandas/tests/indexing/test_interval.py index a5432be0da466..bccc21ed6c086 100644 --- a/pandas/tests/indexing/test_interval.py +++ b/pandas/tests/indexing/test_interval.py @@ -11,14 +11,19 @@ class TestIntervalIndex(tm.TestCase): def setUp(self): self.s = Series(np.arange(5), IntervalIndex.from_breaks(np.arange(6))) - def test_loc_getitem_series(self): + def test_loc_with_scalar(self): s = self.s expected = 0 - self.assertEqual(expected, s.loc[0.5]) - self.assertEqual(expected, s.loc[1]) - self.assertEqual(expected, s.loc[Interval(0, 1)]) - self.assertRaises(KeyError, s.loc.__getitem__, 0) + + result = s.loc[0.5] + assert result == expected + + result = s.loc[1] + assert result == expected + + with pytest.raises(KeyError): + s.loc[0] expected = s.iloc[:3] tm.assert_series_equal(expected, s.loc[:3]) @@ -34,39 +39,19 @@ def test_loc_getitem_series(self): expected = s.iloc[2:5] tm.assert_series_equal(expected, s.loc[s >= 2]) - expected = s.iloc[2:5] - result = s.loc[[pd.Interval(3, 6)]] - tm.assert_series_equal(expected, result) - - expected = s.iloc[2:4] - result = s.loc[[pd.Interval(3, 5)]] - tm.assert_series_equal(expected, result) + def test_getitem_with_scalar(self): - expected = s.iloc[[2, 3, 4, 2, 3, 4]] - result = s.loc[[pd.Interval(3, 6), pd.Interval(3, 6)]] - tm.assert_series_equal(expected, result) - - # slice of interval - with pytest.raises(NotImplementedError): - result = s.loc[pd.Interval(3, 6):] - - def test_loc_non_matching(self): s = self.s + expected = 0 - # TODO: We are getting at least 1 matching - # interval so this meets our current semantics - expected = s.iloc[[2, 3, 4]] - result = s.loc[[-1, 3, 4, 5]] - tm.assert_series_equal(expected, result) + result = s[0.5] + assert result == expected - def test_getitem_series(self): + result = s[1] + assert result == expected - s = self.s - expected = 0 - self.assertEqual(expected, s[0.5]) - self.assertEqual(expected, s[1]) - self.assertEqual(expected, s[Interval(0, 1)]) - self.assertRaises(KeyError, s.__getitem__, 0) + with pytest.raises(KeyError): + s[0] expected = s.iloc[:3] tm.assert_series_equal(expected, s[:3]) @@ -82,18 +67,142 @@ def test_getitem_series(self): expected = s.iloc[2:5] tm.assert_series_equal(expected, s[s >= 2]) - expected = s.iloc[2:5] - result = s[[pd.Interval(3, 6)]] + def test_with_interval(self): + + s = self.s + expected = 0 + + result = s.loc[Interval(0, 1)] + assert result == expected + + result = s[Interval(0, 1)] + assert result == expected + + expected = s.iloc[3:5] + result = s.loc[Interval(3, 6)] + tm.assert_series_equal(expected, result) + + expected = s.iloc[3:5] + result = s.loc[[Interval(3, 6)]] + tm.assert_series_equal(expected, result) + + expected = s.iloc[3:5] + result = s.loc[[Interval(3, 5)]] tm.assert_series_equal(expected, result) + # missing + with pytest.raises(KeyError): + s.loc[Interval(-2, 0)] + + with pytest.raises(KeyError): + s[Interval(-2, 0)] + + with pytest.raises(KeyError): + s.loc[Interval(5, 6)] + + with pytest.raises(KeyError): + s[Interval(5, 6)] + + def test_with_slices(self): + + s = self.s + # slice of interval with pytest.raises(NotImplementedError): - result = s[pd.Interval(3, 6):] + result = s.loc[Interval(3, 6):] - # slice of scalar with pytest.raises(NotImplementedError): + result = s[Interval(3, 6):] + + expected = s.iloc[3:5] + result = s[[Interval(3, 6)]] + tm.assert_series_equal(expected, result) + + # slice of scalar with step != 1 + with pytest.raises(ValueError): s[0:4:2] + def test_with_overlaps(self): + + s = self.s + expected = s.iloc[[3, 4, 3, 4]] + result = s.loc[[Interval(3, 6), Interval(3, 6)]] + tm.assert_series_equal(expected, result) + + idx = IntervalIndex.from_tuples([(1, 5), (3, 7)]) + s = Series(range(len(idx)), index=idx) + + result = s[4] + expected = s + tm.assert_series_equal(expected, result) + + result = s[[4]] + expected = s + tm.assert_series_equal(expected, result) + + result = s.loc[[4]] + expected = s + tm.assert_series_equal(expected, result) + + result = s[Interval(3, 5)] + expected = s + tm.assert_series_equal(expected, result) + + result = s.loc[Interval(3, 5)] + expected = s + tm.assert_series_equal(expected, result) + + # doesn't intersect unique set of intervals + with pytest.raises(KeyError): + s[[Interval(3, 5)]] + + with pytest.raises(KeyError): + s.loc[[Interval(3, 5)]] + + def test_non_unique(self): + + idx = IntervalIndex.from_tuples([(1, 3), (3, 7)]) + + s = pd.Series(range(len(idx)), index=idx) + + result = s.loc[Interval(1, 3)] + assert result == 0 + + result = s.loc[[Interval(1, 3)]] + expected = s.iloc[0:1] + tm.assert_series_equal(expected, result) + + def test_non_unique_moar(self): + + idx = IntervalIndex.from_tuples([(1, 3), (1, 3), (3, 7)]) + s = Series(range(len(idx)), index=idx) + + result = s.loc[Interval(1, 3)] + expected = s.iloc[[0, 1]] + tm.assert_series_equal(expected, result) + + # non-unique index and slices not allowed + with pytest.raises(ValueError): + s.loc[Interval(1, 3):] + + with pytest.raises(ValueError): + s[Interval(1, 3):] + + # non-unique + with pytest.raises(ValueError): + s[[Interval(1, 3)]] + + def test_non_matching(self): + s = self.s + + # this is a departure from our current + # indexin scheme, but simpler + with pytest.raises(KeyError): + s.loc[[-1, 3, 4, 5]] + + with pytest.raises(KeyError): + s.loc[[-1, 3]] + def test_large_series(self): s = Series(np.arange(1000000), index=IntervalIndex.from_breaks(np.arange(1000001))) @@ -115,11 +224,9 @@ def test_loc_getitem_frame(self): expected = df.iloc[4:6] tm.assert_frame_equal(result, expected) - def f(): + with pytest.raises(KeyError): df.loc[10] - self.assertRaises(KeyError, f) - # single list-like result = df.loc[[4]] expected = df.iloc[4:6] @@ -130,12 +237,9 @@ def f(): expected = df.take([4, 5, 4, 5]) tm.assert_frame_equal(result, expected) - def f(): + with pytest.raises(KeyError): df.loc[[10]] - self.assertRaises(KeyError, f) - # partial missing - result = df.loc[[10, 4]] - expected = df.iloc[4:6] - tm.assert_frame_equal(result, expected) + with pytest.raises(KeyError): + df.loc[[10, 4]] diff --git a/pandas/tests/series/test_sorting.py b/pandas/tests/series/test_sorting.py index 66ecba960ae0b..26c51ec976f74 100644 --- a/pandas/tests/series/test_sorting.py +++ b/pandas/tests/series/test_sorting.py @@ -3,9 +3,9 @@ import numpy as np import random -from pandas import (DataFrame, Series, MultiIndex) +from pandas import DataFrame, Series, MultiIndex, IntervalIndex -from pandas.util.testing import (assert_series_equal, assert_almost_equal) +from pandas.util.testing import assert_series_equal, assert_almost_equal import pandas.util.testing as tm from .common import TestData @@ -177,3 +177,18 @@ def test_sort_index_na_position(self): expected_series_last = Series(index=[1, 2, 3, 3, 4, np.nan]) index_sorted_series = series.sort_index(na_position='last') assert_series_equal(expected_series_last, index_sorted_series) + + def test_sort_index_intervals(self): + s = Series([np.nan, 1, 2, 3], IntervalIndex.from_arrays( + [0, 1, 2, 3], + [1, 2, 3, 4])) + + result = s.sort_index() + expected = s + assert_series_equal(result, expected) + + result = s.sort_index(ascending=False) + expected = Series([3, 2, 1, np.nan], IntervalIndex.from_arrays( + [3, 2, 1, 0], + [4, 3, 2, 1])) + assert_series_equal(result, expected) diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index fe7005418b362..48d236177b474 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -253,7 +253,7 @@ def __contains__(self, key): except (KeyError, TypeError, ValueError): return False - _is_contained_in = __contains__ + contains = __contains__ def __getitem__(self, key): """ diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 30ebc4da459ff..7f7b3286fd4f8 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -362,7 +362,7 @@ def __contains__(self, key): return False return False - _is_contained_in = __contains__ + contains = __contains__ @property def asi8(self): From e5f808224836bf92d57986ff70484844326a0f22 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 31 Mar 2017 12:57:49 -0400 Subject: [PATCH 04/12] allow pd.cut to take an IntervalIndex for bins --- pandas/tests/tools/test_tile.py | 12 ++++++++++++ pandas/tools/tile.py | 15 ++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/pandas/tests/tools/test_tile.py b/pandas/tests/tools/test_tile.py index e0a625bbf29c2..cfe4251891cf5 100644 --- a/pandas/tests/tools/test_tile.py +++ b/pandas/tests/tools/test_tile.py @@ -59,6 +59,18 @@ def test_arraylike(self): tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667, 6.53333333, 9.7])) + def test_bins_from_intervalindex(self): + c = cut(range(5), 3) + expected = c + result = cut(range(5), bins=expected.categories) + tm.assert_categorical_equal(result, expected) + + expected = Categorical.from_codes(np.append(c.codes, -1), + categories=c.categories, + ordered=True) + result = cut(range(6), bins=expected.categories) + tm.assert_categorical_equal(result, expected) + def test_bins_not_monotonic(self): data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1] self.assertRaises(ValueError, cut, data, [0.1, 1.5, 1, 10]) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index c6b1ee417c64d..33c7b0481da4d 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -29,7 +29,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, ---------- x : array-like Input array to be binned. It has to be 1-dimensional. - bins : int or sequence of scalars + bins : int, sequence of scalars, or IntervalIndex If `bins` is an int, it defines the number of equal-width bins in the range of `x`. However, in this case, the range of `x` is extended by .1% on each side to include the min or max values of `x`. If @@ -78,10 +78,12 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, (6.533, 9.7], (0.191, 3.367]] Categories (3, object): [(0.191, 3.367] < (3.367, 6.533] < (6.533, 9.7]], array([ 0.1905 , 3.36666667, 6.53333333, 9.7 ])) + >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, labels=["good","medium","bad"]) [good, good, good, medium, bad, good] Categories (3, object): [good < medium < bad] + >>> pd.cut(np.ones(5), 4, labels=False) array([1, 1, 1, 1, 1], dtype=int64) """ @@ -119,6 +121,8 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, else: bins[-1] += adj + elif isinstance(bins, IntervalIndex): + pass else: bins = np.asarray(bins) bins = _convert_bin_to_numeric_type(bins, dtype) @@ -179,9 +183,11 @@ def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): >>> pd.qcut(range(5), 4) [[0, 1], [0, 1], (1, 2], (2, 3], (3, 4]] Categories (4, object): [[0, 1] < (1, 2] < (2, 3] < (3, 4]] + >>> pd.qcut(range(5), 3, labels=["good","medium","bad"]) [good, good, medium, bad, bad] Categories (3, object): [good < medium < bad] + >>> pd.qcut(range(5), 4, labels=False) array([0, 0, 1, 2, 3], dtype=int64) """ @@ -210,6 +216,13 @@ def _bins_to_cuts(x, bins, right=True, labels=None, raise ValueError("invalid value for 'duplicates' parameter, " "valid options are: raise, drop") + if isinstance(bins, IntervalIndex): + # we have a fast-path here + ids = bins.get_indexer(x) + result = algos.take_nd(bins, ids) + result = Categorical(result, ordered=True) + return result, bins + unique_bins = algos.unique(bins) if len(unique_bins) < len(bins) and len(bins) != 2: if duplicates == 'raise': From b2d26eb97e2c57e8a60ba1f710bf5b702797f002 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 2 Apr 2017 19:10:15 -0400 Subject: [PATCH 05/12] more docs --- doc/source/advanced.rst | 33 +++++++++++++++++++++++++++++++++ doc/source/whatsnew/v0.20.0.txt | 3 ++- 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index 43373fc86c4d1..ea00588ba156f 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -850,6 +850,39 @@ Of course if you need integer based selection, then use ``iloc`` dfir.iloc[0:5] +.. _indexing.intervallindex: + +IntervalIndex +~~~~~~~~~~~~~ + +.. versionadded:: 0.20.0 + +.. warning:: + + These indexing behaviors are provisional and may change in a future version of pandas. + +.. ipython:: python + + df = pd.DataFrame({'A': [1, 2, 3, 4]}, + index=pd.IntervalIndex.from_breaks([0, 1, 2, 3, 4])) + df + +Label based indexing via ``.loc`` along the edges of an interval works as you would expect, +selecting that particular interval. + +.. ipython:: python + + df.loc[2] + df.loc[[2, 3]] + +If you select a lable *contained* within an interval, this will also select the interval. + +.. ipython:: python + + df.loc[2.5] + df.loc[[2.5, 3.5]] + + Miscellaneous indexing FAQ -------------------------- diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 6daeb29a6e67e..62fa95479591f 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -321,7 +321,8 @@ IntervalIndex ^^^^^^^^^^^^^ pandas has gain an ``IntervalIndex`` with its own dtype, ``interval`` as well as the ``Interval`` scalar type. These allow first-class support for interval -notation, specifically as return type for ``pd.cut`` and ``pd.qcut``. (:issue:`7640`, :issue:`8625`) +notation, specifically as return type for ``pd.cut`` and ``pd.qcut``. The ``IntervalIndex`` allows some unique indexing, see the +:ref:`docs `. (:issue:`7640`, :issue:`8625`) **Previous behavior**: From f0e3ad2f6457c79bf5b9c9b3ec83fb8558f00337 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 4 Apr 2017 09:26:57 -0400 Subject: [PATCH 06/12] pep --- pandas/tests/indexes/test_category.py | 3 ++- pandas/tests/test_categorical.py | 15 +++++++++------ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index b8c50239efac3..f2e409deb2ce4 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -348,7 +348,8 @@ def test_astype(self): right=[2, 4], closed='right') - ci = CategoricalIndex(Categorical.from_codes([0, 1, -1], categories=ii, ordered=True)) + ci = CategoricalIndex(Categorical.from_codes( + [0, 1, -1], categories=ii, ordered=True)) result = ci.astype('interval') expected = ii.take([0, 1, -1]) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 8600b2d726e49..dd370f0a20c2e 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -1608,11 +1608,12 @@ def setUp(self): self.factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) df = DataFrame({'value': np.random.randint(0, 10000, 100)}) - labels = [ "{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500) ] + labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] cat_labels = Categorical(labels, labels) df = df.sort_values(by=['value'], ascending=True) - df['value_group'] = pd.cut(df.value, range(0, 10500, 500), right=False, labels=cat_labels) + df['value_group'] = pd.cut(df.value, range(0, 10500, 500), + right=False, labels=cat_labels) self.cat = df def test_dtypes(self): @@ -2019,8 +2020,10 @@ def test_series_functions_no_warnings(self): def test_assignment_to_dataframe(self): # assignment - df = DataFrame({'value': np.array(np.random.randint(0, 10000, 100),dtype='int32')}) - labels = Categorical(["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]) + df = DataFrame({'value': np.array( + np.random.randint(0, 10000, 100), dtype='int32')}) + labels = Categorical(["{0} - {1}".format(i, i + 499) + for i in range(0, 10000, 500)]) df = df.sort_values(by=['value'], ascending=True) s = pd.cut(df.value, range(0, 10500, 500), right=False, labels=labels) @@ -3134,7 +3137,7 @@ def test_slicing(self): df = DataFrame({'value': (np.arange(100) + 1).astype('int64')}) df['D'] = pd.cut(df.value, bins=[0, 25, 50, 75, 100]) - expected = Series([11, Interval(0, 25)], index=['value','D'], name=10) + expected = Series([11, Interval(0, 25)], index=['value', 'D'], name=10) result = df.iloc[10] tm.assert_series_equal(result, expected) @@ -3144,7 +3147,7 @@ def test_slicing(self): result = df.iloc[10:20] tm.assert_frame_equal(result, expected) - expected = Series([9, Interval(0, 25)],index=['value', 'D'], name=8) + expected = Series([9, Interval(0, 25)], index=['value', 'D'], name=8) result = df.loc[8] tm.assert_series_equal(result, expected) From 43339378873ba4ee25907691e2531e4f4aaec7a1 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 4 Apr 2017 09:28:14 -0400 Subject: [PATCH 07/12] api-types test fixing --- pandas/tests/api/test_types.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/api/test_types.py b/pandas/tests/api/test_types.py index f3fd6332417a1..1d05eda88e265 100644 --- a/pandas/tests/api/test_types.py +++ b/pandas/tests/api/test_types.py @@ -23,7 +23,8 @@ class TestTypes(Base, tm.TestCase): 'is_string_dtype', 'is_signed_integer_dtype', 'is_timedelta64_dtype', 'is_timedelta64_ns_dtype', 'is_unsigned_integer_dtype', 'is_period', - 'is_period_dtype', 'is_re', 'is_re_compilable', + 'is_period_dtype', 'is_interval', 'is_interval_dtype', + 'is_re', 'is_re_compilable', 'is_dict_like', 'is_iterator', 'is_file_like', 'is_list_like', 'is_hashable', 'is_named_tuple', 'is_sequence', From 3a3e02e960d7afdf853e44bbd96a536b14dca8cf Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 6 Apr 2017 09:16:31 -0400 Subject: [PATCH 08/12] sorting example --- pandas/tests/frame/test_sorting.py | 258 ++++++++++++++++------------- 1 file changed, 139 insertions(+), 119 deletions(-) diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index 5108fc6080866..97171123c4a36 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -1,12 +1,13 @@ # -*- coding: utf-8 -*- from __future__ import print_function - +import random import numpy as np +import pandas as pd from pandas.compat import lrange from pandas import (DataFrame, Series, MultiIndex, Timestamp, - date_range, NaT) + date_range, NaT, IntervalIndex) from pandas.util.testing import (assert_series_equal, assert_frame_equal, @@ -19,45 +20,6 @@ class TestDataFrameSorting(tm.TestCase, TestData): - def test_sort_index(self): - # GH13496 - - frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4], - columns=['A', 'B', 'C', 'D']) - - # axis=0 : sort rows by index labels - unordered = frame.loc[[3, 2, 4, 1]] - result = unordered.sort_index(axis=0) - expected = frame - assert_frame_equal(result, expected) - - result = unordered.sort_index(ascending=False) - expected = frame[::-1] - assert_frame_equal(result, expected) - - # axis=1 : sort columns by column names - unordered = frame.iloc[:, [2, 1, 3, 0]] - result = unordered.sort_index(axis=1) - assert_frame_equal(result, frame) - - result = unordered.sort_index(axis=1, ascending=False) - expected = frame.iloc[:, ::-1] - assert_frame_equal(result, expected) - - def test_sort_index_multiindex(self): - # GH13496 - - # sort rows by specified level of multi-index - mi = MultiIndex.from_tuples([[2, 1, 3], [1, 1, 1]], names=list('ABC')) - df = DataFrame([[1, 2], [3, 4]], mi) - - # MI sort, but no level: sort_level has no effect - mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) - df = DataFrame([[1, 2], [3, 4]], mi) - result = df.sort_index(sort_remaining=False) - expected = df.sort_index() - assert_frame_equal(result, expected) - def test_sort(self): frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4], columns=['A', 'B', 'C', 'D']) @@ -151,21 +113,6 @@ def test_sort_values_inplace(self): expected = frame.sort_values(by=['A', 'B'], ascending=False) assert_frame_equal(sorted_df, expected) - def test_sort_index_categorical_index(self): - - df = (DataFrame({'A': np.arange(6, dtype='int64'), - 'B': Series(list('aabbca')) - .astype('category', categories=list('cab'))}) - .set_index('B')) - - result = df.sort_index() - expected = df.iloc[[4, 0, 1, 5, 2, 3]] - assert_frame_equal(result, expected) - - result = df.sort_index(ascending=False) - expected = df.iloc[[3, 2, 5, 1, 0, 4]] - assert_frame_equal(result, expected) - def test_sort_nan(self): # GH3917 nan = np.nan @@ -291,8 +238,86 @@ def test_stable_descending_multicolumn_sort(self): kind='mergesort') assert_frame_equal(sorted_df, expected) + def test_sort_datetimes(self): + + # GH 3461, argsort / lexsort differences for a datetime column + df = DataFrame(['a', 'a', 'a', 'b', 'c', 'd', 'e', 'f', 'g'], + columns=['A'], + index=date_range('20130101', periods=9)) + dts = [Timestamp(x) + for x in ['2004-02-11', '2004-01-21', '2004-01-26', + '2005-09-20', '2010-10-04', '2009-05-12', + '2008-11-12', '2010-09-28', '2010-09-28']] + df['B'] = dts[::2] + dts[1::2] + df['C'] = 2. + df['A1'] = 3. + + df1 = df.sort_values(by='A') + df2 = df.sort_values(by=['A']) + assert_frame_equal(df1, df2) + + df1 = df.sort_values(by='B') + df2 = df.sort_values(by=['B']) + assert_frame_equal(df1, df2) + + def test_frame_column_inplace_sort_exception(self): + s = self.frame['A'] + with assertRaisesRegexp(ValueError, "This Series is a view"): + s.sort_values(inplace=True) + + cp = s.copy() + cp.sort_values() # it works! + + def test_sort_nat_values_in_int_column(self): + + # GH 14922: "sorting with large float and multiple columns incorrect" + + # cause was that the int64 value NaT was considered as "na". Which is + # only correct for datetime64 columns. + + int_values = (2, int(NaT)) + float_values = (2.0, -1.797693e308) + + df = DataFrame(dict(int=int_values, float=float_values), + columns=["int", "float"]) + + df_reversed = DataFrame(dict(int=int_values[::-1], + float=float_values[::-1]), + columns=["int", "float"], + index=[1, 0]) + + # NaT is not a "na" for int64 columns, so na_position must not + # influence the result: + df_sorted = df.sort_values(["int", "float"], na_position="last") + assert_frame_equal(df_sorted, df_reversed) + + df_sorted = df.sort_values(["int", "float"], na_position="first") + assert_frame_equal(df_sorted, df_reversed) + + # reverse sorting order + df_sorted = df.sort_values(["int", "float"], ascending=False) + assert_frame_equal(df_sorted, df) + + # and now check if NaT is still considered as "na" for datetime64 + # columns: + df = DataFrame(dict(datetime=[Timestamp("2016-01-01"), NaT], + float=float_values), columns=["datetime", "float"]) + + df_reversed = DataFrame(dict(datetime=[NaT, Timestamp("2016-01-01")], + float=float_values[::-1]), + columns=["datetime", "float"], + index=[1, 0]) + + df_sorted = df.sort_values(["datetime", "float"], na_position="first") + assert_frame_equal(df_sorted, df_reversed) + + df_sorted = df.sort_values(["datetime", "float"], na_position="last") + assert_frame_equal(df_sorted, df_reversed) + + +class TestDataFrameSortIndexKinds(tm.TestCase, TestData): + def test_sort_index_multicolumn(self): - import random A = np.arange(5).repeat(20) B = np.tile(np.arange(5), 20) random.shuffle(A) @@ -448,78 +473,73 @@ def test_sort_index_level(self): res = df.sort_index(level=['A', 'B'], sort_remaining=False) assert_frame_equal(df, res) - def test_sort_datetimes(self): - - # GH 3461, argsort / lexsort differences for a datetime column - df = DataFrame(['a', 'a', 'a', 'b', 'c', 'd', 'e', 'f', 'g'], - columns=['A'], - index=date_range('20130101', periods=9)) - dts = [Timestamp(x) - for x in ['2004-02-11', '2004-01-21', '2004-01-26', - '2005-09-20', '2010-10-04', '2009-05-12', - '2008-11-12', '2010-09-28', '2010-09-28']] - df['B'] = dts[::2] + dts[1::2] - df['C'] = 2. - df['A1'] = 3. - - df1 = df.sort_values(by='A') - df2 = df.sort_values(by=['A']) - assert_frame_equal(df1, df2) - - df1 = df.sort_values(by='B') - df2 = df.sort_values(by=['B']) - assert_frame_equal(df1, df2) - - def test_frame_column_inplace_sort_exception(self): - s = self.frame['A'] - with assertRaisesRegexp(ValueError, "This Series is a view"): - s.sort_values(inplace=True) - - cp = s.copy() - cp.sort_values() # it works! + def test_sort_index_categorical_index(self): - def test_sort_nat_values_in_int_column(self): + df = (DataFrame({'A': np.arange(6, dtype='int64'), + 'B': Series(list('aabbca')) + .astype('category', categories=list('cab'))}) + .set_index('B')) - # GH 14922: "sorting with large float and multiple columns incorrect" + result = df.sort_index() + expected = df.iloc[[4, 0, 1, 5, 2, 3]] + assert_frame_equal(result, expected) - # cause was that the int64 value NaT was considered as "na". Which is - # only correct for datetime64 columns. + result = df.sort_index(ascending=False) + expected = df.iloc[[3, 2, 5, 1, 0, 4]] + assert_frame_equal(result, expected) - int_values = (2, int(NaT)) - float_values = (2.0, -1.797693e308) + def test_sort_index(self): + # GH13496 - df = DataFrame(dict(int=int_values, float=float_values), - columns=["int", "float"]) + frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4], + columns=['A', 'B', 'C', 'D']) - df_reversed = DataFrame(dict(int=int_values[::-1], - float=float_values[::-1]), - columns=["int", "float"], - index=[1, 0]) + # axis=0 : sort rows by index labels + unordered = frame.loc[[3, 2, 4, 1]] + result = unordered.sort_index(axis=0) + expected = frame + assert_frame_equal(result, expected) - # NaT is not a "na" for int64 columns, so na_position must not - # influence the result: - df_sorted = df.sort_values(["int", "float"], na_position="last") - assert_frame_equal(df_sorted, df_reversed) + result = unordered.sort_index(ascending=False) + expected = frame[::-1] + assert_frame_equal(result, expected) - df_sorted = df.sort_values(["int", "float"], na_position="first") - assert_frame_equal(df_sorted, df_reversed) + # axis=1 : sort columns by column names + unordered = frame.iloc[:, [2, 1, 3, 0]] + result = unordered.sort_index(axis=1) + assert_frame_equal(result, frame) - # reverse sorting order - df_sorted = df.sort_values(["int", "float"], ascending=False) - assert_frame_equal(df_sorted, df) + result = unordered.sort_index(axis=1, ascending=False) + expected = frame.iloc[:, ::-1] + assert_frame_equal(result, expected) - # and now check if NaT is still considered as "na" for datetime64 - # columns: - df = DataFrame(dict(datetime=[Timestamp("2016-01-01"), NaT], - float=float_values), columns=["datetime", "float"]) + def test_sort_index_multiindex(self): + # GH13496 - df_reversed = DataFrame(dict(datetime=[NaT, Timestamp("2016-01-01")], - float=float_values[::-1]), - columns=["datetime", "float"], - index=[1, 0]) + # sort rows by specified level of multi-index + mi = MultiIndex.from_tuples([[2, 1, 3], [1, 1, 1]], names=list('ABC')) + df = DataFrame([[1, 2], [3, 4]], mi) - df_sorted = df.sort_values(["datetime", "float"], na_position="first") - assert_frame_equal(df_sorted, df_reversed) + # MI sort, but no level: sort_level has no effect + mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) + df = DataFrame([[1, 2], [3, 4]], mi) + result = df.sort_index(sort_remaining=False) + expected = df.sort_index() + assert_frame_equal(result, expected) - df_sorted = df.sort_values(["datetime", "float"], na_position="last") - assert_frame_equal(df_sorted, df_reversed) + def test_sort_index_intervalindex(self): + # this is a de-facto sort via unstack + # confirming that we sort in the order of the bins + y = Series(np.random.randn(100)) + x1 = Series(np.sign(np.random.randn(100))) + x2 = pd.cut(Series(np.random.randn(100)), + bins=[-3, -0.5, 0, 0.5, 3]) + model = pd.concat([y, x1, x2], axis=1, keys=['Y', 'X1', 'X2']) + + result = model.groupby(['X1', 'X2']).mean().unstack() + expected = IntervalIndex.from_tuples( + [(-3.0, -0.5), (-0.5, 0.0), + (0.0, 0.5), (0.5, 3.0)], + closed='right') + result = result.columns.levels[1].categories + tm.assert_index_equal(result, expected) From 7577335d10c24cecf4d8244385dc58fdaa804fa5 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 6 Apr 2017 20:29:12 -0400 Subject: [PATCH 09/12] fixup on merge of changes in algorithms.py --- pandas/core/algorithms.py | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 2a2789843207a..5d2db864dd48e 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -19,7 +19,7 @@ is_bool_dtype, needs_i8_conversion, is_categorical, is_datetimetz, is_datetime64_any_dtype, is_datetime64tz_dtype, - is_timedelta64_dtype, + is_timedelta64_dtype, is_interval_dtype, is_scalar, is_list_like, _ensure_platform_int, _ensure_object, _ensure_float64, _ensure_uint64, @@ -610,19 +610,6 @@ def value_counts(values, sort=True, ascending=False, normalize=False, except TypeError: raise TypeError("bins argument only works with numeric data.") - if is_categorical_dtype(values) or is_sparse(values): - - # handle Categorical and sparse, - result = Series(values).values.value_counts(dropna=dropna) - result.name = name - counts = result.values - - else: - keys, counts = _value_counts_arraylike(values, dropna) - - if not isinstance(keys, Index): - keys = Index(keys) - result = Series(counts, index=keys, name=name) # count, remove nulls (from the index), and but the bins result = ii.value_counts(dropna=dropna) result = result[result.index.notnull()] @@ -636,6 +623,22 @@ def value_counts(values, sort=True, ascending=False, normalize=False, # normalizing is by len of all (regardless of dropna) counts = np.array([len(ii)]) + else: + + if is_categorical_dtype(values) or is_sparse(values): + + # handle Categorical and sparse, + result = Series(values).values.value_counts(dropna=dropna) + result.name = name + counts = result.values + + else: + keys, counts = _value_counts_arraylike(values, dropna) + + if not isinstance(keys, Index): + keys = Index(keys) + result = Series(counts, index=keys, name=name) + if sort: result = result.sort_values(ascending=ascending) From fbc1cf8423aaab475a9ee1bea310be218cfefdad Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 7 Apr 2017 16:47:53 -0400 Subject: [PATCH 10/12] doc example and bug --- doc/source/reshaping.rst | 10 +++++++++- pandas/tests/tools/test_tile.py | 12 ++++++++++++ pandas/tools/tile.py | 2 +- 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index 2c5aae133d4d9..b93749922c8ea 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -517,7 +517,15 @@ Alternatively we can specify custom bin-edges: .. ipython:: python - pd.cut(ages, bins=[0, 18, 35, 70]) + c = pd.cut(ages, bins=[0, 18, 35, 70]) + c + +.. versionadded:: 0.20.0 + +If the ``bins`` keyword is an ``IntervalIndex``, then these will be +used to bin the passed data. + + pd.cut([25, 20, 50], bins=c.categories) .. _reshaping.dummies: diff --git a/pandas/tests/tools/test_tile.py b/pandas/tests/tools/test_tile.py index cfe4251891cf5..742568870c3c3 100644 --- a/pandas/tests/tools/test_tile.py +++ b/pandas/tests/tools/test_tile.py @@ -71,6 +71,18 @@ def test_bins_from_intervalindex(self): result = cut(range(6), bins=expected.categories) tm.assert_categorical_equal(result, expected) + # doc example + # make sure we preserve the bins + ages = np.array([10, 15, 13, 12, 23, 25, 28, 59, 60]) + c = cut(ages, bins=[0, 18, 35, 70]) + expected = IntervalIndex.from_tuples([(0, 18), (18, 35), (35, 70)]) + tm.assert_index_equal(c.categories, expected) + + result = cut([25, 20, 50], bins=c.categories) + tm.assert_index_equal(result.categories, expected) + tm.assert_numpy_array_equal(result.codes, + np.array([1, 1, 2], dtype='int8')) + def test_bins_not_monotonic(self): data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1] self.assertRaises(ValueError, cut, data, [0.1, 1.5, 1, 10]) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index 33c7b0481da4d..2a258d4a7b7e5 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -220,7 +220,7 @@ def _bins_to_cuts(x, bins, right=True, labels=None, # we have a fast-path here ids = bins.get_indexer(x) result = algos.take_nd(bins, ids) - result = Categorical(result, ordered=True) + result = Categorical(result, categories=bins, ordered=True) return result, bins unique_bins = algos.unique(bins) From 834df76794a682ff163747882109776d31ff75d6 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 7 Apr 2017 16:59:04 -0400 Subject: [PATCH 11/12] more docs --- doc/source/whatsnew/v0.20.0.txt | 38 +++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 62fa95479591f..18048b11bc48b 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -320,11 +320,11 @@ To convert a ``SparseDataFrame`` back to sparse SciPy matrix in COO format, you IntervalIndex ^^^^^^^^^^^^^ -pandas has gain an ``IntervalIndex`` with its own dtype, ``interval`` as well as the ``Interval`` scalar type. These allow first-class support for interval -notation, specifically as return type for ``pd.cut`` and ``pd.qcut``. The ``IntervalIndex`` allows some unique indexing, see the +pandas has gained an ``IntervalIndex`` with its own dtype, ``interval`` as well as the ``Interval`` scalar type. These allow first-class support for interval +notation, specifically as a return type for the categories in ``pd.cut`` and ``pd.qcut``. The ``IntervalIndex`` allows some unique indexing, see the :ref:`docs `. (:issue:`7640`, :issue:`8625`) -**Previous behavior**: +Previous behavior: .. code-block:: ipython @@ -337,14 +337,40 @@ notation, specifically as return type for ``pd.cut`` and ``pd.qcut``. The ``Inte In [3]: pd.cut(range(3), 2).categories Out[3]: Index(['(-0.002, 1]', '(1, 2]'], dtype='object') -**New behavior**: +New behavior: .. ipython:: python - c = pd.cut(range(3), 2) + c = pd.cut(range(4), bins=2) c c.categories - pd.api.types.is_interval_dtype(c.categories) + +Furthermore, this allows one to bin *other* data with these same bins. ``NaN`` represents a missing +value similar to other dtypes. + +.. ipython:: python + + pd.cut([0, 3, 1, 1], bins=c.categories) + +These can also used in ``Series`` and ``DataFrame``, and indexed. + +.. ipython:: python + + df = pd.DataFrame({'A': range(4), + 'B': pd.cut([0, 3, 1, 1], bins=c.categories)} + ).set_index('B') + +Selecting a specific interval + +.. ipython:: python + + df.loc[pd.Interval(1.5, 3.0)] + +Selecting via a scalar value that is contained in the intervals. + +.. ipython:: python + + df.loc[0] .. _whatsnew_0200.enhancements.other: From 11ab1e15fd6e1ec9714bec86d8a36633c98802b8 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 11 Apr 2017 08:16:13 -0400 Subject: [PATCH 12/12] merge conflicts --- pandas/tests/test_algos.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 19c9b69ff1988..cd1ec915d3aeb 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1,12 +1,10 @@ # -*- coding: utf-8 -*- -from pandas.compat import range import numpy as np from numpy.random import RandomState from numpy import nan from datetime import datetime from itertools import permutations -from pandas import (Series, Categorical, CategoricalIndex, Index, from pandas import (Series, Categorical, CategoricalIndex, Timestamp, DatetimeIndex, Index, IntervalIndex) @@ -16,7 +14,7 @@ from pandas._libs import (groupby as libgroupby, algos as libalgos, hashtable) from pandas._libs.hashtable import unique_label_indices -from pandas.compat import lrange +from pandas.compat import lrange, range import pandas.core.algorithms as algos import pandas.util.testing as tm from pandas.compat.numpy import np_array_datetime64_compat