From 18d89a3f2fcd69cdfbf728730c774e405c3dd027 Mon Sep 17 00:00:00 2001 From: Benjamin Root Date: Fri, 20 Jul 2018 17:56:17 -0400 Subject: [PATCH 1/5] Add 'tolerance' attribute to much of Index internals --- pandas/core/dtypes/missing.py | 17 ++- pandas/core/indexes/accessors.py | 1 + pandas/core/indexes/api.py | 26 ++-- pandas/core/indexes/base.py | 206 ++++++++++++++++++++-------- pandas/core/indexes/category.py | 27 ++-- pandas/core/indexes/datetimelike.py | 9 +- pandas/core/indexes/datetimes.py | 50 ++++--- pandas/core/indexes/interval.py | 46 ++++--- pandas/core/indexes/multi.py | 61 ++++++-- pandas/core/indexes/numeric.py | 17 ++- pandas/core/indexes/period.py | 33 +++-- pandas/core/indexes/range.py | 14 +- pandas/core/indexes/timedeltas.py | 60 +++++--- pandas/tests/indexes/test_base.py | 12 +- 14 files changed, 396 insertions(+), 183 deletions(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 66998aa6866f6..cf0f210aac21c 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -369,7 +369,7 @@ def _isna_compat(arr, fill_value=np.nan): return True -def array_equivalent(left, right, strict_nan=False): +def array_equivalent(left, right, strict_nan=False, tolerance=None): """ True if two arrays, left and right, have equal non-NaN elements, and NaNs in corresponding locations. False otherwise. It is assumed that left and @@ -409,7 +409,7 @@ def array_equivalent(left, right, strict_nan=False): # Object arrays can contain None, NaN and NaT. # string dtypes must be come to this path for NumPy 1.7.1 compat if is_string_dtype(left) or is_string_dtype(right): - + # FIXME: intolerant if not strict_nan: # isna considers NaN and None to be equivalent. return lib.array_equivalent_object( @@ -434,9 +434,14 @@ def array_equivalent(left, right, strict_nan=False): # empty if not (np.prod(left.shape) and np.prod(right.shape)): return True - return ((left == right) | (isna(left) & isna(right))).all() + if tolerance is None: + return ((left == right) | (isna(left) & isna(right))).all() + else: + return ((np.abs(left - right) <= tolerance) | + (isna(left) & isna(right))).all() # numpy will will not allow this type of datetimelike vs integer comparison + # FIXME: intolerant elif is_datetimelike_v_numeric(left, right): return False @@ -454,7 +459,11 @@ def array_equivalent(left, right, strict_nan=False): if left.dtype != right.dtype: return False - return np.array_equal(left, right) + if tolerance is None: + return np.array_equal(left, right) + else: + # The remaining dtypes left won't have NaNs to worry about. + return np.allclose(left, right, atol=tolerance, equal_nan=False) def _infer_fill_value(val): diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index d7b4ea63cd48c..2c108ea2a089a 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -31,6 +31,7 @@ def __init__(self, data, orig): self.orig = orig self.name = getattr(data, 'name', None) self.index = getattr(data, 'index', None) + self.tolerance = getattr(data, 'tolerance', None) self._freeze() def _get_values(self): diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 3f3448d104165..ffb7b4ffddb2d 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -53,19 +53,21 @@ def _get_objs_combined_axis(objs, intersect=False, axis=0, sort=True): return _get_combined_index(obs_idxes, intersect=intersect, sort=sort) -def _get_combined_index(indexes, intersect=False, sort=False): +def _get_combined_index(indexes, intersect=False, sort=False, + tolerance=None): # TODO: handle index names! indexes = com.get_distinct_objs(indexes) if len(indexes) == 0: - index = Index([]) + index = Index([], tolerance=tolerance) elif len(indexes) == 1: index = indexes[0] elif intersect: index = indexes[0] + tolerance = index._choose_tolerance(indexes[1:], tolerance) for other in indexes[1:]: - index = index.intersection(other) + index = index.intersection(other, tolerance=tolerance) else: - index = _union_indexes(indexes, sort=sort) + index = _union_indexes(indexes, sort=sort, tolerance=tolerance) index = ensure_index(index) if sort: @@ -76,17 +78,20 @@ def _get_combined_index(indexes, intersect=False, sort=False): return index -def _union_indexes(indexes, sort=True): +def _union_indexes(indexes, sort=True, tolerance=None): if len(indexes) == 0: raise AssertionError('Must have at least 1 Index to union') if len(indexes) == 1: result = indexes[0] + if tolerance is None: + tolerance = getattr(result, 'tolerance', None) if isinstance(result, list): - result = Index(sorted(result)) + result = Index(sorted(result), tolerance=tolerance) return result indexes, kind = _sanitize_and_check(indexes) + # FIXME: intolerant def _unique_indices(inds): def conv(i): if isinstance(i, Index): @@ -96,19 +101,20 @@ def conv(i): return Index( lib.fast_unique_multiple_list([conv(i) for i in inds], sort=sort)) + tolerance = indexes[0]._choose_tolerance(indexes[1:], tolerance) if kind == 'special': result = indexes[0] if hasattr(result, 'union_many'): - return result.union_many(indexes[1:]) + return result.union_many(indexes[1:], tolerance=tolerance) else: for other in indexes[1:]: - result = result.union(other) + result = result.union(other, tolerance=tolerance) return result elif kind == 'array': index = indexes[0] for other in indexes[1:]: - if not index.equals(other): + if not index.equals(other, tolerance=tolerance): if sort is None: # TODO: remove once pd.concat sort default changes @@ -119,7 +125,7 @@ def conv(i): name = _get_consensus_names(indexes)[0] if name != index.name: - index = index._shallow_copy(name=name) + index = index._shallow_copy(name=name, tolerance=tolerance) return index else: # kind='list' return _unique_indices(indexes) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 20926ea5163af..48c312cba9ecc 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -188,6 +188,11 @@ class Index(IndexOpsMixin, PandasObject): Name to be stored in the index tupleize_cols : bool (default: True) When True, attempt to create a MultiIndex if possible + tolerance : scalar of compatible dtype as data (default: None) + When not None, this parameter is used for inexact indexing. + Most useful for indexing a Float64Index. + This feature is not fully supported in all Indexer types, + such as categorical and range indexers. Notes ----- @@ -223,9 +228,10 @@ class Index(IndexOpsMixin, PandasObject): _data = None _id = None name = None + tolerance = None asi8 = None _comparables = ['name'] - _attributes = ['name'] + _attributes = ['name', 'tolerance'] _is_numeric_dtype = False _can_hold_na = True @@ -243,22 +249,23 @@ class Index(IndexOpsMixin, PandasObject): str = CachedAccessor("str", StringMethods) def __new__(cls, data=None, dtype=None, copy=False, name=None, - fastpath=False, tupleize_cols=True, **kwargs): + fastpath=False, tupleize_cols=True, tolerance=None, **kwargs): if name is None and hasattr(data, 'name'): name = data.name if fastpath: - return cls._simple_new(data, name) + return cls._simple_new(data, name, tolerance=tolerance) from .range import RangeIndex # range if isinstance(data, RangeIndex): - return RangeIndex(start=data, copy=copy, dtype=dtype, name=name) + return RangeIndex(start=data, copy=copy, dtype=dtype, name=name, + tolerance=tolerance) elif isinstance(data, range): return RangeIndex.from_range(data, copy=copy, dtype=dtype, - name=name) + name=name, tolerance=tolerance) # categorical elif is_categorical_dtype(data) or is_categorical_dtype(dtype): @@ -272,7 +279,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, from .interval import IntervalIndex closed = kwargs.get('closed', None) return IntervalIndex(data, dtype=dtype, name=name, copy=copy, - closed=closed) + closed=closed, tolerance=tolerance) # extension dtype elif is_extension_array_dtype(data) or is_extension_array_dtype(dtype): @@ -296,7 +303,8 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, 'tz' in kwargs): from pandas import DatetimeIndex result = DatetimeIndex(data, copy=copy, name=name, - dtype=dtype, **kwargs) + dtype=dtype, tolerance=tolerance, + **kwargs) if dtype is not None and is_dtype_equal(_o_dtype, dtype): return Index(result.to_pydatetime(), dtype=_o_dtype) else: @@ -305,7 +313,8 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, elif (is_timedelta64_dtype(data) or (dtype is not None and is_timedelta64_dtype(dtype))): from pandas import TimedeltaIndex - result = TimedeltaIndex(data, copy=copy, name=name, **kwargs) + result = TimedeltaIndex(data, copy=copy, name=name, + tolerance=tolerance, **kwargs) if dtype is not None and _o_dtype == dtype: return Index(result.to_pytimedelta(), dtype=_o_dtype) else: @@ -336,14 +345,15 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, # then coerce to integer. try: return cls._try_convert_to_int_index( - data, copy, name, dtype) + data, copy, name, dtype, + tolerance=tolerance) except ValueError: pass # Return an actual float index. from .numeric import Float64Index return Float64Index(data, copy=copy, dtype=dtype, - name=name) + name=name, tolerance=tolerance) elif inferred == 'string': pass @@ -368,16 +378,20 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, from pandas.core.indexes.period import ( PeriodIndex, IncompatibleFrequency) if isinstance(data, PeriodIndex): - return PeriodIndex(data, copy=copy, name=name, **kwargs) + return PeriodIndex(data, copy=copy, name=name, + tolerance=tolerance, **kwargs) if is_signed_integer_dtype(data.dtype): from .numeric import Int64Index - return Int64Index(data, copy=copy, dtype=dtype, name=name) + return Int64Index(data, copy=copy, dtype=dtype, name=name, + tolerance=tolerance) elif is_unsigned_integer_dtype(data.dtype): from .numeric import UInt64Index - return UInt64Index(data, copy=copy, dtype=dtype, name=name) + return UInt64Index(data, copy=copy, dtype=dtype, name=name, + tolerance=tolerance) elif is_float_dtype(data.dtype): from .numeric import Float64Index - return Float64Index(data, copy=copy, dtype=dtype, name=name) + return Float64Index(data, copy=copy, dtype=dtype, name=name, + tolerance=tolerance) elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data): subarr = data.astype('object') else: @@ -393,18 +407,21 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, if inferred == 'integer': try: return cls._try_convert_to_int_index( - subarr, copy, name, dtype) + subarr, copy, name, dtype, + tolerance=tolerance) except ValueError: pass return Index(subarr, copy=copy, - dtype=object, name=name) + dtype=object, name=name, tolerance=tolerance) elif inferred in ['floating', 'mixed-integer-float']: from .numeric import Float64Index - return Float64Index(subarr, copy=copy, name=name) + return Float64Index(subarr, copy=copy, name=name, + tolerance=tolerance) elif inferred == 'interval': from .interval import IntervalIndex - return IntervalIndex(subarr, name=name, copy=copy) + return IntervalIndex(subarr, name=name, copy=copy, + tolerance=tolerance) elif inferred == 'boolean': # don't support boolean explicitly ATM pass @@ -416,24 +433,28 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, from pandas import DatetimeIndex try: return DatetimeIndex(subarr, copy=copy, - name=name, **kwargs) + name=name, + tolerance=tolerance, + **kwargs) except tslibs.OutOfBoundsDatetime: pass elif inferred.startswith('timedelta'): from pandas import TimedeltaIndex return TimedeltaIndex(subarr, copy=copy, name=name, + tolerance=tolerance, **kwargs) elif inferred == 'period': try: - return PeriodIndex(subarr, name=name, **kwargs) + return PeriodIndex(subarr, name=name, + tolerance=tolerance, **kwargs) except IncompatibleFrequency: pass return cls._simple_new(subarr, name) elif hasattr(data, '__array__'): return Index(np.asarray(data), dtype=dtype, copy=copy, name=name, - **kwargs) + tolerance=tolerance, **kwargs) elif data is None or is_scalar(data): cls._scalar_data_error(data) else: @@ -450,7 +471,8 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, data, names=name or kwargs.get('names')) # other iterable of some kind subarr = com.asarray_tuplesafe(data, dtype=object) - return Index(subarr, dtype=dtype, copy=copy, name=name, **kwargs) + return Index(subarr, dtype=dtype, copy=copy, name=name, + tolerance=tolerance, **kwargs) """ NOTE for new Index creation: @@ -474,7 +496,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, @classmethod def _simple_new(cls, values, name=None, dtype=None, **kwargs): """ - we require the we have a dtype compat for the values + we require that we have a dtype compat for the values if we are passed a non-dtype compat, then coerce using the constructor Must be careful not to recurse. @@ -853,7 +875,8 @@ def ravel(self, order='C'): # construction helpers @classmethod - def _try_convert_to_int_index(cls, data, copy, name, dtype): + def _try_convert_to_int_index(cls, data, copy, name, dtype, + tolerance=None): """ Attempt to convert an array of data into an integer index. @@ -862,6 +885,7 @@ def _try_convert_to_int_index(cls, data, copy, name, dtype): data : The data to convert. copy : Whether to copy the data or not. name : The name of the index returned. + tolerance : Tolerance, if any, for index matching Returns ------- @@ -880,7 +904,8 @@ def _try_convert_to_int_index(cls, data, copy, name, dtype): try: res = data.astype('i8', copy=False) if (res == data).all(): - return Int64Index(res, copy=copy, name=name) + return Int64Index(res, copy=copy, name=name, + tolerance=tolerance) except (OverflowError, TypeError, ValueError): pass @@ -889,7 +914,8 @@ def _try_convert_to_int_index(cls, data, copy, name, dtype): try: res = data.astype('u8', copy=False) if (res == data).all(): - return UInt64Index(res, copy=copy, name=name) + return UInt64Index(res, copy=copy, name=name, + tolerance=tolerance) except (OverflowError, TypeError, ValueError): pass @@ -1949,7 +1975,10 @@ def __contains__(self, key): return False _index_shared_docs['contains'] = """ - return a boolean if this key is IN the index + return a boolean if this key is IN the index. + + This method does *not* utilize the index's tolerance attribute + if it is defined. Parameters ---------- @@ -2352,7 +2381,29 @@ def _format_native_types(self, na_rep='', quoting=None, **kwargs): values[mask] = na_rep return values - def equals(self, other): + def _choose_tolerance(self, others, tolerance=None): + """ + Negotiate which tolerance to use. Use the passed in + tolerance if not None. Next, use the greater tolerance + of the indexers' tolerances, discarding None tolerances. + If all else fails, then return None. + + others is a list/tuple of Indexes. + """ + if tolerance is not None: + return tolerance + + # Cast to tuple + others = tuple(others) + + all_non_none_tolerances = [i.tolerance for i in (self,) + others + if i.tolerance is not None] + if len(all_non_none_tolerances) == 0: + return None + else: + return max(all_non_none_tolerances) + + def equals(self, other, tolerance=None): """ Determines if two Index objects contain the same elements. """ @@ -2362,13 +2413,16 @@ def equals(self, other): if not isinstance(other, Index): return False + tolerance = self._choose_tolerance([other], tolerance=tolerance) + if is_object_dtype(self) and not is_object_dtype(other): # if other is not object, use other's logic for coercion - return other.equals(self) + return other.equals(self, tolerance=tolerance) try: return array_equivalent(com.values_from_object(self), - com.values_from_object(other)) + com.values_from_object(other), + tolerance=tolerance) except Exception: return False @@ -2679,7 +2733,7 @@ def _get_consensus_name(self, other): return self._shallow_copy(name=name) return self - def union(self, other): + def union(self, other, tolerance=None): """ Form the union of two Index objects and sorts if possible. @@ -2703,11 +2757,16 @@ def union(self, other): self._assert_can_do_setop(other) other = ensure_index(other) - if len(other) == 0 or self.equals(other): - return self._get_consensus_name(other) + tolerance = self._choose_tolerance([other], tolerance=tolerance) + if len(other) == 0 or self.equals(other, tolerance=tolerance): + new_index = self._get_consensus_name(other) + new_index.tolerance = tolerance + return new_index if len(self) == 0: - return other._get_consensus_name(self) + new_index = other._get_consensus_name(self) + new_index.tolerance = tolerance + return new_index # TODO: is_dtype_union_equal is a hack around # 1. buggy set ops with duplicates (GH #13432) @@ -2716,7 +2775,7 @@ def union(self, other): if not is_dtype_union_equal(self.dtype, other.dtype): this = self.astype('O') other = other.astype('O') - return this.union(other) + return this.union(other, tolerance=tolerance) # TODO(EA): setops-refactor, clean all this up if is_period_dtype(self) or is_datetime64tz_dtype(self): @@ -2729,6 +2788,7 @@ def union(self, other): rvals = other._values if self.is_monotonic and other.is_monotonic: + # FIXME intolerant try: result = self._outer_indexer(lvals, rvals)[0] except TypeError: @@ -2739,7 +2799,7 @@ def union(self, other): value_set = set(lvals) result.extend([x for x in rvals if x not in value_set]) else: - indexer = self.get_indexer(other) + indexer = self.get_indexer(other, tolerance=tolerance) indexer, = (indexer == -1).nonzero() if len(indexer) > 0: @@ -2770,13 +2830,13 @@ def union(self, other): stacklevel=3) # for subclasses - return self._wrap_union_result(other, result) + return self._wrap_union_result(other, result, tolerance) - def _wrap_union_result(self, other, result): + def _wrap_union_result(self, other, result, tolerance=None): name = self.name if self.name == other.name else None - return self.__class__(result, name=name) + return self.__class__(result, name=name, tolerance=tolerance) - def intersection(self, other): + def intersection(self, other, tolerance=None): """ Form the intersection of two Index objects. @@ -2803,13 +2863,17 @@ def intersection(self, other): self._assert_can_do_setop(other) other = ensure_index(other) - if self.equals(other): - return self._get_consensus_name(other) + tolerance = self._choose_tolerance([other], tolerance=tolerance) + + if self.equals(other, tolerance=tolerance): + new_index = self._get_consensus_name(other) + new_index.tolerance = tolerance + return new_index if not is_dtype_equal(self.dtype, other.dtype): this = self.astype('O') other = other.astype('O') - return this.intersection(other) + return this.intersection(other, tolerance=tolerance) # TODO(EA): setops-refactor, clean all this up if is_period_dtype(self): @@ -2822,17 +2886,19 @@ def intersection(self, other): rvals = other._values if self.is_monotonic and other.is_monotonic: + # FIXME: intolerant try: result = self._inner_indexer(lvals, rvals)[0] - return self._wrap_union_result(other, result) + return self._wrap_union_result(other, result, tolerance) except TypeError: pass try: - indexer = Index(rvals).get_indexer(lvals) + indexer = Index(rvals).get_indexer(lvals, tolerance=tolerance) indexer = indexer.take((indexer != -1).nonzero()[0]) except Exception: # duplicates + # FIXME: get_indexer_non_unique() is intolerant indexer = algos.unique1d( Index(rvals).get_indexer_non_unique(lvals)[0]) indexer = indexer[indexer != -1] @@ -2842,7 +2908,7 @@ def intersection(self, other): taken.name = None return taken - def difference(self, other): + def difference(self, other, tolerance=None): """ Return a new Index with elements from the index that are not in `other`. @@ -2868,15 +2934,15 @@ def difference(self, other): """ self._assert_can_do_setop(other) - - if self.equals(other): - return self._shallow_copy([]) - other, result_name = self._convert_can_do_setop(other) + tolerance = self._choose_tolerance([other], tolerance=tolerance) + + if self.equals(other, tolerance=tolerance): + return self._shallow_copy([], tolerance=tolerance) this = self._get_unique_index() - indexer = this.get_indexer(other) + indexer = this.get_indexer(other, tolerance=tolerance) indexer = indexer.take((indexer != -1).nonzero()[0]) label_diff = np.setdiff1d(np.arange(this.size), indexer, @@ -2887,9 +2953,10 @@ def difference(self, other): except TypeError: pass - return this._shallow_copy(the_diff, name=result_name, freq=None) + return this._shallow_copy(the_diff, name=result_name, freq=None, + tolerance=tolerance) - def symmetric_difference(self, other, result_name=None): + def symmetric_difference(self, other, result_name=None, tolerance=None): """ Compute the symmetric difference of two Index objects. It's sorted if sorting is possible. @@ -2927,9 +2994,11 @@ def symmetric_difference(self, other, result_name=None): if result_name is None: result_name = result_name_update + tolerance = self._choose_tolerance([other], tolerance=tolerance) + this = self._get_unique_index() other = other._get_unique_index() - indexer = this.get_indexer(other) + indexer = this.get_indexer(other, tolerance=tolerance) # {this} minus {other} common_indexer = indexer.take((indexer != -1).nonzero()[0]) @@ -2951,6 +3020,7 @@ def symmetric_difference(self, other, result_name=None): attribs['name'] = result_name if 'freq' in attribs: attribs['freq'] = None + attribs['tolerance'] = tolerance return self._shallow_copy_with_infer(the_diff, **attribs) def _get_unique_index(self, dropna=False): @@ -2990,7 +3060,8 @@ def _get_unique_index(self, dropna=False): ---------- key : label method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional - * default: exact matches only. + * default: exact matches only (unless tolerance is + greater than zero) * pad / ffill: find the PREVIOUS index value if no exact match. * backfill / bfill: use NEXT index value if no exact match * nearest: use the NEAREST index value if no exact match. Tied @@ -2999,6 +3070,9 @@ def _get_unique_index(self, dropna=False): Maximum distance from index value for inexact matches. The value of the index at the matching location most satisfy the equation ``abs(index[loc] - key) <= tolerance``. + If not specified, then the tolerance already assigned to + this indexer is used. + FIXME: what to do if the method does not support tolerances? Tolerance may be a scalar value, which applies the same tolerance to all values, or @@ -3007,6 +3081,7 @@ def _get_unique_index(self, dropna=False): the index and its dtype must exactly match the index's type. .. versionadded:: 0.21.0 (list-like tolerance) + .. versionadded:: ??? (default to tolerance attribute) Returns ------- @@ -3029,10 +3104,12 @@ def _get_unique_index(self, dropna=False): @Appender(_index_shared_docs['get_loc']) def get_loc(self, key, method=None, tolerance=None): + if tolerance is None: + tolerance = self.tolerance + if tolerance is not None and method is None and tolerance > 0: + # Force nearest matches + method = 'nearest' if method is None: - if tolerance is not None: - raise ValueError('tolerance argument only valid if using pad, ' - 'backfill or nearest lookups') try: return self._engine.get_loc(key) except KeyError: @@ -3214,7 +3291,8 @@ def droplevel(self, level=0): ---------- target : %(target_klass)s method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional - * default: exact matches only. + * default: exact matches only (unless tolerance is + greater than zero) * pad / ffill: find the PREVIOUS index value if no exact match. * backfill / bfill: use NEXT index value if no exact match * nearest: use the NEAREST index value if no exact match. Tied @@ -3226,6 +3304,8 @@ def droplevel(self, level=0): Maximum distance between original and new labels for inexact matches. The values of the index at the matching locations most satisfy the equation ``abs(index[indexer] - target) <= tolerance``. + If not specified, then the tolerance assigned to this indexer is + used. Tolerance may be a scalar value, which applies the same tolerance to all values, or list-like, which applies variable tolerance per @@ -3234,6 +3314,7 @@ def droplevel(self, level=0): index's type. .. versionadded:: 0.21.0 (list-like tolerance) + .. versionadded:: ???? (defer to tolerance attribute) Returns ------- @@ -3255,10 +3336,15 @@ def droplevel(self, level=0): @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs) def get_indexer(self, target, method=None, limit=None, tolerance=None): + if tolerance is None: + tolerance = self.tolerance method = missing.clean_reindex_fill_method(method) target = ensure_index(target) if tolerance is not None: tolerance = self._convert_tolerance(tolerance, target) + if tolerance is not None and method is None and tolerance > 0: + # Force nearest matches + method = 'nearest' # Treat boolean labels passed to a numeric index as not found. Without # this fix False and True would be treated as 0 and 1 respectively. @@ -3649,7 +3735,7 @@ def reindex(self, target, method=None, level=None, limit=None, _, indexer, _ = self._join_level(target, level, how='right', return_indexers=True) else: - if self.equals(target): + if self.equals(target, tolerance=tolerance): indexer = None else: diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index d76a7ef00f625..bd6a3d463837d 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -73,13 +73,12 @@ class CategoricalIndex(Index, accessor.PandasDelegate): _typ = 'categoricalindex' _engine_type = libindex.Int64Engine - _attributes = ['name'] + _attributes = ['name', 'tolerance'] def __new__(cls, data=None, categories=None, ordered=None, dtype=None, - copy=False, name=None, fastpath=False): - + copy=False, name=None, fastpath=False, tolerance=None): if fastpath: - return cls._simple_new(data, name=name, dtype=dtype) + return cls._simple_new(data, name=name, dtype=dtype, tolerance=tolerance) if name is None and hasattr(data, 'name'): name = data.name @@ -105,7 +104,7 @@ def __new__(cls, data=None, categories=None, ordered=None, dtype=None, if copy: data = data.copy() - return cls._simple_new(data, name=name) + return cls._simple_new(data, name=name, tolerance=tolerance) def _create_from_codes(self, codes, categories=None, ordered=None, name=None): @@ -176,9 +175,13 @@ def _create_categorical(cls, data, categories=None, ordered=None, @classmethod def _simple_new(cls, values, name=None, categories=None, ordered=None, - dtype=None, **kwargs): + dtype=None, tolerance=None, **kwargs): result = object.__new__(cls) + if tolerance is not None: + raise ValueError("CategoricalIndex does not support non-None" + " tolerances!") + values = cls._create_categorical(values, categories, ordered, dtype=dtype) result._data = values @@ -243,7 +246,7 @@ def _is_dtype_compat(self, other): return other - def equals(self, other): + def equals(self, other, tolerance=None): """ Determines if two CategorialIndex objects contain the same elements. """ @@ -253,6 +256,11 @@ def equals(self, other): if not isinstance(other, Index): return False + tolerance = self._choose_tolerance([other], tolerance=tolerance) + if tolerance is not None: + raise ValueError("CategoricalIndex does not support a non-None" + " tolerance") + try: other = self._is_dtype_compat(other) return array_equivalent(self._data, other) @@ -401,7 +409,7 @@ def _to_safe_for_reshape(self): """ convert to object if we are a categorical """ return self.astype('object') - def get_loc(self, key, method=None): + def get_loc(self, key, method=None, tolerance=None): """ Get integer location, slice or boolean mask for requested label. @@ -429,6 +437,9 @@ def get_loc(self, key, method=None): >>> non_monotonic_index.get_loc('b') array([False, True, False, True], dtype=bool) """ + if tolerance is not None: + raise ValueError("CategoricalIndex does not support a non-None" + " tolerance") codes = self.categories.get_loc(key) if (codes == -1): raise KeyError(key) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 8f05a9a887830..7b16dd52cbefd 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -243,7 +243,7 @@ class DatetimeIndexOpsMixin(DatetimeLikeArrayMixin): _resolution = cache_readonly(DatetimeLikeArrayMixin._resolution.fget) resolution = cache_readonly(DatetimeLikeArrayMixin.resolution.fget) - def equals(self, other): + def equals(self, other, tolerance=None): """ Determines if two Index objects contain the same elements. """ @@ -269,7 +269,12 @@ def equals(self, other): if self.freq != other.freq: return False - return np.array_equal(self.asi8, other.asi8) + tolerance = self._choose_tolerance([other], tolerance) + if tolerance is None: + return np.array_equal(self.asi8, other.asi8) + else: + # FIXME: Probably need to convert tolerance? + return np.allclose(self.asi8, other.asi8, atol=tolerance) @staticmethod def _join_i8_wrapper(joinf, dtype, with_indexers=True): diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 933e7406b5af3..855961b164207 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -251,7 +251,7 @@ def _add_comparison_methods(cls): tz = None _freq = None _comparables = ['name', 'freqstr', 'tz'] - _attributes = ['name', 'freq', 'tz'] + _attributes = ['name', 'freq', 'tz', 'tolerance'] # define my properties & methods for delegation _bool_ops = ['is_month_start', 'is_month_end', @@ -279,7 +279,8 @@ def __new__(cls, data=None, freq=None, start=None, end=None, periods=None, tz=None, normalize=False, closed=None, ambiguous='raise', dayfirst=False, yearfirst=False, dtype=None, - copy=False, name=None, verify_integrity=True): + copy=False, name=None, verify_integrity=True, + tolerance=None): # This allows to later ensure that the 'copy' parameter is honored: if isinstance(data, Index): @@ -303,7 +304,8 @@ def __new__(cls, data=None, periods = dtl.validate_periods(periods) return cls._generate_range(start, end, periods, name, freq, tz=tz, normalize=normalize, - closed=closed, ambiguous=ambiguous) + closed=closed, ambiguous=ambiguous, + tolerance=tolerance) if not isinstance(data, (np.ndarray, Index, ABCSeries)): if is_scalar(data): @@ -356,7 +358,8 @@ def __new__(cls, data=None, data = data.astype(np.int64, copy=False) subarr = data.view(_NS_DTYPE) - subarr = cls._simple_new(subarr, name=name, freq=freq, tz=tz) + subarr = cls._simple_new(subarr, name=name, freq=freq, tz=tz, + tolerance=tolerance) if dtype is not None: if not is_dtype_equal(subarr.dtype, dtype): # dtype must be coerced to DatetimeTZDtype above @@ -376,7 +379,8 @@ def __new__(cls, data=None, @classmethod def _generate_range(cls, start, end, periods, name, freq, tz=None, - normalize=False, ambiguous='raise', closed=None): + normalize=False, ambiguous='raise', closed=None, + tolerance=None): if com.count_not_none(start, end, periods, freq) != 3: raise ValueError('Of the four parameters: start, end, periods, ' 'and freq, exactly three must be specified') @@ -501,7 +505,8 @@ def _generate_range(cls, start, end, periods, name, freq, tz=None, if not right_closed and len(index) and index[-1] == end: index = index[:-1] - index = cls._simple_new(index.values, name=name, freq=freq, tz=tz) + index = cls._simple_new(index.values, name=name, freq=freq, tz=tz, + tolerance=tolerance) return index @@ -698,6 +703,7 @@ def __setstate__(self, state): data = np.empty(state) np.ndarray.__setstate__(data, state) + self.tolerance = None self._data = data self._reset_identity() @@ -855,7 +861,8 @@ def to_period(self, freq=None): freq = get_period_alias(freq) - return PeriodIndex(self.values, name=self.name, freq=freq, tz=self.tz) + return PeriodIndex(self.values, name=self.name, freq=freq, tz=self.tz, + tolerance=self.tolerance) def snap(self, freq='S'): """ @@ -892,9 +899,9 @@ def unique(self, level=None): naive = self result = super(DatetimeIndex, naive).unique(level=level) return self._simple_new(result.values, name=self.name, tz=self.tz, - freq=self.freq) + freq=self.freq, tolerance=self.tolerance) - def union(self, other): + def union(self, other, tolerance=None): """ Specialized union for DatetimeIndex objects. If combine overlapping ranges with the same DateOffset, will be much @@ -908,6 +915,7 @@ def union(self, other): ------- y : Index or DatetimeIndex """ + # FIXME: intolerant self._assert_can_do_setop(other) if not isinstance(other, DatetimeIndex): try: @@ -917,10 +925,12 @@ def union(self, other): this, other = self._maybe_utc_convert(other) + tolerance = this._choose_tolerance([other], tolerance=tolerance) + if this._can_fast_union(other): - return this._fast_union(other) + return this._fast_union(other, tolerance) else: - result = Index.union(this, other) + result = Index.union(this, other, tolerance=tolerance) if isinstance(result, DatetimeIndex): result._tz = timezones.tz_standardize(this.tz) if (result.freq is None and @@ -1009,17 +1019,17 @@ def _maybe_utc_convert(self, other): other = other.tz_convert('UTC') return this, other - def _wrap_joined_index(self, joined, other): + def _wrap_joined_index(self, joined, other, tolerance): name = self.name if self.name == other.name else None if (isinstance(other, DatetimeIndex) and self.freq == other.freq and self._can_fast_union(other)): - joined = self._shallow_copy(joined) + joined = self._shallow_copy(joined, tolerance=tolerance) joined.name = name return joined else: tz = getattr(other, 'tz', None) - return self._simple_new(joined, name, tz=tz) + return self._simple_new(joined, name, tz=tz, tolerance=tolerance) def _can_fast_union(self, other): if not isinstance(other, DatetimeIndex): @@ -1054,7 +1064,8 @@ def _can_fast_union(self, other): # this will raise return False - def _fast_union(self, other): + def _fast_union(self, other, tolerance): + # FIXME: intolerant if len(other) == 0: return self.view(type(self)) @@ -1076,7 +1087,7 @@ def _fast_union(self, other): loc = right.searchsorted(left_end, side='right') right_chunk = right.values[loc:] dates = _concat._concat_compat((left.values, right_chunk)) - return self._shallow_copy(dates) + return self._shallow_copy(dates, tolerance=tolerance) else: return left else: @@ -1084,13 +1095,14 @@ def _fast_union(self, other): end=max(left_end, right_end), freq=left.freq) - def _wrap_union_result(self, other, result): + def _wrap_union_result(self, other, result, tolerance): name = self.name if self.name == other.name else None if not timezones.tz_compare(self.tz, other.tz): raise ValueError('Passed item and index have different timezone') - return self._simple_new(result, name=name, freq=None, tz=self.tz) + return self._simple_new(result, name=name, freq=None, tz=self.tz, + tolerance=tolerance) - def intersection(self, other): + def intersection(self, other, tolerance=None): """ Specialized intersection for DatetimeIndex objects. May be much faster than Index.intersection diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 246bd3d541b72..78d649a8bfc99 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -53,6 +53,10 @@ name : object, optional to be stored in the index. """), + tolerance=textwrap.dedent("""\ + tolerance : optional + default tolerance, if any, to apply to indexing operations. + """), )) @@ -111,6 +115,7 @@ def _new_IntervalIndex(cls, d): klass="IntervalIndex", summary="Immutable index of intervals that are closed on the same side.", name=_index_doc_kwargs['name'], + tolerance=_index_doc_kwargs['tolerance'], versionadded="0.20.0", extra_methods="contains\n", examples=textwrap.dedent("""\ @@ -137,7 +142,7 @@ def _new_IntervalIndex(cls, d): class IntervalIndex(IntervalMixin, Index): _typ = 'intervalindex' _comparables = ['name'] - _attributes = ['name', 'closed'] + _attributes = ['name', 'closed', 'tolerance'] # we would like our indexing holder to defer to us _defer_to_indexing = True @@ -146,10 +151,11 @@ class IntervalIndex(IntervalMixin, Index): _mask = None def __new__(cls, data, closed=None, dtype=None, copy=False, - name=None, fastpath=False, verify_integrity=True): + name=None, fastpath=False, verify_integrity=True, + tolerance=None): if fastpath: - return cls._simple_new(data, name) + return cls._simple_new(data, name, tolerance=tolerance) if name is None and hasattr(data, 'name'): name = data.name @@ -159,10 +165,10 @@ def __new__(cls, data, closed=None, dtype=None, copy=False, fastpath=fastpath, verify_integrity=verify_integrity) - return cls._simple_new(array, name) + return cls._simple_new(array, name, tolerance=tolerance) @classmethod - def _simple_new(cls, array, name, closed=None): + def _simple_new(cls, array, name, closed=None, tolerance=None): """ Construct from an IntervalArray @@ -176,6 +182,7 @@ def _simple_new(cls, array, name, closed=None): """ result = IntervalMixin.__new__(cls) result._data = array + result.tolerance = tolerance result.name = name result._reset_identity() return result @@ -256,25 +263,25 @@ def contains(self, key): @classmethod @Appender(_interval_shared_docs['from_breaks'] % _index_doc_kwargs) def from_breaks(cls, breaks, closed='right', name=None, copy=False, - dtype=None): + dtype=None, tolerance=None): with rewrite_exception("IntervalArray", cls.__name__): array = IntervalArray.from_breaks(breaks, closed=closed, copy=copy, dtype=dtype) - return cls._simple_new(array, name=name) + return cls._simple_new(array, name=name, tolerance=tolerance) @classmethod @Appender(_interval_shared_docs['from_arrays'] % _index_doc_kwargs) def from_arrays(cls, left, right, closed='right', name=None, copy=False, - dtype=None): + dtype=None, tolerance=None): with rewrite_exception("IntervalArray", cls.__name__): array = IntervalArray.from_arrays(left, right, closed, copy=copy, dtype=dtype) - return cls._simple_new(array, name=name) + return cls._simple_new(array, name=name, tolerance=tolerance) @classmethod @Appender(_interval_shared_docs['from_intervals'] % _index_doc_kwargs) def from_intervals(cls, data, closed=None, name=None, copy=False, - dtype=None): + dtype=None, tolerance=None): msg = ('IntervalIndex.from_intervals is deprecated and will be ' 'removed in a future version; Use IntervalIndex(...) instead') warnings.warn(msg, FutureWarning, stacklevel=2) @@ -284,16 +291,16 @@ def from_intervals(cls, data, closed=None, name=None, copy=False, if name is None and isinstance(data, cls): name = data.name - return cls._simple_new(array, name=name) + return cls._simple_new(array, name=name, tolerance=tolerance) @classmethod @Appender(_interval_shared_docs['from_tuples'] % _index_doc_kwargs) def from_tuples(cls, data, closed='right', name=None, copy=False, - dtype=None): + dtype=None, tolerance=None): with rewrite_exception("IntervalArray", cls.__name__): arr = IntervalArray.from_tuples(data, closed=closed, copy=copy, dtype=dtype) - return cls._simple_new(arr, name=name) + return cls._simple_new(arr, name=name, tolerance=tolerance) @Appender(_interval_shared_docs['to_tuples'] % dict( return_type="Index", @@ -607,7 +614,7 @@ def _find_non_overlapping_monotonic_bounds(self, key): stop = self._searchsorted_monotonic(key, 'right') return start, stop - def get_loc(self, key, method=None): + def get_loc(self, key, method=None, tolerance=None): """Get integer location, slice or boolean mask for requested label. Parameters @@ -643,6 +650,7 @@ def get_loc(self, key, method=None): >>> overlapping_index.get_loc(1.5) array([0, 1], dtype=int64) """ + # FIXME: mostly intolerant self._check_method(method) original_key = key @@ -652,7 +660,7 @@ def get_loc(self, key, method=None): if isinstance(key, Interval): left = self._maybe_cast_slice_bound(key.left, 'left', None) right = self._maybe_cast_slice_bound(key.right, 'right', None) - key = Interval(left, right, key.closed) + key = Interval(left, right, key.closed, tolerance=tolerance) else: key = self._maybe_cast_slice_bound(key, 'left', None) @@ -959,7 +967,7 @@ def _format_space(self): def argsort(self, *args, **kwargs): return np.lexsort((self.right, self.left)) - def equals(self, other): + def equals(self, other, tolerance=None): """ Determines if two IntervalIndex objects contain the same elements """ @@ -971,10 +979,12 @@ def equals(self, other): if not isinstance(other, IntervalIndex): if not is_interval_dtype(other): return False + # FIXME: '.values'??? other = Index(getattr(other, '.values', other)) + tolerance = self._choose_tolerance([other], tolerance) - return (self.left.equals(other.left) and - self.right.equals(other.right) and + return (self.left.equals(other.left, tolerance=tolerance) and + self.right.equals(other.right, tolerance=tolerance) and self.closed == other.closed) def _setop(op_name): diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 7d24a901382bb..65112c3d6e366 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -211,7 +211,8 @@ class MultiIndex(Index): def __new__(cls, levels=None, labels=None, sortorder=None, names=None, dtype=None, copy=False, name=None, - verify_integrity=True, _set_identity=True): + verify_integrity=True, _set_identity=True, + tolerance=None): # compat with Index if name is not None: @@ -222,12 +223,15 @@ def __new__(cls, levels=None, labels=None, sortorder=None, names=None, raise ValueError('Length of levels and labels must be the same.') if len(levels) == 0: raise ValueError('Must pass non-zero number of levels/labels') + if tolerance is not None: + raise ValueError("MultiIndex does not support non-None tolerances yet") result = object.__new__(MultiIndex) # we've already validated levels and labels, so shortcut here result._set_levels(levels, copy=copy, validate=False) result._set_labels(labels, copy=copy, validate=False) + result.tolerance = tolerance if names is not None: # handles name validation @@ -521,7 +525,8 @@ def copy(self, names=None, dtype=None, levels=None, labels=None, labels = self.labels return MultiIndex(levels=levels, labels=labels, names=names, sortorder=self.sortorder, verify_integrity=False, - _set_identity=_set_identity) + _set_identity=_set_identity, + tolerance=self.tolerance) def __array__(self, dtype=None): """ the array interface, return my values """ @@ -1235,7 +1240,8 @@ def lexsort_depth(self): return 0 @classmethod - def from_arrays(cls, arrays, sortorder=None, names=None): + def from_arrays(cls, arrays, sortorder=None, names=None, + tolerance=None): """ Convert arrays to MultiIndex @@ -1281,10 +1287,11 @@ def from_arrays(cls, arrays, sortorder=None, names=None): names = [getattr(arr, "name", None) for arr in arrays] return MultiIndex(levels=levels, labels=labels, sortorder=sortorder, - names=names, verify_integrity=False) + names=names, verify_integrity=False, + tolerance=tolerance) @classmethod - def from_tuples(cls, tuples, sortorder=None, names=None): + def from_tuples(cls, tuples, sortorder=None, names=None, tolerance=None): """ Convert list of tuples to MultiIndex @@ -1332,10 +1339,12 @@ def from_tuples(cls, tuples, sortorder=None, names=None): else: arrays = lzip(*tuples) - return MultiIndex.from_arrays(arrays, sortorder=sortorder, names=names) + return MultiIndex.from_arrays(arrays, sortorder=sortorder, names=names, + tolerance=tolerance) @classmethod - def from_product(cls, iterables, sortorder=None, names=None): + def from_product(cls, iterables, sortorder=None, names=None, + tolerance=None): """ Make a MultiIndex from the cartesian product of multiple iterables @@ -1378,7 +1387,8 @@ def from_product(cls, iterables, sortorder=None, names=None): labels, levels = _factorize_from_iterables(iterables) labels = cartesian_product(labels) - return MultiIndex(levels, labels, sortorder=sortorder, names=names) + return MultiIndex(levels, labels, sortorder=sortorder, names=names, + tolerance=tolerance) def _sort_levels_monotonic(self): """ @@ -1440,7 +1450,8 @@ def _sort_levels_monotonic(self): return MultiIndex(new_levels, new_labels, names=self.names, sortorder=self.sortorder, - verify_integrity=False) + verify_integrity=False, + tolerance=tolerance) def remove_unused_levels(self): """ @@ -1539,7 +1550,8 @@ def __reduce__(self): """Necessary for making this object picklable""" d = dict(levels=[lev for lev in self.levels], labels=[label for label in self.labels], - sortorder=self.sortorder, names=list(self.names)) + sortorder=self.sortorder, names=list(self.names), + tolerance=self.tolerance) return ibase._new_Index, (self.__class__, d), None def __setstate__(self, state): @@ -1550,6 +1562,7 @@ def __setstate__(self, state): labels = state.get('labels') sortorder = state.get('sortorder') names = state.get('names') + tolerance = state.get('tolerance', None) elif isinstance(state, tuple): @@ -1560,6 +1573,7 @@ def __setstate__(self, state): self._set_labels(labels) self._set_names(names) self.sortorder = sortorder + self.tolerance = None self._verify_integrity() self._reset_identity() @@ -2134,7 +2148,7 @@ def _partial_tup_index(self, tup, side='left'): else: return start + section.searchsorted(idx, side=side) - def get_loc(self, key, method=None): + def get_loc(self, key, method=None, tolerance=None): """ Get location for a label or a tuple of labels as an integer, slice or boolean mask. @@ -2177,6 +2191,8 @@ def get_loc(self, key, method=None): if method is not None: raise NotImplementedError('only the default get_loc method is ' 'currently supported for MultiIndex') + if tolerance is not None: + raise NotImplementedError("Tolerance is not supported for MultiIndex") def _maybe_to_slice(loc): """convert integer indexer to boolean mask or slice if possible""" @@ -2612,7 +2628,7 @@ def truncate(self, before=None, after=None): return MultiIndex(levels=new_levels, labels=new_labels, verify_integrity=False) - def equals(self, other): + def equals(self, other, tolerance=None): """ Determines if two MultiIndex objects have the same labeling information (the levels themselves do not necessarily have to be the same) @@ -2621,6 +2637,9 @@ def equals(self, other): -------- equal_levels """ + if tolerance is not None: + raise NotImplementedError("Tolerance is not supported for" + " MultiIndex") if self.is_(other): return True @@ -2675,7 +2694,7 @@ def equal_levels(self, other): return False return True - def union(self, other): + def union(self, other, tolerance=None): """ Form the union of two MultiIndex objects, sorting if possible @@ -2689,6 +2708,10 @@ def union(self, other): >>> index.union(index2) """ + if tolerance is not None: + raise NotImplementedError("Tolerance is not supported for" + " MultiIndex") + self._assert_can_do_setop(other) other, result_names = self._convert_can_do_setop(other) @@ -2700,7 +2723,7 @@ def union(self, other): return MultiIndex.from_arrays(lzip(*uniq_tuples), sortorder=0, names=result_names) - def intersection(self, other): + def intersection(self, other, tolerance=None): """ Form the intersection of two MultiIndex objects, sorting if possible @@ -2712,6 +2735,10 @@ def intersection(self, other): ------- Index """ + if tolerance is not None: + raise NotImplementedError("Tolerance is not supported for" + " MultiIndex") + self._assert_can_do_setop(other) other, result_names = self._convert_can_do_setop(other) @@ -2729,7 +2756,7 @@ def intersection(self, other): return MultiIndex.from_arrays(lzip(*uniq_tuples), sortorder=0, names=result_names) - def difference(self, other): + def difference(self, other, tolerance=None): """ Compute sorted set difference of two MultiIndex objects @@ -2737,6 +2764,10 @@ def difference(self, other): ------- diff : MultiIndex """ + if tolerance is not None: + raise NotImplementedError("Tolerance is not supported for" + " MultiIndex") + self._assert_can_do_setop(other) other, result_names = self._convert_can_do_setop(other) diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index ea392d0b93377..3101e5f9ce1ac 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -33,10 +33,10 @@ class NumericIndex(Index): _is_numeric_dtype = True def __new__(cls, data=None, dtype=None, copy=False, name=None, - fastpath=False): + fastpath=False, tolerance=None): if fastpath: - return cls._simple_new(data, name=name) + return cls._simple_new(data, name=name, tolerance=tolerance) # is_scalar, generators handled in coerce_to_ndarray data = cls._coerce_to_ndarray(data) @@ -52,7 +52,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, if name is None and hasattr(data, 'name'): name = data.name - return cls._simple_new(subarr, name=name) + return cls._simple_new(subarr, name=name, tolerance=tolerance) @Appender(_index_shared_docs['_maybe_cast_slice_bound']) def _maybe_cast_slice_bound(self, label, side, kind): @@ -360,7 +360,7 @@ def get_value(self, series, key): return new_values - def equals(self, other): + def equals(self, other, tolerance=None): """ Determines if two Index objects contain the same elements. """ @@ -370,6 +370,8 @@ def equals(self, other): if not isinstance(other, Index): return False + + # need to compare nans locations and make sure that they are the same # since nans don't compare equal this is a bit tricky try: @@ -378,8 +380,13 @@ def equals(self, other): if (not is_dtype_equal(self.dtype, other.dtype) or self.shape != other.shape): return False + tolerance = self._choose_tolerance([other], tolerance) left, right = self._ndarray_values, other._ndarray_values - return ((left == right) | (self._isnan & other._isnan)).all() + if tolerance is None: + return ((left == right) | (self._isnan & other._isnan)).all() + else: + return ((np.abs(left - right) <= tolerance) | + (self._isnan & other._isnan)).all() except (TypeError, ValueError): return False diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index b315e3ec20830..8b602bcdde3de 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -155,7 +155,7 @@ class PeriodIndex(PeriodArrayMixin, DatelikeOps, DatetimeIndexOpsMixin, TimedeltaIndex : Index of timedelta64 data """ _typ = 'periodindex' - _attributes = ['name', 'freq'] + _attributes = ['name', 'freq', 'tolerance'] # define my properties & methods for delegation _other_ops = [] @@ -177,7 +177,7 @@ class PeriodIndex(PeriodArrayMixin, DatelikeOps, DatetimeIndexOpsMixin, def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, periods=None, tz=None, dtype=None, copy=False, name=None, - **fields): + tolerance=None, **fields): valid_field_set = {'year', 'month', 'day', 'quarter', 'hour', 'minute', 'second'} @@ -217,7 +217,8 @@ def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, else: data, freq = cls._generate_range(start, end, periods, freq, fields) - return cls._from_ordinals(data, name=name, freq=freq) + return cls._from_ordinals(data, name=name, freq=freq, + tolerance=tolerance) if isinstance(data, PeriodIndex): if freq is None or freq == data.freq: # no freq change @@ -228,7 +229,8 @@ def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, base2, _ = _gfc(freq) data = period.period_asfreq_arr(data._ndarray_values, base1, base2, 1) - return cls._simple_new(data, name=name, freq=freq) + return cls._simple_new(data, name=name, freq=freq, + tolerance=tolerance) # not array / index if not isinstance(data, (np.ndarray, PeriodIndex, @@ -245,7 +247,8 @@ def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, # datetime other than period if is_datetime64_dtype(data.dtype): data = dt64arr_to_periodarr(data, freq, tz) - return cls._from_ordinals(data, name=name, freq=freq) + return cls._from_ordinals(data, name=name, freq=freq, + tolerance=tolerance) # check not floats if infer_dtype(data) == 'floating' and len(data) > 0: @@ -256,7 +259,8 @@ def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, data = ensure_object(data) freq = freq or period.extract_freq(data) data = period.extract_ordinals(data, freq) - return cls._from_ordinals(data, name=name, freq=freq) + return cls._from_ordinals(data, name=name, freq=freq, + tolerance=tolerance) @cache_readonly def _engine(self): @@ -277,14 +281,16 @@ def _simple_new(cls, values, name=None, freq=None, **kwargs): return cls._from_ordinals(values, name, freq, **kwargs) @classmethod - def _from_ordinals(cls, values, name=None, freq=None, **kwargs): + def _from_ordinals(cls, values, name=None, freq=None, + tolerance=None, **kwargs): """ Values should be int ordinals - `__new__` & `_simple_new` cooerce to ordinals and call this method + `__new__` & `_simple_new` coerce to ordinals and call this method """ result = super(PeriodIndex, cls)._from_ordinals(values, freq) result.name = name + result.tolerance = tolerance result._reset_identity() return result @@ -570,11 +576,16 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): msg = DIFFERENT_FREQ_INDEX.format(self.freqstr, target.freqstr) raise IncompatibleFrequency(msg) + # This needs to be done before target.asi8 because + # _choose_tolerance expects Indexes, not numpy arrays + tolerance = self._choose_tolerance([target], tolerance) + if isinstance(target, PeriodIndex): target = target.asi8 if tolerance is not None: tolerance = self._convert_tolerance(tolerance, target) + return Index.get_indexer(self._int64index, target, method, limit, tolerance) @@ -596,6 +607,7 @@ def get_loc(self, key, method=None, tolerance=None): loc : int """ try: + # FIXME: intolerant return self._engine.get_loc(key) except KeyError: if is_integer(key): @@ -619,6 +631,8 @@ def get_loc(self, key, method=None, tolerance=None): if tolerance is not None: tolerance = self._convert_tolerance(tolerance, np.asarray(key)) + else: + tolerance = getattr(self, 'tolerance', None) return self._int64index.get_loc(ordinal, method, tolerance) except KeyError: @@ -743,10 +757,11 @@ def _assert_can_do_setop(self, other): msg = DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) - def _wrap_union_result(self, other, result): + def _wrap_union_result(self, other, result, tolerance): name = self.name if self.name == other.name else None result = self._apply_meta(result) result.name = name + result.tolerance = tolerance return result def _apply_meta(self, rawarr): diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 939ec0b79ac6b..de161e16b86f5 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -66,10 +66,12 @@ class RangeIndex(Int64Index): _engine_type = libindex.Int64Engine def __new__(cls, start=None, stop=None, step=None, - dtype=None, copy=False, name=None, fastpath=False): + dtype=None, copy=False, name=None, fastpath=False, + tolerance=None): if fastpath: - return cls._simple_new(start, stop, step, name=name) + return cls._simple_new(start, stop, step, name=name, + tolerance=tolerance) cls._validate_dtype(dtype) @@ -115,7 +117,7 @@ def ensure_int(value, field): else: step = ensure_int(step, 'step') - return cls._simple_new(start, stop, step, name) + return cls._simple_new(start, stop, step, name, tolerance=tolerance) @classmethod def from_range(cls, data, name=None, dtype=None, **kwargs): @@ -312,7 +314,7 @@ def argsort(self, *args, **kwargs): else: return np.arange(len(self) - 1, -1, -1) - def equals(self, other): + def equals(self, other, tolerance=None): """ Determines if two Index objects contain the same elements. """ @@ -326,7 +328,7 @@ def equals(self, other): self._start == other._start and self._step == other._step) - return super(RangeIndex, self).equals(other) + return super(RangeIndex, self).equals(other, tolerance=tolerance) def intersection(self, other): """ @@ -341,6 +343,7 @@ def intersection(self, other): ------- intersection : Index """ + # FIXME: intolerant if not isinstance(other, RangeIndex): return super(RangeIndex, self).intersection(other) @@ -420,6 +423,7 @@ def union(self, other): ------- union : Index """ + # FIXME: intolerant self._assert_can_do_setop(other) if len(other) == 0 or self.equals(other): return self diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 006758f276f87..c148d6756a802 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -167,7 +167,7 @@ def _add_comparison_methods(cls): _engine_type = libindex.TimedeltaEngine _comparables = ['name', 'freq'] - _attributes = ['name', 'freq'] + _attributes = ['name', 'freq', 'tolerance'] _is_numeric_dtype = True _infer_as_myclass = True @@ -175,7 +175,8 @@ def _add_comparison_methods(cls): def __new__(cls, data=None, unit=None, freq=None, start=None, end=None, periods=None, closed=None, dtype=None, copy=False, - name=None, verify_integrity=True): + name=None, verify_integrity=True, + tolerance=None): if isinstance(data, TimedeltaIndex) and freq is None and name is None: if copy: @@ -192,7 +193,7 @@ def __new__(cls, data=None, unit=None, freq=None, start=None, end=None, 'supplied') periods = dtl.validate_periods(periods) return cls._generate_range(start, end, periods, name, freq, - closed=closed) + closed=closed, tolerance=tolerance) if unit is not None: data = to_timedelta(data, unit=unit, box=False) @@ -209,7 +210,8 @@ def __new__(cls, data=None, unit=None, freq=None, start=None, end=None, elif copy: data = np.array(data, copy=True) - subarr = cls._simple_new(data, name=name, freq=freq) + subarr = cls._simple_new(data, name=name, freq=freq, + tolerance=tolerance) # check that we are matching freqs if verify_integrity and len(subarr) > 0: if freq is not None and not freq_infer: @@ -223,13 +225,15 @@ def __new__(cls, data=None, unit=None, freq=None, start=None, end=None, return subarr @classmethod - def _generate_range(cls, start, end, periods, name, freq, closed=None): + def _generate_range(cls, start, end, periods, name, freq, closed=None, + tolerance=None): # TimedeltaArray gets `name` via **kwargs, so we need to explicitly # override it if name is passed as a positional argument return super(TimedeltaIndex, cls)._generate_range(start, end, periods, freq, name=name, - closed=closed) + closed=closed, + tolerance=tolerance) @classmethod def _simple_new(cls, values, name=None, freq=None, **kwargs): @@ -322,7 +326,7 @@ def astype(self, dtype, copy=True): return Index(result.astype('i8'), name=self.name) return super(TimedeltaIndex, self).astype(dtype, copy=copy) - def union(self, other): + def union(self, other, tolerance=None): """ Specialized union for TimedeltaIndex objects. If combine overlapping ranges with the same DateOffset, will be much @@ -336,6 +340,7 @@ def union(self, other): ------- y : Index or TimedeltaIndex """ + # FIXME: intolerant self._assert_can_do_setop(other) if not isinstance(other, TimedeltaIndex): try: @@ -344,10 +349,12 @@ def union(self, other): pass this, other = self, other + tolerance = this._choose_tolerance([other], tolerance=tolerance) + if this._can_fast_union(other): - return this._fast_union(other) + return this._fast_union(other, tolerance) else: - result = Index.union(this, other) + result = Index.union(this, other, tolerance=tolerance) if isinstance(result, TimedeltaIndex): if result.freq is None: result.freq = to_offset(result.inferred_freq) @@ -368,14 +375,14 @@ def join(self, other, how='left', level=None, return_indexers=False, return_indexers=return_indexers, sort=sort) - def _wrap_joined_index(self, joined, other): + def _wrap_joined_index(self, joined, other, tolerance): name = self.name if self.name == other.name else None if (isinstance(other, TimedeltaIndex) and self.freq == other.freq and self._can_fast_union(other)): - joined = self._shallow_copy(joined, name=name) + joined = self._shallow_copy(joined, name=name, tolerance=tolerance) return joined else: - return self._simple_new(joined, name) + return self._simple_new(joined, name, tolerance=tolerance) def _can_fast_union(self, other): if not isinstance(other, TimedeltaIndex): @@ -404,7 +411,7 @@ def _can_fast_union(self, other): # Only need to "adjoin", not overlap return (right_start == left_end + freq) or right_start in left - def _fast_union(self, other): + def _fast_union(self, other, tolerance): if len(other) == 0: return self.view(type(self)) @@ -420,20 +427,22 @@ def _fast_union(self, other): left_end = left[-1] right_end = right[-1] + # FIXME: intolerant # concatenate if left_end < right_end: loc = right.searchsorted(left_end, side='right') right_chunk = right.values[loc:] dates = _concat._concat_compat((left.values, right_chunk)) - return self._shallow_copy(dates) + return self._shallow_copy(dates, tolerance=tolerance) else: return left - def _wrap_union_result(self, other, result): + def _wrap_union_result(self, other, result, tolerance): name = self.name if self.name == other.name else None - return self._simple_new(result, name=name, freq=None) + return self._simple_new(result, name=name, freq=None, + tolerance=tolerance) - def intersection(self, other): + def intersection(self, other, tolerance=None): """ Specialized intersection for TimedeltaIndex objects. May be much faster than Index.intersection @@ -446,14 +455,17 @@ def intersection(self, other): ------- y : Index or TimedeltaIndex """ + # FIXME: intolerant self._assert_can_do_setop(other) if not isinstance(other, TimedeltaIndex): try: other = TimedeltaIndex(other) except (TypeError, ValueError): pass - result = Index.intersection(self, other) + tolerance = self._choose_tolerance([other], tolerance=tolerance) + result = Index.intersection(self, other, tolerance=tolerance) return result + tolerance = self._choose_tolerance([other], tolerance=tolerance) if len(self) == 0: return self @@ -469,11 +481,12 @@ def intersection(self, other): start = right[0] if end < start: - return type(self)(data=[]) + return type(self)(data=[], tolerance=tolerance) else: + # FIXME: intolerant lslice = slice(*left.slice_locs(start, end)) left_chunk = left.values[lslice] - return self._shallow_copy(left_chunk) + return self._shallow_copy(left_chunk, tolerance=tolerance) def _maybe_promote(self, other): if other.inferred_type == 'timedelta': @@ -529,6 +542,8 @@ def get_loc(self, key, method=None, tolerance=None): # try converting tolerance now, so errors don't get swallowed by # the try/except clauses below tolerance = self._convert_tolerance(tolerance, np.asarray(key)) + else: + tolerance = getattr(self, 'tolerance', None) if _is_convertible_to_td(key): key = Timedelta(key) @@ -756,7 +771,7 @@ def _is_convertible_to_index(other): def timedelta_range(start=None, end=None, periods=None, freq=None, - name=None, closed=None): + name=None, closed=None, tolerance=None): """ Return a fixed frequency TimedeltaIndex, with day as the default frequency @@ -826,4 +841,5 @@ def timedelta_range(start=None, end=None, periods=None, freq=None, freq = 'D' return TimedeltaIndex(start=start, end=end, periods=periods, - freq=freq, name=name, closed=closed) + freq=freq, name=name, closed=closed, + tolerance=tolerance) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 754703dfc4bee..75f9d3088637e 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1264,8 +1264,8 @@ def test_get_indexer_invalid(self): # GH10411 index = Index(np.arange(10)) - with tm.assert_raises_regex(ValueError, 'tolerance argument'): - index.get_indexer([1, 0], tolerance=1) + #with tm.assert_raises_regex(ValueError, 'tolerance argument'): + # index.get_indexer([1, 0], tolerance=1) with tm.assert_raises_regex(ValueError, 'limit argument'): index.get_indexer([1, 0], limit=1) @@ -1408,10 +1408,10 @@ def test_get_loc_bad_tolerance_raises(self): with tm.assert_raises_regex(ValueError, 'must be numeric'): index.get_loc(1.1, 'nearest', tolerance='invalid') - def test_get_loc_tolerance_no_method_raises(self): - index = pd.Index([0, 1, 2]) - with tm.assert_raises_regex(ValueError, 'tolerance .* valid if'): - index.get_loc(1.1, tolerance=1) + #def test_get_loc_tolerance_no_method_raises(self): + # index = pd.Index([0, 1, 2]) + # with tm.assert_raises_regex(ValueError, 'tolerance .* valid if'): + # index.get_loc(1.1, tolerance=1) def test_get_loc_raises_missized_tolerance(self): index = pd.Index([0, 1, 2]) From 9701987f3166b1a1140d13f24b68a3b3de172d26 Mon Sep 17 00:00:00 2001 From: Benjamin Root Date: Tue, 24 Jul 2018 20:35:07 -0400 Subject: [PATCH 2/5] Fixing several problems revealed by CI --- pandas/core/indexes/base.py | 2 +- pandas/core/indexes/category.py | 3 ++- pandas/core/indexes/datetimes.py | 9 +++++---- pandas/core/indexes/multi.py | 6 ++++-- pandas/core/indexes/range.py | 22 +++++++++++++++------- 5 files changed, 27 insertions(+), 15 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 48c312cba9ecc..d417c63ffecc8 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -446,7 +446,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, **kwargs) elif inferred == 'period': try: - return PeriodIndex(subarr, name=name, + return PeriodIndex(subarr, name=name, tolerance=tolerance, **kwargs) except IncompatibleFrequency: pass diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index bd6a3d463837d..7554d2ae1d522 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -78,7 +78,8 @@ class CategoricalIndex(Index, accessor.PandasDelegate): def __new__(cls, data=None, categories=None, ordered=None, dtype=None, copy=False, name=None, fastpath=False, tolerance=None): if fastpath: - return cls._simple_new(data, name=name, dtype=dtype, tolerance=tolerance) + return cls._simple_new(data, name=name, dtype=dtype, + tolerance=tolerance) if name is None and hasattr(data, 'name'): name = data.name diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 855961b164207..d4bfd93e9daad 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -955,15 +955,16 @@ def to_perioddelta(self, freq): return to_timedelta(self.asi8 - self.to_period(freq) .to_timestamp().asi8) - def union_many(self, others): + def union_many(self, others, tolerance=None): """ A bit of a hack to accelerate unioning a collection of indexes """ this = self + tolerance = self._choose_tolerance(others, tolerance=tolerance) for other in others: if not isinstance(this, DatetimeIndex): - this = Index.union(this, other) + this = Index.union(this, other, tolerance=tolerance) continue if not isinstance(other, DatetimeIndex): @@ -975,10 +976,10 @@ def union_many(self, others): this, other = this._maybe_utc_convert(other) if this._can_fast_union(other): - this = this._fast_union(other) + this = this._fast_union(other, tolerance) else: tz = this.tz - this = Index.union(this, other) + this = Index.union(this, other, tolerance=tolerance) if isinstance(this, DatetimeIndex): this._tz = timezones.tz_standardize(tz) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 65112c3d6e366..824c44e575ee5 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -224,7 +224,8 @@ def __new__(cls, levels=None, labels=None, sortorder=None, names=None, if len(levels) == 0: raise ValueError('Must pass non-zero number of levels/labels') if tolerance is not None: - raise ValueError("MultiIndex does not support non-None tolerances yet") + raise ValueError("MultiIndex does not support non-None" + " tolerances yet") result = object.__new__(MultiIndex) @@ -2192,7 +2193,8 @@ def get_loc(self, key, method=None, tolerance=None): raise NotImplementedError('only the default get_loc method is ' 'currently supported for MultiIndex') if tolerance is not None: - raise NotImplementedError("Tolerance is not supported for MultiIndex") + raise NotImplementedError("Tolerance is not supported for" + " MultiIndex") def _maybe_to_slice(loc): """convert integer indexer to boolean mask or slice if possible""" diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index de161e16b86f5..368a00c18497b 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -411,7 +411,7 @@ def _extended_gcd(self, a, b): old_t, t = t, old_t - quotient * t return old_r, old_s, old_t - def union(self, other): + def union(self, other, tolerance=None): """ Form the union of two Index objects and sorts if possible @@ -425,11 +425,15 @@ def union(self, other): """ # FIXME: intolerant self._assert_can_do_setop(other) - if len(other) == 0 or self.equals(other): + tolerance = self._choose_tolerance([other], tolerance=tolerance) + if len(other) == 0 or self.equals(other, tolerance=tolerance): + # FIXME: intolerant return self if len(self) == 0: + # FIXME: intolerant return other if isinstance(other, RangeIndex): + # FIXME: intolerant (how should I implement this?) start_s, step_s = self._start, self._step end_s = self._start + self._step * (len(self) - 1) start_o, step_o = other._start, other._step @@ -450,23 +454,27 @@ def union(self, other): if ((start_s - start_o) % step_s == 0 and (start_s - end_o) <= step_s and (start_o - end_s) <= step_s): - return RangeIndex(start_r, end_r + step_s, step_s) + return RangeIndex(start_r, end_r + step_s, step_s, + tolerance=tolerance) if ((step_s % 2 == 0) and (abs(start_s - start_o) <= step_s / 2) and (abs(end_s - end_o) <= step_s / 2)): - return RangeIndex(start_r, end_r + step_s / 2, step_s / 2) + return RangeIndex(start_r, end_r + step_s / 2, step_s / 2, + tolerance=tolerance) elif step_o % step_s == 0: if ((start_o - start_s) % step_s == 0 and (start_o + step_s >= start_s) and (end_o - step_s <= end_s)): - return RangeIndex(start_r, end_r + step_s, step_s) + return RangeIndex(start_r, end_r + step_s, step_s, + tolerance=tolerance) elif step_s % step_o == 0: if ((start_s - start_o) % step_o == 0 and (start_s + step_o >= start_o) and (end_s - step_o <= end_o)): - return RangeIndex(start_r, end_r + step_o, step_o) + return RangeIndex(start_r, end_r + step_o, step_o, + tolerance=tolerance) - return self._int64index.union(other) + return self._int64index.union(other, tolerance=tolerance) @Appender(_index_shared_docs['join']) def join(self, other, how='left', level=None, return_indexers=False, From 67e476f1d33f348fa8a3c593e0bb627b076c2f50 Mon Sep 17 00:00:00 2001 From: Benjamin Root Date: Wed, 25 Jul 2018 10:41:33 -0400 Subject: [PATCH 3/5] Fix a typo and did some rearranging --- pandas/core/indexes/api.py | 4 ++-- pandas/core/indexes/multi.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index ffb7b4ffddb2d..88f5399346f3e 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -101,10 +101,9 @@ def conv(i): return Index( lib.fast_unique_multiple_list([conv(i) for i in inds], sort=sort)) - tolerance = indexes[0]._choose_tolerance(indexes[1:], tolerance) if kind == 'special': result = indexes[0] - + tolerance = result._choose_tolerance(indexes[1:], tolerance=tolerance) if hasattr(result, 'union_many'): return result.union_many(indexes[1:], tolerance=tolerance) else: @@ -113,6 +112,7 @@ def conv(i): return result elif kind == 'array': index = indexes[0] + tolerance = index._choose_tolerance(indexes[1:], tolerance=tolerance) for other in indexes[1:]: if not index.equals(other, tolerance=tolerance): diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 824c44e575ee5..9f42b21626751 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1452,7 +1452,7 @@ def _sort_levels_monotonic(self): return MultiIndex(new_levels, new_labels, names=self.names, sortorder=self.sortorder, verify_integrity=False, - tolerance=tolerance) + tolerance=self.tolerance) def remove_unused_levels(self): """ From 623038e2cd9295628668d1a1d5497aedb36a38ed Mon Sep 17 00:00:00 2001 From: Benjamin Root Date: Wed, 25 Jul 2018 11:52:57 -0400 Subject: [PATCH 4/5] More plumbing --- pandas/core/indexes/base.py | 26 +++++++++++++++++--------- pandas/core/indexes/interval.py | 4 ++-- pandas/core/indexes/multi.py | 4 ++-- pandas/core/indexes/numeric.py | 8 ++++---- pandas/core/indexes/range.py | 19 +++++++++++-------- 5 files changed, 36 insertions(+), 25 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index d417c63ffecc8..532591bc33b01 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3983,7 +3983,8 @@ def _join_multi(self, other, how, return_indexers=True): def _join_non_unique(self, other, how='left', return_indexers=False): from pandas.core.reshape.merge import _get_join_indexers - + tolerance = self._choose_tolerance([other]) + # FIXME: intolerant left_idx, right_idx = _get_join_indexers([self._ndarray_values], [other._ndarray_values], how=how, @@ -3996,7 +3997,7 @@ def _join_non_unique(self, other, how='left', return_indexers=False): mask = left_idx == -1 np.putmask(join_index, mask, other._ndarray_values.take(right_idx)) - join_index = self._wrap_joined_index(join_index, other) + join_index = self._wrap_joined_index(join_index, other, tolerance) if return_indexers: return join_index, left_idx, right_idx @@ -4133,9 +4134,12 @@ def _get_leaf_sorter(labels): else: return join_index - def _join_monotonic(self, other, how='left', return_indexers=False): - if self.equals(other): + def _join_monotonic(self, other, how='left', return_indexers=False, + tolerance=None): + tolerance = self._choose_tolerance([other], tolerance=tolerance) + if self.equals(other, tolerance=tolerance): ret_index = other if how == 'right' else self + # FIXME: intolerant, need to set the used tolerance if return_indexers: return ret_index, None, None else: @@ -4145,6 +4149,7 @@ def _join_monotonic(self, other, how='left', return_indexers=False): ov = other._ndarray_values if self.is_unique and other.is_unique: + # FIXME: intolerant # We can perform much better than the general case if how == 'left': join_index = self @@ -4156,11 +4161,14 @@ def _join_monotonic(self, other, how='left', return_indexers=False): ridx = None elif how == 'inner': join_index, lidx, ridx = self._inner_indexer(sv, ov) - join_index = self._wrap_joined_index(join_index, other) + join_index = self._wrap_joined_index(join_index, other, + tolerance) elif how == 'outer': join_index, lidx, ridx = self._outer_indexer(sv, ov) - join_index = self._wrap_joined_index(join_index, other) + join_index = self._wrap_joined_index(join_index, other, + tolerance) else: + # FIXME: intolerant if how == 'left': join_index, lidx, ridx = self._left_indexer(sv, ov) elif how == 'right': @@ -4169,7 +4177,7 @@ def _join_monotonic(self, other, how='left', return_indexers=False): join_index, lidx, ridx = self._inner_indexer(sv, ov) elif how == 'outer': join_index, lidx, ridx = self._outer_indexer(sv, ov) - join_index = self._wrap_joined_index(join_index, other) + join_index = self._wrap_joined_index(join_index, other, tolerance) if return_indexers: lidx = None if lidx is None else ensure_platform_int(lidx) @@ -4178,9 +4186,9 @@ def _join_monotonic(self, other, how='left', return_indexers=False): else: return join_index - def _wrap_joined_index(self, joined, other): + def _wrap_joined_index(self, joined, other, tolerance): name = self.name if self.name == other.name else None - return Index(joined, name=name) + return Index(joined, name=name, tolerance=tolerance) def _get_string_slice(self, key, use_lhs=True, use_rhs=True): # this is for partial string indexing, diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 78d649a8bfc99..67640864670b0 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -650,7 +650,7 @@ def get_loc(self, key, method=None, tolerance=None): >>> overlapping_index.get_loc(1.5) array([0, 1], dtype=int64) """ - # FIXME: mostly intolerant + # FIXME: intolerant self._check_method(method) original_key = key @@ -660,7 +660,7 @@ def get_loc(self, key, method=None, tolerance=None): if isinstance(key, Interval): left = self._maybe_cast_slice_bound(key.left, 'left', None) right = self._maybe_cast_slice_bound(key.right, 'right', None) - key = Interval(left, right, key.closed, tolerance=tolerance) + key = Interval(left, right, key.closed) else: key = self._maybe_cast_slice_bound(key, 'left', None) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 9f42b21626751..b3c6a3e8c8af3 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2892,9 +2892,9 @@ def _bounds(self): return self.__bounds - def _wrap_joined_index(self, joined, other): + def _wrap_joined_index(self, joined, other, tolerance): names = self.names if self.names == other.names else None - return MultiIndex.from_tuples(joined, names=names) + return MultiIndex.from_tuples(joined, names=names, tolerance=tolerance) @Appender(Index.isin.__doc__) def isin(self, values, level=None): diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 3101e5f9ce1ac..63cdb2119c54f 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -186,9 +186,9 @@ def _convert_scalar_indexer(self, key, kind=None): return (super(Int64Index, self) ._convert_scalar_indexer(key, kind=kind)) - def _wrap_joined_index(self, joined, other): + def _wrap_joined_index(self, joined, other, tolerance): name = self.name if self.name == other.name else None - return Int64Index(joined, name=name) + return Int64Index(joined, name=name, tolerance=tolerance) @classmethod def _assert_safe_casting(cls, data, subarr): @@ -263,9 +263,9 @@ def _convert_index_indexer(self, keyarr): return keyarr.astype(np.uint64) return keyarr - def _wrap_joined_index(self, joined, other): + def _wrap_joined_index(self, joined, other, tolerance): name = self.name if self.name == other.name else None - return UInt64Index(joined, name=name) + return UInt64Index(joined, name=name, tolerance=tolerance) @classmethod def _assert_safe_casting(cls, data, subarr): diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 368a00c18497b..0f17990b2f582 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -330,7 +330,7 @@ def equals(self, other, tolerance=None): return super(RangeIndex, self).equals(other, tolerance=tolerance) - def intersection(self, other): + def intersection(self, other, tolerance=None): """ Form the intersection of two Index objects. Sortedness of the result is not guaranteed @@ -343,22 +343,25 @@ def intersection(self, other): ------- intersection : Index """ - # FIXME: intolerant if not isinstance(other, RangeIndex): - return super(RangeIndex, self).intersection(other) + return super(RangeIndex, self).intersection(other, + tolerance=tolerance) + tolerance = self._choose_tolerance([other], tolerance=tolerance) if not len(self) or not len(other): - return RangeIndex._simple_new(None) + return RangeIndex._simple_new(None, tolerance=tolerance) first = self[::-1] if self._step < 0 else self second = other[::-1] if other._step < 0 else other + # FIXME: intolerant + # check whether intervals intersect # deals with in- and decreasing ranges int_low = max(first._start, second._start) int_high = min(first._stop, second._stop) if int_high <= int_low: - return RangeIndex._simple_new(None) + return RangeIndex._simple_new(None, tolerance=tolerance) # Method hint: linear Diophantine equation # solve intersection problem @@ -368,14 +371,15 @@ def intersection(self, other): # check whether element sets intersect if (first._start - second._start) % gcd: - return RangeIndex._simple_new(None) + return RangeIndex._simple_new(None, tolerance=tolerance) # calculate parameters for the RangeIndex describing the # intersection disregarding the lower bounds tmp_start = first._start + (second._start - first._start) * \ first._step // gcd * s new_step = first._step * second._step // gcd - new_index = RangeIndex(tmp_start, int_high, new_step, fastpath=True) + new_index = RangeIndex(tmp_start, int_high, new_step, fastpath=True, + tolerance=tolerance) # adjust index to limiting interval new_index._start = new_index._min_fitting_element(int_low) @@ -423,7 +427,6 @@ def union(self, other, tolerance=None): ------- union : Index """ - # FIXME: intolerant self._assert_can_do_setop(other) tolerance = self._choose_tolerance([other], tolerance=tolerance) if len(other) == 0 or self.equals(other, tolerance=tolerance): From c3e583cc4a618f58b96d26bd3f39bd92a2ca430b Mon Sep 17 00:00:00 2001 From: Benjamin Root Date: Wed, 25 Jul 2018 15:19:09 -0400 Subject: [PATCH 5/5] More plumbing and fixing * took care of wrappers in datetimes and interval * fix tolerance handling in extended dtype index construction * fix unpickling of old pickles and a bug in numeric index unpickling * fix tolerance for constructor delegation in `__new__`. --- pandas/core/indexes/base.py | 19 ++++++++++++++----- pandas/core/indexes/datetimes.py | 4 ++-- pandas/core/indexes/interval.py | 9 ++++++--- pandas/core/indexes/multi.py | 3 ++- pandas/core/indexes/period.py | 1 + pandas/core/indexes/range.py | 16 +++++++++------- 6 files changed, 34 insertions(+), 18 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 532591bc33b01..351d52db93b96 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -293,7 +293,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, # coerce to the object dtype data = data.astype(object) return Index(data, dtype=object, copy=copy, name=name, - **kwargs) + tolerance=tolerance, **kwargs) # index-like elif isinstance(data, (np.ndarray, Index, ABCSeries)): @@ -306,7 +306,8 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, dtype=dtype, tolerance=tolerance, **kwargs) if dtype is not None and is_dtype_equal(_o_dtype, dtype): - return Index(result.to_pydatetime(), dtype=_o_dtype) + return Index(result.to_pydatetime(), dtype=_o_dtype, + tolerance=tolerance) else: return result @@ -316,7 +317,8 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, result = TimedeltaIndex(data, copy=copy, name=name, tolerance=tolerance, **kwargs) if dtype is not None and _o_dtype == dtype: - return Index(result.to_pytimedelta(), dtype=_o_dtype) + return Index(result.to_pytimedelta(), dtype=_o_dtype, + tolerance=tolerance) else: return result @@ -468,7 +470,8 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, # 10697 from .multi import MultiIndex return MultiIndex.from_tuples( - data, names=name or kwargs.get('names')) + data, names=name or kwargs.get('names'), + tolerance=tolerance) # other iterable of some kind subarr = com.asarray_tuplesafe(data, dtype=object) return Index(subarr, dtype=dtype, copy=copy, name=name, @@ -1925,6 +1928,7 @@ def __setstate__(self, state): if isinstance(state, dict): self._data = state.pop('data') + self.tolerance = state.pop('tolerance', None) for k, v in compat.iteritems(state): setattr(self, k, v) @@ -1940,6 +1944,7 @@ def __setstate__(self, state): data = np.empty(state) np.ndarray.__setstate__(data, state) + self.tolerance = None self._data = data self._reset_identity() else: @@ -2430,6 +2435,10 @@ def identical(self, other): """Similar to equals, but check that other comparable attributes are also equal """ + # FIXME: Should two indexes with different tolerances but are + # otherwise identical be considered identical? + # FIXME: Should this equality check take into account tolerances, + # or should it be forced to be strict (tolerance=0)? return (self.equals(other) and all((getattr(self, c, None) == getattr(other, c, None) for c in self._comparables)) and @@ -4563,7 +4572,7 @@ def unique(self, level=None): if level is not None: self._validate_index_level(level) result = super(Index, self).unique() - return self._shallow_copy(result) + return self._shallow_copy(result, tolerance=self.tolerance) def drop_duplicates(self, keep='first'): """ diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index d4bfd93e9daad..677db0337f1fc 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -64,7 +64,7 @@ def f(self): result = fget(self) if is_bool_dtype(result): return result - return Index(result, name=self.name) + return Index(result, name=self.name, tolerance=self.tolerance) f.__name__ = name f.__doc__ = fget.__doc__ @@ -76,7 +76,7 @@ def _wrap_in_index(name): def func(self, *args, **kwargs): result = meth(self, *args, **kwargs) - return Index(result, name=self.name) + return Index(result, name=self.name, tolerance=self.tolerance) func.__doc__ = meth.__doc__ func.__name__ = name diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 67640864670b0..2fab7d0105db4 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -988,7 +988,7 @@ def equals(self, other, tolerance=None): self.closed == other.closed) def _setop(op_name): - def func(self, other): + def func(self, other, tolerance=None): other = self._as_like_interval_index(other) # GH 19016: ensure set op will not return a prohibited dtype @@ -999,8 +999,10 @@ def func(self, other): 'objects that have compatible dtypes') raise TypeError(msg.format(op=op_name)) - result = getattr(self._multiindex, op_name)(other._multiindex) + result = getattr(self._multiindex, op_name)(other._multiindex, + tolerance=tolerance) result_name = self.name if self.name == other.name else None + result_tolerance = result.tolerance # GH 19101: ensure empty results have correct dtype if result.empty: @@ -1009,7 +1011,8 @@ def func(self, other): result = result.values return type(self).from_tuples(result, closed=self.closed, - name=result_name) + name=result_name, + tolerance=result_tolerance) return func union = _setop('union') diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index b3c6a3e8c8af3..3b7f9ceddb6e9 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1569,12 +1569,13 @@ def __setstate__(self, state): nd_state, own_state = state levels, labels, sortorder, names = own_state + tolerance = None self._set_levels([Index(x) for x in levels], validate=False) self._set_labels(labels) self._set_names(names) self.sortorder = sortorder - self.tolerance = None + self.tolerance = tolerance self._verify_integrity() self._reset_identity() diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 8b602bcdde3de..92058f62abcd5 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -811,6 +811,7 @@ def __setstate__(self, state): np.ndarray.__setstate__(self, state) self._data = data + self.tolerance = None else: raise Exception("invalid pickle state") diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 0f17990b2f582..576ba463039da 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -17,7 +17,7 @@ import pandas.core.common as com from pandas.core import ops -from pandas.core.indexes.base import Index, _index_shared_docs +from pandas.core.indexes.base import Index, _index_shared_docs, ensure_index from pandas.util._decorators import Appender, cache_readonly import pandas.core.dtypes.concat as _concat import pandas.core.indexes.base as ibase @@ -80,7 +80,8 @@ def __new__(cls, start=None, stop=None, step=None, if name is None: name = start.name return cls._simple_new(name=name, - **dict(start._get_data_as_items())) + **dict(start._get_data_as_items(), + tolerance=tolerance)) # validate the arguments def ensure_int(value, field): @@ -174,7 +175,8 @@ def _data(self): @cache_readonly def _int64index(self): - return Int64Index(self._data, name=self.name, fastpath=True) + return Int64Index(self._data, name=self.name, fastpath=True, + tolerance=self.tolerance) def _get_data_as_items(self): """ return a list of tuples of start, stop, step """ @@ -321,6 +323,7 @@ def equals(self, other, tolerance=None): if isinstance(other, RangeIndex): ls = len(self) lo = len(other) + # FIXME: intolerant return (ls == lo == 0 or ls == lo == 1 and self._start == other._start or @@ -428,13 +431,12 @@ def union(self, other, tolerance=None): union : Index """ self._assert_can_do_setop(other) + other = ensure_index(other) tolerance = self._choose_tolerance([other], tolerance=tolerance) if len(other) == 0 or self.equals(other, tolerance=tolerance): - # FIXME: intolerant - return self + return self._shallow_copy(tolerance=tolerance) if len(self) == 0: - # FIXME: intolerant - return other + return other._shallow_copy(tolerance=tolerance) if isinstance(other, RangeIndex): # FIXME: intolerant (how should I implement this?) start_s, step_s = self._start, self._step